beauvoir 0.0.2c → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/beauvoir.rb +95 -96
- data/lib/beauvoir/name.rb +70 -0
- data/lib/beauvoir/statistics.rb +48 -0
- metadata +10 -8
- data/lib/name.rb +0 -95
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37155ed7c902ddc28438c6b4ad54b6cb83bad8e0
|
4
|
+
data.tar.gz: de8d1e463cc33fb728718a3383b5d0cc4c2eaa1b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1e06f1a247c296122b17670309b74b5d6230bc08a5a4a609ab57a583bdc64726bc2ef4cf7fe82d985dd7c910b895f967c5a71ccd488de8003488180df5f80299
|
7
|
+
data.tar.gz: 94c3c3c697c23f0b43fdb49d4688689a02913c8db81eb46e7482e38cf850c0c76fcc01e92670400647cd92ee3e89b1c111fed1f78470e1d7e13adb09be076f51
|
data/README.md
CHANGED
@@ -12,9 +12,9 @@ This is pre-alpha software. The API will change, I guarantee it.
|
|
12
12
|
Caveats
|
13
13
|
-------
|
14
14
|
|
15
|
-
It's important to note that many people identify as neither a
|
15
|
+
It's important to note that many people identify as neither a man nor a woman. It's important, too, to note that many people who do identify as male or female have names for which most other people with that name identify as a different gender. All of these people deserve not to be misgendered.
|
16
16
|
|
17
|
-
Nevertheless, automatically classifying people by apparent gender can be a very useful tool to perform censuses of communities or publications to detect and quantify perhaps-invisible bias. VIDA is a pioneer in performing theses censuses, but their
|
17
|
+
Nevertheless, automatically classifying people by apparent gender can be a very useful tool to perform censuses of communities or publications to detect and quantify perhaps-invisible bias. VIDA is a pioneer in performing theses censuses, but their "Count" is limited by a manual methodology that depends hundreds of person-hours of labor. There is a place for more automated counts and Beauvoir can help, but if you plan to publish a count like this, you should be careful. Beauvoir's confidence thresholds are set very high by default on purpose, you shouldn't lower them unless you take other steps to make sure that you're very unlikely to misgender someone; you should also be prepared to be responsive and respectful if you do. You should include your methodology, prominently. You might also consider emphasizing aggregate numbers over your mapping of individual people's names to genders.
|
18
18
|
|
19
19
|
Usage
|
20
20
|
-----
|
data/lib/beauvoir.rb
CHANGED
@@ -1,123 +1,122 @@
|
|
1
1
|
require 'csv'
|
2
2
|
require 'set'
|
3
|
-
require_relative './
|
3
|
+
require_relative './beauvoir/statistics'
|
4
|
+
require_relative './beauvoir/name'
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
6
|
+
module Beauvoir
|
7
|
+
class Categorizer
|
8
|
+
DEFAULT_PROPORTION_THRESHOLD = 0.99
|
9
|
+
DEFAULT_LOWER_CONFIDENCE_BOUND = 0.75
|
8
10
|
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
# these aren't writable because once a Beauvoir is initialized, since their
|
13
|
+
# value is baked into Beauvoir's internal judgments of gender.
|
14
|
+
attr_reader :threshold, :lower_confidence_bound, :names_by_names, :names_genders
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
def initialize(options={})
|
17
|
+
countries = Set.new([:us, :uk])
|
16
18
|
|
17
|
-
|
19
|
+
@threshold = options[:threshold] || DEFAULT_PROPORTION_THRESHOLD
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
# TODO: what should this be in the default case? (0, i.e. ignore the lower bound?, some sensical value to
|
22
|
+
# exclude a naive user from getting back nonsense? the bare minimum value for a loose significance level?)
|
23
|
+
@lower_confidence_bound = options[:lower_confidence_bound] || DEFAULT_LOWER_CONFIDENCE_BOUND
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
@names_by_names = {}
|
26
|
+
# @country_totals = {}
|
27
|
+
@names_genders = {}
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
if options[:country] && !options[:countries]
|
30
|
+
countries &= Set.new([options[:country].to_sym])
|
31
|
+
elsif options[:countries] && !options[:country]
|
32
|
+
countries &= Set.new(options[:countries].map(&:to_sym))
|
33
|
+
elsif options[:countries] && options[:country]
|
34
|
+
raise ArgumentError, "Specify either :country or :countries, not both."
|
35
|
+
end
|
34
36
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
37
|
+
#TODO: consider "piecewise" loading with stashing of already-loaded names
|
38
|
+
# to avoid ~10sec delay when loading into memory
|
39
|
+
#(e.g. seeking around the file?)
|
40
|
+
countries.each do |country|
|
41
|
+
|
42
|
+
CSV.open(File.join(File.dirname(File.expand_path(__FILE__)), "data/#{country}processed.csv"), :headers => true).each do |row|
|
43
|
+
name_str = Beauvoir::Categorizer.normalize(row["Name"])
|
44
|
+
name = @names_by_names.fetch(name_str, Name.new(name_str))
|
45
|
+
name.male_count += row["count.male"].to_i
|
46
|
+
name.female_count += row["count.female"].to_i
|
47
|
+
@names_by_names[name_str] = name
|
48
|
+
end
|
46
49
|
end
|
47
|
-
end
|
48
50
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
:unknown
|
54
|
-
end
|
51
|
+
@names_by_names.values.each do |name|
|
52
|
+
@names_genders[name.name] = name.guess_gender(@threshold, @lower_confidence_bound)
|
53
|
+
end
|
54
|
+
self
|
55
55
|
end
|
56
|
-
self
|
57
|
-
end
|
58
56
|
|
59
|
-
|
60
|
-
(name.male_proportion > @threshold || name.female_proportion > @threshold) &&
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
57
|
+
# def sufficiently_confident(name)
|
58
|
+
# (name.male_proportion > @threshold || name.female_proportion > @threshold) &&
|
59
|
+
# name.lower > @lower_confidence_bound
|
60
|
+
# end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Transform any name-like string into an unpadded, initial-cased first name.
|
64
|
+
# Should be a surjection, mapping many possible inputs (e.g. "Jeremy", "Jeremy.", "JEREMY", "Jeremy B. Merrill")
|
65
|
+
# onto one single name.
|
66
|
+
# This is used for two things:
|
67
|
+
# 1. Accepting differently-formatted/tokenized names from the user.
|
68
|
+
# 2. Dealing with differently-formatted names from the source agencies (e.g. "Mckinley" v. "McKinley", "Obrien", vs. "O'brien")
|
69
|
+
#
|
70
|
+
def self.normalize(name)
|
71
|
+
name.tr!("^A-Za-z' \-", '')
|
72
|
+
# name.gsub!(/[^A-Za-z \-\']+/, '') #this I suspect is done more efficiently with String#tr
|
73
|
+
if name.include?(" ")
|
74
|
+
name = name[0...name.index(" ")]
|
75
|
+
end
|
76
|
+
name[0].upcase + name[1..-1].downcase
|
77
77
|
end
|
78
|
-
name[0].upcase + name[1..-1].downcase
|
79
|
-
end
|
80
78
|
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
def guess(name)
|
80
|
+
@names_genders.fetch(Beauvoir::Categorizer.normalize(name), :unknown)
|
81
|
+
end
|
84
82
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
83
|
+
def estimated_male_value(name)
|
84
|
+
if name_obj = @names_by_names[Beauvoir::Categorizer.normalize(name)]
|
85
|
+
name_obj.estimated_male_value
|
86
|
+
else
|
87
|
+
nil
|
88
|
+
end
|
90
89
|
end
|
91
|
-
end
|
92
90
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
91
|
+
def estimated_female_value(name)
|
92
|
+
if name_obj = @names_by_names[Beauvoir::Categorizer.normalize(name)]
|
93
|
+
name_obj.estimated_female_value
|
94
|
+
else
|
95
|
+
nil
|
96
|
+
end
|
98
97
|
end
|
99
|
-
end
|
100
98
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
99
|
+
def raw_male_proportion(name)
|
100
|
+
if name_obj = @names_by_names[Beauvoir::Categorizer.normalize(name)]
|
101
|
+
name_obj.raw_male_proportion
|
102
|
+
else
|
103
|
+
nil
|
104
|
+
end
|
106
105
|
end
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
106
|
+
def raw_female_proportion(name)
|
107
|
+
if name_obj = @names_by_names[Beauvoir::Categorizer.normalize(name)]
|
108
|
+
name_obj.raw_female_proportion
|
109
|
+
else
|
110
|
+
nil
|
111
|
+
end
|
113
112
|
end
|
114
|
-
end
|
115
113
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
114
|
+
def inspect
|
115
|
+
inspect_string = "#<#{self.class.name}:0x#{(self.object_id*2).to_s(16)} "
|
116
|
+
exclude = [:@names_by_names, :@names_genders]
|
117
|
+
fields = self.instance_variables - exclude
|
118
|
+
inspect_string << fields.map{|field| "#{field}=#{instance_variable_get(field)}"}.join(", ") << ">"
|
119
|
+
inspect_string
|
120
|
+
end
|
122
121
|
end
|
123
122
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
|
2
|
+
module Beauvoir
|
3
|
+
class Name
|
4
|
+
include Beauvoir::Statistics
|
5
|
+
attr_accessor :male_count, :female_count, :name
|
6
|
+
|
7
|
+
def initialize(name, options={})
|
8
|
+
# default_options = {
|
9
|
+
# :significance_level => 0.95,
|
10
|
+
# }
|
11
|
+
@options = options #default_options.merge(options)
|
12
|
+
|
13
|
+
@male_count = 0
|
14
|
+
@female_count = 0
|
15
|
+
@name = name
|
16
|
+
# @significance_level = @options[:significance_level]
|
17
|
+
end
|
18
|
+
|
19
|
+
def guess_gender(threshold=DEFAULT_PROPORTION_THRESHOLD, lower_confidence_bound=DEFAULT_LOWER_CONFIDENCE_BOUND)
|
20
|
+
if sufficiently_confident(threshold, lower_confidence_bound)
|
21
|
+
gender
|
22
|
+
else
|
23
|
+
:unknown
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def raw_female_proportion
|
28
|
+
return 0 unless self.total > 0
|
29
|
+
@female_count / self.total
|
30
|
+
end
|
31
|
+
|
32
|
+
def raw_male_proportion
|
33
|
+
return 0 unless self.total > 0
|
34
|
+
@male_count / self.total
|
35
|
+
end
|
36
|
+
|
37
|
+
def total
|
38
|
+
(@male_count + @female_count).to_f
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
# These methods are private for a reason.
|
43
|
+
# You should use the guess_gender method instead.
|
44
|
+
# (See README.md for more discussion.)
|
45
|
+
def female?
|
46
|
+
#pure proportions, so even the slightest greater proportion of one gender will affect this
|
47
|
+
@female_count > @male_count
|
48
|
+
end
|
49
|
+
|
50
|
+
def male?
|
51
|
+
#pure proportions, so even the slightest greater proportion of one gender will affect this
|
52
|
+
@male_count > @female_count
|
53
|
+
end
|
54
|
+
|
55
|
+
def gender
|
56
|
+
if female?
|
57
|
+
:female
|
58
|
+
elsif male?
|
59
|
+
:male
|
60
|
+
else
|
61
|
+
:unknown
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def sufficiently_confident(threshold=DEFAULT_PROPORTION_THRESHOLD, lower_confidence_bound=DEFAULT_LOWER_CONFIDENCE_BOUND)
|
66
|
+
(raw_male_proportion > threshold || raw_female_proportion > threshold) &&
|
67
|
+
lower > lower_confidence_bound
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative './statistics'
|
2
|
+
|
3
|
+
module Beauvoir
|
4
|
+
module Statistics
|
5
|
+
# fancy statistics!
|
6
|
+
#------------------
|
7
|
+
# implements Agresti-Coull estimated value and binomial confidence interval
|
8
|
+
# via:
|
9
|
+
# - http://codesequoia.wordpress.com/2010/12/06/unit-test-and-statistics/
|
10
|
+
# - http://stackoverflow.com/questions/3749125/how-should-i-order-these-helpful-scores/3752941#3752941
|
11
|
+
# -
|
12
|
+
# the MAGIC_STATISTICS_NUMBER is apparently related to alpha and related to
|
13
|
+
# the level of statistical significance we care about.
|
14
|
+
# 1.96 pertains to a 0.95 significance level.
|
15
|
+
#
|
16
|
+
MAGIC_STATISTICS_NUMBER = 1.96
|
17
|
+
|
18
|
+
def z
|
19
|
+
# TODO: https://github.com/clbustos/statsample/blob/1168d58b14a5095af0a639b4843b31433d40f105/lib/statsample/srs.rb
|
20
|
+
#@significance_level #do stuff with this.
|
21
|
+
MAGIC_STATISTICS_NUMBER
|
22
|
+
end
|
23
|
+
|
24
|
+
def estimated_female_value
|
25
|
+
estimated_value_formula(@female_count)
|
26
|
+
end
|
27
|
+
|
28
|
+
def estimated_male_value
|
29
|
+
estimated_value_formula(@male_count)
|
30
|
+
end
|
31
|
+
|
32
|
+
def estimated_value
|
33
|
+
estimated_value_formula([@male_count, @female_count].max)
|
34
|
+
end
|
35
|
+
|
36
|
+
# returns lower bound of higher of male/female
|
37
|
+
def lower
|
38
|
+
nt = total + z ** 2
|
39
|
+
interval = z * Math.sqrt(estimated_value * (1 - estimated_value) / nt)
|
40
|
+
[raw_female_proportion, raw_male_proportion].max - interval
|
41
|
+
end
|
42
|
+
|
43
|
+
def estimated_value_formula(observed)
|
44
|
+
nt = total + z ** 2
|
45
|
+
(observed + ((z ** 2) / 2)) / nt
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: beauvoir
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: "Guess gender by a first name using more detailed, better\n sourced
|
14
14
|
data from Open Gender Tracker's Global Name Data.<br />\n Beauvoir
|
@@ -33,12 +33,13 @@ executables: []
|
|
33
33
|
extensions: []
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
|
-
- lib/beauvoir.rb
|
37
|
-
- lib/name.rb
|
38
36
|
- LICENSE
|
39
37
|
- README.md
|
40
|
-
- lib/
|
38
|
+
- lib/beauvoir.rb
|
39
|
+
- lib/beauvoir/name.rb
|
40
|
+
- lib/beauvoir/statistics.rb
|
41
41
|
- lib/data/ukprocessed.csv
|
42
|
+
- lib/data/usprocessed.csv
|
42
43
|
homepage: http://rubygems.org/gems/beauvoir
|
43
44
|
licenses:
|
44
45
|
- MIT
|
@@ -54,13 +55,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
54
55
|
version: '0'
|
55
56
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
57
|
requirements:
|
57
|
-
- - '
|
58
|
+
- - '>='
|
58
59
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
60
|
+
version: '0'
|
60
61
|
requirements: []
|
61
62
|
rubyforge_project:
|
62
|
-
rubygems_version: 2.0
|
63
|
+
rubygems_version: 2.2.0
|
63
64
|
signing_key:
|
64
65
|
specification_version: 4
|
65
66
|
summary: Guess a person's gender by their first name
|
66
67
|
test_files: []
|
68
|
+
has_rdoc:
|
data/lib/name.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
class Name
|
4
|
-
attr_accessor :male_count, :female_count, :name
|
5
|
-
|
6
|
-
def initialize(name, options={})
|
7
|
-
# default_options = {
|
8
|
-
# :significance_level => 0.95,
|
9
|
-
# }
|
10
|
-
@options = options #default_options.merge(options)
|
11
|
-
|
12
|
-
@male_count = 0
|
13
|
-
@female_count = 0
|
14
|
-
@name = name
|
15
|
-
# @significance_level = @options[:significance_level]
|
16
|
-
end
|
17
|
-
|
18
|
-
def male?
|
19
|
-
#pure proportions, so even the slightest greater proportion of one gender will affect this
|
20
|
-
@male_count > @female_count
|
21
|
-
end
|
22
|
-
|
23
|
-
def female?
|
24
|
-
@female_count > @male_count
|
25
|
-
end
|
26
|
-
|
27
|
-
def gender
|
28
|
-
if female?
|
29
|
-
:female
|
30
|
-
elsif male?
|
31
|
-
:male
|
32
|
-
else
|
33
|
-
:unknown
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def female_proportion
|
38
|
-
return 0 unless self.total > 0
|
39
|
-
@female_count / self.total
|
40
|
-
end
|
41
|
-
|
42
|
-
def male_proportion
|
43
|
-
return 0 unless self.total > 0
|
44
|
-
@male_count / self.total
|
45
|
-
end
|
46
|
-
|
47
|
-
def total
|
48
|
-
(@male_count + @female_count).to_f
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
# fancy statistics!
|
53
|
-
#------------------
|
54
|
-
# implements Agresti-Coull estimated value and binomial confidence interval
|
55
|
-
# via:
|
56
|
-
# - http://codesequoia.wordpress.com/2010/12/06/unit-test-and-statistics/
|
57
|
-
# - http://stackoverflow.com/questions/3749125/how-should-i-order-these-helpful-scores/3752941#3752941
|
58
|
-
# -
|
59
|
-
#this is apparently related to alpha and related to the level of statistical significance we care about.
|
60
|
-
# 1.96 pertains to a 0.95 significance level.
|
61
|
-
#
|
62
|
-
MAGIC_STATISTICS_NUMBER = 1.96
|
63
|
-
|
64
|
-
def z
|
65
|
-
# TODO: https://github.com/clbustos/statsample/blob/1168d58b14a5095af0a639b4843b31433d40f105/lib/statsample/srs.rb
|
66
|
-
#@significance_level #do stuff with this.
|
67
|
-
MAGIC_STATISTICS_NUMBER
|
68
|
-
end
|
69
|
-
|
70
|
-
def estimated_female_value
|
71
|
-
estimated_value_formula(@female_count)
|
72
|
-
end
|
73
|
-
|
74
|
-
def estimated_male_value
|
75
|
-
estimated_value_formula(@male_count)
|
76
|
-
end
|
77
|
-
|
78
|
-
def estimated_value
|
79
|
-
estimated_value_formula([@male_count, @female_count].max)
|
80
|
-
end
|
81
|
-
|
82
|
-
# returns lower bound of higher of male/female
|
83
|
-
def lower
|
84
|
-
nt = total + z ** 2
|
85
|
-
interval = z * Math.sqrt(estimated_value * (1 - estimated_value) / nt)
|
86
|
-
[female_proportion, male_proportion].max - interval
|
87
|
-
end
|
88
|
-
|
89
|
-
private
|
90
|
-
|
91
|
-
def estimated_value_formula(observed)
|
92
|
-
nt = total + z ** 2
|
93
|
-
(observed + ((z ** 2) / 2)) / nt
|
94
|
-
end
|
95
|
-
end
|