name_gender_classifier 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 787939c630e3a02792909f4e987711b5ce2e336b515db92f349bed2ebe696309
|
4
|
+
data.tar.gz: 1ad33afd9ce5b28cdb1737345b7ba4d4b104b58fd0a85aa69d88cea2339f4403
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 54975df68d17a91d44292eac77e6ab4372c64f922ccd9ade1a2dee3ebe9b1a967f3a992d42b5b6367218a5cddbfce1559630b7013607b9189e377b46bc082afa
|
7
|
+
data.tar.gz: 52f02d8ae418868dc3f529cf5eaa357470192fb1b549bf8602797576268d7128e60fb99eafc61abc0c163e8b58ba7b87f925875048b517ae3675e3e9bec7af97
|
Binary file
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'gdbm'
|
4
|
+
|
5
|
+
module NameGenderClassifier
|
6
|
+
# Uses GDBM database to retrieve gender classification from {DB_NAME}.
|
7
|
+
module DatabaseManager
|
8
|
+
# @return [String] the database location (which holds the classified names)
|
9
|
+
DB_NAME = "#{Gem.loaded_specs['name_gender_classifier'].gem_dir}/lib/"\
|
10
|
+
'name_gender_classifier/classified_names_pt-br.db'
|
11
|
+
|
12
|
+
# Find in the database the value for a previously saved key. The key holds the first name
|
13
|
+
# and the value the gender probability.
|
14
|
+
#
|
15
|
+
# @param key [String, Symbol] a key to be searched in the database
|
16
|
+
#
|
17
|
+
# @return [Float] the gender probability (value between 0 and 1, where 0 <= male < 0.5 <= female <= 1)
|
18
|
+
def self.find(key)
|
19
|
+
value = gdbm[key.to_s]
|
20
|
+
gdbm.close
|
21
|
+
@gdbm = nil
|
22
|
+
|
23
|
+
value ? value.to_f : nil
|
24
|
+
end
|
25
|
+
|
26
|
+
# With a block { |db| ... } allow to read multiple records with a single database open request,
|
27
|
+
# or return the database instance for a single read request.
|
28
|
+
#
|
29
|
+
# @yard [db] gives the database instance to the block
|
30
|
+
# @return [GDBM, nil] the GDBM database instance or nil if used with a block
|
31
|
+
def self.gdbm
|
32
|
+
@gdbm ||= GDBM.new(DB_NAME)
|
33
|
+
|
34
|
+
if block_given?
|
35
|
+
yield(@gdbm)
|
36
|
+
|
37
|
+
@gdbm.close
|
38
|
+
@gdbm = nil
|
39
|
+
else
|
40
|
+
@gdbm
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module NameGenderClassifier
|
4
|
+
# If no match is found in the database, this module is called to predict the
|
5
|
+
# gender based on the first name suffix.
|
6
|
+
module FallbackGenderDetector
|
7
|
+
# @return [String] the locale
|
8
|
+
LOCALE = 'PT_BR'
|
9
|
+
|
10
|
+
# @return [String] male suffix terminations for pt-br
|
11
|
+
PT_BR_MALE_SUFFIXES = %w[ard as el eu ex iz is o on or os ur us rge me pe se re vi].freeze
|
12
|
+
# @return [String] female suffix terminations for pt-br
|
13
|
+
PT_BR_FEMALE_SUFFIXES = %w[a ais are ari eis eme ere ese iko ime ire yse ise isse
|
14
|
+
oko uko ume quel bel cao ce de dis le li lis liz lse ne
|
15
|
+
nis nge ris riz sse].freeze
|
16
|
+
|
17
|
+
# Try to guess the gender based on first name suffix.
|
18
|
+
#
|
19
|
+
# @param name [String] first name
|
20
|
+
#
|
21
|
+
# @return [String] the gender
|
22
|
+
def self.guess_gender(name)
|
23
|
+
return 'female' if const_get("#{LOCALE}_FEMALE_SUFFIXES").any? { |t| name.end_with?(t) }
|
24
|
+
return 'male' if const_get("#{LOCALE}_MALE_SUFFIXES").any? { |t| name.end_with?(t) }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
# Gender detector for first names.
|
4
|
+
module NameGenderClassifier
|
5
|
+
# Return the gender(s) (probability) for the informed name(s). The result type will vary depending
|
6
|
+
# on the parameter type:
|
7
|
+
#
|
8
|
+
# [String, Symbol] the gender (String) is returned.
|
9
|
+
# [Array<String>] an array (Array<String>) with the genders is returned.
|
10
|
+
# [Array<Object>] an array (Array<Object>) with the same objects and the newly assigned genders is returned.
|
11
|
+
#
|
12
|
+
# @param arg [String, Symbol, Array<String>, Array<Symbol>, Array<Object>] argument holding first
|
13
|
+
# name(s) information(s).
|
14
|
+
# @param options [Hash] first_name_attribute: name of the method that returns the first name,
|
15
|
+
# gender_attribute: name of the method which will receive the gender assignment.
|
16
|
+
#
|
17
|
+
# @return [String, Array<String>, Array<Object>] the gender classification for the passed first names
|
18
|
+
def self.classify(arg, options = {})
|
19
|
+
case arg
|
20
|
+
when String, Symbol
|
21
|
+
most_probable_gender(arg)
|
22
|
+
when Array
|
23
|
+
# Assumes that all elements within the array are of the same type as the first.
|
24
|
+
if arg[0].is_a?(String) || arg[0].is_a?(Symbol)
|
25
|
+
classify_array(arg)
|
26
|
+
else
|
27
|
+
classify_objects(arg, options)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the genders (probabilistically) for the informed names.
|
33
|
+
#
|
34
|
+
# @param array [Array<String>, Array<Symbol>] see {NameGenderClassifier.classify}
|
35
|
+
#
|
36
|
+
# @return [Array<String>] see {NameGenderClassifier.classify}
|
37
|
+
def self.classify_array(array)
|
38
|
+
result = []
|
39
|
+
DatabaseManager.gdbm do |db|
|
40
|
+
array.each do |name|
|
41
|
+
next unless name
|
42
|
+
|
43
|
+
result << most_probable_gender(name, db)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
result
|
48
|
+
end
|
49
|
+
|
50
|
+
# For each object in the array, it tries to classify the gender for object.first_name or
|
51
|
+
# object.name (or equivalent method) and save it on object.gender (or equivalent method).
|
52
|
+
#
|
53
|
+
# @param objects [Array<Object>] see {NameGenderClassifier.classify}
|
54
|
+
# @param options see {NameGenderClassifier.classify}
|
55
|
+
#
|
56
|
+
# @return [Array<Object>] the objects with the assigned genders
|
57
|
+
def self.classify_objects(objects, options = {})
|
58
|
+
first_name_attribute = options.fetch(:first_name_attribute, nil) ||
|
59
|
+
(:first_name if defined?(objects[0].first_name)) ||
|
60
|
+
(:name if defined?(objects[0].name))
|
61
|
+
|
62
|
+
if first_name_attribute.nil?
|
63
|
+
puts 'The object doesn\'t have the methods \'name\' nor \'first_name\'. '\
|
64
|
+
'Use #classify(arg, first_name_attribute: nil, gender_attribute: nil) '\
|
65
|
+
'to inform which methods to lookup.'
|
66
|
+
|
67
|
+
return objects
|
68
|
+
end
|
69
|
+
|
70
|
+
gender_attribute = options.fetch(:gender_attribute, 'gender')
|
71
|
+
gender_attribute_assignment = "#{gender_attribute}="
|
72
|
+
|
73
|
+
DatabaseManager.gdbm do |db|
|
74
|
+
objects.each do |object|
|
75
|
+
next unless name = object.public_send(first_name_attribute)
|
76
|
+
|
77
|
+
object.public_send(gender_attribute_assignment, most_probable_gender(name, db))
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
objects
|
82
|
+
end
|
83
|
+
|
84
|
+
# Remove whitespaces, secondary names, accents, digits and transform to string and lower case.
|
85
|
+
def self.remove_unwanted_chars(name)
|
86
|
+
# Enforce the string format, remove white spaces and discard secondary names
|
87
|
+
return unless name = name.to_s.strip.split(' ')[0]
|
88
|
+
|
89
|
+
# Transform to lower case, transliterate and remove non letter characters
|
90
|
+
Iconv.iconv('ascii//translit//ignore', 'utf-8', name.downcase)[0].gsub(/\W+/, '')
|
91
|
+
end
|
92
|
+
private_class_method :remove_unwanted_chars
|
93
|
+
|
94
|
+
# @return [String, nil] the gender of the informed name
|
95
|
+
def self.most_probable_gender(name, db = nil)
|
96
|
+
return unless name = remove_unwanted_chars(name)
|
97
|
+
|
98
|
+
if fem_probability = db ? db[name]&.to_f : DatabaseManager.find(name)
|
99
|
+
fem_probability >= 0.5 ? 'female' : 'male'
|
100
|
+
else
|
101
|
+
FallbackGenderDetector.guess_gender(name)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
private_class_method :most_probable_gender
|
105
|
+
end
|
106
|
+
|
107
|
+
require 'name_gender_classifier/database_manager'
|
108
|
+
require 'name_gender_classifier/fallback_gender_detector'
|
metadata
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: name_gender_classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Avantsoft
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-09-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: gdbm
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: iconv
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.0.0
|
41
|
+
description: Using primarily IBGE census data [2010], this gem classifies brazilian
|
42
|
+
first names as 'male' or 'female'.
|
43
|
+
email: hello@avantsoft.com.br
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- lib/name_gender_classifier.rb
|
49
|
+
- lib/name_gender_classifier/classified_names_pt-br.db
|
50
|
+
- lib/name_gender_classifier/database_manager.rb
|
51
|
+
- lib/name_gender_classifier/fallback_gender_detector.rb
|
52
|
+
homepage: https://rubygems.org/gems/name_gender_classifier
|
53
|
+
licenses:
|
54
|
+
- MIT
|
55
|
+
metadata: {}
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
requirements: []
|
71
|
+
rubygems_version: 3.0.9
|
72
|
+
signing_key:
|
73
|
+
specification_version: 4
|
74
|
+
summary: Gender detection for brazilian first names.
|
75
|
+
test_files: []
|