name_gender_classifier 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 787939c630e3a02792909f4e987711b5ce2e336b515db92f349bed2ebe696309
4
+ data.tar.gz: 1ad33afd9ce5b28cdb1737345b7ba4d4b104b58fd0a85aa69d88cea2339f4403
5
+ SHA512:
6
+ metadata.gz: 54975df68d17a91d44292eac77e6ab4372c64f922ccd9ade1a2dee3ebe9b1a967f3a992d42b5b6367218a5cddbfce1559630b7013607b9189e377b46bc082afa
7
+ data.tar.gz: 52f02d8ae418868dc3f529cf5eaa357470192fb1b549bf8602797576268d7128e60fb99eafc61abc0c163e8b58ba7b87f925875048b517ae3675e3e9bec7af97
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'gdbm'
4
+
5
+ module NameGenderClassifier
6
+ # Uses GDBM database to retrieve gender classification from {DB_NAME}.
7
+ module DatabaseManager
8
+ # @return [String] the database location (which holds the classified names)
9
+ DB_NAME = "#{Gem.loaded_specs['name_gender_classifier'].gem_dir}/lib/"\
10
+ 'name_gender_classifier/classified_names_pt-br.db'
11
+
12
+ # Find in the database the value for a previously saved key. The key holds the first name
13
+ # and the value the gender probability.
14
+ #
15
+ # @param key [String, Symbol] a key to be searched in the database
16
+ #
17
+ # @return [Float] the gender probability (value between 0 and 1, where 0 <= male < 0.5 <= female <= 1)
18
+ def self.find(key)
19
+ value = gdbm[key.to_s]
20
+ gdbm.close
21
+ @gdbm = nil
22
+
23
+ value ? value.to_f : nil
24
+ end
25
+
26
+ # With a block { |db| ... } allow to read multiple records with a single database open request,
27
+ # or return the database instance for a single read request.
28
+ #
29
+ # @yard [db] gives the database instance to the block
30
+ # @return [GDBM, nil] the GDBM database instance or nil if used with a block
31
+ def self.gdbm
32
+ @gdbm ||= GDBM.new(DB_NAME)
33
+
34
+ if block_given?
35
+ yield(@gdbm)
36
+
37
+ @gdbm.close
38
+ @gdbm = nil
39
+ else
40
+ @gdbm
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NameGenderClassifier
4
+ # If no match is found in the database, this module is called to predict the
5
+ # gender based on the first name suffix.
6
+ module FallbackGenderDetector
7
+ # @return [String] the locale
8
+ LOCALE = 'PT_BR'
9
+
10
+ # @return [String] male suffix terminations for pt-br
11
+ PT_BR_MALE_SUFFIXES = %w[ard as el eu ex iz is o on or os ur us rge me pe se re vi].freeze
12
+ # @return [String] female suffix terminations for pt-br
13
+ PT_BR_FEMALE_SUFFIXES = %w[a ais are ari eis eme ere ese iko ime ire yse ise isse
14
+ oko uko ume quel bel cao ce de dis le li lis liz lse ne
15
+ nis nge ris riz sse].freeze
16
+
17
+ # Try to guess the gender based on first name suffix.
18
+ #
19
+ # @param name [String] first name
20
+ #
21
+ # @return [String] the gender
22
+ def self.guess_gender(name)
23
+ return 'female' if const_get("#{LOCALE}_FEMALE_SUFFIXES").any? { |t| name.end_with?(t) }
24
+ return 'male' if const_get("#{LOCALE}_MALE_SUFFIXES").any? { |t| name.end_with?(t) }
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,108 @@
1
+ require 'iconv'
2
+
3
+ # Gender detector for first names.
4
+ module NameGenderClassifier
5
+ # Return the gender(s) (probability) for the informed name(s). The result type will vary depending
6
+ # on the parameter type:
7
+ #
8
+ # [String, Symbol] the gender (String) is returned.
9
+ # [Array<String>] an array (Array<String>) with the genders is returned.
10
+ # [Array<Object>] an array (Array<Object>) with the same objects and the newly assigned genders is returned.
11
+ #
12
+ # @param arg [String, Symbol, Array<String>, Array<Symbol>, Array<Object>] argument holding first
13
+ # name(s) information(s).
14
+ # @param options [Hash] first_name_attribute: name of the method that returns the first name,
15
+ # gender_attribute: name of the method which will receive the gender assignment.
16
+ #
17
+ # @return [String, Array<String>, Array<Object>] the gender classification for the passed first names
18
+ def self.classify(arg, options = {})
19
+ case arg
20
+ when String, Symbol
21
+ most_probable_gender(arg)
22
+ when Array
23
+ # Assumes that all elements within the array are of the same type as the first.
24
+ if arg[0].is_a?(String) || arg[0].is_a?(Symbol)
25
+ classify_array(arg)
26
+ else
27
+ classify_objects(arg, options)
28
+ end
29
+ end
30
+ end
31
+
32
+ # Return the genders (probabilistically) for the informed names.
33
+ #
34
+ # @param array [Array<String>, Array<Symbol>] see {NameGenderClassifier.classify}
35
+ #
36
+ # @return [Array<String>] see {NameGenderClassifier.classify}
37
+ def self.classify_array(array)
38
+ result = []
39
+ DatabaseManager.gdbm do |db|
40
+ array.each do |name|
41
+ next unless name
42
+
43
+ result << most_probable_gender(name, db)
44
+ end
45
+ end
46
+
47
+ result
48
+ end
49
+
50
+ # For each object in the array, it tries to classify the gender for object.first_name or
51
+ # object.name (or equivalent method) and save it on object.gender (or equivalent method).
52
+ #
53
+ # @param objects [Array<Object>] see {NameGenderClassifier.classify}
54
+ # @param options see {NameGenderClassifier.classify}
55
+ #
56
+ # @return [Array<Object>] the objects with the assigned genders
57
+ def self.classify_objects(objects, options = {})
58
+ first_name_attribute = options.fetch(:first_name_attribute, nil) ||
59
+ (:first_name if defined?(objects[0].first_name)) ||
60
+ (:name if defined?(objects[0].name))
61
+
62
+ if first_name_attribute.nil?
63
+ puts 'The object doesn\'t have the methods \'name\' nor \'first_name\'. '\
64
+ 'Use #classify(arg, first_name_attribute: nil, gender_attribute: nil) '\
65
+ 'to inform which methods to lookup.'
66
+
67
+ return objects
68
+ end
69
+
70
+ gender_attribute = options.fetch(:gender_attribute, 'gender')
71
+ gender_attribute_assignment = "#{gender_attribute}="
72
+
73
+ DatabaseManager.gdbm do |db|
74
+ objects.each do |object|
75
+ next unless name = object.public_send(first_name_attribute)
76
+
77
+ object.public_send(gender_attribute_assignment, most_probable_gender(name, db))
78
+ end
79
+ end
80
+
81
+ objects
82
+ end
83
+
84
+ # Remove whitespaces, secondary names, accents, digits and transform to string and lower case.
85
+ def self.remove_unwanted_chars(name)
86
+ # Enforce the string format, remove white spaces and discard secondary names
87
+ return unless name = name.to_s.strip.split(' ')[0]
88
+
89
+ # Transform to lower case, transliterate and remove non letter characters
90
+ Iconv.iconv('ascii//translit//ignore', 'utf-8', name.downcase)[0].gsub(/\W+/, '')
91
+ end
92
+ private_class_method :remove_unwanted_chars
93
+
94
+ # @return [String, nil] the gender of the informed name
95
+ def self.most_probable_gender(name, db = nil)
96
+ return unless name = remove_unwanted_chars(name)
97
+
98
+ if fem_probability = db ? db[name]&.to_f : DatabaseManager.find(name)
99
+ fem_probability >= 0.5 ? 'female' : 'male'
100
+ else
101
+ FallbackGenderDetector.guess_gender(name)
102
+ end
103
+ end
104
+ private_class_method :most_probable_gender
105
+ end
106
+
107
+ require 'name_gender_classifier/database_manager'
108
+ require 'name_gender_classifier/fallback_gender_detector'
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: name_gender_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Avantsoft
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-09-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: gdbm
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: iconv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.0.0
41
+ description: Using primarily IBGE census data [2010], this gem classifies brazilian
42
+ first names as 'male' or 'female'.
43
+ email: hello@avantsoft.com.br
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - lib/name_gender_classifier.rb
49
+ - lib/name_gender_classifier/classified_names_pt-br.db
50
+ - lib/name_gender_classifier/database_manager.rb
51
+ - lib/name_gender_classifier/fallback_gender_detector.rb
52
+ homepage: https://rubygems.org/gems/name_gender_classifier
53
+ licenses:
54
+ - MIT
55
+ metadata: {}
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ requirements: []
71
+ rubygems_version: 3.0.9
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: Gender detection for brazilian first names.
75
+ test_files: []