name_gender_classifier 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 787939c630e3a02792909f4e987711b5ce2e336b515db92f349bed2ebe696309
4
+ data.tar.gz: 1ad33afd9ce5b28cdb1737345b7ba4d4b104b58fd0a85aa69d88cea2339f4403
5
+ SHA512:
6
+ metadata.gz: 54975df68d17a91d44292eac77e6ab4372c64f922ccd9ade1a2dee3ebe9b1a967f3a992d42b5b6367218a5cddbfce1559630b7013607b9189e377b46bc082afa
7
+ data.tar.gz: 52f02d8ae418868dc3f529cf5eaa357470192fb1b549bf8602797576268d7128e60fb99eafc61abc0c163e8b58ba7b87f925875048b517ae3675e3e9bec7af97
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'gdbm'
4
+
5
+ module NameGenderClassifier
6
+ # Uses GDBM database to retrieve gender classification from {DB_NAME}.
7
+ module DatabaseManager
8
+ # @return [String] the database location (which holds the classified names)
9
+ DB_NAME = "#{Gem.loaded_specs['name_gender_classifier'].gem_dir}/lib/"\
10
+ 'name_gender_classifier/classified_names_pt-br.db'
11
+
12
+ # Find in the database the value for a previously saved key. The key holds the first name
13
+ # and the value the gender probability.
14
+ #
15
+ # @param key [String, Symbol] a key to be searched in the database
16
+ #
17
+ # @return [Float] the gender probability (value between 0 and 1, where 0 <= male < 0.5 <= female <= 1)
18
+ def self.find(key)
19
+ value = gdbm[key.to_s]
20
+ gdbm.close
21
+ @gdbm = nil
22
+
23
+ value ? value.to_f : nil
24
+ end
25
+
26
+ # With a block { |db| ... } allow to read multiple records with a single database open request,
27
+ # or return the database instance for a single read request.
28
+ #
29
+ # @yard [db] gives the database instance to the block
30
+ # @return [GDBM, nil] the GDBM database instance or nil if used with a block
31
+ def self.gdbm
32
+ @gdbm ||= GDBM.new(DB_NAME)
33
+
34
+ if block_given?
35
+ yield(@gdbm)
36
+
37
+ @gdbm.close
38
+ @gdbm = nil
39
+ else
40
+ @gdbm
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NameGenderClassifier
4
+ # If no match is found in the database, this module is called to predict the
5
+ # gender based on the first name suffix.
6
+ module FallbackGenderDetector
7
+ # @return [String] the locale
8
+ LOCALE = 'PT_BR'
9
+
10
+ # @return [String] male suffix terminations for pt-br
11
+ PT_BR_MALE_SUFFIXES = %w[ard as el eu ex iz is o on or os ur us rge me pe se re vi].freeze
12
+ # @return [String] female suffix terminations for pt-br
13
+ PT_BR_FEMALE_SUFFIXES = %w[a ais are ari eis eme ere ese iko ime ire yse ise isse
14
+ oko uko ume quel bel cao ce de dis le li lis liz lse ne
15
+ nis nge ris riz sse].freeze
16
+
17
+ # Try to guess the gender based on first name suffix.
18
+ #
19
+ # @param name [String] first name
20
+ #
21
+ # @return [String] the gender
22
+ def self.guess_gender(name)
23
+ return 'female' if const_get("#{LOCALE}_FEMALE_SUFFIXES").any? { |t| name.end_with?(t) }
24
+ return 'male' if const_get("#{LOCALE}_MALE_SUFFIXES").any? { |t| name.end_with?(t) }
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,108 @@
1
+ require 'iconv'
2
+
3
+ # Gender detector for first names.
4
+ module NameGenderClassifier
5
+ # Return the gender(s) (probability) for the informed name(s). The result type will vary depending
6
+ # on the parameter type:
7
+ #
8
+ # [String, Symbol] the gender (String) is returned.
9
+ # [Array<String>] an array (Array<String>) with the genders is returned.
10
+ # [Array<Object>] an array (Array<Object>) with the same objects and the newly assigned genders is returned.
11
+ #
12
+ # @param arg [String, Symbol, Array<String>, Array<Symbol>, Array<Object>] argument holding first
13
+ # name(s) information(s).
14
+ # @param options [Hash] first_name_attribute: name of the method that returns the first name,
15
+ # gender_attribute: name of the method which will receive the gender assignment.
16
+ #
17
+ # @return [String, Array<String>, Array<Object>] the gender classification for the passed first names
18
+ def self.classify(arg, options = {})
19
+ case arg
20
+ when String, Symbol
21
+ most_probable_gender(arg)
22
+ when Array
23
+ # Assumes that all elements within the array are of the same type as the first.
24
+ if arg[0].is_a?(String) || arg[0].is_a?(Symbol)
25
+ classify_array(arg)
26
+ else
27
+ classify_objects(arg, options)
28
+ end
29
+ end
30
+ end
31
+
32
+ # Return the genders (probabilistically) for the informed names.
33
+ #
34
+ # @param array [Array<String>, Array<Symbol>] see {NameGenderClassifier.classify}
35
+ #
36
+ # @return [Array<String>] see {NameGenderClassifier.classify}
37
+ def self.classify_array(array)
38
+ result = []
39
+ DatabaseManager.gdbm do |db|
40
+ array.each do |name|
41
+ next unless name
42
+
43
+ result << most_probable_gender(name, db)
44
+ end
45
+ end
46
+
47
+ result
48
+ end
49
+
50
+ # For each object in the array, it tries to classify the gender for object.first_name or
51
+ # object.name (or equivalent method) and save it on object.gender (or equivalent method).
52
+ #
53
+ # @param objects [Array<Object>] see {NameGenderClassifier.classify}
54
+ # @param options see {NameGenderClassifier.classify}
55
+ #
56
+ # @return [Array<Object>] the objects with the assigned genders
57
+ def self.classify_objects(objects, options = {})
58
+ first_name_attribute = options.fetch(:first_name_attribute, nil) ||
59
+ (:first_name if defined?(objects[0].first_name)) ||
60
+ (:name if defined?(objects[0].name))
61
+
62
+ if first_name_attribute.nil?
63
+ puts 'The object doesn\'t have the methods \'name\' nor \'first_name\'. '\
64
+ 'Use #classify(arg, first_name_attribute: nil, gender_attribute: nil) '\
65
+ 'to inform which methods to lookup.'
66
+
67
+ return objects
68
+ end
69
+
70
+ gender_attribute = options.fetch(:gender_attribute, 'gender')
71
+ gender_attribute_assignment = "#{gender_attribute}="
72
+
73
+ DatabaseManager.gdbm do |db|
74
+ objects.each do |object|
75
+ next unless name = object.public_send(first_name_attribute)
76
+
77
+ object.public_send(gender_attribute_assignment, most_probable_gender(name, db))
78
+ end
79
+ end
80
+
81
+ objects
82
+ end
83
+
84
+ # Remove whitespaces, secondary names, accents, digits and transform to string and lower case.
85
+ def self.remove_unwanted_chars(name)
86
+ # Enforce the string format, remove white spaces and discard secondary names
87
+ return unless name = name.to_s.strip.split(' ')[0]
88
+
89
+ # Transform to lower case, transliterate and remove non letter characters
90
+ Iconv.iconv('ascii//translit//ignore', 'utf-8', name.downcase)[0].gsub(/\W+/, '')
91
+ end
92
+ private_class_method :remove_unwanted_chars
93
+
94
+ # @return [String, nil] the gender of the informed name
95
+ def self.most_probable_gender(name, db = nil)
96
+ return unless name = remove_unwanted_chars(name)
97
+
98
+ if fem_probability = db ? db[name]&.to_f : DatabaseManager.find(name)
99
+ fem_probability >= 0.5 ? 'female' : 'male'
100
+ else
101
+ FallbackGenderDetector.guess_gender(name)
102
+ end
103
+ end
104
+ private_class_method :most_probable_gender
105
+ end
106
+
107
+ require 'name_gender_classifier/database_manager'
108
+ require 'name_gender_classifier/fallback_gender_detector'
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: name_gender_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Avantsoft
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-09-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: gdbm
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: iconv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.0.0
41
+ description: Using primarily IBGE census data [2010], this gem classifies brazilian
42
+ first names as 'male' or 'female'.
43
+ email: hello@avantsoft.com.br
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - lib/name_gender_classifier.rb
49
+ - lib/name_gender_classifier/classified_names_pt-br.db
50
+ - lib/name_gender_classifier/database_manager.rb
51
+ - lib/name_gender_classifier/fallback_gender_detector.rb
52
+ homepage: https://rubygems.org/gems/name_gender_classifier
53
+ licenses:
54
+ - MIT
55
+ metadata: {}
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ requirements: []
71
+ rubygems_version: 3.0.9
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: Gender detection for brazilian first names.
75
+ test_files: []