name_popularity 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NamePopularity
4
+ # Cache maintains threshold-specific sets of popular names per gender
5
+ class Cache
6
+ def initialize
7
+ # structure: { gender_sym => { threshold_int => Set("NAME", ...) } }
8
+ @store = Hash.new { |h, k| h[k] = {} }
9
+ end
10
+
11
+ # Returns a Set of names meeting threshold for the gender, building and caching on first use
12
+ def fetch(gender, threshold)
13
+ thr = Integer(threshold)
14
+ @store[gender][thr] ||= build_set(gender, thr)
15
+ end
16
+
17
+ private
18
+
19
+ def build_set(gender, threshold)
20
+ require 'set'
21
+ if gender == :any
22
+ sums = Hash.new(0)
23
+ Dataset.for(:male).each_row { |name, count| sums[name] += count }
24
+ Dataset.for(:female).each_row { |name, count| sums[name] += count }
25
+ set = Set.new
26
+ sums.each do |name, total|
27
+ set.add(name) if total >= threshold
28
+ end
29
+ set
30
+ else
31
+ set = Set.new
32
+ dataset = Dataset.for(gender)
33
+ dataset.each_row do |name, count|
34
+ next unless count >= threshold
35
+
36
+ set.add(name)
37
+ end
38
+ set
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NamePopularity
4
+ # Dataset reads the packaged TSVs and yields [NAME, COUNT]
5
+ class Dataset
6
+ DATA_DIR = File.expand_path("../../data", __dir__)
7
+
8
+ # Source: SCB (Statistics Sweden) 2024
9
+ FILES = {
10
+ male: File.join(DATA_DIR, "male_names.tsv"),
11
+ female: File.join(DATA_DIR, "female_names.tsv")
12
+ }.freeze
13
+
14
+ def self.for(gender)
15
+ path = FILES[gender]
16
+ raise ArgumentError, "Unknown gender: #{gender.inspect}" unless path
17
+
18
+ new(path)
19
+ end
20
+
21
+ def initialize(path)
22
+ @path = path
23
+ end
24
+
25
+ # Iterates rows as [NAME, COUNT(Integer)]
26
+ def each_row
27
+ return enum_for(:each_row) unless block_given?
28
+
29
+ File.foreach(@path) do |line|
30
+ line = line.strip
31
+ next if line.empty?
32
+
33
+ name, count_txt = line.split(/\t/, 2)
34
+ next unless name && count_txt
35
+
36
+ count = parse_count(count_txt)
37
+ yield(name, count)
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ # Handles spaces and non-breaking spaces in integers: "1 234" -> 1234
44
+ def parse_count(text)
45
+ cleaned = text.tr("\u00A0", ' ').delete(' ')
46
+ cleaned.to_i
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NamePopularity
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "name_popularity/version"
4
+ require_relative "name_popularity/cache"
5
+ require_relative "name_popularity/dataset"
6
+
7
+ module NamePopularity
8
+ DEFAULT_THRESHOLD = 500
9
+
10
+ class << self
11
+ # Determine if a name has at least `threshold` occurrences across male or female datasets.
12
+ # - name: String (any case). Converted to uppercase to match dataset.
13
+ # - threshold: Integer, default 500
14
+ # - cache: optional NamePopularity::Cache
15
+ def popular_name?(name, threshold: DEFAULT_THRESHOLD, cache: default_cache)
16
+ norm = normalize(name)
17
+ # Use sum of male + female counts at this threshold
18
+ combined = cache.fetch(:any, threshold)
19
+ combined.include?(norm)
20
+ end
21
+
22
+ private
23
+
24
+ def normalize(name)
25
+ name.to_s.strip.upcase
26
+ end
27
+
28
+ def default_cache
29
+ @default_cache ||= NamePopularity::Cache.new
30
+ end
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: name_popularity
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Joel E. Svensson
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rake
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '13'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '13'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '3.12'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '3.12'
40
+ description: Provides popular_name? and gender inference using compressed, threshold-keyed
41
+ caches backed by TSV datasets for male and female names.
42
+ email:
43
+ - joel.e.svensson@skiftet.org
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - LICENSE
49
+ - README.md
50
+ - data/female_names.tsv
51
+ - data/male_names.tsv
52
+ - lib/name_popularity.rb
53
+ - lib/name_popularity/cache.rb
54
+ - lib/name_popularity/dataset.rb
55
+ - lib/name_popularity/version.rb
56
+ homepage: https://rubygems.org/gems/name_popularity
57
+ licenses:
58
+ - GPL-3.0-only
59
+ metadata:
60
+ allowed_push_host: https://rubygems.org
61
+ homepage_uri: https://rubygems.org/gems/name_popularity
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '3.1'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.6.9
77
+ specification_version: 4
78
+ summary: Fast lookups for name popularity and likely gender from TSV datasets
79
+ test_files: []