name_popularity 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/data/female_names.tsv +91244 -0
- data/data/male_names.tsv +79125 -0
- data/lib/name_popularity/cache.rb +42 -0
- data/lib/name_popularity/dataset.rb +49 -0
- data/lib/name_popularity/version.rb +5 -0
- data/lib/name_popularity.rb +32 -0
- metadata +79 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NamePopularity
|
|
4
|
+
# Cache maintains threshold-specific sets of popular names per gender
|
|
5
|
+
class Cache
|
|
6
|
+
def initialize
|
|
7
|
+
# structure: { gender_sym => { threshold_int => Set("NAME", ...) } }
|
|
8
|
+
@store = Hash.new { |h, k| h[k] = {} }
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Returns a Set of names meeting threshold for the gender, building and caching on first use
|
|
12
|
+
def fetch(gender, threshold)
|
|
13
|
+
thr = Integer(threshold)
|
|
14
|
+
@store[gender][thr] ||= build_set(gender, thr)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def build_set(gender, threshold)
|
|
20
|
+
require 'set'
|
|
21
|
+
if gender == :any
|
|
22
|
+
sums = Hash.new(0)
|
|
23
|
+
Dataset.for(:male).each_row { |name, count| sums[name] += count }
|
|
24
|
+
Dataset.for(:female).each_row { |name, count| sums[name] += count }
|
|
25
|
+
set = Set.new
|
|
26
|
+
sums.each do |name, total|
|
|
27
|
+
set.add(name) if total >= threshold
|
|
28
|
+
end
|
|
29
|
+
set
|
|
30
|
+
else
|
|
31
|
+
set = Set.new
|
|
32
|
+
dataset = Dataset.for(gender)
|
|
33
|
+
dataset.each_row do |name, count|
|
|
34
|
+
next unless count >= threshold
|
|
35
|
+
|
|
36
|
+
set.add(name)
|
|
37
|
+
end
|
|
38
|
+
set
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NamePopularity
|
|
4
|
+
# Dataset reads the packaged TSVs and yields [NAME, COUNT]
|
|
5
|
+
class Dataset
|
|
6
|
+
DATA_DIR = File.expand_path("../../data", __dir__)
|
|
7
|
+
|
|
8
|
+
# Source: SCB (Statistics Sweden) 2024
|
|
9
|
+
FILES = {
|
|
10
|
+
male: File.join(DATA_DIR, "male_names.tsv"),
|
|
11
|
+
female: File.join(DATA_DIR, "female_names.tsv")
|
|
12
|
+
}.freeze
|
|
13
|
+
|
|
14
|
+
def self.for(gender)
|
|
15
|
+
path = FILES[gender]
|
|
16
|
+
raise ArgumentError, "Unknown gender: #{gender.inspect}" unless path
|
|
17
|
+
|
|
18
|
+
new(path)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(path)
|
|
22
|
+
@path = path
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Iterates rows as [NAME, COUNT(Integer)]
|
|
26
|
+
def each_row
|
|
27
|
+
return enum_for(:each_row) unless block_given?
|
|
28
|
+
|
|
29
|
+
File.foreach(@path) do |line|
|
|
30
|
+
line = line.strip
|
|
31
|
+
next if line.empty?
|
|
32
|
+
|
|
33
|
+
name, count_txt = line.split(/\t/, 2)
|
|
34
|
+
next unless name && count_txt
|
|
35
|
+
|
|
36
|
+
count = parse_count(count_txt)
|
|
37
|
+
yield(name, count)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
# Handles spaces and non-breaking spaces in integers: "1 234" -> 1234
|
|
44
|
+
def parse_count(text)
|
|
45
|
+
cleaned = text.tr("\u00A0", ' ').delete(' ')
|
|
46
|
+
cleaned.to_i
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "name_popularity/version"
|
|
4
|
+
require_relative "name_popularity/cache"
|
|
5
|
+
require_relative "name_popularity/dataset"
|
|
6
|
+
|
|
7
|
+
module NamePopularity
|
|
8
|
+
DEFAULT_THRESHOLD = 500
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
# Determine if a name has at least `threshold` occurrences across male or female datasets.
|
|
12
|
+
# - name: String (any case). Converted to uppercase to match dataset.
|
|
13
|
+
# - threshold: Integer, default 500
|
|
14
|
+
# - cache: optional NamePopularity::Cache
|
|
15
|
+
def popular_name?(name, threshold: DEFAULT_THRESHOLD, cache: default_cache)
|
|
16
|
+
norm = normalize(name)
|
|
17
|
+
# Use sum of male + female counts at this threshold
|
|
18
|
+
combined = cache.fetch(:any, threshold)
|
|
19
|
+
combined.include?(norm)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def normalize(name)
|
|
25
|
+
name.to_s.strip.upcase
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def default_cache
|
|
29
|
+
@default_cache ||= NamePopularity::Cache.new
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: name_popularity
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Joel E. Svensson
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rake
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '13'
|
|
19
|
+
type: :development
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '13'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rspec
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '3.12'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '3.12'
|
|
40
|
+
description: Provides popular_name? and gender inference using compressed, threshold-keyed
|
|
41
|
+
caches backed by TSV datasets for male and female names.
|
|
42
|
+
email:
|
|
43
|
+
- joel.e.svensson@skiftet.org
|
|
44
|
+
executables: []
|
|
45
|
+
extensions: []
|
|
46
|
+
extra_rdoc_files: []
|
|
47
|
+
files:
|
|
48
|
+
- LICENSE
|
|
49
|
+
- README.md
|
|
50
|
+
- data/female_names.tsv
|
|
51
|
+
- data/male_names.tsv
|
|
52
|
+
- lib/name_popularity.rb
|
|
53
|
+
- lib/name_popularity/cache.rb
|
|
54
|
+
- lib/name_popularity/dataset.rb
|
|
55
|
+
- lib/name_popularity/version.rb
|
|
56
|
+
homepage: https://rubygems.org/gems/name_popularity
|
|
57
|
+
licenses:
|
|
58
|
+
- GPL-3.0-only
|
|
59
|
+
metadata:
|
|
60
|
+
allowed_push_host: https://rubygems.org
|
|
61
|
+
homepage_uri: https://rubygems.org/gems/name_popularity
|
|
62
|
+
rdoc_options: []
|
|
63
|
+
require_paths:
|
|
64
|
+
- lib
|
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
66
|
+
requirements:
|
|
67
|
+
- - ">="
|
|
68
|
+
- !ruby/object:Gem::Version
|
|
69
|
+
version: '3.1'
|
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '0'
|
|
75
|
+
requirements: []
|
|
76
|
+
rubygems_version: 3.6.9
|
|
77
|
+
specification_version: 4
|
|
78
|
+
summary: Fast lookups for name popularity and likely gender from TSV datasets
|
|
79
|
+
test_files: []
|