iriq 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ require "json"
2
+
3
+ module Iriq
4
+ # Streaming-friendly observer over a (potentially unbounded) corpus of IRIs.
5
+ # Maintains rolling aggregates and per-(host, prefix) frequency stats so
6
+ # that classification can improve as more data flows in.
7
+ #
8
+ # The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
9
+ # Corpus#normalize and Corpus#explain are the corpus-informed variants.
10
+ #
11
+ # State lives in a Storage backend (Memory by default; Json or Sqlite when
12
+ # opened against a file). The classification logic on top is identical
13
+ # regardless of where the counters live.
14
+ class Corpus
15
+ # Type-based: position is "mostly variable" (UUIDs/integers/etc.).
16
+ VARIABLE_DOMINANCE_THRESHOLD = 0.8
17
+
18
+ # Cardinality-based: position has mostly distinct literal values, so the
19
+ # literal "type" is misleading — it's really a variable slot. We trigger
20
+ # on either:
21
+ # - very high cardinality fraction (most observations are singletons), OR
22
+ # - moderate cardinality fraction AND high absolute distinct count
23
+ # The second branch catches realistic streams where popular outliers
24
+ # bring the frac down but the long tail is clearly variable.
25
+ LITERAL_UNIQUENESS_THRESHOLD = 0.8
26
+ LITERAL_UNIQUENESS_MODERATE_THRESHOLD = 0.5
27
+ MIN_CARDINALITY_FOR_INFERENCE = 20
28
+
29
+ # Don't apply corpus heuristics until we have at least this many
30
+ # observations at a position — too easy to be wrong with tiny samples.
31
+ MIN_OBSERVATIONS_FOR_INFERENCE = 5
32
+
33
+ # Value-fraction at or above which a literal is considered the stable
34
+ # occupant of its position.
35
+ STABLE_LITERAL_THRESHOLD = 0.5
36
+
37
+ # Within a high-cardinality literal position (mostly singletons), a
38
+ # specific value qualifies as a "popular outlier" — and gets preserved
39
+ # as :stable_literal instead of being lumped into :corpus_inferred_variable
40
+ # — when its count is at least POPULAR_MIN_COUNT and its frequency is at
41
+ # least POPULAR_BASELINE_MULTIPLE × the uniform baseline (1/cardinality).
42
+ POPULAR_MIN_COUNT = 5
43
+ POPULAR_BASELINE_MULTIPLE = 3
44
+
45
+ attr_reader :storage
46
+
47
+ def initialize(classifier: SegmentClassifier::DEFAULT,
48
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
49
+ storage: nil)
50
+ @classifier = classifier
51
+ @storage = storage || Storage::Memory.new(
52
+ classifier: classifier,
53
+ max_values_per_position: max_values_per_position,
54
+ )
55
+ end
56
+
57
+ # Open a corpus against `path`. File extension picks the backend:
58
+ # `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
59
+ # else uses JSON.
60
+ def self.open(path, classifier: SegmentClassifier::DEFAULT,
61
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
62
+ storage = Storage.open(path,
63
+ classifier: classifier,
64
+ max_values_per_position: max_values_per_position)
65
+ new(classifier: classifier, storage: storage)
66
+ end
67
+
68
+ # Observe a single IRI. Returns an Observation.
69
+ def observe(input)
70
+ iri = coerce(input)
71
+ hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
72
+ raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
73
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
74
+
75
+ cluster = nil
76
+ @storage.transaction do |s|
77
+ s.increment_host(iri.host)
78
+ s.increment_path_length(iri.path_segments.size)
79
+ s.increment_raw_shape(raw_shape)
80
+ s.increment_fingerprint(hinted_shape)
81
+
82
+ prefix = ""
83
+ hinted_entries.each do |entry|
84
+ s.observe_position(iri.host, prefix, entry[:value], entry[:type])
85
+ prefix = "#{prefix}/#{placeholder(entry)}"
86
+ end
87
+
88
+ key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape)
89
+ cluster = s.add_to_cluster(key, host, scheme, shape, iri)
90
+ end
91
+
92
+ Observation.new(corpus: self, identifier: iri, cluster: cluster)
93
+ end
94
+
95
+ # Corpus-informed normalization. Falls back to mechanical normalization
96
+ # when the corpus has no signal for a position.
97
+ def normalize(input)
98
+ iri = coerce(input)
99
+ return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
100
+
101
+ tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
102
+ out = +""
103
+ out << "#{iri.scheme}://" if iri.scheme
104
+ out << iri.host if iri.host
105
+ out << ":#{iri.port}" if iri.port
106
+ out << "/" << tokens.join("/")
107
+ out
108
+ end
109
+
110
+ # Per-segment explanation with corpus-informed `classification`.
111
+ # Returns an array of entries shaped like the Explanation rows plus
112
+ # `classification:` ∈ :stable_literal, :variable_identifier,
113
+ # :rare_literal, :ambiguous, :corpus_inferred_variable.
114
+ def explain(input)
115
+ iri = coerce(input)
116
+ annotate_segments(iri).map do |entry|
117
+ entry.reject { |k, _| k == :prefix }
118
+ end
119
+ end
120
+
121
+ def host_counts; @storage.host_counts; end
122
+ def path_length_counts; @storage.path_length_counts; end
123
+ def raw_shape_counts; @storage.raw_shape_counts; end
124
+ def fingerprint_counts; @storage.fingerprint_counts; end
125
+
126
+ # Iterates (host, prefix) → PositionStats over all observed positions.
127
+ # Used by inspection tooling; not part of the hot path.
128
+ def each_position_stats(&block)
129
+ @storage.each_position_stats(&block)
130
+ end
131
+
132
+ def clusters
133
+ @storage.clusters
134
+ end
135
+
136
+ def size
137
+ @storage.cluster_size
138
+ end
139
+
140
+ # Stats for a given (host, prefix_shape) — useful for tests and
141
+ # debugging. Returns nil if nothing has been observed there.
142
+ def stats_for(host, prefix)
143
+ @storage.position_stats(host, prefix)
144
+ end
145
+
146
+ # Persist the corpus.
147
+ #
148
+ # save() → flush the backend in place (JSON writes its file,
149
+ # SQLite is already on disk).
150
+ # save(same_path) → same as save() — idempotent for the backend's path.
151
+ # save(other_path) → export to other_path as JSON, regardless of the
152
+ # live backend.
153
+ def save(path = nil)
154
+ backend_path = @storage.respond_to?(:path) ? @storage.path : nil
155
+ if path.nil? || path == backend_path
156
+ @storage.save
157
+ else
158
+ write_json_dump(path)
159
+ end
160
+ end
161
+
162
+ def close
163
+ @storage.close
164
+ end
165
+
166
+ # Wrap many observations in a single backend transaction. For SQLite this
167
+ # turns thousands of fsyncs into one; for in-memory backends it's a
168
+ # no-op. Use when ingesting a batch.
169
+ def batch(&block)
170
+ @storage.batch(&block)
171
+ end
172
+
173
+ private
174
+
175
+ def coerce(input)
176
+ input.is_a?(Identifier) ? input : Parser.parse(input)
177
+ end
178
+
179
+ def annotate_segments(iri)
180
+ hinted = SegmentHints.derive(iri.path_segments, @classifier)
181
+ prefix = ""
182
+ hinted.map do |entry|
183
+ stats = @storage.position_stats(iri.host, prefix)
184
+ out = entry.merge(
185
+ prefix: prefix,
186
+ classification: classify(entry, stats),
187
+ )
188
+ prefix = "#{prefix}/#{placeholder(entry)}"
189
+ out
190
+ end
191
+ end
192
+
193
+ def placeholder(entry)
194
+ return entry[:value] unless entry[:variable]
195
+
196
+ "{#{entry[:hint] || entry[:type]}}"
197
+ end
198
+
199
+ def classify(entry, stats)
200
+ variable = entry[:variable]
201
+
202
+ return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
203
+ return :variable_identifier if variable
204
+
205
+ value = entry[:value]
206
+ total = stats.total
207
+ variable_frac = stats.variable_fraction(@classifier)
208
+ cardinality_frac = stats.cardinality.to_f / total
209
+ enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
210
+ value_frac = stats.value_fraction(value)
211
+
212
+ if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
213
+ # Position is dominated by variable types (UUIDs, integers, etc.).
214
+ # A literal here is a special-case outlier (e.g. /users/me).
215
+ stats.value_counts.key?(value) ? :rare_literal : :ambiguous
216
+ elsif value_frac >= STABLE_LITERAL_THRESHOLD
217
+ # This specific value dominates — preserve it regardless of how
218
+ # diverse the rest of the position is.
219
+ :stable_literal
220
+ elsif enough_data && high_cardinality_literal_position?(stats, cardinality_frac)
221
+ # High-cardinality literal position — usually a variable slot, but
222
+ # recognize values that dramatically exceed the uniform baseline as
223
+ # "popular outliers" (e.g. /workspaces/mainspace surviving in a slot
224
+ # full of one-shot user-created workspace names).
225
+ popular_outlier?(stats, value) ? :stable_literal : :corpus_inferred_variable
226
+ elsif stats.cardinality == 1
227
+ :stable_literal
228
+ elsif stats.value_counts.key?(value)
229
+ :rare_literal
230
+ else
231
+ :ambiguous
232
+ end
233
+ end
234
+
235
+ def high_cardinality_literal_position?(stats, cardinality_frac)
236
+ return true if cardinality_frac >= LITERAL_UNIQUENESS_THRESHOLD
237
+
238
+ cardinality_frac >= LITERAL_UNIQUENESS_MODERATE_THRESHOLD &&
239
+ stats.cardinality >= MIN_CARDINALITY_FOR_INFERENCE
240
+ end
241
+
242
+ def popular_outlier?(stats, value)
243
+ count = stats.value_counts[value] || 0
244
+ return false if count < POPULAR_MIN_COUNT
245
+
246
+ baseline = 1.0 / stats.cardinality
247
+ stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
248
+ end
249
+
250
+ def corpus_token(entry)
251
+ case entry[:classification]
252
+ when :variable_identifier, :corpus_inferred_variable
253
+ placeholder_for_variable(entry)
254
+ else
255
+ entry[:value]
256
+ end
257
+ end
258
+
259
+ def placeholder_for_variable(entry)
260
+ return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
261
+
262
+ # corpus-inferred variable: classifier said literal, corpus says
263
+ # otherwise. Derive a hint from the prefix's last literal segment if
264
+ # we can.
265
+ last_literal = entry[:prefix].split("/").reject(&:empty?).reject { |s| s.start_with?("{") }.last
266
+ base = last_literal ? Inflector.singularize(last_literal) : nil
267
+ base ? "{#{base}}" : "{value}"
268
+ end
269
+
270
+ public
271
+
272
+ # --- Legacy dump/load (JSON shape) ------------------------------------
273
+ #
274
+ # The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
275
+ # `Corpus.load(path)` for JSON-backed persistence. Those names still work
276
+ # but are now thin wrappers around the appropriate Storage backend.
277
+
278
+ def dump
279
+ memory_view.to_dump
280
+ end
281
+
282
+ def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
283
+ max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
284
+ storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
285
+ storage.load_dump!(h)
286
+ new(classifier: classifier, storage: storage)
287
+ end
288
+
289
+ def self.load(path, classifier: SegmentClassifier::DEFAULT)
290
+ open(path, classifier: classifier)
291
+ end
292
+
293
+ private
294
+
295
+ def write_json_dump(path)
296
+ tmp = "#{path}.tmp"
297
+ File.write(tmp, JSON.generate(memory_view.to_dump))
298
+ File.rename(tmp, path)
299
+ end
300
+
301
+ # Materialize a Memory snapshot of the current state — used by dump for
302
+ # backends that don't natively know how to emit the JSON shape.
303
+ def memory_view
304
+ return @storage if @storage.respond_to?(:to_dump)
305
+
306
+ mem = Storage::Memory.new(
307
+ classifier: @classifier,
308
+ max_values_per_position: @storage.max_values_per_position,
309
+ )
310
+ mem.instance_variable_set(:@host_counts, Hash.new(0).merge(@storage.host_counts))
311
+ mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
312
+ mem.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(@storage.raw_shape_counts))
313
+ mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
314
+ ps = {}
315
+ @storage.each_position_stats { |key, stats| ps[key] = stats }
316
+ mem.instance_variable_set(:@position_stats, ps)
317
+ clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
318
+ mem.instance_variable_set(:@clusters, clusters_h)
319
+ mem
320
+ end
321
+ end
322
+ end
@@ -3,43 +3,27 @@ module Iriq
3
3
  #
4
4
  # Explanation.explain("https://foo.com/users/123")
5
5
  # # => [
6
- # # { value: "users", type: :literal, variable: false },
7
- # # { value: "123", type: :integer_id, variable: true },
6
+ # # { value: "users", type: :literal, variable: false, hint: nil },
7
+ # # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
8
8
  # # ]
9
9
  module Explanation
10
10
  module_function
11
11
 
12
- def explain(input, classifier: SegmentClassifier.new)
12
+ def explain(input, classifier: SegmentClassifier::DEFAULT)
13
13
  iri = input.is_a?(Identifier) ? input : Parser.parse(input)
14
14
 
15
15
  if iri.urn?
16
16
  explain_urn(iri, classifier)
17
17
  else
18
- iri.path_segments.map { |s| segment_entry(s, classifier) }
18
+ SegmentHints.derive(iri.path_segments, classifier)
19
19
  end
20
20
  end
21
21
 
22
- def segment_entry(segment, classifier)
23
- type = classifier.classify(segment)
24
- {
25
- value: segment,
26
- type: type,
27
- variable: classifier.variable?(type),
28
- }
29
- end
30
-
31
22
  def explain_urn(iri, classifier)
32
23
  return [] unless iri.nss
33
24
 
34
- if iri.nss.include?(":")
35
- ns, value = iri.nss.split(":", 2)
36
- [
37
- { value: ns, type: :literal, variable: false },
38
- segment_entry(value, classifier),
39
- ]
40
- else
41
- [segment_entry(iri.nss, classifier)]
42
- end
25
+ parts = iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
26
+ SegmentHints.derive(parts, classifier)
43
27
  end
44
28
  end
45
29
  end
@@ -0,0 +1,125 @@
1
+ module Iriq
2
+ # Pulls IRIs out of free text. Scheme-anchored — only URLs whose scheme
3
+ # appears explicitly are extracted (scheme-less hosts like "foo.com/x" are
4
+ # too noisy to disambiguate from prose).
5
+ #
6
+ # Iriq::Extractor.new.extract("Visit https://foo.com today.")
7
+ # # => [#<Iriq::Identifier https://foo.com>]
8
+ #
9
+ # Design draws on twitter-text and GFM autolink rules: scheme anchoring,
10
+ # iterative trailing-punct trim, balanced-paren preservation.
11
+ class Extractor
12
+ SCHEMES = %w[https http ftp wss ws].freeze
13
+
14
+ # Conservative TLD allow-list for scheme-less extraction. Limited to a
15
+ # small set of very common TLDs to keep false-positive rate low. A
16
+ # scheme-less candidate ALSO requires a `/path` to count, so plain
17
+ # `foo.com` in prose still won't match — only `foo.com/something`.
18
+ SCHEMELESS_TLDS = %w[com org net io ai dev co app gov edu].freeze
19
+
20
+ # Boundary chars — a URL ends at any of these (whitespace, angle
21
+ # brackets, quotes, backtick).
22
+ BOUNDARY = %r{[\s<>"'`]}.freeze
23
+
24
+ # Non-ASCII Unicode brackets and quotation marks that almost always
25
+ # terminate a URL in source text (e.g. `「URL」`). ASCII brackets are NOT
26
+ # listed here — those stay inside the URL match so the balanced-paren
27
+ # trim step can handle them (Wikipedia URLs like /Foo_(bar) survive).
28
+ NON_ASCII_BOUNDARY = (
29
+ "」』)】〉》〕〗〙〛⦆}]>" + # CJK closing brackets
30
+ "「『(【〈《〔〖〘〚⦅{[<" + # CJK opening brackets
31
+ "“”‘’„‟‚«»‹›" # Unicode quotation marks
32
+ ).chars.uniq.join.freeze
33
+
34
+ URL_CHAR_CLASS = %{[^\\s<>"'`,#{NON_ASCII_BOUNDARY}]+}.freeze
35
+
36
+ CANDIDATE_RE = %r{
37
+ (?<![\w/]) # not mid-word, not mid-path
38
+ (?:
39
+ (?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS} # absolute URL
40
+ |
41
+ urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS} # urn:NID:NSS
42
+ )
43
+ }xu.freeze
44
+
45
+ # Scheme-less alternative — same chars allowed as the absolute URL but
46
+ # requires a host with an allow-listed TLD AND a `/path` to keep prose
47
+ # noise low. The host part allows ASCII labels separated by dots; no
48
+ # Unicode hosts (those are too easily confused with prose).
49
+ SCHEMELESS_ALT = %{(?:[a-zA-Z0-9](?:[a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+(?i:#{SCHEMELESS_TLDS.join("|")})/#{URL_CHAR_CLASS}}.freeze
50
+
51
+ # Single-scan combined pattern used when scheme_less is on. One regex
52
+ # over the text is meaningfully cheaper than two.
53
+ COMBINED_RE = %r{
54
+ (?<![\w/.@])
55
+ (?:
56
+ (?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS}
57
+ |
58
+ urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS}
59
+ |
60
+ #{SCHEMELESS_ALT}
61
+ )
62
+ }xu.freeze
63
+
64
+ # Punctuation that's almost always sentence punctuation rather than part
65
+ # of a URL when it appears at the trailing edge.
66
+ TRAILING_PUNCT_RE = /[.,;:!?'"‘’“”]+\z/u.freeze
67
+
68
+ # Unmatched closing brackets that should be trimmed.
69
+ BRACKET_PAIRS = { ")" => "(", "]" => "[", "}" => "{" }.freeze
70
+
71
+ def initialize(scheme_less: true)
72
+ @scheme_less = scheme_less
73
+ end
74
+
75
+ def extract(text)
76
+ return [] if text.nil? || text.empty?
77
+
78
+ candidates = scan_candidates(text)
79
+ candidates.filter_map do |candidate|
80
+ trimmed = trim(candidate)
81
+ next nil if trimmed.empty?
82
+
83
+ begin
84
+ Parser.parse(trimmed)
85
+ rescue ParseError
86
+ nil
87
+ end
88
+ end
89
+ end
90
+
91
+ # Same as extract but returns only canonical strings, deduplicated,
92
+ # preserving first-seen order.
93
+ def extract_strings(text)
94
+ seen = {}
95
+ extract(text).each { |iri| seen[iri.canonical] ||= true }
96
+ seen.keys
97
+ end
98
+
99
+ private
100
+
101
+ # One regex scan over the text — combined pattern when scheme-less is
102
+ # on, scheme-anchored only otherwise.
103
+ def scan_candidates(text)
104
+ pattern = @scheme_less ? COMBINED_RE : CANDIDATE_RE
105
+ text.scan(pattern)
106
+ end
107
+
108
+ # Iteratively strip sentence punctuation and unmatched closing brackets
109
+ # until the candidate stabilizes.
110
+ def trim(candidate)
111
+ s = candidate.dup
112
+ loop do
113
+ before = s
114
+ s = s.sub(TRAILING_PUNCT_RE, "")
115
+ BRACKET_PAIRS.each do |close, open|
116
+ while s.end_with?(close) && s.count(close) > s.count(open)
117
+ s = s[0...-1]
118
+ end
119
+ end
120
+ break if s == before
121
+ end
122
+ s
123
+ end
124
+ end
125
+ end
@@ -43,9 +43,17 @@ module Iriq
43
43
  out << "#{scheme}://" if scheme
44
44
  out << host if host
45
45
  out << ":#{port}" if port
46
- out << "/" + path_segments.join("/") if path_segments.any?
47
- out << "?#{query}" if query && !query.empty?
48
- out << "##{fragment}" if fragment && !fragment.empty?
46
+ has_query = query && !query.empty?
47
+ has_fragment = fragment && !fragment.empty?
48
+ if path_segments.any?
49
+ out << "/" + path_segments.join("/")
50
+ elsif has_query || has_fragment
51
+ # RFC 3986: an authority with query/fragment but no path needs the
52
+ # implied "/" to be a valid URI.
53
+ out << "/"
54
+ end
55
+ out << "?#{query}" if has_query
56
+ out << "##{fragment}" if has_fragment
49
57
  out
50
58
  end
51
59
  end
@@ -0,0 +1,145 @@
1
+ require "set"
2
+
3
+ module Iriq
4
+ # Singularization with a swappable adapter.
5
+ #
6
+ # By default uses ActiveSupport's inflector if it can be required, otherwise
7
+ # falls back to BuiltinAdapter. Override globally:
8
+ #
9
+ # Iriq::Inflector.adapter = MyAdapter # must respond to .singularize(String)
10
+ #
11
+ # And reset to default with `Iriq::Inflector.reset_adapter!`.
12
+ module Inflector
13
+ # Vocabulary is bounded in practice; cache + cap matches the
14
+ # SegmentClassifier strategy.
15
+ CACHE_MAX = 10_000
16
+
17
+ class << self
18
+ def singularize(word)
19
+ cache = (@cache ||= {})
20
+ cached = cache[word]
21
+ return cached if cached
22
+
23
+ cache.clear if cache.size >= CACHE_MAX
24
+ cache[word] = adapter.singularize(word)
25
+ end
26
+
27
+ def adapter
28
+ @adapter ||= default_adapter
29
+ end
30
+
31
+ def adapter=(value)
32
+ @adapter = value
33
+ @cache = {} # different adapter could singularize differently
34
+ end
35
+
36
+ def reset_adapter!
37
+ @adapter = nil
38
+ @cache = {}
39
+ end
40
+
41
+ def default_adapter
42
+ require "active_support/inflector"
43
+ ActiveSupportAdapter
44
+ rescue LoadError
45
+ BuiltinAdapter
46
+ end
47
+ end
48
+
49
+ module ActiveSupportAdapter
50
+ def self.singularize(word)
51
+ ::ActiveSupport::Inflector.singularize(word.to_s)
52
+ end
53
+ end
54
+
55
+ # Rule-based English singularizer. Rules are ordered most-specific-first
56
+ # and adapted from ActiveSupport's default inflections.
57
+ module BuiltinAdapter
58
+ IRREGULARS = {
59
+ "people" => "person",
60
+ "children" => "child",
61
+ "men" => "man",
62
+ "women" => "woman",
63
+ "mice" => "mouse",
64
+ "geese" => "goose",
65
+ "oxen" => "ox",
66
+ "feet" => "foot",
67
+ "teeth" => "tooth",
68
+ "lives" => "life",
69
+ "wives" => "wife",
70
+ "moves" => "move",
71
+ "zombies" => "zombie",
72
+ # latin/greek plurals that don't fit a clean suffix rule
73
+ "indices" => "index",
74
+ "vertices" => "vertex",
75
+ # -f/-fe words where the stem doesn't end in l/r/i
76
+ "leaves" => "leaf",
77
+ "calves" => "calf",
78
+ "halves" => "half",
79
+ "loaves" => "loaf",
80
+ "hooves" => "hoof",
81
+ }.freeze
82
+
83
+ UNCOUNTABLE = Set.new(%w[
84
+ news fish sheep deer series species equipment information
85
+ money rice jeans police data media
86
+ ]).freeze
87
+
88
+ # [pattern, replacement] — first match wins.
89
+ RULES = [
90
+ [/(quiz)zes$/i, '\1'],
91
+ [/(matri|appendi)ces$/i, '\1x'],
92
+ [/(ox)en$/i, '\1'],
93
+ [/(alias|status)(es)?$/i, '\1'],
94
+ [/(octop|vir)(us|i)$/i, '\1us'],
95
+ [/(cris|ax|test)es$/i, '\1is'],
96
+ [/(shoe)s$/i, '\1'],
97
+ [/(bus)(es)?$/i, '\1'],
98
+ [/([ml])ice$/i, '\1ouse'],
99
+ [/(x|ch|ss|sh)es$/i, '\1'],
100
+ [/(m)ovies$/i, '\1ovie'],
101
+ [/(s)eries$/i, '\1eries'],
102
+ [/([^aeiouy]|qu)ies$/i, '\1y'],
103
+ [/([lr])ves$/i, '\1f'],
104
+ [/(tive)s$/i, '\1'],
105
+ [/(hive)s$/i, '\1'],
106
+ [/([^f])ves$/i, '\1fe'],
107
+ [/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i, '\1sis'],
108
+ [/([ti])a$/i, '\1um'],
109
+ [/(n)ews$/i, '\1ews'],
110
+ [/(o)es$/i, '\1'],
111
+ [/(ss)$/i, '\1'],
112
+ [/s$/i, ''],
113
+ ].freeze
114
+
115
+ def self.singularize(word)
116
+ return word if word.nil? || word.empty?
117
+
118
+ lower = word.downcase
119
+ return word if UNCOUNTABLE.include?(lower)
120
+
121
+ if (irr = IRREGULARS[lower])
122
+ return preserve_case(word, irr)
123
+ end
124
+
125
+ RULES.each do |pattern, replacement|
126
+ if word.match?(pattern)
127
+ return word.sub(pattern, replacement)
128
+ end
129
+ end
130
+
131
+ word
132
+ end
133
+
134
+ def self.preserve_case(original, lowered)
135
+ if original == original.upcase
136
+ lowered.upcase
137
+ elsif original[0] == original[0].upcase
138
+ lowered.sub(/\A./, &:upcase)
139
+ else
140
+ lowered
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end