iriq 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ require "json"
2
+
3
+ module Iriq
4
+ # Streaming-friendly observer over a (potentially unbounded) corpus of IRIs.
5
+ # Maintains rolling aggregates and per-(host, prefix) frequency stats so
6
+ # that classification can improve as more data flows in.
7
+ #
8
+ # The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
9
+ # Corpus#normalize and Corpus#explain are the corpus-informed variants.
10
+ class Corpus
11
+ # Type-based: position is "mostly variable" (UUIDs/integers/etc.).
12
+ VARIABLE_DOMINANCE_THRESHOLD = 0.8
13
+
14
+ # Cardinality-based: position has mostly distinct literal values, so the
15
+ # literal "type" is misleading — it's really a variable slot. We trigger
16
+ # on either:
17
+ # - very high cardinality fraction (most observations are singletons), OR
18
+ # - moderate cardinality fraction AND high absolute distinct count
19
+ # The second branch catches realistic streams where popular outliers
20
+ # bring the frac down but the long tail is clearly variable.
21
+ LITERAL_UNIQUENESS_THRESHOLD = 0.8
22
+ LITERAL_UNIQUENESS_MODERATE_THRESHOLD = 0.5
23
+ MIN_CARDINALITY_FOR_INFERENCE = 20
24
+
25
+ # Don't apply corpus heuristics until we have at least this many
26
+ # observations at a position — too easy to be wrong with tiny samples.
27
+ MIN_OBSERVATIONS_FOR_INFERENCE = 5
28
+
29
+ # Value-fraction at or above which a literal is considered the stable
30
+ # occupant of its position.
31
+ STABLE_LITERAL_THRESHOLD = 0.5
32
+
33
+ # Within a high-cardinality literal position (mostly singletons), a
34
+ # specific value qualifies as a "popular outlier" — and gets preserved
35
+ # as :stable_literal instead of being lumped into :corpus_inferred_variable
36
+ # — when its count is at least POPULAR_MIN_COUNT and its frequency is at
37
+ # least POPULAR_BASELINE_MULTIPLE × the uniform baseline (1/cardinality).
38
+ POPULAR_MIN_COUNT = 5
39
+ POPULAR_BASELINE_MULTIPLE = 3
40
+
41
+ attr_reader :host_counts, :path_length_counts, :raw_shape_counts,
42
+ :fingerprint_counts, :position_stats
43
+
44
+ def initialize(classifier: SegmentClassifier::DEFAULT,
45
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
46
+ @classifier = classifier
47
+ @max_values_per_position = max_values_per_position
48
+ @host_counts = Hash.new(0)
49
+ @path_length_counts = Hash.new(0)
50
+ @raw_shape_counts = Hash.new(0)
51
+ @fingerprint_counts = Hash.new(0)
52
+ @position_stats = {}
53
+ @clusterer = Clusterer.new(classifier: classifier)
54
+ end
55
+
56
+ # Observe a single IRI. Returns an Observation.
57
+ def observe(input)
58
+ iri = coerce(input)
59
+ hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
60
+ record_aggregates(iri, hinted_entries)
61
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
62
+ cluster = @clusterer.add(iri, shape: hinted_shape)
63
+ Observation.new(corpus: self, identifier: iri, cluster: cluster)
64
+ end
65
+
66
+ # Corpus-informed normalization. Falls back to mechanical normalization
67
+ # when the corpus has no signal for a position.
68
+ def normalize(input)
69
+ iri = coerce(input)
70
+ return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
71
+
72
+ tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
73
+ out = +""
74
+ out << "#{iri.scheme}://" if iri.scheme
75
+ out << iri.host if iri.host
76
+ out << ":#{iri.port}" if iri.port
77
+ out << "/" << tokens.join("/")
78
+ out
79
+ end
80
+
81
+ # Per-segment explanation with corpus-informed `classification`.
82
+ # Returns an array of entries shaped like the Explanation rows plus
83
+ # `classification:` ∈ :stable_literal, :variable_identifier,
84
+ # :rare_literal, :ambiguous, :corpus_inferred_variable.
85
+ def explain(input)
86
+ iri = coerce(input)
87
+ annotate_segments(iri).map do |entry|
88
+ entry.reject { |k, _| k == :prefix }
89
+ end
90
+ end
91
+
92
+ def clusters
93
+ @clusterer.clusters
94
+ end
95
+
96
+ def size
97
+ @clusterer.size
98
+ end
99
+
100
+ # Stats for a given (host, prefix_shape) — useful for tests and
101
+ # debugging. Returns nil if nothing has been observed there.
102
+ def stats_for(host, prefix)
103
+ @position_stats[[host, prefix]]
104
+ end
105
+
106
+ private
107
+
108
+ def coerce(input)
109
+ input.is_a?(Identifier) ? input : Parser.parse(input)
110
+ end
111
+
112
+ def record_aggregates(iri, hinted_entries)
113
+ @host_counts[iri.host] += 1 if iri.host
114
+ @path_length_counts[iri.path_segments.size] += 1
115
+
116
+ raw = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
117
+ fp = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
118
+ @raw_shape_counts[raw] += 1
119
+ @fingerprint_counts[fp] += 1
120
+
121
+ record_position_stats(iri, hinted_entries)
122
+ end
123
+
124
+ def record_position_stats(iri, hinted_entries)
125
+ prefix = ""
126
+ hinted_entries.each do |entry|
127
+ key = [iri.host, prefix]
128
+ stats = @position_stats[key] ||= PositionStats.new(max_values: @max_values_per_position)
129
+ stats.observe(entry[:value], entry[:type])
130
+ prefix = "#{prefix}/#{placeholder(entry)}"
131
+ end
132
+ end
133
+
134
+ # Walks the IRI's segments and returns hint-derived entries enriched with
135
+ # the (host, prefix) PositionStats reference and a :classification symbol.
136
+ def annotate_segments(iri)
137
+ hinted = SegmentHints.derive(iri.path_segments, @classifier)
138
+ prefix = ""
139
+ hinted.map do |entry|
140
+ stats = @position_stats[[iri.host, prefix]]
141
+ out = entry.merge(
142
+ prefix: prefix,
143
+ classification: classify(entry, stats),
144
+ )
145
+ prefix = "#{prefix}/#{placeholder(entry)}"
146
+ out
147
+ end
148
+ end
149
+
150
+ def placeholder(entry)
151
+ return entry[:value] unless entry[:variable]
152
+
153
+ "{#{entry[:hint] || entry[:type]}}"
154
+ end
155
+
156
+ def classify(entry, stats)
157
+ variable = entry[:variable]
158
+
159
+ return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
160
+ return :variable_identifier if variable
161
+
162
+ value = entry[:value]
163
+ total = stats.total
164
+ variable_frac = stats.variable_fraction(@classifier)
165
+ cardinality_frac = stats.cardinality.to_f / total
166
+ enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
167
+ value_frac = stats.value_fraction(value)
168
+
169
+ if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
170
+ # Position is dominated by variable types (UUIDs, integers, etc.).
171
+ # A literal here is a special-case outlier (e.g. /users/me).
172
+ stats.value_counts.key?(value) ? :rare_literal : :ambiguous
173
+ elsif value_frac >= STABLE_LITERAL_THRESHOLD
174
+ # This specific value dominates — preserve it regardless of how
175
+ # diverse the rest of the position is.
176
+ :stable_literal
177
+ elsif enough_data && high_cardinality_literal_position?(stats, cardinality_frac)
178
+ # High-cardinality literal position — usually a variable slot, but
179
+ # recognize values that dramatically exceed the uniform baseline as
180
+ # "popular outliers" (e.g. /workspaces/mainspace surviving in a slot
181
+ # full of one-shot user-created workspace names).
182
+ popular_outlier?(stats, value) ? :stable_literal : :corpus_inferred_variable
183
+ elsif stats.cardinality == 1
184
+ :stable_literal
185
+ elsif stats.value_counts.key?(value)
186
+ :rare_literal
187
+ else
188
+ :ambiguous
189
+ end
190
+ end
191
+
192
+ def high_cardinality_literal_position?(stats, cardinality_frac)
193
+ return true if cardinality_frac >= LITERAL_UNIQUENESS_THRESHOLD
194
+
195
+ cardinality_frac >= LITERAL_UNIQUENESS_MODERATE_THRESHOLD &&
196
+ stats.cardinality >= MIN_CARDINALITY_FOR_INFERENCE
197
+ end
198
+
199
+ def popular_outlier?(stats, value)
200
+ count = stats.value_counts[value] || 0
201
+ return false if count < POPULAR_MIN_COUNT
202
+
203
+ baseline = 1.0 / stats.cardinality
204
+ stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
205
+ end
206
+
207
+ def corpus_token(entry)
208
+ case entry[:classification]
209
+ when :variable_identifier, :corpus_inferred_variable
210
+ placeholder_for_variable(entry)
211
+ else
212
+ entry[:value]
213
+ end
214
+ end
215
+
216
+ def placeholder_for_variable(entry)
217
+ return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
218
+
219
+ # corpus-inferred variable: classifier said literal, corpus says
220
+ # otherwise. Derive a hint from the prefix's last literal segment if
221
+ # we can.
222
+ last_literal = entry[:prefix].split("/").reject(&:empty?).reject { |s| s.start_with?("{") }.last
223
+ base = last_literal ? Inflector.singularize(last_literal) : nil
224
+ base ? "{#{base}}" : "{value}"
225
+ end
226
+
227
+ public
228
+
229
+ def dump
230
+ {
231
+ "host_counts" => @host_counts,
232
+ "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
233
+ "raw_shape_counts" => @raw_shape_counts,
234
+ "fingerprint_counts" => @fingerprint_counts,
235
+ "max_values_per_position" => @max_values_per_position,
236
+ "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
237
+ "clusterer" => @clusterer.dump,
238
+ }
239
+ end
240
+
241
+ def save(path)
242
+ tmp = "#{path}.tmp"
243
+ File.write(tmp, JSON.generate(dump))
244
+ File.rename(tmp, path)
245
+ end
246
+
247
+ def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
248
+ c = new(
249
+ classifier: classifier,
250
+ max_values_per_position: h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES),
251
+ )
252
+ c.instance_variable_set(:@host_counts, Hash.new(0).merge(h["host_counts"]))
253
+ c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
254
+ c.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(h["raw_shape_counts"]))
255
+ c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
256
+ stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
257
+ acc[[host, prefix]] = PositionStats.from_dump(sdump)
258
+ end
259
+ c.instance_variable_set(:@position_stats, stats)
260
+ c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
261
+ c
262
+ end
263
+
264
+ def self.load(path, classifier: SegmentClassifier::DEFAULT)
265
+ from_dump(JSON.parse(File.read(path)), classifier: classifier)
266
+ end
267
+ end
268
+ end
@@ -3,43 +3,27 @@ module Iriq
3
3
  #
4
4
  # Explanation.explain("https://foo.com/users/123")
5
5
  # # => [
6
- # # { value: "users", type: :literal, variable: false },
7
- # # { value: "123", type: :integer_id, variable: true },
6
+ # # { value: "users", type: :literal, variable: false, hint: nil },
7
+ # # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
8
8
  # # ]
9
9
  module Explanation
10
10
  module_function
11
11
 
12
- def explain(input, classifier: SegmentClassifier.new)
12
+ def explain(input, classifier: SegmentClassifier::DEFAULT)
13
13
  iri = input.is_a?(Identifier) ? input : Parser.parse(input)
14
14
 
15
15
  if iri.urn?
16
16
  explain_urn(iri, classifier)
17
17
  else
18
- iri.path_segments.map { |s| segment_entry(s, classifier) }
18
+ SegmentHints.derive(iri.path_segments, classifier)
19
19
  end
20
20
  end
21
21
 
22
- def segment_entry(segment, classifier)
23
- type = classifier.classify(segment)
24
- {
25
- value: segment,
26
- type: type,
27
- variable: classifier.variable?(type),
28
- }
29
- end
30
-
31
22
  def explain_urn(iri, classifier)
32
23
  return [] unless iri.nss
33
24
 
34
- if iri.nss.include?(":")
35
- ns, value = iri.nss.split(":", 2)
36
- [
37
- { value: ns, type: :literal, variable: false },
38
- segment_entry(value, classifier),
39
- ]
40
- else
41
- [segment_entry(iri.nss, classifier)]
42
- end
25
+ parts = iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
26
+ SegmentHints.derive(parts, classifier)
43
27
  end
44
28
  end
45
29
  end
@@ -0,0 +1,125 @@
1
+ module Iriq
2
+ # Pulls IRIs out of free text. Scheme-anchored — only URLs whose scheme
3
+ # appears explicitly are extracted (scheme-less hosts like "foo.com/x" are
4
+ # too noisy to disambiguate from prose).
5
+ #
6
+ # Iriq::Extractor.new.extract("Visit https://foo.com today.")
7
+ # # => [#<Iriq::Identifier https://foo.com>]
8
+ #
9
+ # Design draws on twitter-text and GFM autolink rules: scheme anchoring,
10
+ # iterative trailing-punct trim, balanced-paren preservation.
11
+ class Extractor
12
+ SCHEMES = %w[https http ftp wss ws].freeze
13
+
14
+ # Conservative TLD allow-list for scheme-less extraction. Limited to a
15
+ # small set of very common TLDs to keep false-positive rate low. A
16
+ # scheme-less candidate ALSO requires a `/path` to count, so plain
17
+ # `foo.com` in prose still won't match — only `foo.com/something`.
18
+ SCHEMELESS_TLDS = %w[com org net io ai dev co app gov edu].freeze
19
+
20
+ # Boundary chars — a URL ends at any of these (whitespace, angle
21
+ # brackets, quotes, backtick).
22
+ BOUNDARY = %r{[\s<>"'`]}.freeze
23
+
24
+ # Non-ASCII Unicode brackets and quotation marks that almost always
25
+ # terminate a URL in source text (e.g. `「URL」`). ASCII brackets are NOT
26
+ # listed here — those stay inside the URL match so the balanced-paren
27
+ # trim step can handle them (Wikipedia URLs like /Foo_(bar) survive).
28
+ NON_ASCII_BOUNDARY = (
29
+ "」』)】〉》〕〗〙〛⦆}]>" + # CJK closing brackets
30
+ "「『(【〈《〔〖〘〚⦅{[<" + # CJK opening brackets
31
+ "“”‘’„‟‚«»‹›" # Unicode quotation marks
32
+ ).chars.uniq.join.freeze
33
+
34
+ URL_CHAR_CLASS = %{[^\\s<>"'`,#{NON_ASCII_BOUNDARY}]+}.freeze
35
+
36
+ CANDIDATE_RE = %r{
37
+ (?<![\w/]) # not mid-word, not mid-path
38
+ (?:
39
+ (?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS} # absolute URL
40
+ |
41
+ urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS} # urn:NID:NSS
42
+ )
43
+ }xu.freeze
44
+
45
+ # Scheme-less alternative — same chars allowed as the absolute URL but
46
+ # requires a host with an allow-listed TLD AND a `/path` to keep prose
47
+ # noise low. The host part allows ASCII labels separated by dots; no
48
+ # Unicode hosts (those are too easily confused with prose).
49
+ SCHEMELESS_ALT = %{(?:[a-zA-Z0-9](?:[a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+(?i:#{SCHEMELESS_TLDS.join("|")})/#{URL_CHAR_CLASS}}.freeze
50
+
51
+ # Single-scan combined pattern used when scheme_less is on. One regex
52
+ # over the text is meaningfully cheaper than two.
53
+ COMBINED_RE = %r{
54
+ (?<![\w/.@])
55
+ (?:
56
+ (?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS}
57
+ |
58
+ urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS}
59
+ |
60
+ #{SCHEMELESS_ALT}
61
+ )
62
+ }xu.freeze
63
+
64
+ # Punctuation that's almost always sentence punctuation rather than part
65
+ # of a URL when it appears at the trailing edge.
66
+ TRAILING_PUNCT_RE = /[.,;:!?'"‘’“”]+\z/u.freeze
67
+
68
+ # Unmatched closing brackets that should be trimmed.
69
+ BRACKET_PAIRS = { ")" => "(", "]" => "[", "}" => "{" }.freeze
70
+
71
+ def initialize(scheme_less: true)
72
+ @scheme_less = scheme_less
73
+ end
74
+
75
+ def extract(text)
76
+ return [] if text.nil? || text.empty?
77
+
78
+ candidates = scan_candidates(text)
79
+ candidates.filter_map do |candidate|
80
+ trimmed = trim(candidate)
81
+ next nil if trimmed.empty?
82
+
83
+ begin
84
+ Parser.parse(trimmed)
85
+ rescue ParseError
86
+ nil
87
+ end
88
+ end
89
+ end
90
+
91
+ # Same as extract but returns only canonical strings, deduplicated,
92
+ # preserving first-seen order.
93
+ def extract_strings(text)
94
+ seen = {}
95
+ extract(text).each { |iri| seen[iri.canonical] ||= true }
96
+ seen.keys
97
+ end
98
+
99
+ private
100
+
101
+ # One regex scan over the text — combined pattern when scheme-less is
102
+ # on, scheme-anchored only otherwise.
103
+ def scan_candidates(text)
104
+ pattern = @scheme_less ? COMBINED_RE : CANDIDATE_RE
105
+ text.scan(pattern)
106
+ end
107
+
108
+ # Iteratively strip sentence punctuation and unmatched closing brackets
109
+ # until the candidate stabilizes.
110
+ def trim(candidate)
111
+ s = candidate.dup
112
+ loop do
113
+ before = s
114
+ s = s.sub(TRAILING_PUNCT_RE, "")
115
+ BRACKET_PAIRS.each do |close, open|
116
+ while s.end_with?(close) && s.count(close) > s.count(open)
117
+ s = s[0...-1]
118
+ end
119
+ end
120
+ break if s == before
121
+ end
122
+ s
123
+ end
124
+ end
125
+ end
@@ -43,9 +43,17 @@ module Iriq
43
43
  out << "#{scheme}://" if scheme
44
44
  out << host if host
45
45
  out << ":#{port}" if port
46
- out << "/" + path_segments.join("/") if path_segments.any?
47
- out << "?#{query}" if query && !query.empty?
48
- out << "##{fragment}" if fragment && !fragment.empty?
46
+ has_query = query && !query.empty?
47
+ has_fragment = fragment && !fragment.empty?
48
+ if path_segments.any?
49
+ out << "/" + path_segments.join("/")
50
+ elsif has_query || has_fragment
51
+ # RFC 3986: an authority with query/fragment but no path needs the
52
+ # implied "/" to be a valid URI.
53
+ out << "/"
54
+ end
55
+ out << "?#{query}" if has_query
56
+ out << "##{fragment}" if has_fragment
49
57
  out
50
58
  end
51
59
  end
@@ -0,0 +1,145 @@
1
+ require "set"
2
+
3
+ module Iriq
4
+ # Singularization with a swappable adapter.
5
+ #
6
+ # By default uses ActiveSupport's inflector if it can be required, otherwise
7
+ # falls back to BuiltinAdapter. Override globally:
8
+ #
9
+ # Iriq::Inflector.adapter = MyAdapter # must respond to .singularize(String)
10
+ #
11
+ # And reset to default with `Iriq::Inflector.reset_adapter!`.
12
+ module Inflector
13
+ # Vocabulary is bounded in practice; cache + cap matches the
14
+ # SegmentClassifier strategy.
15
+ CACHE_MAX = 10_000
16
+
17
+ class << self
18
+ def singularize(word)
19
+ cache = (@cache ||= {})
20
+ cached = cache[word]
21
+ return cached if cached
22
+
23
+ cache.clear if cache.size >= CACHE_MAX
24
+ cache[word] = adapter.singularize(word)
25
+ end
26
+
27
+ def adapter
28
+ @adapter ||= default_adapter
29
+ end
30
+
31
+ def adapter=(value)
32
+ @adapter = value
33
+ @cache = {} # different adapter could singularize differently
34
+ end
35
+
36
+ def reset_adapter!
37
+ @adapter = nil
38
+ @cache = {}
39
+ end
40
+
41
+ def default_adapter
42
+ require "active_support/inflector"
43
+ ActiveSupportAdapter
44
+ rescue LoadError
45
+ BuiltinAdapter
46
+ end
47
+ end
48
+
49
+ module ActiveSupportAdapter
50
+ def self.singularize(word)
51
+ ::ActiveSupport::Inflector.singularize(word.to_s)
52
+ end
53
+ end
54
+
55
+ # Rule-based English singularizer. Rules are ordered most-specific-first
56
+ # and adapted from ActiveSupport's default inflections.
57
+ module BuiltinAdapter
58
+ IRREGULARS = {
59
+ "people" => "person",
60
+ "children" => "child",
61
+ "men" => "man",
62
+ "women" => "woman",
63
+ "mice" => "mouse",
64
+ "geese" => "goose",
65
+ "oxen" => "ox",
66
+ "feet" => "foot",
67
+ "teeth" => "tooth",
68
+ "lives" => "life",
69
+ "wives" => "wife",
70
+ "moves" => "move",
71
+ "zombies" => "zombie",
72
+ # latin/greek plurals that don't fit a clean suffix rule
73
+ "indices" => "index",
74
+ "vertices" => "vertex",
75
+ # -f/-fe words where the stem doesn't end in l/r/i
76
+ "leaves" => "leaf",
77
+ "calves" => "calf",
78
+ "halves" => "half",
79
+ "loaves" => "loaf",
80
+ "hooves" => "hoof",
81
+ }.freeze
82
+
83
+ UNCOUNTABLE = Set.new(%w[
84
+ news fish sheep deer series species equipment information
85
+ money rice jeans police data media
86
+ ]).freeze
87
+
88
+ # [pattern, replacement] — first match wins.
89
+ RULES = [
90
+ [/(quiz)zes$/i, '\1'],
91
+ [/(matri|appendi)ces$/i, '\1x'],
92
+ [/(ox)en$/i, '\1'],
93
+ [/(alias|status)(es)?$/i, '\1'],
94
+ [/(octop|vir)(us|i)$/i, '\1us'],
95
+ [/(cris|ax|test)es$/i, '\1is'],
96
+ [/(shoe)s$/i, '\1'],
97
+ [/(bus)(es)?$/i, '\1'],
98
+ [/([ml])ice$/i, '\1ouse'],
99
+ [/(x|ch|ss|sh)es$/i, '\1'],
100
+ [/(m)ovies$/i, '\1ovie'],
101
+ [/(s)eries$/i, '\1eries'],
102
+ [/([^aeiouy]|qu)ies$/i, '\1y'],
103
+ [/([lr])ves$/i, '\1f'],
104
+ [/(tive)s$/i, '\1'],
105
+ [/(hive)s$/i, '\1'],
106
+ [/([^f])ves$/i, '\1fe'],
107
+ [/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i, '\1sis'],
108
+ [/([ti])a$/i, '\1um'],
109
+ [/(n)ews$/i, '\1ews'],
110
+ [/(o)es$/i, '\1'],
111
+ [/(ss)$/i, '\1'],
112
+ [/s$/i, ''],
113
+ ].freeze
114
+
115
+ def self.singularize(word)
116
+ return word if word.nil? || word.empty?
117
+
118
+ lower = word.downcase
119
+ return word if UNCOUNTABLE.include?(lower)
120
+
121
+ if (irr = IRREGULARS[lower])
122
+ return preserve_case(word, irr)
123
+ end
124
+
125
+ RULES.each do |pattern, replacement|
126
+ if word.match?(pattern)
127
+ return word.sub(pattern, replacement)
128
+ end
129
+ end
130
+
131
+ word
132
+ end
133
+
134
+ def self.preserve_case(original, lowered)
135
+ if original == original.upcase
136
+ lowered.upcase
137
+ elsif original[0] == original[0].upcase
138
+ lowered.sub(/\A./, &:upcase)
139
+ else
140
+ lowered
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
@@ -2,24 +2,27 @@ module Iriq
2
2
  # Produces a canonical, shape-aware string for an identifier.
3
3
  #
4
4
  # Normalizer.normalize("https://Foo.com:443/users/123")
5
- # # => "https://foo.com/users/{integer_id}"
5
+ # # => "https://foo.com/users/{user_id}"
6
6
  #
7
7
  # The form is intended for grouping/diffing — it is not a round-trippable URL.
8
8
  module Normalizer
9
9
  module_function
10
10
 
11
- def normalize(input, classifier: SegmentClassifier.new)
11
+ def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
12
12
  iri = input.is_a?(Identifier) ? input : Parser.parse(input)
13
- normalize_identifier(iri, classifier: classifier)
13
+ normalize_identifier(iri, classifier: classifier, hints: hints)
14
14
  end
15
15
 
16
- def normalize_identifier(iri, classifier: SegmentClassifier.new)
16
+ def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
17
17
  if iri.urn?
18
- # urn:isbn:0451450523 -> urn:isbn:{integer_id}
19
18
  if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
20
19
  ns, value = iri.nss.split(":", 2)
21
- type = classifier.classify(value)
22
- shaped = classifier.variable?(type) ? "{#{type}}" : value
20
+ entry = SegmentHints.derive([ns, value], classifier).last
21
+ shaped = if entry[:variable]
22
+ "{#{(hints && entry[:hint]) || entry[:type]}}"
23
+ else
24
+ entry[:value]
25
+ end
23
26
  "urn:#{ns}:#{shaped}"
24
27
  else
25
28
  iri.canonical
@@ -29,7 +32,7 @@ module Iriq
29
32
  out << "#{iri.scheme}://" if iri.scheme
30
33
  out << iri.host if iri.host
31
34
  out << ":#{iri.port}" if iri.port
32
- out << PathShape.new(classifier: classifier).for(iri.path_segments)
35
+ out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
33
36
  if iri.query_params && !iri.query_params.empty?
34
37
  out << "?" + shape_query(iri.query_params, classifier)
35
38
  end
@@ -0,0 +1,25 @@
1
+ module Iriq
2
+ # The result of Corpus#observe. Lightweight value object — heavy work
3
+ # (explanation, normalization) is deferred until you ask.
4
+ class Observation
5
+ attr_reader :identifier, :cluster
6
+
7
+ def initialize(corpus:, identifier:, cluster:)
8
+ @corpus = corpus
9
+ @identifier = identifier
10
+ @cluster = cluster
11
+ end
12
+
13
+ def fingerprint
14
+ @fingerprint ||= Normalizer.normalize_identifier(@identifier)
15
+ end
16
+
17
+ def explanation
18
+ @explanation ||= @corpus.explain(@identifier)
19
+ end
20
+
21
+ def normalize
22
+ @corpus.normalize(@identifier)
23
+ end
24
+ end
25
+ end