iriq 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/CLAUDE.md +121 -0
- data/Gemfile.lock +8 -2
- data/Makefile +56 -0
- data/README.md +334 -39
- data/iriq.gemspec +4 -3
- data/lib/iriq/cli.rb +289 -100
- data/lib/iriq/cluster.rb +47 -0
- data/lib/iriq/clusterer.rb +29 -39
- data/lib/iriq/corpus.rb +322 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +138 -0
- data/lib/iriq/storage/sqlite.rb +367 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +11 -0
- metadata +29 -4
data/lib/iriq/corpus.rb
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# Streaming-friendly observer over a (potentially unbounded) corpus of IRIs.
|
|
5
|
+
# Maintains rolling aggregates and per-(host, prefix) frequency stats so
|
|
6
|
+
# that classification can improve as more data flows in.
|
|
7
|
+
#
|
|
8
|
+
# The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
|
|
9
|
+
# Corpus#normalize and Corpus#explain are the corpus-informed variants.
|
|
10
|
+
#
|
|
11
|
+
# State lives in a Storage backend (Memory by default; Json or Sqlite when
|
|
12
|
+
# opened against a file). The classification logic on top is identical
|
|
13
|
+
# regardless of where the counters live.
|
|
14
|
+
class Corpus
|
|
15
|
+
# Type-based: position is "mostly variable" (UUIDs/integers/etc.).
|
|
16
|
+
VARIABLE_DOMINANCE_THRESHOLD = 0.8
|
|
17
|
+
|
|
18
|
+
# Cardinality-based: position has mostly distinct literal values, so the
|
|
19
|
+
# literal "type" is misleading — it's really a variable slot. We trigger
|
|
20
|
+
# on either:
|
|
21
|
+
# - very high cardinality fraction (most observations are singletons), OR
|
|
22
|
+
# - moderate cardinality fraction AND high absolute distinct count
|
|
23
|
+
# The second branch catches realistic streams where popular outliers
|
|
24
|
+
# bring the frac down but the long tail is clearly variable.
|
|
25
|
+
LITERAL_UNIQUENESS_THRESHOLD = 0.8
|
|
26
|
+
LITERAL_UNIQUENESS_MODERATE_THRESHOLD = 0.5
|
|
27
|
+
MIN_CARDINALITY_FOR_INFERENCE = 20
|
|
28
|
+
|
|
29
|
+
# Don't apply corpus heuristics until we have at least this many
|
|
30
|
+
# observations at a position — too easy to be wrong with tiny samples.
|
|
31
|
+
MIN_OBSERVATIONS_FOR_INFERENCE = 5
|
|
32
|
+
|
|
33
|
+
# Value-fraction at or above which a literal is considered the stable
|
|
34
|
+
# occupant of its position.
|
|
35
|
+
STABLE_LITERAL_THRESHOLD = 0.5
|
|
36
|
+
|
|
37
|
+
# Within a high-cardinality literal position (mostly singletons), a
|
|
38
|
+
# specific value qualifies as a "popular outlier" — and gets preserved
|
|
39
|
+
# as :stable_literal instead of being lumped into :corpus_inferred_variable
|
|
40
|
+
# — when its count is at least POPULAR_MIN_COUNT and its frequency is at
|
|
41
|
+
# least POPULAR_BASELINE_MULTIPLE × the uniform baseline (1/cardinality).
|
|
42
|
+
POPULAR_MIN_COUNT = 5
|
|
43
|
+
POPULAR_BASELINE_MULTIPLE = 3
|
|
44
|
+
|
|
45
|
+
attr_reader :storage
|
|
46
|
+
|
|
47
|
+
def initialize(classifier: SegmentClassifier::DEFAULT,
|
|
48
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
|
|
49
|
+
storage: nil)
|
|
50
|
+
@classifier = classifier
|
|
51
|
+
@storage = storage || Storage::Memory.new(
|
|
52
|
+
classifier: classifier,
|
|
53
|
+
max_values_per_position: max_values_per_position,
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Open a corpus against `path`. File extension picks the backend:
|
|
58
|
+
# `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
|
|
59
|
+
# else uses JSON.
|
|
60
|
+
def self.open(path, classifier: SegmentClassifier::DEFAULT,
|
|
61
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
62
|
+
storage = Storage.open(path,
|
|
63
|
+
classifier: classifier,
|
|
64
|
+
max_values_per_position: max_values_per_position)
|
|
65
|
+
new(classifier: classifier, storage: storage)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Observe a single IRI. Returns an Observation.
|
|
69
|
+
def observe(input)
|
|
70
|
+
iri = coerce(input)
|
|
71
|
+
hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
|
|
72
|
+
raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
|
|
73
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
|
|
74
|
+
|
|
75
|
+
cluster = nil
|
|
76
|
+
@storage.transaction do |s|
|
|
77
|
+
s.increment_host(iri.host)
|
|
78
|
+
s.increment_path_length(iri.path_segments.size)
|
|
79
|
+
s.increment_raw_shape(raw_shape)
|
|
80
|
+
s.increment_fingerprint(hinted_shape)
|
|
81
|
+
|
|
82
|
+
prefix = ""
|
|
83
|
+
hinted_entries.each do |entry|
|
|
84
|
+
s.observe_position(iri.host, prefix, entry[:value], entry[:type])
|
|
85
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape)
|
|
89
|
+
cluster = s.add_to_cluster(key, host, scheme, shape, iri)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
Observation.new(corpus: self, identifier: iri, cluster: cluster)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Corpus-informed normalization. Falls back to mechanical normalization
|
|
96
|
+
# when the corpus has no signal for a position.
|
|
97
|
+
def normalize(input)
|
|
98
|
+
iri = coerce(input)
|
|
99
|
+
return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
|
|
100
|
+
|
|
101
|
+
tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
|
|
102
|
+
out = +""
|
|
103
|
+
out << "#{iri.scheme}://" if iri.scheme
|
|
104
|
+
out << iri.host if iri.host
|
|
105
|
+
out << ":#{iri.port}" if iri.port
|
|
106
|
+
out << "/" << tokens.join("/")
|
|
107
|
+
out
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Per-segment explanation with corpus-informed `classification`.
|
|
111
|
+
# Returns an array of entries shaped like the Explanation rows plus
|
|
112
|
+
# `classification:` ∈ :stable_literal, :variable_identifier,
|
|
113
|
+
# :rare_literal, :ambiguous, :corpus_inferred_variable.
|
|
114
|
+
def explain(input)
|
|
115
|
+
iri = coerce(input)
|
|
116
|
+
annotate_segments(iri).map do |entry|
|
|
117
|
+
entry.reject { |k, _| k == :prefix }
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def host_counts; @storage.host_counts; end
|
|
122
|
+
def path_length_counts; @storage.path_length_counts; end
|
|
123
|
+
def raw_shape_counts; @storage.raw_shape_counts; end
|
|
124
|
+
def fingerprint_counts; @storage.fingerprint_counts; end
|
|
125
|
+
|
|
126
|
+
# Iterates (host, prefix) → PositionStats over all observed positions.
|
|
127
|
+
# Used by inspection tooling; not part of the hot path.
|
|
128
|
+
def each_position_stats(&block)
|
|
129
|
+
@storage.each_position_stats(&block)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def clusters
|
|
133
|
+
@storage.clusters
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def size
|
|
137
|
+
@storage.cluster_size
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Stats for a given (host, prefix_shape) — useful for tests and
|
|
141
|
+
# debugging. Returns nil if nothing has been observed there.
|
|
142
|
+
def stats_for(host, prefix)
|
|
143
|
+
@storage.position_stats(host, prefix)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Persist the corpus.
|
|
147
|
+
#
|
|
148
|
+
# save() → flush the backend in place (JSON writes its file,
|
|
149
|
+
# SQLite is already on disk).
|
|
150
|
+
# save(same_path) → same as save() — idempotent for the backend's path.
|
|
151
|
+
# save(other_path) → export to other_path as JSON, regardless of the
|
|
152
|
+
# live backend.
|
|
153
|
+
def save(path = nil)
|
|
154
|
+
backend_path = @storage.respond_to?(:path) ? @storage.path : nil
|
|
155
|
+
if path.nil? || path == backend_path
|
|
156
|
+
@storage.save
|
|
157
|
+
else
|
|
158
|
+
write_json_dump(path)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def close
|
|
163
|
+
@storage.close
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Wrap many observations in a single backend transaction. For SQLite this
|
|
167
|
+
# turns thousands of fsyncs into one; for in-memory backends it's a
|
|
168
|
+
# no-op. Use when ingesting a batch.
|
|
169
|
+
def batch(&block)
|
|
170
|
+
@storage.batch(&block)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
private
|
|
174
|
+
|
|
175
|
+
def coerce(input)
|
|
176
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def annotate_segments(iri)
|
|
180
|
+
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
181
|
+
prefix = ""
|
|
182
|
+
hinted.map do |entry|
|
|
183
|
+
stats = @storage.position_stats(iri.host, prefix)
|
|
184
|
+
out = entry.merge(
|
|
185
|
+
prefix: prefix,
|
|
186
|
+
classification: classify(entry, stats),
|
|
187
|
+
)
|
|
188
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
189
|
+
out
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def placeholder(entry)
|
|
194
|
+
return entry[:value] unless entry[:variable]
|
|
195
|
+
|
|
196
|
+
"{#{entry[:hint] || entry[:type]}}"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def classify(entry, stats)
|
|
200
|
+
variable = entry[:variable]
|
|
201
|
+
|
|
202
|
+
return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
|
|
203
|
+
return :variable_identifier if variable
|
|
204
|
+
|
|
205
|
+
value = entry[:value]
|
|
206
|
+
total = stats.total
|
|
207
|
+
variable_frac = stats.variable_fraction(@classifier)
|
|
208
|
+
cardinality_frac = stats.cardinality.to_f / total
|
|
209
|
+
enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
210
|
+
value_frac = stats.value_fraction(value)
|
|
211
|
+
|
|
212
|
+
if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
|
|
213
|
+
# Position is dominated by variable types (UUIDs, integers, etc.).
|
|
214
|
+
# A literal here is a special-case outlier (e.g. /users/me).
|
|
215
|
+
stats.value_counts.key?(value) ? :rare_literal : :ambiguous
|
|
216
|
+
elsif value_frac >= STABLE_LITERAL_THRESHOLD
|
|
217
|
+
# This specific value dominates — preserve it regardless of how
|
|
218
|
+
# diverse the rest of the position is.
|
|
219
|
+
:stable_literal
|
|
220
|
+
elsif enough_data && high_cardinality_literal_position?(stats, cardinality_frac)
|
|
221
|
+
# High-cardinality literal position — usually a variable slot, but
|
|
222
|
+
# recognize values that dramatically exceed the uniform baseline as
|
|
223
|
+
# "popular outliers" (e.g. /workspaces/mainspace surviving in a slot
|
|
224
|
+
# full of one-shot user-created workspace names).
|
|
225
|
+
popular_outlier?(stats, value) ? :stable_literal : :corpus_inferred_variable
|
|
226
|
+
elsif stats.cardinality == 1
|
|
227
|
+
:stable_literal
|
|
228
|
+
elsif stats.value_counts.key?(value)
|
|
229
|
+
:rare_literal
|
|
230
|
+
else
|
|
231
|
+
:ambiguous
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def high_cardinality_literal_position?(stats, cardinality_frac)
|
|
236
|
+
return true if cardinality_frac >= LITERAL_UNIQUENESS_THRESHOLD
|
|
237
|
+
|
|
238
|
+
cardinality_frac >= LITERAL_UNIQUENESS_MODERATE_THRESHOLD &&
|
|
239
|
+
stats.cardinality >= MIN_CARDINALITY_FOR_INFERENCE
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def popular_outlier?(stats, value)
|
|
243
|
+
count = stats.value_counts[value] || 0
|
|
244
|
+
return false if count < POPULAR_MIN_COUNT
|
|
245
|
+
|
|
246
|
+
baseline = 1.0 / stats.cardinality
|
|
247
|
+
stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def corpus_token(entry)
|
|
251
|
+
case entry[:classification]
|
|
252
|
+
when :variable_identifier, :corpus_inferred_variable
|
|
253
|
+
placeholder_for_variable(entry)
|
|
254
|
+
else
|
|
255
|
+
entry[:value]
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def placeholder_for_variable(entry)
|
|
260
|
+
return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
|
|
261
|
+
|
|
262
|
+
# corpus-inferred variable: classifier said literal, corpus says
|
|
263
|
+
# otherwise. Derive a hint from the prefix's last literal segment if
|
|
264
|
+
# we can.
|
|
265
|
+
last_literal = entry[:prefix].split("/").reject(&:empty?).reject { |s| s.start_with?("{") }.last
|
|
266
|
+
base = last_literal ? Inflector.singularize(last_literal) : nil
|
|
267
|
+
base ? "{#{base}}" : "{value}"
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
public
|
|
271
|
+
|
|
272
|
+
# --- Legacy dump/load (JSON shape) ------------------------------------
|
|
273
|
+
#
|
|
274
|
+
# The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
|
|
275
|
+
# `Corpus.load(path)` for JSON-backed persistence. Those names still work
|
|
276
|
+
# but are now thin wrappers around the appropriate Storage backend.
|
|
277
|
+
|
|
278
|
+
def dump
|
|
279
|
+
memory_view.to_dump
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
|
|
283
|
+
max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
|
|
284
|
+
storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
|
|
285
|
+
storage.load_dump!(h)
|
|
286
|
+
new(classifier: classifier, storage: storage)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def self.load(path, classifier: SegmentClassifier::DEFAULT)
|
|
290
|
+
open(path, classifier: classifier)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
private
|
|
294
|
+
|
|
295
|
+
def write_json_dump(path)
|
|
296
|
+
tmp = "#{path}.tmp"
|
|
297
|
+
File.write(tmp, JSON.generate(memory_view.to_dump))
|
|
298
|
+
File.rename(tmp, path)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Materialize a Memory snapshot of the current state — used by dump for
|
|
302
|
+
# backends that don't natively know how to emit the JSON shape.
|
|
303
|
+
def memory_view
|
|
304
|
+
return @storage if @storage.respond_to?(:to_dump)
|
|
305
|
+
|
|
306
|
+
mem = Storage::Memory.new(
|
|
307
|
+
classifier: @classifier,
|
|
308
|
+
max_values_per_position: @storage.max_values_per_position,
|
|
309
|
+
)
|
|
310
|
+
mem.instance_variable_set(:@host_counts, Hash.new(0).merge(@storage.host_counts))
|
|
311
|
+
mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
|
|
312
|
+
mem.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(@storage.raw_shape_counts))
|
|
313
|
+
mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
|
|
314
|
+
ps = {}
|
|
315
|
+
@storage.each_position_stats { |key, stats| ps[key] = stats }
|
|
316
|
+
mem.instance_variable_set(:@position_stats, ps)
|
|
317
|
+
clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
|
|
318
|
+
mem.instance_variable_set(:@clusters, clusters_h)
|
|
319
|
+
mem
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
end
|
data/lib/iriq/explanation.rb
CHANGED
|
@@ -3,43 +3,27 @@ module Iriq
|
|
|
3
3
|
#
|
|
4
4
|
# Explanation.explain("https://foo.com/users/123")
|
|
5
5
|
# # => [
|
|
6
|
-
# # { value: "users", type: :literal, variable: false },
|
|
7
|
-
# # { value: "123", type: :integer_id, variable: true },
|
|
6
|
+
# # { value: "users", type: :literal, variable: false, hint: nil },
|
|
7
|
+
# # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
|
|
8
8
|
# # ]
|
|
9
9
|
module Explanation
|
|
10
10
|
module_function
|
|
11
11
|
|
|
12
|
-
def explain(input, classifier: SegmentClassifier
|
|
12
|
+
def explain(input, classifier: SegmentClassifier::DEFAULT)
|
|
13
13
|
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
14
14
|
|
|
15
15
|
if iri.urn?
|
|
16
16
|
explain_urn(iri, classifier)
|
|
17
17
|
else
|
|
18
|
-
iri.path_segments
|
|
18
|
+
SegmentHints.derive(iri.path_segments, classifier)
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
-
def segment_entry(segment, classifier)
|
|
23
|
-
type = classifier.classify(segment)
|
|
24
|
-
{
|
|
25
|
-
value: segment,
|
|
26
|
-
type: type,
|
|
27
|
-
variable: classifier.variable?(type),
|
|
28
|
-
}
|
|
29
|
-
end
|
|
30
|
-
|
|
31
22
|
def explain_urn(iri, classifier)
|
|
32
23
|
return [] unless iri.nss
|
|
33
24
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
[
|
|
37
|
-
{ value: ns, type: :literal, variable: false },
|
|
38
|
-
segment_entry(value, classifier),
|
|
39
|
-
]
|
|
40
|
-
else
|
|
41
|
-
[segment_entry(iri.nss, classifier)]
|
|
42
|
-
end
|
|
25
|
+
parts = iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
|
|
26
|
+
SegmentHints.derive(parts, classifier)
|
|
43
27
|
end
|
|
44
28
|
end
|
|
45
29
|
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Pulls IRIs out of free text. Scheme-anchored — only URLs whose scheme
|
|
3
|
+
# appears explicitly are extracted (scheme-less hosts like "foo.com/x" are
|
|
4
|
+
# too noisy to disambiguate from prose).
|
|
5
|
+
#
|
|
6
|
+
# Iriq::Extractor.new.extract("Visit https://foo.com today.")
|
|
7
|
+
# # => [#<Iriq::Identifier https://foo.com>]
|
|
8
|
+
#
|
|
9
|
+
# Design draws on twitter-text and GFM autolink rules: scheme anchoring,
|
|
10
|
+
# iterative trailing-punct trim, balanced-paren preservation.
|
|
11
|
+
class Extractor
|
|
12
|
+
SCHEMES = %w[https http ftp wss ws].freeze
|
|
13
|
+
|
|
14
|
+
# Conservative TLD allow-list for scheme-less extraction. Limited to a
|
|
15
|
+
# small set of very common TLDs to keep false-positive rate low. A
|
|
16
|
+
# scheme-less candidate ALSO requires a `/path` to count, so plain
|
|
17
|
+
# `foo.com` in prose still won't match — only `foo.com/something`.
|
|
18
|
+
SCHEMELESS_TLDS = %w[com org net io ai dev co app gov edu].freeze
|
|
19
|
+
|
|
20
|
+
# Boundary chars — a URL ends at any of these (whitespace, angle
|
|
21
|
+
# brackets, quotes, backtick).
|
|
22
|
+
BOUNDARY = %r{[\s<>"'`]}.freeze
|
|
23
|
+
|
|
24
|
+
# Non-ASCII Unicode brackets and quotation marks that almost always
|
|
25
|
+
# terminate a URL in source text (e.g. `「URL」`). ASCII brackets are NOT
|
|
26
|
+
# listed here — those stay inside the URL match so the balanced-paren
|
|
27
|
+
# trim step can handle them (Wikipedia URLs like /Foo_(bar) survive).
|
|
28
|
+
NON_ASCII_BOUNDARY = (
|
|
29
|
+
"」』)】〉》〕〗〙〛⦆}]>" + # CJK closing brackets
|
|
30
|
+
"「『(【〈《〔〖〘〚⦅{[<" + # CJK opening brackets
|
|
31
|
+
"“”‘’„‟‚«»‹›" # Unicode quotation marks
|
|
32
|
+
).chars.uniq.join.freeze
|
|
33
|
+
|
|
34
|
+
URL_CHAR_CLASS = %{[^\\s<>"'`,#{NON_ASCII_BOUNDARY}]+}.freeze
|
|
35
|
+
|
|
36
|
+
CANDIDATE_RE = %r{
|
|
37
|
+
(?<![\w/]) # not mid-word, not mid-path
|
|
38
|
+
(?:
|
|
39
|
+
(?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS} # absolute URL
|
|
40
|
+
|
|
|
41
|
+
urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS} # urn:NID:NSS
|
|
42
|
+
)
|
|
43
|
+
}xu.freeze
|
|
44
|
+
|
|
45
|
+
# Scheme-less alternative — same chars allowed as the absolute URL but
|
|
46
|
+
# requires a host with an allow-listed TLD AND a `/path` to keep prose
|
|
47
|
+
# noise low. The host part allows ASCII labels separated by dots; no
|
|
48
|
+
# Unicode hosts (those are too easily confused with prose).
|
|
49
|
+
SCHEMELESS_ALT = %{(?:[a-zA-Z0-9](?:[a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+(?i:#{SCHEMELESS_TLDS.join("|")})/#{URL_CHAR_CLASS}}.freeze
|
|
50
|
+
|
|
51
|
+
# Single-scan combined pattern used when scheme_less is on. One regex
|
|
52
|
+
# over the text is meaningfully cheaper than two.
|
|
53
|
+
COMBINED_RE = %r{
|
|
54
|
+
(?<![\w/.@])
|
|
55
|
+
(?:
|
|
56
|
+
(?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS}
|
|
57
|
+
|
|
|
58
|
+
urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS}
|
|
59
|
+
|
|
|
60
|
+
#{SCHEMELESS_ALT}
|
|
61
|
+
)
|
|
62
|
+
}xu.freeze
|
|
63
|
+
|
|
64
|
+
# Punctuation that's almost always sentence punctuation rather than part
|
|
65
|
+
# of a URL when it appears at the trailing edge.
|
|
66
|
+
TRAILING_PUNCT_RE = /[.,;:!?'"‘’“”]+\z/u.freeze
|
|
67
|
+
|
|
68
|
+
# Unmatched closing brackets that should be trimmed.
|
|
69
|
+
BRACKET_PAIRS = { ")" => "(", "]" => "[", "}" => "{" }.freeze
|
|
70
|
+
|
|
71
|
+
def initialize(scheme_less: true)
|
|
72
|
+
@scheme_less = scheme_less
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def extract(text)
|
|
76
|
+
return [] if text.nil? || text.empty?
|
|
77
|
+
|
|
78
|
+
candidates = scan_candidates(text)
|
|
79
|
+
candidates.filter_map do |candidate|
|
|
80
|
+
trimmed = trim(candidate)
|
|
81
|
+
next nil if trimmed.empty?
|
|
82
|
+
|
|
83
|
+
begin
|
|
84
|
+
Parser.parse(trimmed)
|
|
85
|
+
rescue ParseError
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Same as extract but returns only canonical strings, deduplicated,
|
|
92
|
+
# preserving first-seen order.
|
|
93
|
+
def extract_strings(text)
|
|
94
|
+
seen = {}
|
|
95
|
+
extract(text).each { |iri| seen[iri.canonical] ||= true }
|
|
96
|
+
seen.keys
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# One regex scan over the text — combined pattern when scheme-less is
|
|
102
|
+
# on, scheme-anchored only otherwise.
|
|
103
|
+
def scan_candidates(text)
|
|
104
|
+
pattern = @scheme_less ? COMBINED_RE : CANDIDATE_RE
|
|
105
|
+
text.scan(pattern)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Iteratively strip sentence punctuation and unmatched closing brackets
|
|
109
|
+
# until the candidate stabilizes.
|
|
110
|
+
def trim(candidate)
|
|
111
|
+
s = candidate.dup
|
|
112
|
+
loop do
|
|
113
|
+
before = s
|
|
114
|
+
s = s.sub(TRAILING_PUNCT_RE, "")
|
|
115
|
+
BRACKET_PAIRS.each do |close, open|
|
|
116
|
+
while s.end_with?(close) && s.count(close) > s.count(open)
|
|
117
|
+
s = s[0...-1]
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
break if s == before
|
|
121
|
+
end
|
|
122
|
+
s
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
data/lib/iriq/identifier.rb
CHANGED
|
@@ -43,9 +43,17 @@ module Iriq
|
|
|
43
43
|
out << "#{scheme}://" if scheme
|
|
44
44
|
out << host if host
|
|
45
45
|
out << ":#{port}" if port
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
has_query = query && !query.empty?
|
|
47
|
+
has_fragment = fragment && !fragment.empty?
|
|
48
|
+
if path_segments.any?
|
|
49
|
+
out << "/" + path_segments.join("/")
|
|
50
|
+
elsif has_query || has_fragment
|
|
51
|
+
# RFC 3986: an authority with query/fragment but no path needs the
|
|
52
|
+
# implied "/" to be a valid URI.
|
|
53
|
+
out << "/"
|
|
54
|
+
end
|
|
55
|
+
out << "?#{query}" if has_query
|
|
56
|
+
out << "##{fragment}" if has_fragment
|
|
49
57
|
out
|
|
50
58
|
end
|
|
51
59
|
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# Singularization with a swappable adapter.
|
|
5
|
+
#
|
|
6
|
+
# By default uses ActiveSupport's inflector if it can be required, otherwise
|
|
7
|
+
# falls back to BuiltinAdapter. Override globally:
|
|
8
|
+
#
|
|
9
|
+
# Iriq::Inflector.adapter = MyAdapter # must respond to .singularize(String)
|
|
10
|
+
#
|
|
11
|
+
# And reset to default with `Iriq::Inflector.reset_adapter!`.
|
|
12
|
+
module Inflector
|
|
13
|
+
# Vocabulary is bounded in practice; cache + cap matches the
|
|
14
|
+
# SegmentClassifier strategy.
|
|
15
|
+
CACHE_MAX = 10_000
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
def singularize(word)
|
|
19
|
+
cache = (@cache ||= {})
|
|
20
|
+
cached = cache[word]
|
|
21
|
+
return cached if cached
|
|
22
|
+
|
|
23
|
+
cache.clear if cache.size >= CACHE_MAX
|
|
24
|
+
cache[word] = adapter.singularize(word)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def adapter
|
|
28
|
+
@adapter ||= default_adapter
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def adapter=(value)
|
|
32
|
+
@adapter = value
|
|
33
|
+
@cache = {} # different adapter could singularize differently
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def reset_adapter!
|
|
37
|
+
@adapter = nil
|
|
38
|
+
@cache = {}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def default_adapter
|
|
42
|
+
require "active_support/inflector"
|
|
43
|
+
ActiveSupportAdapter
|
|
44
|
+
rescue LoadError
|
|
45
|
+
BuiltinAdapter
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
module ActiveSupportAdapter
|
|
50
|
+
def self.singularize(word)
|
|
51
|
+
::ActiveSupport::Inflector.singularize(word.to_s)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Rule-based English singularizer. Rules are ordered most-specific-first
|
|
56
|
+
# and adapted from ActiveSupport's default inflections.
|
|
57
|
+
module BuiltinAdapter
|
|
58
|
+
IRREGULARS = {
|
|
59
|
+
"people" => "person",
|
|
60
|
+
"children" => "child",
|
|
61
|
+
"men" => "man",
|
|
62
|
+
"women" => "woman",
|
|
63
|
+
"mice" => "mouse",
|
|
64
|
+
"geese" => "goose",
|
|
65
|
+
"oxen" => "ox",
|
|
66
|
+
"feet" => "foot",
|
|
67
|
+
"teeth" => "tooth",
|
|
68
|
+
"lives" => "life",
|
|
69
|
+
"wives" => "wife",
|
|
70
|
+
"moves" => "move",
|
|
71
|
+
"zombies" => "zombie",
|
|
72
|
+
# latin/greek plurals that don't fit a clean suffix rule
|
|
73
|
+
"indices" => "index",
|
|
74
|
+
"vertices" => "vertex",
|
|
75
|
+
# -f/-fe words where the stem doesn't end in l/r/i
|
|
76
|
+
"leaves" => "leaf",
|
|
77
|
+
"calves" => "calf",
|
|
78
|
+
"halves" => "half",
|
|
79
|
+
"loaves" => "loaf",
|
|
80
|
+
"hooves" => "hoof",
|
|
81
|
+
}.freeze
|
|
82
|
+
|
|
83
|
+
UNCOUNTABLE = Set.new(%w[
|
|
84
|
+
news fish sheep deer series species equipment information
|
|
85
|
+
money rice jeans police data media
|
|
86
|
+
]).freeze
|
|
87
|
+
|
|
88
|
+
# [pattern, replacement] — first match wins.
|
|
89
|
+
RULES = [
|
|
90
|
+
[/(quiz)zes$/i, '\1'],
|
|
91
|
+
[/(matri|appendi)ces$/i, '\1x'],
|
|
92
|
+
[/(ox)en$/i, '\1'],
|
|
93
|
+
[/(alias|status)(es)?$/i, '\1'],
|
|
94
|
+
[/(octop|vir)(us|i)$/i, '\1us'],
|
|
95
|
+
[/(cris|ax|test)es$/i, '\1is'],
|
|
96
|
+
[/(shoe)s$/i, '\1'],
|
|
97
|
+
[/(bus)(es)?$/i, '\1'],
|
|
98
|
+
[/([ml])ice$/i, '\1ouse'],
|
|
99
|
+
[/(x|ch|ss|sh)es$/i, '\1'],
|
|
100
|
+
[/(m)ovies$/i, '\1ovie'],
|
|
101
|
+
[/(s)eries$/i, '\1eries'],
|
|
102
|
+
[/([^aeiouy]|qu)ies$/i, '\1y'],
|
|
103
|
+
[/([lr])ves$/i, '\1f'],
|
|
104
|
+
[/(tive)s$/i, '\1'],
|
|
105
|
+
[/(hive)s$/i, '\1'],
|
|
106
|
+
[/([^f])ves$/i, '\1fe'],
|
|
107
|
+
[/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i, '\1sis'],
|
|
108
|
+
[/([ti])a$/i, '\1um'],
|
|
109
|
+
[/(n)ews$/i, '\1ews'],
|
|
110
|
+
[/(o)es$/i, '\1'],
|
|
111
|
+
[/(ss)$/i, '\1'],
|
|
112
|
+
[/s$/i, ''],
|
|
113
|
+
].freeze
|
|
114
|
+
|
|
115
|
+
def self.singularize(word)
|
|
116
|
+
return word if word.nil? || word.empty?
|
|
117
|
+
|
|
118
|
+
lower = word.downcase
|
|
119
|
+
return word if UNCOUNTABLE.include?(lower)
|
|
120
|
+
|
|
121
|
+
if (irr = IRREGULARS[lower])
|
|
122
|
+
return preserve_case(word, irr)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
RULES.each do |pattern, replacement|
|
|
126
|
+
if word.match?(pattern)
|
|
127
|
+
return word.sub(pattern, replacement)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
word
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def self.preserve_case(original, lowered)
|
|
135
|
+
if original == original.upcase
|
|
136
|
+
lowered.upcase
|
|
137
|
+
elsif original[0] == original[0].upcase
|
|
138
|
+
lowered.sub(/\A./, &:upcase)
|
|
139
|
+
else
|
|
140
|
+
lowered
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|