iriq 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Gemfile.lock +2 -2
- data/README.md +227 -33
- data/lib/iriq/cli.rb +288 -100
- data/lib/iriq/cluster.rb +23 -0
- data/lib/iriq/clusterer.rb +32 -17
- data/lib/iriq/corpus.rb +268 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +10 -0
- data/script/benchmark.rb +81 -0
- data/script/memory.rb +121 -0
- metadata +9 -1
data/lib/iriq/corpus.rb
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# Streaming-friendly observer over a (potentially unbounded) corpus of IRIs.
|
|
5
|
+
# Maintains rolling aggregates and per-(host, prefix) frequency stats so
|
|
6
|
+
# that classification can improve as more data flows in.
|
|
7
|
+
#
|
|
8
|
+
# The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
|
|
9
|
+
# Corpus#normalize and Corpus#explain are the corpus-informed variants.
|
|
10
|
+
class Corpus
|
|
11
|
+
# Type-based: position is "mostly variable" (UUIDs/integers/etc.).
|
|
12
|
+
VARIABLE_DOMINANCE_THRESHOLD = 0.8
|
|
13
|
+
|
|
14
|
+
# Cardinality-based: position has mostly distinct literal values, so the
|
|
15
|
+
# literal "type" is misleading — it's really a variable slot. We trigger
|
|
16
|
+
# on either:
|
|
17
|
+
# - very high cardinality fraction (most observations are singletons), OR
|
|
18
|
+
# - moderate cardinality fraction AND high absolute distinct count
|
|
19
|
+
# The second branch catches realistic streams where popular outliers
|
|
20
|
+
# bring the frac down but the long tail is clearly variable.
|
|
21
|
+
LITERAL_UNIQUENESS_THRESHOLD = 0.8
|
|
22
|
+
LITERAL_UNIQUENESS_MODERATE_THRESHOLD = 0.5
|
|
23
|
+
MIN_CARDINALITY_FOR_INFERENCE = 20
|
|
24
|
+
|
|
25
|
+
# Don't apply corpus heuristics until we have at least this many
|
|
26
|
+
# observations at a position — too easy to be wrong with tiny samples.
|
|
27
|
+
MIN_OBSERVATIONS_FOR_INFERENCE = 5
|
|
28
|
+
|
|
29
|
+
# Value-fraction at or above which a literal is considered the stable
|
|
30
|
+
# occupant of its position.
|
|
31
|
+
STABLE_LITERAL_THRESHOLD = 0.5
|
|
32
|
+
|
|
33
|
+
# Within a high-cardinality literal position (mostly singletons), a
|
|
34
|
+
# specific value qualifies as a "popular outlier" — and gets preserved
|
|
35
|
+
# as :stable_literal instead of being lumped into :corpus_inferred_variable
|
|
36
|
+
# — when its count is at least POPULAR_MIN_COUNT and its frequency is at
|
|
37
|
+
# least POPULAR_BASELINE_MULTIPLE × the uniform baseline (1/cardinality).
|
|
38
|
+
POPULAR_MIN_COUNT = 5
|
|
39
|
+
POPULAR_BASELINE_MULTIPLE = 3
|
|
40
|
+
|
|
41
|
+
attr_reader :host_counts, :path_length_counts, :raw_shape_counts,
|
|
42
|
+
:fingerprint_counts, :position_stats
|
|
43
|
+
|
|
44
|
+
def initialize(classifier: SegmentClassifier::DEFAULT,
|
|
45
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
46
|
+
@classifier = classifier
|
|
47
|
+
@max_values_per_position = max_values_per_position
|
|
48
|
+
@host_counts = Hash.new(0)
|
|
49
|
+
@path_length_counts = Hash.new(0)
|
|
50
|
+
@raw_shape_counts = Hash.new(0)
|
|
51
|
+
@fingerprint_counts = Hash.new(0)
|
|
52
|
+
@position_stats = {}
|
|
53
|
+
@clusterer = Clusterer.new(classifier: classifier)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Observe a single IRI. Returns an Observation.
|
|
57
|
+
def observe(input)
|
|
58
|
+
iri = coerce(input)
|
|
59
|
+
hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
|
|
60
|
+
record_aggregates(iri, hinted_entries)
|
|
61
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
|
|
62
|
+
cluster = @clusterer.add(iri, shape: hinted_shape)
|
|
63
|
+
Observation.new(corpus: self, identifier: iri, cluster: cluster)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Corpus-informed normalization. Falls back to mechanical normalization
|
|
67
|
+
# when the corpus has no signal for a position.
|
|
68
|
+
def normalize(input)
|
|
69
|
+
iri = coerce(input)
|
|
70
|
+
return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
|
|
71
|
+
|
|
72
|
+
tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
|
|
73
|
+
out = +""
|
|
74
|
+
out << "#{iri.scheme}://" if iri.scheme
|
|
75
|
+
out << iri.host if iri.host
|
|
76
|
+
out << ":#{iri.port}" if iri.port
|
|
77
|
+
out << "/" << tokens.join("/")
|
|
78
|
+
out
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Per-segment explanation with corpus-informed `classification`.
|
|
82
|
+
# Returns an array of entries shaped like the Explanation rows plus
|
|
83
|
+
# `classification:` ∈ :stable_literal, :variable_identifier,
|
|
84
|
+
# :rare_literal, :ambiguous, :corpus_inferred_variable.
|
|
85
|
+
def explain(input)
|
|
86
|
+
iri = coerce(input)
|
|
87
|
+
annotate_segments(iri).map do |entry|
|
|
88
|
+
entry.reject { |k, _| k == :prefix }
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def clusters
|
|
93
|
+
@clusterer.clusters
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def size
|
|
97
|
+
@clusterer.size
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Stats for a given (host, prefix_shape) — useful for tests and
|
|
101
|
+
# debugging. Returns nil if nothing has been observed there.
|
|
102
|
+
def stats_for(host, prefix)
|
|
103
|
+
@position_stats[[host, prefix]]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
def coerce(input)
|
|
109
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def record_aggregates(iri, hinted_entries)
|
|
113
|
+
@host_counts[iri.host] += 1 if iri.host
|
|
114
|
+
@path_length_counts[iri.path_segments.size] += 1
|
|
115
|
+
|
|
116
|
+
raw = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
|
|
117
|
+
fp = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
|
|
118
|
+
@raw_shape_counts[raw] += 1
|
|
119
|
+
@fingerprint_counts[fp] += 1
|
|
120
|
+
|
|
121
|
+
record_position_stats(iri, hinted_entries)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def record_position_stats(iri, hinted_entries)
|
|
125
|
+
prefix = ""
|
|
126
|
+
hinted_entries.each do |entry|
|
|
127
|
+
key = [iri.host, prefix]
|
|
128
|
+
stats = @position_stats[key] ||= PositionStats.new(max_values: @max_values_per_position)
|
|
129
|
+
stats.observe(entry[:value], entry[:type])
|
|
130
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Walks the IRI's segments and returns hint-derived entries enriched with
|
|
135
|
+
# the (host, prefix) PositionStats reference and a :classification symbol.
|
|
136
|
+
def annotate_segments(iri)
|
|
137
|
+
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
138
|
+
prefix = ""
|
|
139
|
+
hinted.map do |entry|
|
|
140
|
+
stats = @position_stats[[iri.host, prefix]]
|
|
141
|
+
out = entry.merge(
|
|
142
|
+
prefix: prefix,
|
|
143
|
+
classification: classify(entry, stats),
|
|
144
|
+
)
|
|
145
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
146
|
+
out
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def placeholder(entry)
|
|
151
|
+
return entry[:value] unless entry[:variable]
|
|
152
|
+
|
|
153
|
+
"{#{entry[:hint] || entry[:type]}}"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def classify(entry, stats)
|
|
157
|
+
variable = entry[:variable]
|
|
158
|
+
|
|
159
|
+
return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
|
|
160
|
+
return :variable_identifier if variable
|
|
161
|
+
|
|
162
|
+
value = entry[:value]
|
|
163
|
+
total = stats.total
|
|
164
|
+
variable_frac = stats.variable_fraction(@classifier)
|
|
165
|
+
cardinality_frac = stats.cardinality.to_f / total
|
|
166
|
+
enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
167
|
+
value_frac = stats.value_fraction(value)
|
|
168
|
+
|
|
169
|
+
if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
|
|
170
|
+
# Position is dominated by variable types (UUIDs, integers, etc.).
|
|
171
|
+
# A literal here is a special-case outlier (e.g. /users/me).
|
|
172
|
+
stats.value_counts.key?(value) ? :rare_literal : :ambiguous
|
|
173
|
+
elsif value_frac >= STABLE_LITERAL_THRESHOLD
|
|
174
|
+
# This specific value dominates — preserve it regardless of how
|
|
175
|
+
# diverse the rest of the position is.
|
|
176
|
+
:stable_literal
|
|
177
|
+
elsif enough_data && high_cardinality_literal_position?(stats, cardinality_frac)
|
|
178
|
+
# High-cardinality literal position — usually a variable slot, but
|
|
179
|
+
# recognize values that dramatically exceed the uniform baseline as
|
|
180
|
+
# "popular outliers" (e.g. /workspaces/mainspace surviving in a slot
|
|
181
|
+
# full of one-shot user-created workspace names).
|
|
182
|
+
popular_outlier?(stats, value) ? :stable_literal : :corpus_inferred_variable
|
|
183
|
+
elsif stats.cardinality == 1
|
|
184
|
+
:stable_literal
|
|
185
|
+
elsif stats.value_counts.key?(value)
|
|
186
|
+
:rare_literal
|
|
187
|
+
else
|
|
188
|
+
:ambiguous
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def high_cardinality_literal_position?(stats, cardinality_frac)
|
|
193
|
+
return true if cardinality_frac >= LITERAL_UNIQUENESS_THRESHOLD
|
|
194
|
+
|
|
195
|
+
cardinality_frac >= LITERAL_UNIQUENESS_MODERATE_THRESHOLD &&
|
|
196
|
+
stats.cardinality >= MIN_CARDINALITY_FOR_INFERENCE
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def popular_outlier?(stats, value)
|
|
200
|
+
count = stats.value_counts[value] || 0
|
|
201
|
+
return false if count < POPULAR_MIN_COUNT
|
|
202
|
+
|
|
203
|
+
baseline = 1.0 / stats.cardinality
|
|
204
|
+
stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def corpus_token(entry)
|
|
208
|
+
case entry[:classification]
|
|
209
|
+
when :variable_identifier, :corpus_inferred_variable
|
|
210
|
+
placeholder_for_variable(entry)
|
|
211
|
+
else
|
|
212
|
+
entry[:value]
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def placeholder_for_variable(entry)
|
|
217
|
+
return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
|
|
218
|
+
|
|
219
|
+
# corpus-inferred variable: classifier said literal, corpus says
|
|
220
|
+
# otherwise. Derive a hint from the prefix's last literal segment if
|
|
221
|
+
# we can.
|
|
222
|
+
last_literal = entry[:prefix].split("/").reject(&:empty?).reject { |s| s.start_with?("{") }.last
|
|
223
|
+
base = last_literal ? Inflector.singularize(last_literal) : nil
|
|
224
|
+
base ? "{#{base}}" : "{value}"
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
public
|
|
228
|
+
|
|
229
|
+
def dump
|
|
230
|
+
{
|
|
231
|
+
"host_counts" => @host_counts,
|
|
232
|
+
"path_length_counts" => @path_length_counts.transform_keys(&:to_s),
|
|
233
|
+
"raw_shape_counts" => @raw_shape_counts,
|
|
234
|
+
"fingerprint_counts" => @fingerprint_counts,
|
|
235
|
+
"max_values_per_position" => @max_values_per_position,
|
|
236
|
+
"position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
|
|
237
|
+
"clusterer" => @clusterer.dump,
|
|
238
|
+
}
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def save(path)
|
|
242
|
+
tmp = "#{path}.tmp"
|
|
243
|
+
File.write(tmp, JSON.generate(dump))
|
|
244
|
+
File.rename(tmp, path)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
|
|
248
|
+
c = new(
|
|
249
|
+
classifier: classifier,
|
|
250
|
+
max_values_per_position: h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES),
|
|
251
|
+
)
|
|
252
|
+
c.instance_variable_set(:@host_counts, Hash.new(0).merge(h["host_counts"]))
|
|
253
|
+
c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
|
|
254
|
+
c.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(h["raw_shape_counts"]))
|
|
255
|
+
c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
|
|
256
|
+
stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
|
|
257
|
+
acc[[host, prefix]] = PositionStats.from_dump(sdump)
|
|
258
|
+
end
|
|
259
|
+
c.instance_variable_set(:@position_stats, stats)
|
|
260
|
+
c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
|
|
261
|
+
c
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def self.load(path, classifier: SegmentClassifier::DEFAULT)
|
|
265
|
+
from_dump(JSON.parse(File.read(path)), classifier: classifier)
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
data/lib/iriq/explanation.rb
CHANGED
|
@@ -3,43 +3,27 @@ module Iriq
|
|
|
3
3
|
#
|
|
4
4
|
# Explanation.explain("https://foo.com/users/123")
|
|
5
5
|
# # => [
|
|
6
|
-
# # { value: "users", type: :literal, variable: false },
|
|
7
|
-
# # { value: "123", type: :integer_id, variable: true },
|
|
6
|
+
# # { value: "users", type: :literal, variable: false, hint: nil },
|
|
7
|
+
# # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
|
|
8
8
|
# # ]
|
|
9
9
|
module Explanation
|
|
10
10
|
module_function
|
|
11
11
|
|
|
12
|
-
def explain(input, classifier: SegmentClassifier
|
|
12
|
+
def explain(input, classifier: SegmentClassifier::DEFAULT)
|
|
13
13
|
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
14
14
|
|
|
15
15
|
if iri.urn?
|
|
16
16
|
explain_urn(iri, classifier)
|
|
17
17
|
else
|
|
18
|
-
iri.path_segments
|
|
18
|
+
SegmentHints.derive(iri.path_segments, classifier)
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
-
def segment_entry(segment, classifier)
|
|
23
|
-
type = classifier.classify(segment)
|
|
24
|
-
{
|
|
25
|
-
value: segment,
|
|
26
|
-
type: type,
|
|
27
|
-
variable: classifier.variable?(type),
|
|
28
|
-
}
|
|
29
|
-
end
|
|
30
|
-
|
|
31
22
|
def explain_urn(iri, classifier)
|
|
32
23
|
return [] unless iri.nss
|
|
33
24
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
[
|
|
37
|
-
{ value: ns, type: :literal, variable: false },
|
|
38
|
-
segment_entry(value, classifier),
|
|
39
|
-
]
|
|
40
|
-
else
|
|
41
|
-
[segment_entry(iri.nss, classifier)]
|
|
42
|
-
end
|
|
25
|
+
parts = iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
|
|
26
|
+
SegmentHints.derive(parts, classifier)
|
|
43
27
|
end
|
|
44
28
|
end
|
|
45
29
|
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Pulls IRIs out of free text. Scheme-anchored — only URLs whose scheme
|
|
3
|
+
# appears explicitly are extracted (scheme-less hosts like "foo.com/x" are
|
|
4
|
+
# too noisy to disambiguate from prose).
|
|
5
|
+
#
|
|
6
|
+
# Iriq::Extractor.new.extract("Visit https://foo.com today.")
|
|
7
|
+
# # => [#<Iriq::Identifier https://foo.com>]
|
|
8
|
+
#
|
|
9
|
+
# Design draws on twitter-text and GFM autolink rules: scheme anchoring,
|
|
10
|
+
# iterative trailing-punct trim, balanced-paren preservation.
|
|
11
|
+
class Extractor
|
|
12
|
+
SCHEMES = %w[https http ftp wss ws].freeze
|
|
13
|
+
|
|
14
|
+
# Conservative TLD allow-list for scheme-less extraction. Limited to a
|
|
15
|
+
# small set of very common TLDs to keep false-positive rate low. A
|
|
16
|
+
# scheme-less candidate ALSO requires a `/path` to count, so plain
|
|
17
|
+
# `foo.com` in prose still won't match — only `foo.com/something`.
|
|
18
|
+
SCHEMELESS_TLDS = %w[com org net io ai dev co app gov edu].freeze
|
|
19
|
+
|
|
20
|
+
# Boundary chars — a URL ends at any of these (whitespace, angle
|
|
21
|
+
# brackets, quotes, backtick).
|
|
22
|
+
BOUNDARY = %r{[\s<>"'`]}.freeze
|
|
23
|
+
|
|
24
|
+
# Non-ASCII Unicode brackets and quotation marks that almost always
|
|
25
|
+
# terminate a URL in source text (e.g. `「URL」`). ASCII brackets are NOT
|
|
26
|
+
# listed here — those stay inside the URL match so the balanced-paren
|
|
27
|
+
# trim step can handle them (Wikipedia URLs like /Foo_(bar) survive).
|
|
28
|
+
NON_ASCII_BOUNDARY = (
|
|
29
|
+
"」』)】〉》〕〗〙〛⦆}]>" + # CJK closing brackets
|
|
30
|
+
"「『(【〈《〔〖〘〚⦅{[<" + # CJK opening brackets
|
|
31
|
+
"“”‘’„‟‚«»‹›" # Unicode quotation marks
|
|
32
|
+
).chars.uniq.join.freeze
|
|
33
|
+
|
|
34
|
+
URL_CHAR_CLASS = %{[^\\s<>"'`,#{NON_ASCII_BOUNDARY}]+}.freeze
|
|
35
|
+
|
|
36
|
+
CANDIDATE_RE = %r{
|
|
37
|
+
(?<![\w/]) # not mid-word, not mid-path
|
|
38
|
+
(?:
|
|
39
|
+
(?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS} # absolute URL
|
|
40
|
+
|
|
|
41
|
+
urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS} # urn:NID:NSS
|
|
42
|
+
)
|
|
43
|
+
}xu.freeze
|
|
44
|
+
|
|
45
|
+
# Scheme-less alternative — same chars allowed as the absolute URL but
|
|
46
|
+
# requires a host with an allow-listed TLD AND a `/path` to keep prose
|
|
47
|
+
# noise low. The host part allows ASCII labels separated by dots; no
|
|
48
|
+
# Unicode hosts (those are too easily confused with prose).
|
|
49
|
+
SCHEMELESS_ALT = %{(?:[a-zA-Z0-9](?:[a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+(?i:#{SCHEMELESS_TLDS.join("|")})/#{URL_CHAR_CLASS}}.freeze
|
|
50
|
+
|
|
51
|
+
# Single-scan combined pattern used when scheme_less is on. One regex
|
|
52
|
+
# over the text is meaningfully cheaper than two.
|
|
53
|
+
COMBINED_RE = %r{
|
|
54
|
+
(?<![\w/.@])
|
|
55
|
+
(?:
|
|
56
|
+
(?i:#{SCHEMES.join("|")})://#{URL_CHAR_CLASS}
|
|
57
|
+
|
|
|
58
|
+
urn:[a-zA-Z0-9][a-zA-Z0-9\-]{0,30}:#{URL_CHAR_CLASS}
|
|
59
|
+
|
|
|
60
|
+
#{SCHEMELESS_ALT}
|
|
61
|
+
)
|
|
62
|
+
}xu.freeze
|
|
63
|
+
|
|
64
|
+
# Punctuation that's almost always sentence punctuation rather than part
|
|
65
|
+
# of a URL when it appears at the trailing edge.
|
|
66
|
+
TRAILING_PUNCT_RE = /[.,;:!?'"‘’“”]+\z/u.freeze
|
|
67
|
+
|
|
68
|
+
# Unmatched closing brackets that should be trimmed.
|
|
69
|
+
BRACKET_PAIRS = { ")" => "(", "]" => "[", "}" => "{" }.freeze
|
|
70
|
+
|
|
71
|
+
def initialize(scheme_less: true)
|
|
72
|
+
@scheme_less = scheme_less
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def extract(text)
|
|
76
|
+
return [] if text.nil? || text.empty?
|
|
77
|
+
|
|
78
|
+
candidates = scan_candidates(text)
|
|
79
|
+
candidates.filter_map do |candidate|
|
|
80
|
+
trimmed = trim(candidate)
|
|
81
|
+
next nil if trimmed.empty?
|
|
82
|
+
|
|
83
|
+
begin
|
|
84
|
+
Parser.parse(trimmed)
|
|
85
|
+
rescue ParseError
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Same as extract but returns only canonical strings, deduplicated,
|
|
92
|
+
# preserving first-seen order.
|
|
93
|
+
def extract_strings(text)
|
|
94
|
+
seen = {}
|
|
95
|
+
extract(text).each { |iri| seen[iri.canonical] ||= true }
|
|
96
|
+
seen.keys
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# One regex scan over the text — combined pattern when scheme-less is
|
|
102
|
+
# on, scheme-anchored only otherwise.
|
|
103
|
+
def scan_candidates(text)
|
|
104
|
+
pattern = @scheme_less ? COMBINED_RE : CANDIDATE_RE
|
|
105
|
+
text.scan(pattern)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Iteratively strip sentence punctuation and unmatched closing brackets
|
|
109
|
+
# until the candidate stabilizes.
|
|
110
|
+
def trim(candidate)
|
|
111
|
+
s = candidate.dup
|
|
112
|
+
loop do
|
|
113
|
+
before = s
|
|
114
|
+
s = s.sub(TRAILING_PUNCT_RE, "")
|
|
115
|
+
BRACKET_PAIRS.each do |close, open|
|
|
116
|
+
while s.end_with?(close) && s.count(close) > s.count(open)
|
|
117
|
+
s = s[0...-1]
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
break if s == before
|
|
121
|
+
end
|
|
122
|
+
s
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
data/lib/iriq/identifier.rb
CHANGED
|
@@ -43,9 +43,17 @@ module Iriq
|
|
|
43
43
|
out << "#{scheme}://" if scheme
|
|
44
44
|
out << host if host
|
|
45
45
|
out << ":#{port}" if port
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
has_query = query && !query.empty?
|
|
47
|
+
has_fragment = fragment && !fragment.empty?
|
|
48
|
+
if path_segments.any?
|
|
49
|
+
out << "/" + path_segments.join("/")
|
|
50
|
+
elsif has_query || has_fragment
|
|
51
|
+
# RFC 3986: an authority with query/fragment but no path needs the
|
|
52
|
+
# implied "/" to be a valid URI.
|
|
53
|
+
out << "/"
|
|
54
|
+
end
|
|
55
|
+
out << "?#{query}" if has_query
|
|
56
|
+
out << "##{fragment}" if has_fragment
|
|
49
57
|
out
|
|
50
58
|
end
|
|
51
59
|
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# Singularization with a swappable adapter.
|
|
5
|
+
#
|
|
6
|
+
# By default uses ActiveSupport's inflector if it can be required, otherwise
|
|
7
|
+
# falls back to BuiltinAdapter. Override globally:
|
|
8
|
+
#
|
|
9
|
+
# Iriq::Inflector.adapter = MyAdapter # must respond to .singularize(String)
|
|
10
|
+
#
|
|
11
|
+
# And reset to default with `Iriq::Inflector.reset_adapter!`.
|
|
12
|
+
module Inflector
|
|
13
|
+
# Vocabulary is bounded in practice; cache + cap matches the
|
|
14
|
+
# SegmentClassifier strategy.
|
|
15
|
+
CACHE_MAX = 10_000
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
def singularize(word)
|
|
19
|
+
cache = (@cache ||= {})
|
|
20
|
+
cached = cache[word]
|
|
21
|
+
return cached if cached
|
|
22
|
+
|
|
23
|
+
cache.clear if cache.size >= CACHE_MAX
|
|
24
|
+
cache[word] = adapter.singularize(word)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def adapter
|
|
28
|
+
@adapter ||= default_adapter
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def adapter=(value)
|
|
32
|
+
@adapter = value
|
|
33
|
+
@cache = {} # different adapter could singularize differently
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def reset_adapter!
|
|
37
|
+
@adapter = nil
|
|
38
|
+
@cache = {}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def default_adapter
|
|
42
|
+
require "active_support/inflector"
|
|
43
|
+
ActiveSupportAdapter
|
|
44
|
+
rescue LoadError
|
|
45
|
+
BuiltinAdapter
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
module ActiveSupportAdapter
|
|
50
|
+
def self.singularize(word)
|
|
51
|
+
::ActiveSupport::Inflector.singularize(word.to_s)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Rule-based English singularizer. Rules are ordered most-specific-first
|
|
56
|
+
# and adapted from ActiveSupport's default inflections.
|
|
57
|
+
module BuiltinAdapter
|
|
58
|
+
IRREGULARS = {
|
|
59
|
+
"people" => "person",
|
|
60
|
+
"children" => "child",
|
|
61
|
+
"men" => "man",
|
|
62
|
+
"women" => "woman",
|
|
63
|
+
"mice" => "mouse",
|
|
64
|
+
"geese" => "goose",
|
|
65
|
+
"oxen" => "ox",
|
|
66
|
+
"feet" => "foot",
|
|
67
|
+
"teeth" => "tooth",
|
|
68
|
+
"lives" => "life",
|
|
69
|
+
"wives" => "wife",
|
|
70
|
+
"moves" => "move",
|
|
71
|
+
"zombies" => "zombie",
|
|
72
|
+
# latin/greek plurals that don't fit a clean suffix rule
|
|
73
|
+
"indices" => "index",
|
|
74
|
+
"vertices" => "vertex",
|
|
75
|
+
# -f/-fe words where the stem doesn't end in l/r/i
|
|
76
|
+
"leaves" => "leaf",
|
|
77
|
+
"calves" => "calf",
|
|
78
|
+
"halves" => "half",
|
|
79
|
+
"loaves" => "loaf",
|
|
80
|
+
"hooves" => "hoof",
|
|
81
|
+
}.freeze
|
|
82
|
+
|
|
83
|
+
UNCOUNTABLE = Set.new(%w[
|
|
84
|
+
news fish sheep deer series species equipment information
|
|
85
|
+
money rice jeans police data media
|
|
86
|
+
]).freeze
|
|
87
|
+
|
|
88
|
+
# [pattern, replacement] — first match wins.
|
|
89
|
+
RULES = [
|
|
90
|
+
[/(quiz)zes$/i, '\1'],
|
|
91
|
+
[/(matri|appendi)ces$/i, '\1x'],
|
|
92
|
+
[/(ox)en$/i, '\1'],
|
|
93
|
+
[/(alias|status)(es)?$/i, '\1'],
|
|
94
|
+
[/(octop|vir)(us|i)$/i, '\1us'],
|
|
95
|
+
[/(cris|ax|test)es$/i, '\1is'],
|
|
96
|
+
[/(shoe)s$/i, '\1'],
|
|
97
|
+
[/(bus)(es)?$/i, '\1'],
|
|
98
|
+
[/([ml])ice$/i, '\1ouse'],
|
|
99
|
+
[/(x|ch|ss|sh)es$/i, '\1'],
|
|
100
|
+
[/(m)ovies$/i, '\1ovie'],
|
|
101
|
+
[/(s)eries$/i, '\1eries'],
|
|
102
|
+
[/([^aeiouy]|qu)ies$/i, '\1y'],
|
|
103
|
+
[/([lr])ves$/i, '\1f'],
|
|
104
|
+
[/(tive)s$/i, '\1'],
|
|
105
|
+
[/(hive)s$/i, '\1'],
|
|
106
|
+
[/([^f])ves$/i, '\1fe'],
|
|
107
|
+
[/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i, '\1sis'],
|
|
108
|
+
[/([ti])a$/i, '\1um'],
|
|
109
|
+
[/(n)ews$/i, '\1ews'],
|
|
110
|
+
[/(o)es$/i, '\1'],
|
|
111
|
+
[/(ss)$/i, '\1'],
|
|
112
|
+
[/s$/i, ''],
|
|
113
|
+
].freeze
|
|
114
|
+
|
|
115
|
+
def self.singularize(word)
|
|
116
|
+
return word if word.nil? || word.empty?
|
|
117
|
+
|
|
118
|
+
lower = word.downcase
|
|
119
|
+
return word if UNCOUNTABLE.include?(lower)
|
|
120
|
+
|
|
121
|
+
if (irr = IRREGULARS[lower])
|
|
122
|
+
return preserve_case(word, irr)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
RULES.each do |pattern, replacement|
|
|
126
|
+
if word.match?(pattern)
|
|
127
|
+
return word.sub(pattern, replacement)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
word
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def self.preserve_case(original, lowered)
|
|
135
|
+
if original == original.upcase
|
|
136
|
+
lowered.upcase
|
|
137
|
+
elsif original[0] == original[0].upcase
|
|
138
|
+
lowered.sub(/\A./, &:upcase)
|
|
139
|
+
else
|
|
140
|
+
lowered
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
data/lib/iriq/normalizer.rb
CHANGED
|
@@ -2,24 +2,27 @@ module Iriq
|
|
|
2
2
|
# Produces a canonical, shape-aware string for an identifier.
|
|
3
3
|
#
|
|
4
4
|
# Normalizer.normalize("https://Foo.com:443/users/123")
|
|
5
|
-
# # => "https://foo.com/users/{
|
|
5
|
+
# # => "https://foo.com/users/{user_id}"
|
|
6
6
|
#
|
|
7
7
|
# The form is intended for grouping/diffing — it is not a round-trippable URL.
|
|
8
8
|
module Normalizer
|
|
9
9
|
module_function
|
|
10
10
|
|
|
11
|
-
def normalize(input, classifier: SegmentClassifier
|
|
11
|
+
def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
12
12
|
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
13
|
-
normalize_identifier(iri, classifier: classifier)
|
|
13
|
+
normalize_identifier(iri, classifier: classifier, hints: hints)
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
-
def normalize_identifier(iri, classifier: SegmentClassifier
|
|
16
|
+
def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
17
17
|
if iri.urn?
|
|
18
|
-
# urn:isbn:0451450523 -> urn:isbn:{integer_id}
|
|
19
18
|
if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
|
|
20
19
|
ns, value = iri.nss.split(":", 2)
|
|
21
|
-
|
|
22
|
-
shaped =
|
|
20
|
+
entry = SegmentHints.derive([ns, value], classifier).last
|
|
21
|
+
shaped = if entry[:variable]
|
|
22
|
+
"{#{(hints && entry[:hint]) || entry[:type]}}"
|
|
23
|
+
else
|
|
24
|
+
entry[:value]
|
|
25
|
+
end
|
|
23
26
|
"urn:#{ns}:#{shaped}"
|
|
24
27
|
else
|
|
25
28
|
iri.canonical
|
|
@@ -29,7 +32,7 @@ module Iriq
|
|
|
29
32
|
out << "#{iri.scheme}://" if iri.scheme
|
|
30
33
|
out << iri.host if iri.host
|
|
31
34
|
out << ":#{iri.port}" if iri.port
|
|
32
|
-
out << PathShape.new(classifier: classifier).for(iri.path_segments)
|
|
35
|
+
out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
|
|
33
36
|
if iri.query_params && !iri.query_params.empty?
|
|
34
37
|
out << "?" + shape_query(iri.query_params, classifier)
|
|
35
38
|
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# The result of Corpus#observe. Lightweight value object — heavy work
|
|
3
|
+
# (explanation, normalization) is deferred until you ask.
|
|
4
|
+
class Observation
|
|
5
|
+
attr_reader :identifier, :cluster
|
|
6
|
+
|
|
7
|
+
def initialize(corpus:, identifier:, cluster:)
|
|
8
|
+
@corpus = corpus
|
|
9
|
+
@identifier = identifier
|
|
10
|
+
@cluster = cluster
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def fingerprint
|
|
14
|
+
@fingerprint ||= Normalizer.normalize_identifier(@identifier)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def explanation
|
|
18
|
+
@explanation ||= @corpus.explain(@identifier)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def normalize
|
|
22
|
+
@corpus.normalize(@identifier)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|