iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
data/lib/iriq/corpus.rb
CHANGED
|
@@ -7,6 +7,10 @@ module Iriq
|
|
|
7
7
|
#
|
|
8
8
|
# The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
|
|
9
9
|
# Corpus#normalize and Corpus#explain are the corpus-informed variants.
|
|
10
|
+
#
|
|
11
|
+
# State lives in a Storage backend (Memory by default; Json or Sqlite when
|
|
12
|
+
# opened against a file). The classification logic on top is identical
|
|
13
|
+
# regardless of where the counters live.
|
|
10
14
|
class Corpus
|
|
11
15
|
# Type-based: position is "mostly variable" (UUIDs/integers/etc.).
|
|
12
16
|
VARIABLE_DOMINANCE_THRESHOLD = 0.8
|
|
@@ -38,44 +42,274 @@ module Iriq
|
|
|
38
42
|
POPULAR_MIN_COUNT = 5
|
|
39
43
|
POPULAR_BASELINE_MULTIPLE = 3
|
|
40
44
|
|
|
41
|
-
|
|
42
|
-
|
|
45
|
+
HOST_STRATEGIES = %i[full registrable none].freeze
|
|
46
|
+
|
|
47
|
+
attr_reader :storage, :host_strategy, :classifier
|
|
43
48
|
|
|
44
49
|
def initialize(classifier: SegmentClassifier::DEFAULT,
|
|
45
|
-
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@
|
|
52
|
-
@
|
|
53
|
-
@
|
|
50
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
|
|
51
|
+
host_strategy: :full,
|
|
52
|
+
storage: nil)
|
|
53
|
+
raise ArgumentError, "host_strategy must be one of #{HOST_STRATEGIES.inspect}" \
|
|
54
|
+
unless HOST_STRATEGIES.include?(host_strategy)
|
|
55
|
+
|
|
56
|
+
@classifier = classifier
|
|
57
|
+
@host_strategy = host_strategy
|
|
58
|
+
@storage = storage || Storage::Memory.new(
|
|
59
|
+
classifier: classifier,
|
|
60
|
+
max_values_per_position: max_values_per_position,
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Open a corpus against `path`. File extension picks the backend:
|
|
65
|
+
# `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
|
|
66
|
+
# else uses JSON.
|
|
67
|
+
def self.open(path, classifier: SegmentClassifier::DEFAULT,
|
|
68
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
|
|
69
|
+
host_strategy: :full)
|
|
70
|
+
storage = Storage.open(path,
|
|
71
|
+
classifier: classifier,
|
|
72
|
+
max_values_per_position: max_values_per_position)
|
|
73
|
+
corpus = new(classifier: classifier, storage: storage, host_strategy: host_strategy)
|
|
74
|
+
corpus.send(:reapply_activated_recognizers!) if storage.respond_to?(:each_activated_recognizer)
|
|
75
|
+
corpus
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Normalize the host for keying purposes. `:full` keeps the original
|
|
79
|
+
# host; `:registrable` collapses subdomains via the inline-PSL heuristic
|
|
80
|
+
# (api.foo.com + app.foo.com → foo.com); `:none` ignores host entirely
|
|
81
|
+
# so clusters group across all hosts by shape alone.
|
|
82
|
+
def effective_host(host)
|
|
83
|
+
case @host_strategy
|
|
84
|
+
when :registrable then RegistrableDomain.for(host)
|
|
85
|
+
when :none then ""
|
|
86
|
+
else host
|
|
87
|
+
end
|
|
54
88
|
end
|
|
55
89
|
|
|
56
90
|
# Observe a single IRI. Returns an Observation.
|
|
91
|
+
#
|
|
92
|
+
# Internally: builds an Event list for the IRI, then applies each event
|
|
93
|
+
# through the Reducer registry inside a single storage transaction. The
|
|
94
|
+
# event list is transient today — a future commit can persist it and
|
|
95
|
+
# replay against alternate reducers / thresholds for re-runnable
|
|
96
|
+
# inference. See lib/iriq/event.rb and lib/iriq/reducer.rb.
|
|
57
97
|
def observe(input)
|
|
98
|
+
iri = coerce(input)
|
|
99
|
+
events = events_for(iri)
|
|
100
|
+
cluster = nil
|
|
101
|
+
|
|
102
|
+
@storage.transaction do |s|
|
|
103
|
+
events.each do |e|
|
|
104
|
+
result = Reducer.apply(e, s)
|
|
105
|
+
cluster = result if e.is_a?(Event::ClusterAddition)
|
|
106
|
+
end
|
|
107
|
+
s.record_observation(iri.canonical) if s.respond_to?(:record_observation)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
Observation.new(corpus: self, identifier: iri, cluster: cluster)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Drop every materialized view (host counts, position stats, clusters,
|
|
114
|
+
# …) and rebuild them by replaying the source-IRI log through the
|
|
115
|
+
# current events + reducers pipeline. Useful for:
|
|
116
|
+
#
|
|
117
|
+
# - Tuning thresholds (swap a Corpus constant, call reinfer)
|
|
118
|
+
# - Swapping the classifier (open the Corpus with a different
|
|
119
|
+
# classifier, call reinfer — events are re-derived from raw IRIs)
|
|
120
|
+
# - Recovering after a Reducer-set change
|
|
121
|
+
#
|
|
122
|
+
# Wrapped in a single backend transaction so a failure mid-replay
|
|
123
|
+
# leaves the prior views intact.
|
|
124
|
+
def reinfer
|
|
125
|
+
@storage.transaction do |s|
|
|
126
|
+
iris = []
|
|
127
|
+
s.each_observed_iri { |canonical| iris << canonical }
|
|
128
|
+
s.clear_materialized_views
|
|
129
|
+
iris.each do |canonical|
|
|
130
|
+
iri = Parser.parse(canonical)
|
|
131
|
+
events_for(iri).each { |e| Reducer.apply(e, s) }
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
nil
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Number of IRIs in the source-IRI log. The materialized views are
|
|
138
|
+
# derived from this log; reinfer replays it.
|
|
139
|
+
def observed_iri_count
|
|
140
|
+
return @storage.observed_iri_count if @storage.respond_to?(:observed_iri_count)
|
|
141
|
+
0
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Scan observed values for shape patterns that recur frequently enough
|
|
145
|
+
# to suggest a new Recognizer. Returns RecognizerProposal records; nothing
|
|
146
|
+
# is automatically applied — the proposal carries enough evidence for a
|
|
147
|
+
# human to decide whether to bake the Recognizer in.
|
|
148
|
+
#
|
|
149
|
+
# Strategies are pluggable; the default set lives in
|
|
150
|
+
# Iriq::ProposalStrategy::DEFAULTS. Pass `strategies:` to limit / extend.
|
|
151
|
+
# Pass `min_observations:` / `min_coverage:` / `min_hosts:` to tune
|
|
152
|
+
# what passes the noise floor.
|
|
153
|
+
def propose_recognizers(strategies: ProposalStrategy::DEFAULTS, **opts)
|
|
154
|
+
strategies.flat_map { |s| s.propose(@storage, **opts) }
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Promote a RecognizerProposal into a live Recognizer for this corpus.
|
|
158
|
+
#
|
|
159
|
+
# Mechanics:
|
|
160
|
+
# 1. Synthesize a SynthesizedRecognizer from the proposal's prefix.
|
|
161
|
+
# 2. Switch to a per-corpus classifier (if we were sharing the
|
|
162
|
+
# module-level DEFAULT) so activation doesn't leak to other
|
|
163
|
+
# corpora using the same default singleton.
|
|
164
|
+
# 3. Register the Recognizer on the classifier — the ensemble
|
|
165
|
+
# picks it up on the next classify() call.
|
|
166
|
+
# 4. Persist the activation in storage so reopens re-apply it.
|
|
167
|
+
# 5. Reinfer so existing observations get re-classified through
|
|
168
|
+
# the new Recognizer.
|
|
169
|
+
#
|
|
170
|
+
# Returns the synthesized Recognizer.
|
|
171
|
+
def activate_proposal(proposal)
|
|
172
|
+
recognizer = SynthesizedRecognizer.from_proposal(proposal)
|
|
173
|
+
ensure_per_corpus_classifier!
|
|
174
|
+
@classifier.register_recognizer(recognizer)
|
|
175
|
+
if @storage.respond_to?(:record_activated_recognizer)
|
|
176
|
+
@storage.record_activated_recognizer(recognizer.to_dump)
|
|
177
|
+
end
|
|
178
|
+
reinfer
|
|
179
|
+
recognizer
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Convenience: activate every proposal whose confidence clears the
|
|
183
|
+
# given threshold. Returns the activated Recognizers. Confidence
|
|
184
|
+
# incorporates both per-position coverage AND cross-host
|
|
185
|
+
# corroboration — see RecognizerProposal#compute_confidence.
|
|
186
|
+
def activate_proposals_above(confidence_threshold, **propose_opts)
|
|
187
|
+
proposals = propose_recognizers(**propose_opts)
|
|
188
|
+
proposals.select { |p| p.confidence >= confidence_threshold }.map { |p| activate_proposal(p) }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Number of activated recognizers persisted with this corpus.
|
|
192
|
+
def activated_recognizer_count
|
|
193
|
+
return @storage.activated_recognizer_count if @storage.respond_to?(:activated_recognizer_count)
|
|
194
|
+
0
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Route shapes that recur across `min_hosts` or more distinct hosts.
|
|
198
|
+
# Returns CrossHostShape records sorted by host_count desc, then by
|
|
199
|
+
# observation_count desc, then by shape (stable, deterministic).
|
|
200
|
+
#
|
|
201
|
+
# Cross-host recurrence is independent evidence of a real semantic
|
|
202
|
+
# pattern — two unrelated hosts inventing the same `/users/{integer}`
|
|
203
|
+
# structure by accident is unlikely. A natural follow-up is feeding
|
|
204
|
+
# this signal back into RecognizerProposal confidence: a proposal
|
|
205
|
+
# supported by N hosts is much stronger than one seen on a single
|
|
206
|
+
# host with the same per-position coverage.
|
|
207
|
+
def cross_host_shapes(min_hosts: 2)
|
|
208
|
+
by_shape = Hash.new { |h, k| h[k] = { hosts: Set.new, count: 0 } }
|
|
209
|
+
@storage.clusters.each do |cluster|
|
|
210
|
+
# Skip non-URL clusters (URN clusters have no host).
|
|
211
|
+
next if cluster.host.nil? || cluster.host.empty?
|
|
212
|
+
|
|
213
|
+
agg = by_shape[cluster.shape]
|
|
214
|
+
agg[:hosts] << cluster.host
|
|
215
|
+
agg[:count] += cluster.count
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
by_shape.filter_map do |shape, data|
|
|
219
|
+
next nil if data[:hosts].size < min_hosts
|
|
220
|
+
|
|
221
|
+
CrossHostShape.new(
|
|
222
|
+
shape: shape,
|
|
223
|
+
hosts: data[:hosts],
|
|
224
|
+
observation_count: data[:count],
|
|
225
|
+
)
|
|
226
|
+
end.sort_by { |s| [-s.host_count, -s.observation_count, s.shape] }
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Build the ordered Event list for `input` without applying it. Useful
|
|
230
|
+
# for inspection, tests, and future event-log persistence. Each call is
|
|
231
|
+
# pure — no storage side-effects.
|
|
232
|
+
def events_for(input)
|
|
58
233
|
iri = coerce(input)
|
|
59
234
|
hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
|
|
60
|
-
|
|
235
|
+
raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
|
|
61
236
|
hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
|
|
62
|
-
|
|
63
|
-
|
|
237
|
+
keying_host = effective_host(iri.host)
|
|
238
|
+
|
|
239
|
+
events = [
|
|
240
|
+
Event::HostSeen.new(keying_host),
|
|
241
|
+
Event::PathLengthSeen.new(iri.path_segments.size),
|
|
242
|
+
Event::RawShapeSeen.new(raw_shape),
|
|
243
|
+
Event::FingerprintSeen.new(hinted_shape),
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
prefix = ""
|
|
247
|
+
hinted_entries.each do |entry|
|
|
248
|
+
events << Event::PositionSeen.new(
|
|
249
|
+
Position.path(host: keying_host, prefix: prefix),
|
|
250
|
+
entry[:value], entry[:type],
|
|
251
|
+
)
|
|
252
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape, host: keying_host)
|
|
256
|
+
events << Event::ClusterAddition.new(key, host, scheme, shape, iri)
|
|
257
|
+
|
|
258
|
+
events
|
|
64
259
|
end
|
|
65
260
|
|
|
66
261
|
# Corpus-informed normalization. Falls back to mechanical normalization
|
|
67
|
-
# when the corpus has no signal for a position.
|
|
262
|
+
# when the corpus has no signal for a position. Implemented as a thin
|
|
263
|
+
# call into Normalizer with `evidence: self`; the corpus-informed path
|
|
264
|
+
# and query rendering live in #render_path / #render_query below
|
|
265
|
+
# (the evidence-source interface).
|
|
68
266
|
def normalize(input)
|
|
69
267
|
iri = coerce(input)
|
|
70
|
-
|
|
268
|
+
Normalizer.normalize_identifier(iri, classifier: @classifier, hints: true, evidence: self)
|
|
269
|
+
end
|
|
71
270
|
|
|
271
|
+
# Evidence-source interface — called by Normalizer when this Corpus is
|
|
272
|
+
# passed as `evidence:`. Renders the path using corpus-informed
|
|
273
|
+
# classifications (variability promotion, popular-outlier preservation).
|
|
274
|
+
# Always emits a leading "/" — empty path collapses to "/" to match
|
|
275
|
+
# mechanical output and anchor any trailing query.
|
|
276
|
+
def render_path(iri, _classifier, _hints)
|
|
72
277
|
tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
278
|
+
"/" + tokens.join("/")
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Evidence-source interface — render the query string with
|
|
282
|
+
# cluster-inferred param types where available. The mechanical
|
|
283
|
+
# NullEvidenceSource provides the classifier-only fallback; this
|
|
284
|
+
# version prefers the cluster's observed type per param (dominant
|
|
285
|
+
# type_count, subject to the corpus thresholds).
|
|
286
|
+
def render_query(iri, _classifier = @classifier)
|
|
287
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true)
|
|
288
|
+
.from_entries(SegmentHints.derive(iri.path_segments, @classifier))
|
|
289
|
+
key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
|
|
290
|
+
host: effective_host(iri.host))
|
|
291
|
+
cluster = @storage.cluster_for(key)
|
|
292
|
+
|
|
293
|
+
iri.query_params.keys.sort.map do |k|
|
|
294
|
+
v = iri.query_params[k].to_s
|
|
295
|
+
type = inferred_param_type(cluster, k, v)
|
|
296
|
+
shaped = render_param_value(v, type)
|
|
297
|
+
"#{k}=#{shaped}"
|
|
298
|
+
end.join("&")
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Inferred params for the cluster `input` would fall into. Returns the
|
|
302
|
+
# same shape as Cluster#param_summary — useful for "what query params
|
|
303
|
+
# might this URL accept?" tooling. Empty array if no cluster has been
|
|
304
|
+
# observed for this shape yet.
|
|
305
|
+
def params_for(input)
|
|
306
|
+
iri = coerce(input)
|
|
307
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true)
|
|
308
|
+
.from_entries(SegmentHints.derive(iri.path_segments, @classifier))
|
|
309
|
+
key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
|
|
310
|
+
host: effective_host(iri.host))
|
|
311
|
+
cluster = @storage.cluster_for(key)
|
|
312
|
+
cluster ? cluster.param_summary : []
|
|
79
313
|
end
|
|
80
314
|
|
|
81
315
|
# Per-segment explanation with corpus-informed `classification`.
|
|
@@ -89,55 +323,93 @@ module Iriq
|
|
|
89
323
|
end
|
|
90
324
|
end
|
|
91
325
|
|
|
326
|
+
def host_counts; @storage.host_counts; end
|
|
327
|
+
def path_length_counts; @storage.path_length_counts; end
|
|
328
|
+
def raw_shape_counts; @storage.raw_shape_counts; end
|
|
329
|
+
def fingerprint_counts; @storage.fingerprint_counts; end
|
|
330
|
+
|
|
331
|
+
# Iterates Position → PositionStats over all observed positions.
|
|
332
|
+
# Used by inspection tooling; not part of the hot path.
|
|
333
|
+
def each_position_stats(&block)
|
|
334
|
+
@storage.each_position_stats(&block)
|
|
335
|
+
end
|
|
336
|
+
|
|
92
337
|
def clusters
|
|
93
|
-
@
|
|
338
|
+
@storage.clusters
|
|
94
339
|
end
|
|
95
340
|
|
|
96
341
|
def size
|
|
97
|
-
@
|
|
342
|
+
@storage.cluster_size
|
|
98
343
|
end
|
|
99
344
|
|
|
100
|
-
# Stats for a given (host,
|
|
345
|
+
# Stats for a given (host, path-prefix) — useful for tests and
|
|
101
346
|
# debugging. Returns nil if nothing has been observed there.
|
|
102
|
-
|
|
103
|
-
|
|
347
|
+
# Accepts either a Position or (host, prefix) for ergonomics.
|
|
348
|
+
def stats_for(host_or_position, prefix = nil)
|
|
349
|
+
position = host_or_position.is_a?(Position) ? host_or_position : Position.path(host: host_or_position, prefix: prefix)
|
|
350
|
+
@storage.position_stats(position)
|
|
104
351
|
end
|
|
105
352
|
|
|
106
|
-
|
|
353
|
+
# Persist the corpus.
|
|
354
|
+
#
|
|
355
|
+
# save() → flush the backend in place (JSON writes its file,
|
|
356
|
+
# SQLite is already on disk).
|
|
357
|
+
# save(same_path) → same as save() — idempotent for the backend's path.
|
|
358
|
+
# save(other_path) → export to other_path as JSON, regardless of the
|
|
359
|
+
# live backend.
|
|
360
|
+
def save(path = nil)
|
|
361
|
+
backend_path = @storage.respond_to?(:path) ? @storage.path : nil
|
|
362
|
+
if path.nil? || path == backend_path
|
|
363
|
+
@storage.save
|
|
364
|
+
else
|
|
365
|
+
write_json_dump(path)
|
|
366
|
+
end
|
|
367
|
+
end
|
|
107
368
|
|
|
108
|
-
def
|
|
109
|
-
|
|
369
|
+
def close
|
|
370
|
+
@storage.close
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# Wrap many observations in a single backend transaction. For SQLite this
|
|
374
|
+
# turns thousands of fsyncs into one; for in-memory backends it's a
|
|
375
|
+
# no-op. Use when ingesting a batch.
|
|
376
|
+
def batch(&block)
|
|
377
|
+
@storage.batch(&block)
|
|
110
378
|
end
|
|
111
379
|
|
|
112
|
-
|
|
113
|
-
@host_counts[iri.host] += 1 if iri.host
|
|
114
|
-
@path_length_counts[iri.path_segments.size] += 1
|
|
380
|
+
private
|
|
115
381
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
382
|
+
# If we're still sharing the module-level DEFAULT classifier, switch
|
|
383
|
+
# to our own copy so register_recognizer doesn't leak into other
|
|
384
|
+
# corpora using the same default singleton.
|
|
385
|
+
def ensure_per_corpus_classifier!
|
|
386
|
+
return if @classifier != SegmentClassifier::DEFAULT
|
|
120
387
|
|
|
121
|
-
|
|
388
|
+
@classifier = SegmentClassifier.new
|
|
122
389
|
end
|
|
123
390
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
391
|
+
# On Corpus.open, walk the stored activations and register each one
|
|
392
|
+
# on this corpus's classifier. Switches to a per-corpus classifier
|
|
393
|
+
# if any activations exist.
|
|
394
|
+
def reapply_activated_recognizers!
|
|
395
|
+
return if @storage.activated_recognizer_count.zero?
|
|
396
|
+
|
|
397
|
+
ensure_per_corpus_classifier!
|
|
398
|
+
@storage.each_activated_recognizer do |dump|
|
|
399
|
+
@classifier.register_recognizer(SynthesizedRecognizer.from_dump(dump))
|
|
131
400
|
end
|
|
132
401
|
end
|
|
133
402
|
|
|
134
|
-
|
|
135
|
-
|
|
403
|
+
def coerce(input)
|
|
404
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
405
|
+
end
|
|
406
|
+
|
|
136
407
|
def annotate_segments(iri)
|
|
137
408
|
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
138
409
|
prefix = ""
|
|
410
|
+
keying_host = effective_host(iri.host)
|
|
139
411
|
hinted.map do |entry|
|
|
140
|
-
stats = @position_stats
|
|
412
|
+
stats = @storage.position_stats(Position.path(host: keying_host, prefix: prefix))
|
|
141
413
|
out = entry.merge(
|
|
142
414
|
prefix: prefix,
|
|
143
415
|
classification: classify(entry, stats),
|
|
@@ -150,14 +422,28 @@ module Iriq
|
|
|
150
422
|
def placeholder(entry)
|
|
151
423
|
return entry[:value] unless entry[:variable]
|
|
152
424
|
|
|
153
|
-
"{#{entry[:hint] || entry[:type]}}"
|
|
425
|
+
"{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}"
|
|
154
426
|
end
|
|
155
427
|
|
|
428
|
+
# Types whose values are often a small fixed set (or a single static
|
|
429
|
+
# value baked into a REST route). For these, run through the same
|
|
430
|
+
# cardinality / value-fraction analysis literals get — a dominant
|
|
431
|
+
# value gets preserved as :stable_literal instead of being
|
|
432
|
+
# placeholdered as a generic {version}/{slug}/etc.
|
|
433
|
+
#
|
|
434
|
+
# Slug + opaque_id are here because a lot of route literals
|
|
435
|
+
# accidentally match those shapes (`/users/{id}/create-new`,
|
|
436
|
+
# reference codes like `WK1234`). When a single value dominates the
|
|
437
|
+
# position, the literal is almost always the better display.
|
|
438
|
+
STABLE_VARIABLE_TYPES = %i[version locale currency boolean slug opaque_id].freeze
|
|
439
|
+
|
|
156
440
|
def classify(entry, stats)
|
|
157
441
|
variable = entry[:variable]
|
|
158
442
|
|
|
159
443
|
return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
|
|
160
|
-
|
|
444
|
+
if variable && !STABLE_VARIABLE_TYPES.include?(entry[:type])
|
|
445
|
+
return :variable_identifier
|
|
446
|
+
end
|
|
161
447
|
|
|
162
448
|
value = entry[:value]
|
|
163
449
|
total = stats.total
|
|
@@ -166,6 +452,17 @@ module Iriq
|
|
|
166
452
|
enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
167
453
|
value_frac = stats.value_fraction(value)
|
|
168
454
|
|
|
455
|
+
# For STABLE_VARIABLE_TYPES (version, locale, currency, boolean),
|
|
456
|
+
# a dominant value wins over the variable-dominance branch — a
|
|
457
|
+
# single-version /api/v1/... pattern stays as the literal `v1`
|
|
458
|
+
# rather than placeholdering to {version}. Without dominance,
|
|
459
|
+
# fall through to :variable_identifier (the per-type placeholder).
|
|
460
|
+
if variable
|
|
461
|
+
return :stable_literal if value_frac >= STABLE_LITERAL_THRESHOLD
|
|
462
|
+
|
|
463
|
+
return :variable_identifier
|
|
464
|
+
end
|
|
465
|
+
|
|
169
466
|
if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
|
|
170
467
|
# Position is dominated by variable types (UUIDs, integers, etc.).
|
|
171
468
|
# A literal here is a special-case outlier (e.g. /users/me).
|
|
@@ -204,6 +501,28 @@ module Iriq
|
|
|
204
501
|
stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
|
|
205
502
|
end
|
|
206
503
|
|
|
504
|
+
def inferred_param_type(cluster, name, value)
|
|
505
|
+
# Prefer the cluster's confident type when we have enough samples;
|
|
506
|
+
# otherwise classify the current value directly. Cluster#param_type
|
|
507
|
+
# applies the :date quorum gate (see Cluster::DATE_CONFIDENCE_THRESHOLD).
|
|
508
|
+
stats = cluster && cluster.param_stats[name]
|
|
509
|
+
if stats && stats.total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
510
|
+
cluster.param_type(name) || @classifier.classify(value)
|
|
511
|
+
else
|
|
512
|
+
@classifier.classify(value)
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
def render_param_value(value, type)
|
|
517
|
+
if type == :date && (canon = SegmentClassifier.canonical_date(value))
|
|
518
|
+
canon
|
|
519
|
+
elsif @classifier.variable?(type)
|
|
520
|
+
"{#{SegmentClassifier.display_type(type)}}"
|
|
521
|
+
else
|
|
522
|
+
value
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
|
|
207
526
|
def corpus_token(entry)
|
|
208
527
|
case entry[:classification]
|
|
209
528
|
when :variable_identifier, :corpus_inferred_variable
|
|
@@ -214,7 +533,13 @@ module Iriq
|
|
|
214
533
|
end
|
|
215
534
|
|
|
216
535
|
def placeholder_for_variable(entry)
|
|
217
|
-
|
|
536
|
+
# Dates render in canonical ISO form rather than as a `{date}` placeholder
|
|
537
|
+
# — matches what mechanical Iriq.normalize does for path segments and
|
|
538
|
+
# what render_param_value does for query params.
|
|
539
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
540
|
+
return canon
|
|
541
|
+
end
|
|
542
|
+
return "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}" if entry[:variable]
|
|
218
543
|
|
|
219
544
|
# corpus-inferred variable: classifier said literal, corpus says
|
|
220
545
|
# otherwise. Derive a hint from the prefix's last literal segment if
|
|
@@ -226,43 +551,54 @@ module Iriq
|
|
|
226
551
|
|
|
227
552
|
public
|
|
228
553
|
|
|
554
|
+
# --- Legacy dump/load (JSON shape) ------------------------------------
|
|
555
|
+
#
|
|
556
|
+
# The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
|
|
557
|
+
# `Corpus.load(path)` for JSON-backed persistence. Those names still work
|
|
558
|
+
# but are now thin wrappers around the appropriate Storage backend.
|
|
559
|
+
|
|
229
560
|
def dump
|
|
230
|
-
|
|
231
|
-
"host_counts" => @host_counts,
|
|
232
|
-
"path_length_counts" => @path_length_counts.transform_keys(&:to_s),
|
|
233
|
-
"raw_shape_counts" => @raw_shape_counts,
|
|
234
|
-
"fingerprint_counts" => @fingerprint_counts,
|
|
235
|
-
"max_values_per_position" => @max_values_per_position,
|
|
236
|
-
"position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
|
|
237
|
-
"clusterer" => @clusterer.dump,
|
|
238
|
-
}
|
|
239
|
-
end
|
|
240
|
-
|
|
241
|
-
def save(path)
|
|
242
|
-
tmp = "#{path}.tmp"
|
|
243
|
-
File.write(tmp, JSON.generate(dump))
|
|
244
|
-
File.rename(tmp, path)
|
|
561
|
+
memory_view.to_dump
|
|
245
562
|
end
|
|
246
563
|
|
|
247
564
|
def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
)
|
|
252
|
-
c.instance_variable_set(:@host_counts, Hash.new(0).merge(h["host_counts"]))
|
|
253
|
-
c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
|
|
254
|
-
c.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(h["raw_shape_counts"]))
|
|
255
|
-
c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
|
|
256
|
-
stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
|
|
257
|
-
acc[[host, prefix]] = PositionStats.from_dump(sdump)
|
|
258
|
-
end
|
|
259
|
-
c.instance_variable_set(:@position_stats, stats)
|
|
260
|
-
c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
|
|
261
|
-
c
|
|
565
|
+
max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
|
|
566
|
+
storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
|
|
567
|
+
storage.load_dump!(h)
|
|
568
|
+
new(classifier: classifier, storage: storage)
|
|
262
569
|
end
|
|
263
570
|
|
|
264
571
|
def self.load(path, classifier: SegmentClassifier::DEFAULT)
|
|
265
|
-
|
|
572
|
+
open(path, classifier: classifier)
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
private
|
|
576
|
+
|
|
577
|
+
def write_json_dump(path)
|
|
578
|
+
tmp = "#{path}.tmp"
|
|
579
|
+
File.write(tmp, JSON.generate(memory_view.to_dump))
|
|
580
|
+
File.rename(tmp, path)
|
|
581
|
+
end
|
|
582
|
+
|
|
583
|
+
# Materialize a Memory snapshot of the current state — used by dump for
|
|
584
|
+
# backends that don't natively know how to emit the JSON shape.
|
|
585
|
+
def memory_view
|
|
586
|
+
return @storage if @storage.respond_to?(:to_dump)
|
|
587
|
+
|
|
588
|
+
mem = Storage::Memory.new(
|
|
589
|
+
classifier: @classifier,
|
|
590
|
+
max_values_per_position: @storage.max_values_per_position,
|
|
591
|
+
)
|
|
592
|
+
mem.instance_variable_set(:@host_counts, Hash.new(0).merge(@storage.host_counts))
|
|
593
|
+
mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
|
|
594
|
+
mem.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(@storage.raw_shape_counts))
|
|
595
|
+
mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
|
|
596
|
+
ps = {}
|
|
597
|
+
@storage.each_position_stats { |key, stats| ps[key] = stats }
|
|
598
|
+
mem.instance_variable_set(:@position_stats, ps)
|
|
599
|
+
clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
|
|
600
|
+
mem.instance_variable_set(:@clusters, clusters_h)
|
|
601
|
+
mem
|
|
266
602
|
end
|
|
267
603
|
end
|
|
268
604
|
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# A route shape that recurs across multiple hosts.
|
|
5
|
+
#
|
|
6
|
+
# Emitted by Corpus#cross_host_shapes. The shape string ("/users/{user_id}")
|
|
7
|
+
# is the cluster's rendered placeholder form; two clusters with the same
|
|
8
|
+
# shape but different hosts coalesce into one CrossHostShape record.
|
|
9
|
+
#
|
|
10
|
+
# A shape appearing at N hosts is strong evidence of a semantic pattern
|
|
11
|
+
# rather than a host-local quirk — independent hosts are unlikely to
|
|
12
|
+
# invent the same `/users/{integer}` structure by accident. Future work
|
|
13
|
+
# can feed this signal into proposal confidence and corpus-informed
|
|
14
|
+
# normalization (raise weight when a Shape has cross-host support).
|
|
15
|
+
class CrossHostShape
|
|
16
|
+
attr_reader :shape, :hosts, :observation_count
|
|
17
|
+
|
|
18
|
+
def initialize(shape:, hosts:, observation_count:)
|
|
19
|
+
@shape = shape
|
|
20
|
+
@hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
|
|
21
|
+
@observation_count = observation_count
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def host_count
|
|
25
|
+
@hosts.size
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_h
|
|
29
|
+
{
|
|
30
|
+
shape: @shape,
|
|
31
|
+
hosts: @hosts.to_a.sort,
|
|
32
|
+
host_count: host_count,
|
|
33
|
+
observation_count: @observation_count,
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/iriq/event.rb
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Events are the atomic observation-time facts emitted by Corpus#observe
|
|
3
|
+
# before any state changes. A single observe(iri) call emits a small
|
|
4
|
+
# ordered list of Events; Reducers consume that list to update materialized
|
|
5
|
+
# views (host counts, position stats, clusters, etc.).
|
|
6
|
+
#
|
|
7
|
+
# Today the event list is transient — built fresh per observe(), applied,
|
|
8
|
+
# and discarded. The shape is in place so a future commit can persist the
|
|
9
|
+
# log and replay it to re-derive materialized views without re-feeding
|
|
10
|
+
# source IRIs (the "re-runnable inference" win from ROADMAP.md).
|
|
11
|
+
#
|
|
12
|
+
# Each Event is a Struct so callers can pattern-match on type and access
|
|
13
|
+
# fields positionally or by name.
|
|
14
|
+
module Event
|
|
15
|
+
HostSeen = Struct.new(:host)
|
|
16
|
+
PathLengthSeen = Struct.new(:length)
|
|
17
|
+
RawShapeSeen = Struct.new(:shape)
|
|
18
|
+
FingerprintSeen = Struct.new(:shape)
|
|
19
|
+
PositionSeen = Struct.new(:position, :value, :type)
|
|
20
|
+
ClusterAddition = Struct.new(:key, :host, :scheme, :shape, :identifier)
|
|
21
|
+
end
|
|
22
|
+
end
|