iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
data/lib/iriq/corpus.rb
CHANGED
|
@@ -42,13 +42,20 @@ module Iriq
|
|
|
42
42
|
POPULAR_MIN_COUNT = 5
|
|
43
43
|
POPULAR_BASELINE_MULTIPLE = 3
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
HOST_STRATEGIES = %i[full registrable none].freeze
|
|
46
|
+
|
|
47
|
+
attr_reader :storage, :host_strategy, :classifier
|
|
46
48
|
|
|
47
49
|
def initialize(classifier: SegmentClassifier::DEFAULT,
|
|
48
50
|
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
|
|
51
|
+
host_strategy: :full,
|
|
49
52
|
storage: nil)
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
raise ArgumentError, "host_strategy must be one of #{HOST_STRATEGIES.inspect}" \
|
|
54
|
+
unless HOST_STRATEGIES.include?(host_strategy)
|
|
55
|
+
|
|
56
|
+
@classifier = classifier
|
|
57
|
+
@host_strategy = host_strategy
|
|
58
|
+
@storage = storage || Storage::Memory.new(
|
|
52
59
|
classifier: classifier,
|
|
53
60
|
max_values_per_position: max_values_per_position,
|
|
54
61
|
)
|
|
@@ -58,53 +65,251 @@ module Iriq
|
|
|
58
65
|
# `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
|
|
59
66
|
# else uses JSON.
|
|
60
67
|
def self.open(path, classifier: SegmentClassifier::DEFAULT,
|
|
61
|
-
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES
|
|
68
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
|
|
69
|
+
host_strategy: :full)
|
|
62
70
|
storage = Storage.open(path,
|
|
63
71
|
classifier: classifier,
|
|
64
72
|
max_values_per_position: max_values_per_position)
|
|
65
|
-
new(classifier: classifier, storage: storage)
|
|
73
|
+
corpus = new(classifier: classifier, storage: storage, host_strategy: host_strategy)
|
|
74
|
+
corpus.send(:reapply_activated_recognizers!) if storage.respond_to?(:each_activated_recognizer)
|
|
75
|
+
corpus
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Normalize the host for keying purposes. `:full` keeps the original
|
|
79
|
+
# host; `:registrable` collapses subdomains via the inline-PSL heuristic
|
|
80
|
+
# (api.foo.com + app.foo.com → foo.com); `:none` ignores host entirely
|
|
81
|
+
# so clusters group across all hosts by shape alone.
|
|
82
|
+
def effective_host(host)
|
|
83
|
+
case @host_strategy
|
|
84
|
+
when :registrable then RegistrableDomain.for(host)
|
|
85
|
+
when :none then ""
|
|
86
|
+
else host
|
|
87
|
+
end
|
|
66
88
|
end
|
|
67
89
|
|
|
68
90
|
# Observe a single IRI. Returns an Observation.
|
|
91
|
+
#
|
|
92
|
+
# Internally: builds an Event list for the IRI, then applies each event
|
|
93
|
+
# through the Reducer registry inside a single storage transaction. The
|
|
94
|
+
# event list is transient today — a future commit can persist it and
|
|
95
|
+
# replay against alternate reducers / thresholds for re-runnable
|
|
96
|
+
# inference. See lib/iriq/event.rb and lib/iriq/reducer.rb.
|
|
69
97
|
def observe(input)
|
|
98
|
+
iri = coerce(input)
|
|
99
|
+
events = events_for(iri)
|
|
100
|
+
cluster = nil
|
|
101
|
+
|
|
102
|
+
@storage.transaction do |s|
|
|
103
|
+
events.each do |e|
|
|
104
|
+
result = Reducer.apply(e, s)
|
|
105
|
+
cluster = result if e.is_a?(Event::ClusterAddition)
|
|
106
|
+
end
|
|
107
|
+
s.record_observation(iri.canonical) if s.respond_to?(:record_observation)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
Observation.new(corpus: self, identifier: iri, cluster: cluster)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Drop every materialized view (host counts, position stats, clusters,
|
|
114
|
+
# …) and rebuild them by replaying the source-IRI log through the
|
|
115
|
+
# current events + reducers pipeline. Useful for:
|
|
116
|
+
#
|
|
117
|
+
# - Tuning thresholds (swap a Corpus constant, call reinfer)
|
|
118
|
+
# - Swapping the classifier (open the Corpus with a different
|
|
119
|
+
# classifier, call reinfer — events are re-derived from raw IRIs)
|
|
120
|
+
# - Recovering after a Reducer-set change
|
|
121
|
+
#
|
|
122
|
+
# Wrapped in a single backend transaction so a failure mid-replay
|
|
123
|
+
# leaves the prior views intact.
|
|
124
|
+
def reinfer
|
|
125
|
+
@storage.transaction do |s|
|
|
126
|
+
iris = []
|
|
127
|
+
s.each_observed_iri { |canonical| iris << canonical }
|
|
128
|
+
s.clear_materialized_views
|
|
129
|
+
iris.each do |canonical|
|
|
130
|
+
iri = Parser.parse(canonical)
|
|
131
|
+
events_for(iri).each { |e| Reducer.apply(e, s) }
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
nil
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Number of IRIs in the source-IRI log. The materialized views are
|
|
138
|
+
# derived from this log; reinfer replays it.
|
|
139
|
+
def observed_iri_count
|
|
140
|
+
return @storage.observed_iri_count if @storage.respond_to?(:observed_iri_count)
|
|
141
|
+
0
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Scan observed values for shape patterns that recur frequently enough
|
|
145
|
+
# to suggest a new Recognizer. Returns RecognizerProposal records; nothing
|
|
146
|
+
# is automatically applied — the proposal carries enough evidence for a
|
|
147
|
+
# human to decide whether to bake the Recognizer in.
|
|
148
|
+
#
|
|
149
|
+
# Strategies are pluggable; the default set lives in
|
|
150
|
+
# Iriq::ProposalStrategy::DEFAULTS. Pass `strategies:` to limit / extend.
|
|
151
|
+
# Pass `min_observations:` / `min_coverage:` / `min_hosts:` to tune
|
|
152
|
+
# what passes the noise floor.
|
|
153
|
+
def propose_recognizers(strategies: ProposalStrategy::DEFAULTS, **opts)
|
|
154
|
+
strategies.flat_map { |s| s.propose(@storage, **opts) }
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Promote a RecognizerProposal into a live Recognizer for this corpus.
|
|
158
|
+
#
|
|
159
|
+
# Mechanics:
|
|
160
|
+
# 1. Synthesize a SynthesizedRecognizer from the proposal's prefix.
|
|
161
|
+
# 2. Switch to a per-corpus classifier (if we were sharing the
|
|
162
|
+
# module-level DEFAULT) so activation doesn't leak to other
|
|
163
|
+
# corpora using the same default singleton.
|
|
164
|
+
# 3. Register the Recognizer on the classifier — the ensemble
|
|
165
|
+
# picks it up on the next classify() call.
|
|
166
|
+
# 4. Persist the activation in storage so reopens re-apply it.
|
|
167
|
+
# 5. Reinfer so existing observations get re-classified through
|
|
168
|
+
# the new Recognizer.
|
|
169
|
+
#
|
|
170
|
+
# Returns the synthesized Recognizer.
|
|
171
|
+
def activate_proposal(proposal)
|
|
172
|
+
recognizer = SynthesizedRecognizer.from_proposal(proposal)
|
|
173
|
+
ensure_per_corpus_classifier!
|
|
174
|
+
@classifier.register_recognizer(recognizer)
|
|
175
|
+
if @storage.respond_to?(:record_activated_recognizer)
|
|
176
|
+
@storage.record_activated_recognizer(recognizer.to_dump)
|
|
177
|
+
end
|
|
178
|
+
reinfer
|
|
179
|
+
recognizer
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Convenience: activate every proposal whose confidence clears the
|
|
183
|
+
# given threshold. Returns the activated Recognizers. Confidence
|
|
184
|
+
# incorporates both per-position coverage AND cross-host
|
|
185
|
+
# corroboration — see RecognizerProposal#compute_confidence.
|
|
186
|
+
def activate_proposals_above(confidence_threshold, **propose_opts)
|
|
187
|
+
proposals = propose_recognizers(**propose_opts)
|
|
188
|
+
proposals.select { |p| p.confidence >= confidence_threshold }.map { |p| activate_proposal(p) }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Number of activated recognizers persisted with this corpus.
|
|
192
|
+
def activated_recognizer_count
|
|
193
|
+
return @storage.activated_recognizer_count if @storage.respond_to?(:activated_recognizer_count)
|
|
194
|
+
0
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Route shapes that recur across `min_hosts` or more distinct hosts.
|
|
198
|
+
# Returns CrossHostShape records sorted by host_count desc, then by
|
|
199
|
+
# observation_count desc, then by shape (stable, deterministic).
|
|
200
|
+
#
|
|
201
|
+
# Cross-host recurrence is independent evidence of a real semantic
|
|
202
|
+
# pattern — two unrelated hosts inventing the same `/users/{integer}`
|
|
203
|
+
# structure by accident is unlikely. A natural follow-up is feeding
|
|
204
|
+
# this signal back into RecognizerProposal confidence: a proposal
|
|
205
|
+
# supported by N hosts is much stronger than one seen on a single
|
|
206
|
+
# host with the same per-position coverage.
|
|
207
|
+
def cross_host_shapes(min_hosts: 2)
|
|
208
|
+
by_shape = Hash.new { |h, k| h[k] = { hosts: Set.new, count: 0 } }
|
|
209
|
+
@storage.clusters.each do |cluster|
|
|
210
|
+
# Skip non-URL clusters (URN clusters have no host).
|
|
211
|
+
next if cluster.host.nil? || cluster.host.empty?
|
|
212
|
+
|
|
213
|
+
agg = by_shape[cluster.shape]
|
|
214
|
+
agg[:hosts] << cluster.host
|
|
215
|
+
agg[:count] += cluster.count
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
by_shape.filter_map do |shape, data|
|
|
219
|
+
next nil if data[:hosts].size < min_hosts
|
|
220
|
+
|
|
221
|
+
CrossHostShape.new(
|
|
222
|
+
shape: shape,
|
|
223
|
+
hosts: data[:hosts],
|
|
224
|
+
observation_count: data[:count],
|
|
225
|
+
)
|
|
226
|
+
end.sort_by { |s| [-s.host_count, -s.observation_count, s.shape] }
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Build the ordered Event list for `input` without applying it. Useful
|
|
230
|
+
# for inspection, tests, and future event-log persistence. Each call is
|
|
231
|
+
# pure — no storage side-effects.
|
|
232
|
+
def events_for(input)
|
|
70
233
|
iri = coerce(input)
|
|
71
234
|
hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
|
|
72
235
|
raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
|
|
73
236
|
hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
|
|
237
|
+
keying_host = effective_host(iri.host)
|
|
74
238
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
prefix = ""
|
|
83
|
-
hinted_entries.each do |entry|
|
|
84
|
-
s.observe_position(iri.host, prefix, entry[:value], entry[:type])
|
|
85
|
-
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
86
|
-
end
|
|
239
|
+
events = [
|
|
240
|
+
Event::HostSeen.new(keying_host),
|
|
241
|
+
Event::PathLengthSeen.new(iri.path_segments.size),
|
|
242
|
+
Event::RawShapeSeen.new(raw_shape),
|
|
243
|
+
Event::FingerprintSeen.new(hinted_shape),
|
|
244
|
+
]
|
|
87
245
|
|
|
88
|
-
|
|
89
|
-
|
|
246
|
+
prefix = ""
|
|
247
|
+
hinted_entries.each do |entry|
|
|
248
|
+
events << Event::PositionSeen.new(
|
|
249
|
+
Position.path(host: keying_host, prefix: prefix),
|
|
250
|
+
entry[:value], entry[:type],
|
|
251
|
+
)
|
|
252
|
+
prefix = "#{prefix}/#{placeholder(entry)}"
|
|
90
253
|
end
|
|
91
254
|
|
|
92
|
-
|
|
255
|
+
key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape, host: keying_host)
|
|
256
|
+
events << Event::ClusterAddition.new(key, host, scheme, shape, iri)
|
|
257
|
+
|
|
258
|
+
events
|
|
93
259
|
end
|
|
94
260
|
|
|
95
261
|
# Corpus-informed normalization. Falls back to mechanical normalization
|
|
96
|
-
# when the corpus has no signal for a position.
|
|
262
|
+
# when the corpus has no signal for a position. Implemented as a thin
|
|
263
|
+
# call into Normalizer with `evidence: self`; the corpus-informed path
|
|
264
|
+
# and query rendering live in #render_path / #render_query below
|
|
265
|
+
# (the evidence-source interface).
|
|
97
266
|
def normalize(input)
|
|
98
267
|
iri = coerce(input)
|
|
99
|
-
|
|
268
|
+
Normalizer.normalize_identifier(iri, classifier: @classifier, hints: true, evidence: self)
|
|
269
|
+
end
|
|
100
270
|
|
|
271
|
+
# Evidence-source interface — called by Normalizer when this Corpus is
|
|
272
|
+
# passed as `evidence:`. Renders the path using corpus-informed
|
|
273
|
+
# classifications (variability promotion, popular-outlier preservation).
|
|
274
|
+
# Always emits a leading "/" — empty path collapses to "/" to match
|
|
275
|
+
# mechanical output and anchor any trailing query.
|
|
276
|
+
def render_path(iri, _classifier, _hints)
|
|
101
277
|
tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
278
|
+
"/" + tokens.join("/")
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Evidence-source interface — render the query string with
|
|
282
|
+
# cluster-inferred param types where available. The mechanical
|
|
283
|
+
# NullEvidenceSource provides the classifier-only fallback; this
|
|
284
|
+
# version prefers the cluster's observed type per param (dominant
|
|
285
|
+
# type_count, subject to the corpus thresholds).
|
|
286
|
+
def render_query(iri, _classifier = @classifier)
|
|
287
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true)
|
|
288
|
+
.from_entries(SegmentHints.derive(iri.path_segments, @classifier))
|
|
289
|
+
key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
|
|
290
|
+
host: effective_host(iri.host))
|
|
291
|
+
cluster = @storage.cluster_for(key)
|
|
292
|
+
|
|
293
|
+
iri.query_params.keys.sort.map do |k|
|
|
294
|
+
v = iri.query_params[k].to_s
|
|
295
|
+
type = inferred_param_type(cluster, k, v)
|
|
296
|
+
shaped = render_param_value(v, type)
|
|
297
|
+
"#{k}=#{shaped}"
|
|
298
|
+
end.join("&")
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Inferred params for the cluster `input` would fall into. Returns the
|
|
302
|
+
# same shape as Cluster#param_summary — useful for "what query params
|
|
303
|
+
# might this URL accept?" tooling. Empty array if no cluster has been
|
|
304
|
+
# observed for this shape yet.
|
|
305
|
+
def params_for(input)
|
|
306
|
+
iri = coerce(input)
|
|
307
|
+
hinted_shape = PathShape.new(classifier: @classifier, hints: true)
|
|
308
|
+
.from_entries(SegmentHints.derive(iri.path_segments, @classifier))
|
|
309
|
+
key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
|
|
310
|
+
host: effective_host(iri.host))
|
|
311
|
+
cluster = @storage.cluster_for(key)
|
|
312
|
+
cluster ? cluster.param_summary : []
|
|
108
313
|
end
|
|
109
314
|
|
|
110
315
|
# Per-segment explanation with corpus-informed `classification`.
|
|
@@ -123,7 +328,7 @@ module Iriq
|
|
|
123
328
|
def raw_shape_counts; @storage.raw_shape_counts; end
|
|
124
329
|
def fingerprint_counts; @storage.fingerprint_counts; end
|
|
125
330
|
|
|
126
|
-
# Iterates
|
|
331
|
+
# Iterates Position → PositionStats over all observed positions.
|
|
127
332
|
# Used by inspection tooling; not part of the hot path.
|
|
128
333
|
def each_position_stats(&block)
|
|
129
334
|
@storage.each_position_stats(&block)
|
|
@@ -137,10 +342,12 @@ module Iriq
|
|
|
137
342
|
@storage.cluster_size
|
|
138
343
|
end
|
|
139
344
|
|
|
140
|
-
# Stats for a given (host,
|
|
345
|
+
# Stats for a given (host, path-prefix) — useful for tests and
|
|
141
346
|
# debugging. Returns nil if nothing has been observed there.
|
|
142
|
-
|
|
143
|
-
|
|
347
|
+
# Accepts either a Position or (host, prefix) for ergonomics.
|
|
348
|
+
def stats_for(host_or_position, prefix = nil)
|
|
349
|
+
position = host_or_position.is_a?(Position) ? host_or_position : Position.path(host: host_or_position, prefix: prefix)
|
|
350
|
+
@storage.position_stats(position)
|
|
144
351
|
end
|
|
145
352
|
|
|
146
353
|
# Persist the corpus.
|
|
@@ -172,6 +379,27 @@ module Iriq
|
|
|
172
379
|
|
|
173
380
|
private
|
|
174
381
|
|
|
382
|
+
# If we're still sharing the module-level DEFAULT classifier, switch
|
|
383
|
+
# to our own copy so register_recognizer doesn't leak into other
|
|
384
|
+
# corpora using the same default singleton.
|
|
385
|
+
def ensure_per_corpus_classifier!
|
|
386
|
+
return if @classifier != SegmentClassifier::DEFAULT
|
|
387
|
+
|
|
388
|
+
@classifier = SegmentClassifier.new
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# On Corpus.open, walk the stored activations and register each one
|
|
392
|
+
# on this corpus's classifier. Switches to a per-corpus classifier
|
|
393
|
+
# if any activations exist.
|
|
394
|
+
def reapply_activated_recognizers!
|
|
395
|
+
return if @storage.activated_recognizer_count.zero?
|
|
396
|
+
|
|
397
|
+
ensure_per_corpus_classifier!
|
|
398
|
+
@storage.each_activated_recognizer do |dump|
|
|
399
|
+
@classifier.register_recognizer(SynthesizedRecognizer.from_dump(dump))
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
|
|
175
403
|
def coerce(input)
|
|
176
404
|
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
177
405
|
end
|
|
@@ -179,8 +407,9 @@ module Iriq
|
|
|
179
407
|
def annotate_segments(iri)
|
|
180
408
|
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
181
409
|
prefix = ""
|
|
410
|
+
keying_host = effective_host(iri.host)
|
|
182
411
|
hinted.map do |entry|
|
|
183
|
-
stats = @storage.position_stats(
|
|
412
|
+
stats = @storage.position_stats(Position.path(host: keying_host, prefix: prefix))
|
|
184
413
|
out = entry.merge(
|
|
185
414
|
prefix: prefix,
|
|
186
415
|
classification: classify(entry, stats),
|
|
@@ -193,14 +422,28 @@ module Iriq
|
|
|
193
422
|
def placeholder(entry)
|
|
194
423
|
return entry[:value] unless entry[:variable]
|
|
195
424
|
|
|
196
|
-
"{#{entry[:hint] || entry[:type]}}"
|
|
425
|
+
"{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}"
|
|
197
426
|
end
|
|
198
427
|
|
|
428
|
+
# Types whose values are often a small fixed set (or a single static
|
|
429
|
+
# value baked into a REST route). For these, run through the same
|
|
430
|
+
# cardinality / value-fraction analysis literals get — a dominant
|
|
431
|
+
# value gets preserved as :stable_literal instead of being
|
|
432
|
+
# placeholdered as a generic {version}/{slug}/etc.
|
|
433
|
+
#
|
|
434
|
+
# Slug + opaque_id are here because a lot of route literals
|
|
435
|
+
# accidentally match those shapes (`/users/{id}/create-new`,
|
|
436
|
+
# reference codes like `WK1234`). When a single value dominates the
|
|
437
|
+
# position, the literal is almost always the better display.
|
|
438
|
+
STABLE_VARIABLE_TYPES = %i[version locale currency boolean slug opaque_id].freeze
|
|
439
|
+
|
|
199
440
|
def classify(entry, stats)
|
|
200
441
|
variable = entry[:variable]
|
|
201
442
|
|
|
202
443
|
return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
|
|
203
|
-
|
|
444
|
+
if variable && !STABLE_VARIABLE_TYPES.include?(entry[:type])
|
|
445
|
+
return :variable_identifier
|
|
446
|
+
end
|
|
204
447
|
|
|
205
448
|
value = entry[:value]
|
|
206
449
|
total = stats.total
|
|
@@ -209,6 +452,17 @@ module Iriq
|
|
|
209
452
|
enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
210
453
|
value_frac = stats.value_fraction(value)
|
|
211
454
|
|
|
455
|
+
# For STABLE_VARIABLE_TYPES (version, locale, currency, boolean),
|
|
456
|
+
# a dominant value wins over the variable-dominance branch — a
|
|
457
|
+
# single-version /api/v1/... pattern stays as the literal `v1`
|
|
458
|
+
# rather than placeholdering to {version}. Without dominance,
|
|
459
|
+
# fall through to :variable_identifier (the per-type placeholder).
|
|
460
|
+
if variable
|
|
461
|
+
return :stable_literal if value_frac >= STABLE_LITERAL_THRESHOLD
|
|
462
|
+
|
|
463
|
+
return :variable_identifier
|
|
464
|
+
end
|
|
465
|
+
|
|
212
466
|
if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
|
|
213
467
|
# Position is dominated by variable types (UUIDs, integers, etc.).
|
|
214
468
|
# A literal here is a special-case outlier (e.g. /users/me).
|
|
@@ -247,6 +501,28 @@ module Iriq
|
|
|
247
501
|
stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
|
|
248
502
|
end
|
|
249
503
|
|
|
504
|
+
def inferred_param_type(cluster, name, value)
|
|
505
|
+
# Prefer the cluster's confident type when we have enough samples;
|
|
506
|
+
# otherwise classify the current value directly. Cluster#param_type
|
|
507
|
+
# applies the :date quorum gate (see Cluster::DATE_CONFIDENCE_THRESHOLD).
|
|
508
|
+
stats = cluster && cluster.param_stats[name]
|
|
509
|
+
if stats && stats.total >= MIN_OBSERVATIONS_FOR_INFERENCE
|
|
510
|
+
cluster.param_type(name) || @classifier.classify(value)
|
|
511
|
+
else
|
|
512
|
+
@classifier.classify(value)
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
def render_param_value(value, type)
|
|
517
|
+
if type == :date && (canon = SegmentClassifier.canonical_date(value))
|
|
518
|
+
canon
|
|
519
|
+
elsif @classifier.variable?(type)
|
|
520
|
+
"{#{SegmentClassifier.display_type(type)}}"
|
|
521
|
+
else
|
|
522
|
+
value
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
|
|
250
526
|
def corpus_token(entry)
|
|
251
527
|
case entry[:classification]
|
|
252
528
|
when :variable_identifier, :corpus_inferred_variable
|
|
@@ -257,7 +533,13 @@ module Iriq
|
|
|
257
533
|
end
|
|
258
534
|
|
|
259
535
|
def placeholder_for_variable(entry)
|
|
260
|
-
|
|
536
|
+
# Dates render in canonical ISO form rather than as a `{date}` placeholder
|
|
537
|
+
# — matches what mechanical Iriq.normalize does for path segments and
|
|
538
|
+
# what render_param_value does for query params.
|
|
539
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
540
|
+
return canon
|
|
541
|
+
end
|
|
542
|
+
return "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}" if entry[:variable]
|
|
261
543
|
|
|
262
544
|
# corpus-inferred variable: classifier said literal, corpus says
|
|
263
545
|
# otherwise. Derive a hint from the prefix's last literal segment if
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# A route shape that recurs across multiple hosts.
|
|
5
|
+
#
|
|
6
|
+
# Emitted by Corpus#cross_host_shapes. The shape string ("/users/{user_id}")
|
|
7
|
+
# is the cluster's rendered placeholder form; two clusters with the same
|
|
8
|
+
# shape but different hosts coalesce into one CrossHostShape record.
|
|
9
|
+
#
|
|
10
|
+
# A shape appearing at N hosts is strong evidence of a semantic pattern
|
|
11
|
+
# rather than a host-local quirk — independent hosts are unlikely to
|
|
12
|
+
# invent the same `/users/{integer}` structure by accident. Future work
|
|
13
|
+
# can feed this signal into proposal confidence and corpus-informed
|
|
14
|
+
# normalization (raise weight when a Shape has cross-host support).
|
|
15
|
+
class CrossHostShape
|
|
16
|
+
attr_reader :shape, :hosts, :observation_count
|
|
17
|
+
|
|
18
|
+
def initialize(shape:, hosts:, observation_count:)
|
|
19
|
+
@shape = shape
|
|
20
|
+
@hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
|
|
21
|
+
@observation_count = observation_count
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def host_count
|
|
25
|
+
@hosts.size
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_h
|
|
29
|
+
{
|
|
30
|
+
shape: @shape,
|
|
31
|
+
hosts: @hosts.to_a.sort,
|
|
32
|
+
host_count: host_count,
|
|
33
|
+
observation_count: @observation_count,
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/iriq/event.rb
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Events are the atomic observation-time facts emitted by Corpus#observe
|
|
3
|
+
# before any state changes. A single observe(iri) call emits a small
|
|
4
|
+
# ordered list of Events; Reducers consume that list to update materialized
|
|
5
|
+
# views (host counts, position stats, clusters, etc.).
|
|
6
|
+
#
|
|
7
|
+
# Today the event list is transient — built fresh per observe(), applied,
|
|
8
|
+
# and discarded. The shape is in place so a future commit can persist the
|
|
9
|
+
# log and replay it to re-derive materialized views without re-feeding
|
|
10
|
+
# source IRIs (the "re-runnable inference" win from ROADMAP.md).
|
|
11
|
+
#
|
|
12
|
+
# Each Event is a Struct so callers can pattern-match on type and access
|
|
13
|
+
# fields positionally or by name.
|
|
14
|
+
module Event
|
|
15
|
+
HostSeen = Struct.new(:host)
|
|
16
|
+
PathLengthSeen = Struct.new(:length)
|
|
17
|
+
RawShapeSeen = Struct.new(:shape)
|
|
18
|
+
FingerprintSeen = Struct.new(:shape)
|
|
19
|
+
PositionSeen = Struct.new(:position, :value, :type)
|
|
20
|
+
ClusterAddition = Struct.new(:key, :host, :scheme, :shape, :identifier)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Evidence is the structured substrate for explanation. Each Record
|
|
3
|
+
# captures one fact about the system's reasoning: "this segment
|
|
4
|
+
# classified as :integer because the Integer recognizer fired with
|
|
5
|
+
# specificity TYPED", "the IPv4 type collapses to {ip} by policy",
|
|
6
|
+
# "Position P is mostly variable because of corpus stats".
|
|
7
|
+
#
|
|
8
|
+
# Trace and Explanation are views over a list of Evidence records;
|
|
9
|
+
# the structured form is what programmatic consumers (test assertions,
|
|
10
|
+
# PR-diff annotators, downstream tooling) should build on. Human note
|
|
11
|
+
# strings emitted by Trace are derived from Evidence payloads, so
|
|
12
|
+
# adding a new note kind starts with adding a new Evidence shape.
|
|
13
|
+
#
|
|
14
|
+
# Two axes:
|
|
15
|
+
#
|
|
16
|
+
# subject_kind ∈ {:segment, :position, :cluster}
|
|
17
|
+
# What this Evidence is about. Today most Evidence is :segment
|
|
18
|
+
# (per-segment classification facts). :position and :cluster
|
|
19
|
+
# Evidence become load-bearing once corpus-informed Trace lands
|
|
20
|
+
# in a follow-up step.
|
|
21
|
+
#
|
|
22
|
+
# source ∈ {:lexical, :recognizer, :corpus, :neighbor, :policy}
|
|
23
|
+
# What kind of fact is being asserted.
|
|
24
|
+
# :lexical — pure shape match (e.g. "matches DATE_RE")
|
|
25
|
+
# :recognizer — a named Recognizer fired with confidence/specificity
|
|
26
|
+
# :corpus — aggregated counts/distributions support this
|
|
27
|
+
# :neighbor — adjacent context informed this (prior literal,
|
|
28
|
+
# param name hint)
|
|
29
|
+
# :policy — a normalization policy applied (ip umbrella
|
|
30
|
+
# collapse, canonical date, currency upcase)
|
|
31
|
+
module Evidence
|
|
32
|
+
SUBJECT_KINDS = %i[segment position cluster].freeze
|
|
33
|
+
SOURCES = %i[lexical recognizer corpus neighbor policy].freeze
|
|
34
|
+
|
|
35
|
+
# A single Evidence fact.
|
|
36
|
+
#
|
|
37
|
+
# subject_kind — :segment | :position | :cluster
|
|
38
|
+
# subject — kind-specific identity:
|
|
39
|
+
# :segment → { index:, value: }
|
|
40
|
+
# :position → Iriq::Position
|
|
41
|
+
# :cluster → cluster key (string)
|
|
42
|
+
# source — :lexical | :recognizer | :corpus | :neighbor | :policy
|
|
43
|
+
# payload — source-and-kind-specific structured data
|
|
44
|
+
# weight — optional float in [0,1] — contribution to the
|
|
45
|
+
# ultimate decision. Set when scoring is meaningful;
|
|
46
|
+
# nil otherwise.
|
|
47
|
+
# notes — optional human-readable strings. Trace renders
|
|
48
|
+
# these directly; programmatic consumers can ignore.
|
|
49
|
+
class Record
|
|
50
|
+
attr_reader :subject_kind, :subject, :source, :payload, :weight, :notes
|
|
51
|
+
|
|
52
|
+
def initialize(subject_kind:, subject:, source:, payload:, weight: nil, notes: [])
|
|
53
|
+
unless SUBJECT_KINDS.include?(subject_kind)
|
|
54
|
+
raise ArgumentError, "subject_kind must be one of #{SUBJECT_KINDS.inspect}"
|
|
55
|
+
end
|
|
56
|
+
unless SOURCES.include?(source)
|
|
57
|
+
raise ArgumentError, "source must be one of #{SOURCES.inspect}"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
@subject_kind = subject_kind
|
|
61
|
+
@subject = subject
|
|
62
|
+
@source = source
|
|
63
|
+
@payload = payload || {}
|
|
64
|
+
@weight = weight
|
|
65
|
+
@notes = notes || []
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def to_h
|
|
69
|
+
{
|
|
70
|
+
subject_kind: @subject_kind,
|
|
71
|
+
subject: subject_serialized,
|
|
72
|
+
source: @source,
|
|
73
|
+
payload: @payload,
|
|
74
|
+
weight: @weight,
|
|
75
|
+
notes: @notes,
|
|
76
|
+
}.compact
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def subject_serialized
|
|
82
|
+
return @subject.to_h if @subject.respond_to?(:to_h) && !@subject.is_a?(Hash)
|
|
83
|
+
@subject
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
module_function
|
|
88
|
+
|
|
89
|
+
# Factories so call sites don't have to repeat subject_kind:.
|
|
90
|
+
def segment(index:, value:, source:, payload:, weight: nil, notes: [])
|
|
91
|
+
Record.new(
|
|
92
|
+
subject_kind: :segment,
|
|
93
|
+
subject: { index: index, value: value },
|
|
94
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def position(position:, source:, payload:, weight: nil, notes: [])
|
|
99
|
+
Record.new(
|
|
100
|
+
subject_kind: :position,
|
|
101
|
+
subject: position,
|
|
102
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def cluster(key:, source:, payload:, weight: nil, notes: [])
|
|
107
|
+
Record.new(
|
|
108
|
+
subject_kind: :cluster,
|
|
109
|
+
subject: key,
|
|
110
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
data/lib/iriq/explanation.rb
CHANGED
|
@@ -4,7 +4,7 @@ module Iriq
|
|
|
4
4
|
# Explanation.explain("https://foo.com/users/123")
|
|
5
5
|
# # => [
|
|
6
6
|
# # { value: "users", type: :literal, variable: false, hint: nil },
|
|
7
|
-
# # { value: "123", type: :
|
|
7
|
+
# # { value: "123", type: :integer, variable: true, hint: "user_id" },
|
|
8
8
|
# # ]
|
|
9
9
|
module Explanation
|
|
10
10
|
module_function
|