iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
data/lib/iriq/corpus.rb CHANGED
@@ -7,6 +7,10 @@ module Iriq
7
7
  #
8
8
  # The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
9
9
  # Corpus#normalize and Corpus#explain are the corpus-informed variants.
10
+ #
11
+ # State lives in a Storage backend (Memory by default; Json or Sqlite when
12
+ # opened against a file). The classification logic on top is identical
13
+ # regardless of where the counters live.
10
14
  class Corpus
11
15
  # Type-based: position is "mostly variable" (UUIDs/integers/etc.).
12
16
  VARIABLE_DOMINANCE_THRESHOLD = 0.8
@@ -38,44 +42,274 @@ module Iriq
38
42
  POPULAR_MIN_COUNT = 5
39
43
  POPULAR_BASELINE_MULTIPLE = 3
40
44
 
41
- attr_reader :host_counts, :path_length_counts, :raw_shape_counts,
42
- :fingerprint_counts, :position_stats
45
+ HOST_STRATEGIES = %i[full registrable none].freeze
46
+
47
+ attr_reader :storage, :host_strategy, :classifier
43
48
 
44
49
  def initialize(classifier: SegmentClassifier::DEFAULT,
45
- max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
46
- @classifier = classifier
47
- @max_values_per_position = max_values_per_position
48
- @host_counts = Hash.new(0)
49
- @path_length_counts = Hash.new(0)
50
- @raw_shape_counts = Hash.new(0)
51
- @fingerprint_counts = Hash.new(0)
52
- @position_stats = {}
53
- @clusterer = Clusterer.new(classifier: classifier)
50
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
51
+ host_strategy: :full,
52
+ storage: nil)
53
+ raise ArgumentError, "host_strategy must be one of #{HOST_STRATEGIES.inspect}" \
54
+ unless HOST_STRATEGIES.include?(host_strategy)
55
+
56
+ @classifier = classifier
57
+ @host_strategy = host_strategy
58
+ @storage = storage || Storage::Memory.new(
59
+ classifier: classifier,
60
+ max_values_per_position: max_values_per_position,
61
+ )
62
+ end
63
+
64
+ # Open a corpus against `path`. File extension picks the backend:
65
+ # `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
66
+ # else uses JSON.
67
+ def self.open(path, classifier: SegmentClassifier::DEFAULT,
68
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
69
+ host_strategy: :full)
70
+ storage = Storage.open(path,
71
+ classifier: classifier,
72
+ max_values_per_position: max_values_per_position)
73
+ corpus = new(classifier: classifier, storage: storage, host_strategy: host_strategy)
74
+ corpus.send(:reapply_activated_recognizers!) if storage.respond_to?(:each_activated_recognizer)
75
+ corpus
76
+ end
77
+
78
+ # Normalize the host for keying purposes. `:full` keeps the original
79
+ # host; `:registrable` collapses subdomains via the inline-PSL heuristic
80
+ # (api.foo.com + app.foo.com → foo.com); `:none` ignores host entirely
81
+ # so clusters group across all hosts by shape alone.
82
+ def effective_host(host)
83
+ case @host_strategy
84
+ when :registrable then RegistrableDomain.for(host)
85
+ when :none then ""
86
+ else host
87
+ end
54
88
  end
55
89
 
56
90
  # Observe a single IRI. Returns an Observation.
91
+ #
92
+ # Internally: builds an Event list for the IRI, then applies each event
93
+ # through the Reducer registry inside a single storage transaction. The
94
+ # event list is transient today — a future commit can persist it and
95
+ # replay against alternate reducers / thresholds for re-runnable
96
+ # inference. See lib/iriq/event.rb and lib/iriq/reducer.rb.
57
97
  def observe(input)
98
+ iri = coerce(input)
99
+ events = events_for(iri)
100
+ cluster = nil
101
+
102
+ @storage.transaction do |s|
103
+ events.each do |e|
104
+ result = Reducer.apply(e, s)
105
+ cluster = result if e.is_a?(Event::ClusterAddition)
106
+ end
107
+ s.record_observation(iri.canonical) if s.respond_to?(:record_observation)
108
+ end
109
+
110
+ Observation.new(corpus: self, identifier: iri, cluster: cluster)
111
+ end
112
+
113
+ # Drop every materialized view (host counts, position stats, clusters,
114
+ # …) and rebuild them by replaying the source-IRI log through the
115
+ # current events + reducers pipeline. Useful for:
116
+ #
117
+ # - Tuning thresholds (swap a Corpus constant, call reinfer)
118
+ # - Swapping the classifier (open the Corpus with a different
119
+ # classifier, call reinfer — events are re-derived from raw IRIs)
120
+ # - Recovering after a Reducer-set change
121
+ #
122
+ # Wrapped in a single backend transaction so a failure mid-replay
123
+ # leaves the prior views intact.
124
+ def reinfer
125
+ @storage.transaction do |s|
126
+ iris = []
127
+ s.each_observed_iri { |canonical| iris << canonical }
128
+ s.clear_materialized_views
129
+ iris.each do |canonical|
130
+ iri = Parser.parse(canonical)
131
+ events_for(iri).each { |e| Reducer.apply(e, s) }
132
+ end
133
+ end
134
+ nil
135
+ end
136
+
137
+ # Number of IRIs in the source-IRI log. The materialized views are
138
+ # derived from this log; reinfer replays it.
139
+ def observed_iri_count
140
+ return @storage.observed_iri_count if @storage.respond_to?(:observed_iri_count)
141
+ 0
142
+ end
143
+
144
+ # Scan observed values for shape patterns that recur frequently enough
145
+ # to suggest a new Recognizer. Returns RecognizerProposal records; nothing
146
+ # is automatically applied — the proposal carries enough evidence for a
147
+ # human to decide whether to bake the Recognizer in.
148
+ #
149
+ # Strategies are pluggable; the default set lives in
150
+ # Iriq::ProposalStrategy::DEFAULTS. Pass `strategies:` to limit / extend.
151
+ # Pass `min_observations:` / `min_coverage:` / `min_hosts:` to tune
152
+ # what passes the noise floor.
153
+ def propose_recognizers(strategies: ProposalStrategy::DEFAULTS, **opts)
154
+ strategies.flat_map { |s| s.propose(@storage, **opts) }
155
+ end
156
+
157
+ # Promote a RecognizerProposal into a live Recognizer for this corpus.
158
+ #
159
+ # Mechanics:
160
+ # 1. Synthesize a SynthesizedRecognizer from the proposal's prefix.
161
+ # 2. Switch to a per-corpus classifier (if we were sharing the
162
+ # module-level DEFAULT) so activation doesn't leak to other
163
+ # corpora using the same default singleton.
164
+ # 3. Register the Recognizer on the classifier — the ensemble
165
+ # picks it up on the next classify() call.
166
+ # 4. Persist the activation in storage so reopens re-apply it.
167
+ # 5. Reinfer so existing observations get re-classified through
168
+ # the new Recognizer.
169
+ #
170
+ # Returns the synthesized Recognizer.
171
+ def activate_proposal(proposal)
172
+ recognizer = SynthesizedRecognizer.from_proposal(proposal)
173
+ ensure_per_corpus_classifier!
174
+ @classifier.register_recognizer(recognizer)
175
+ if @storage.respond_to?(:record_activated_recognizer)
176
+ @storage.record_activated_recognizer(recognizer.to_dump)
177
+ end
178
+ reinfer
179
+ recognizer
180
+ end
181
+
182
+ # Convenience: activate every proposal whose confidence clears the
183
+ # given threshold. Returns the activated Recognizers. Confidence
184
+ # incorporates both per-position coverage AND cross-host
185
+ # corroboration — see RecognizerProposal#compute_confidence.
186
+ def activate_proposals_above(confidence_threshold, **propose_opts)
187
+ proposals = propose_recognizers(**propose_opts)
188
+ proposals.select { |p| p.confidence >= confidence_threshold }.map { |p| activate_proposal(p) }
189
+ end
190
+
191
+ # Number of activated recognizers persisted with this corpus.
192
+ def activated_recognizer_count
193
+ return @storage.activated_recognizer_count if @storage.respond_to?(:activated_recognizer_count)
194
+ 0
195
+ end
196
+
197
+ # Route shapes that recur across `min_hosts` or more distinct hosts.
198
+ # Returns CrossHostShape records sorted by host_count desc, then by
199
+ # observation_count desc, then by shape (stable, deterministic).
200
+ #
201
+ # Cross-host recurrence is independent evidence of a real semantic
202
+ # pattern — two unrelated hosts inventing the same `/users/{integer}`
203
+ # structure by accident is unlikely. A natural follow-up is feeding
204
+ # this signal back into RecognizerProposal confidence: a proposal
205
+ # supported by N hosts is much stronger than one seen on a single
206
+ # host with the same per-position coverage.
207
+ def cross_host_shapes(min_hosts: 2)
208
+ by_shape = Hash.new { |h, k| h[k] = { hosts: Set.new, count: 0 } }
209
+ @storage.clusters.each do |cluster|
210
+ # Skip non-URL clusters (URN clusters have no host).
211
+ next if cluster.host.nil? || cluster.host.empty?
212
+
213
+ agg = by_shape[cluster.shape]
214
+ agg[:hosts] << cluster.host
215
+ agg[:count] += cluster.count
216
+ end
217
+
218
+ by_shape.filter_map do |shape, data|
219
+ next nil if data[:hosts].size < min_hosts
220
+
221
+ CrossHostShape.new(
222
+ shape: shape,
223
+ hosts: data[:hosts],
224
+ observation_count: data[:count],
225
+ )
226
+ end.sort_by { |s| [-s.host_count, -s.observation_count, s.shape] }
227
+ end
228
+
229
+ # Build the ordered Event list for `input` without applying it. Useful
230
+ # for inspection, tests, and future event-log persistence. Each call is
231
+ # pure — no storage side-effects.
232
+ def events_for(input)
58
233
  iri = coerce(input)
59
234
  hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
60
- record_aggregates(iri, hinted_entries)
235
+ raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
61
236
  hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
62
- cluster = @clusterer.add(iri, shape: hinted_shape)
63
- Observation.new(corpus: self, identifier: iri, cluster: cluster)
237
+ keying_host = effective_host(iri.host)
238
+
239
+ events = [
240
+ Event::HostSeen.new(keying_host),
241
+ Event::PathLengthSeen.new(iri.path_segments.size),
242
+ Event::RawShapeSeen.new(raw_shape),
243
+ Event::FingerprintSeen.new(hinted_shape),
244
+ ]
245
+
246
+ prefix = ""
247
+ hinted_entries.each do |entry|
248
+ events << Event::PositionSeen.new(
249
+ Position.path(host: keying_host, prefix: prefix),
250
+ entry[:value], entry[:type],
251
+ )
252
+ prefix = "#{prefix}/#{placeholder(entry)}"
253
+ end
254
+
255
+ key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape, host: keying_host)
256
+ events << Event::ClusterAddition.new(key, host, scheme, shape, iri)
257
+
258
+ events
64
259
  end
65
260
 
66
261
  # Corpus-informed normalization. Falls back to mechanical normalization
67
- # when the corpus has no signal for a position.
262
+ # when the corpus has no signal for a position. Implemented as a thin
263
+ # call into Normalizer with `evidence: self`; the corpus-informed path
264
+ # and query rendering live in #render_path / #render_query below
265
+ # (the evidence-source interface).
68
266
  def normalize(input)
69
267
  iri = coerce(input)
70
- return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
268
+ Normalizer.normalize_identifier(iri, classifier: @classifier, hints: true, evidence: self)
269
+ end
71
270
 
271
+ # Evidence-source interface — called by Normalizer when this Corpus is
272
+ # passed as `evidence:`. Renders the path using corpus-informed
273
+ # classifications (variability promotion, popular-outlier preservation).
274
+ # Always emits a leading "/" — empty path collapses to "/" to match
275
+ # mechanical output and anchor any trailing query.
276
+ def render_path(iri, _classifier, _hints)
72
277
  tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
73
- out = +""
74
- out << "#{iri.scheme}://" if iri.scheme
75
- out << iri.host if iri.host
76
- out << ":#{iri.port}" if iri.port
77
- out << "/" << tokens.join("/")
78
- out
278
+ "/" + tokens.join("/")
279
+ end
280
+
281
+ # Evidence-source interface render the query string with
282
+ # cluster-inferred param types where available. The mechanical
283
+ # NullEvidenceSource provides the classifier-only fallback; this
284
+ # version prefers the cluster's observed type per param (dominant
285
+ # type_count, subject to the corpus thresholds).
286
+ def render_query(iri, _classifier = @classifier)
287
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true)
288
+ .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
289
+ key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
290
+ host: effective_host(iri.host))
291
+ cluster = @storage.cluster_for(key)
292
+
293
+ iri.query_params.keys.sort.map do |k|
294
+ v = iri.query_params[k].to_s
295
+ type = inferred_param_type(cluster, k, v)
296
+ shaped = render_param_value(v, type)
297
+ "#{k}=#{shaped}"
298
+ end.join("&")
299
+ end
300
+
301
+ # Inferred params for the cluster `input` would fall into. Returns the
302
+ # same shape as Cluster#param_summary — useful for "what query params
303
+ # might this URL accept?" tooling. Empty array if no cluster has been
304
+ # observed for this shape yet.
305
+ def params_for(input)
306
+ iri = coerce(input)
307
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true)
308
+ .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
309
+ key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
310
+ host: effective_host(iri.host))
311
+ cluster = @storage.cluster_for(key)
312
+ cluster ? cluster.param_summary : []
79
313
  end
80
314
 
81
315
  # Per-segment explanation with corpus-informed `classification`.
@@ -89,55 +323,93 @@ module Iriq
89
323
  end
90
324
  end
91
325
 
326
+ def host_counts; @storage.host_counts; end
327
+ def path_length_counts; @storage.path_length_counts; end
328
+ def raw_shape_counts; @storage.raw_shape_counts; end
329
+ def fingerprint_counts; @storage.fingerprint_counts; end
330
+
331
+ # Iterates Position → PositionStats over all observed positions.
332
+ # Used by inspection tooling; not part of the hot path.
333
+ def each_position_stats(&block)
334
+ @storage.each_position_stats(&block)
335
+ end
336
+
92
337
  def clusters
93
- @clusterer.clusters
338
+ @storage.clusters
94
339
  end
95
340
 
96
341
  def size
97
- @clusterer.size
342
+ @storage.cluster_size
98
343
  end
99
344
 
100
- # Stats for a given (host, prefix_shape) — useful for tests and
345
+ # Stats for a given (host, path-prefix) — useful for tests and
101
346
  # debugging. Returns nil if nothing has been observed there.
102
- def stats_for(host, prefix)
103
- @position_stats[[host, prefix]]
347
+ # Accepts either a Position or (host, prefix) for ergonomics.
348
+ def stats_for(host_or_position, prefix = nil)
349
+ position = host_or_position.is_a?(Position) ? host_or_position : Position.path(host: host_or_position, prefix: prefix)
350
+ @storage.position_stats(position)
104
351
  end
105
352
 
106
- private
353
+ # Persist the corpus.
354
+ #
355
+ # save() → flush the backend in place (JSON writes its file,
356
+ # SQLite is already on disk).
357
+ # save(same_path) → same as save() — idempotent for the backend's path.
358
+ # save(other_path) → export to other_path as JSON, regardless of the
359
+ # live backend.
360
+ def save(path = nil)
361
+ backend_path = @storage.respond_to?(:path) ? @storage.path : nil
362
+ if path.nil? || path == backend_path
363
+ @storage.save
364
+ else
365
+ write_json_dump(path)
366
+ end
367
+ end
107
368
 
108
- def coerce(input)
109
- input.is_a?(Identifier) ? input : Parser.parse(input)
369
+ def close
370
+ @storage.close
371
+ end
372
+
373
+ # Wrap many observations in a single backend transaction. For SQLite this
374
+ # turns thousands of fsyncs into one; for in-memory backends it's a
375
+ # no-op. Use when ingesting a batch.
376
+ def batch(&block)
377
+ @storage.batch(&block)
110
378
  end
111
379
 
112
- def record_aggregates(iri, hinted_entries)
113
- @host_counts[iri.host] += 1 if iri.host
114
- @path_length_counts[iri.path_segments.size] += 1
380
+ private
115
381
 
116
- raw = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
117
- fp = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
118
- @raw_shape_counts[raw] += 1
119
- @fingerprint_counts[fp] += 1
382
+ # If we're still sharing the module-level DEFAULT classifier, switch
383
+ # to our own copy so register_recognizer doesn't leak into other
384
+ # corpora using the same default singleton.
385
+ def ensure_per_corpus_classifier!
386
+ return if @classifier != SegmentClassifier::DEFAULT
120
387
 
121
- record_position_stats(iri, hinted_entries)
388
+ @classifier = SegmentClassifier.new
122
389
  end
123
390
 
124
- def record_position_stats(iri, hinted_entries)
125
- prefix = ""
126
- hinted_entries.each do |entry|
127
- key = [iri.host, prefix]
128
- stats = @position_stats[key] ||= PositionStats.new(max_values: @max_values_per_position)
129
- stats.observe(entry[:value], entry[:type])
130
- prefix = "#{prefix}/#{placeholder(entry)}"
391
+ # On Corpus.open, walk the stored activations and register each one
392
+ # on this corpus's classifier. Switches to a per-corpus classifier
393
+ # if any activations exist.
394
+ def reapply_activated_recognizers!
395
+ return if @storage.activated_recognizer_count.zero?
396
+
397
+ ensure_per_corpus_classifier!
398
+ @storage.each_activated_recognizer do |dump|
399
+ @classifier.register_recognizer(SynthesizedRecognizer.from_dump(dump))
131
400
  end
132
401
  end
133
402
 
134
- # Walks the IRI's segments and returns hint-derived entries enriched with
135
- # the (host, prefix) PositionStats reference and a :classification symbol.
403
+ def coerce(input)
404
+ input.is_a?(Identifier) ? input : Parser.parse(input)
405
+ end
406
+
136
407
  def annotate_segments(iri)
137
408
  hinted = SegmentHints.derive(iri.path_segments, @classifier)
138
409
  prefix = ""
410
+ keying_host = effective_host(iri.host)
139
411
  hinted.map do |entry|
140
- stats = @position_stats[[iri.host, prefix]]
412
+ stats = @storage.position_stats(Position.path(host: keying_host, prefix: prefix))
141
413
  out = entry.merge(
142
414
  prefix: prefix,
143
415
  classification: classify(entry, stats),
@@ -150,14 +422,28 @@ module Iriq
150
422
  def placeholder(entry)
151
423
  return entry[:value] unless entry[:variable]
152
424
 
153
- "{#{entry[:hint] || entry[:type]}}"
425
+ "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}"
154
426
  end
155
427
 
428
+ # Types whose values are often a small fixed set (or a single static
429
+ # value baked into a REST route). For these, run through the same
430
+ # cardinality / value-fraction analysis literals get — a dominant
431
+ # value gets preserved as :stable_literal instead of being
432
+ # placeholdered as a generic {version}/{slug}/etc.
433
+ #
434
+ # Slug + opaque_id are here because a lot of route literals
435
+ # accidentally match those shapes (`/users/{id}/create-new`,
436
+ # reference codes like `WK1234`). When a single value dominates the
437
+ # position, the literal is almost always the better display.
438
+ STABLE_VARIABLE_TYPES = %i[version locale currency boolean slug opaque_id].freeze
439
+
156
440
  def classify(entry, stats)
157
441
  variable = entry[:variable]
158
442
 
159
443
  return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
160
- return :variable_identifier if variable
444
+ if variable && !STABLE_VARIABLE_TYPES.include?(entry[:type])
445
+ return :variable_identifier
446
+ end
161
447
 
162
448
  value = entry[:value]
163
449
  total = stats.total
@@ -166,6 +452,17 @@ module Iriq
166
452
  enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
167
453
  value_frac = stats.value_fraction(value)
168
454
 
455
+ # For STABLE_VARIABLE_TYPES (version, locale, currency, boolean),
456
+ # a dominant value wins over the variable-dominance branch — a
457
+ # single-version /api/v1/... pattern stays as the literal `v1`
458
+ # rather than placeholdering to {version}. Without dominance,
459
+ # fall through to :variable_identifier (the per-type placeholder).
460
+ if variable
461
+ return :stable_literal if value_frac >= STABLE_LITERAL_THRESHOLD
462
+
463
+ return :variable_identifier
464
+ end
465
+
169
466
  if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
170
467
  # Position is dominated by variable types (UUIDs, integers, etc.).
171
468
  # A literal here is a special-case outlier (e.g. /users/me).
@@ -204,6 +501,28 @@ module Iriq
204
501
  stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
205
502
  end
206
503
 
504
+ def inferred_param_type(cluster, name, value)
505
+ # Prefer the cluster's confident type when we have enough samples;
506
+ # otherwise classify the current value directly. Cluster#param_type
507
+ # applies the :date quorum gate (see Cluster::DATE_CONFIDENCE_THRESHOLD).
508
+ stats = cluster && cluster.param_stats[name]
509
+ if stats && stats.total >= MIN_OBSERVATIONS_FOR_INFERENCE
510
+ cluster.param_type(name) || @classifier.classify(value)
511
+ else
512
+ @classifier.classify(value)
513
+ end
514
+ end
515
+
516
+ def render_param_value(value, type)
517
+ if type == :date && (canon = SegmentClassifier.canonical_date(value))
518
+ canon
519
+ elsif @classifier.variable?(type)
520
+ "{#{SegmentClassifier.display_type(type)}}"
521
+ else
522
+ value
523
+ end
524
+ end
525
+
207
526
  def corpus_token(entry)
208
527
  case entry[:classification]
209
528
  when :variable_identifier, :corpus_inferred_variable
@@ -214,7 +533,13 @@ module Iriq
214
533
  end
215
534
 
216
535
  def placeholder_for_variable(entry)
217
- return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
536
+ # Dates render in canonical ISO form rather than as a `{date}` placeholder
537
+ # — matches what mechanical Iriq.normalize does for path segments and
538
+ # what render_param_value does for query params.
539
+ if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
540
+ return canon
541
+ end
542
+ return "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}" if entry[:variable]
218
543
 
219
544
  # corpus-inferred variable: classifier said literal, corpus says
220
545
  # otherwise. Derive a hint from the prefix's last literal segment if
@@ -226,43 +551,54 @@ module Iriq
226
551
 
227
552
  public
228
553
 
554
+ # --- Legacy dump/load (JSON shape) ------------------------------------
555
+ #
556
+ # The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
557
+ # `Corpus.load(path)` for JSON-backed persistence. Those names still work
558
+ # but are now thin wrappers around the appropriate Storage backend.
559
+
229
560
  def dump
230
- {
231
- "host_counts" => @host_counts,
232
- "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
233
- "raw_shape_counts" => @raw_shape_counts,
234
- "fingerprint_counts" => @fingerprint_counts,
235
- "max_values_per_position" => @max_values_per_position,
236
- "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
237
- "clusterer" => @clusterer.dump,
238
- }
239
- end
240
-
241
- def save(path)
242
- tmp = "#{path}.tmp"
243
- File.write(tmp, JSON.generate(dump))
244
- File.rename(tmp, path)
561
+ memory_view.to_dump
245
562
  end
246
563
 
247
564
  def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
248
- c = new(
249
- classifier: classifier,
250
- max_values_per_position: h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES),
251
- )
252
- c.instance_variable_set(:@host_counts, Hash.new(0).merge(h["host_counts"]))
253
- c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
254
- c.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(h["raw_shape_counts"]))
255
- c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
256
- stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
257
- acc[[host, prefix]] = PositionStats.from_dump(sdump)
258
- end
259
- c.instance_variable_set(:@position_stats, stats)
260
- c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
261
- c
565
+ max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
566
+ storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
567
+ storage.load_dump!(h)
568
+ new(classifier: classifier, storage: storage)
262
569
  end
263
570
 
264
571
  def self.load(path, classifier: SegmentClassifier::DEFAULT)
265
- from_dump(JSON.parse(File.read(path)), classifier: classifier)
572
+ open(path, classifier: classifier)
573
+ end
574
+
575
+ private
576
+
577
+ def write_json_dump(path)
578
+ tmp = "#{path}.tmp"
579
+ File.write(tmp, JSON.generate(memory_view.to_dump))
580
+ File.rename(tmp, path)
581
+ end
582
+
583
+ # Materialize a Memory snapshot of the current state — used by dump for
584
+ # backends that don't natively know how to emit the JSON shape.
585
+ def memory_view
586
+ return @storage if @storage.respond_to?(:to_dump)
587
+
588
+ mem = Storage::Memory.new(
589
+ classifier: @classifier,
590
+ max_values_per_position: @storage.max_values_per_position,
591
+ )
592
+ mem.instance_variable_set(:@host_counts, Hash.new(0).merge(@storage.host_counts))
593
+ mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
594
+ mem.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(@storage.raw_shape_counts))
595
+ mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
596
+ ps = {}
597
+ @storage.each_position_stats { |key, stats| ps[key] = stats }
598
+ mem.instance_variable_set(:@position_stats, ps)
599
+ clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
600
+ mem.instance_variable_set(:@clusters, clusters_h)
601
+ mem
266
602
  end
267
603
  end
268
604
  end
@@ -0,0 +1,37 @@
1
+ require "set"
2
+
3
+ module Iriq
4
+ # A route shape that recurs across multiple hosts.
5
+ #
6
+ # Emitted by Corpus#cross_host_shapes. The shape string ("/users/{user_id}")
7
+ # is the cluster's rendered placeholder form; two clusters with the same
8
+ # shape but different hosts coalesce into one CrossHostShape record.
9
+ #
10
+ # A shape appearing at N hosts is strong evidence of a semantic pattern
11
+ # rather than a host-local quirk — independent hosts are unlikely to
12
+ # invent the same `/users/{integer}` structure by accident. Future work
13
+ # can feed this signal into proposal confidence and corpus-informed
14
+ # normalization (raise weight when a Shape has cross-host support).
15
+ class CrossHostShape
16
+ attr_reader :shape, :hosts, :observation_count
17
+
18
+ def initialize(shape:, hosts:, observation_count:)
19
+ @shape = shape
20
+ @hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
21
+ @observation_count = observation_count
22
+ end
23
+
24
+ def host_count
25
+ @hosts.size
26
+ end
27
+
28
+ def to_h
29
+ {
30
+ shape: @shape,
31
+ hosts: @hosts.to_a.sort,
32
+ host_count: host_count,
33
+ observation_count: @observation_count,
34
+ }
35
+ end
36
+ end
37
+ end
data/lib/iriq/event.rb ADDED
@@ -0,0 +1,22 @@
1
+ module Iriq
2
+ # Events are the atomic observation-time facts emitted by Corpus#observe
3
+ # before any state changes. A single observe(iri) call emits a small
4
+ # ordered list of Events; Reducers consume that list to update materialized
5
+ # views (host counts, position stats, clusters, etc.).
6
+ #
7
+ # Today the event list is transient — built fresh per observe(), applied,
8
+ # and discarded. The shape is in place so a future commit can persist the
9
+ # log and replay it to re-derive materialized views without re-feeding
10
+ # source IRIs (the "re-runnable inference" win from ROADMAP.md).
11
+ #
12
+ # Each Event is a Struct so callers can pattern-match on type and access
13
+ # fields positionally or by name.
14
+ module Event
15
+ HostSeen = Struct.new(:host)
16
+ PathLengthSeen = Struct.new(:length)
17
+ RawShapeSeen = Struct.new(:shape)
18
+ FingerprintSeen = Struct.new(:shape)
19
+ PositionSeen = Struct.new(:position, :value, :type)
20
+ ClusterAddition = Struct.new(:key, :host, :scheme, :shape, :identifier)
21
+ end
22
+ end