iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iriq/corpus.rb CHANGED
@@ -42,13 +42,20 @@ module Iriq
42
42
  POPULAR_MIN_COUNT = 5
43
43
  POPULAR_BASELINE_MULTIPLE = 3
44
44
 
45
- attr_reader :storage
45
+ HOST_STRATEGIES = %i[full registrable none].freeze
46
+
47
+ attr_reader :storage, :host_strategy, :classifier
46
48
 
47
49
  def initialize(classifier: SegmentClassifier::DEFAULT,
48
50
  max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
51
+ host_strategy: :full,
49
52
  storage: nil)
50
- @classifier = classifier
51
- @storage = storage || Storage::Memory.new(
53
+ raise ArgumentError, "host_strategy must be one of #{HOST_STRATEGIES.inspect}" \
54
+ unless HOST_STRATEGIES.include?(host_strategy)
55
+
56
+ @classifier = classifier
57
+ @host_strategy = host_strategy
58
+ @storage = storage || Storage::Memory.new(
52
59
  classifier: classifier,
53
60
  max_values_per_position: max_values_per_position,
54
61
  )
@@ -58,53 +65,251 @@ module Iriq
58
65
  # `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
59
66
  # else uses JSON.
60
67
  def self.open(path, classifier: SegmentClassifier::DEFAULT,
61
- max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
68
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
69
+ host_strategy: :full)
62
70
  storage = Storage.open(path,
63
71
  classifier: classifier,
64
72
  max_values_per_position: max_values_per_position)
65
- new(classifier: classifier, storage: storage)
73
+ corpus = new(classifier: classifier, storage: storage, host_strategy: host_strategy)
74
+ corpus.send(:reapply_activated_recognizers!) if storage.respond_to?(:each_activated_recognizer)
75
+ corpus
76
+ end
77
+
78
+ # Normalize the host for keying purposes. `:full` keeps the original
79
+ # host; `:registrable` collapses subdomains via the inline-PSL heuristic
80
+ # (api.foo.com + app.foo.com → foo.com); `:none` ignores host entirely
81
+ # so clusters group across all hosts by shape alone.
82
+ def effective_host(host)
83
+ case @host_strategy
84
+ when :registrable then RegistrableDomain.for(host)
85
+ when :none then ""
86
+ else host
87
+ end
66
88
  end
67
89
 
68
90
  # Observe a single IRI. Returns an Observation.
91
+ #
92
+ # Internally: builds an Event list for the IRI, then applies each event
93
+ # through the Reducer registry inside a single storage transaction. The
94
+ # event list is transient today — a future commit can persist it and
95
+ # replay against alternate reducers / thresholds for re-runnable
96
+ # inference. See lib/iriq/event.rb and lib/iriq/reducer.rb.
69
97
  def observe(input)
98
+ iri = coerce(input)
99
+ events = events_for(iri)
100
+ cluster = nil
101
+
102
+ @storage.transaction do |s|
103
+ events.each do |e|
104
+ result = Reducer.apply(e, s)
105
+ cluster = result if e.is_a?(Event::ClusterAddition)
106
+ end
107
+ s.record_observation(iri.canonical) if s.respond_to?(:record_observation)
108
+ end
109
+
110
+ Observation.new(corpus: self, identifier: iri, cluster: cluster)
111
+ end
112
+
113
+ # Drop every materialized view (host counts, position stats, clusters,
114
+ # …) and rebuild them by replaying the source-IRI log through the
115
+ # current events + reducers pipeline. Useful for:
116
+ #
117
+ # - Tuning thresholds (swap a Corpus constant, call reinfer)
118
+ # - Swapping the classifier (open the Corpus with a different
119
+ # classifier, call reinfer — events are re-derived from raw IRIs)
120
+ # - Recovering after a Reducer-set change
121
+ #
122
+ # Wrapped in a single backend transaction so a failure mid-replay
123
+ # leaves the prior views intact.
124
+ def reinfer
125
+ @storage.transaction do |s|
126
+ iris = []
127
+ s.each_observed_iri { |canonical| iris << canonical }
128
+ s.clear_materialized_views
129
+ iris.each do |canonical|
130
+ iri = Parser.parse(canonical)
131
+ events_for(iri).each { |e| Reducer.apply(e, s) }
132
+ end
133
+ end
134
+ nil
135
+ end
136
+
137
+ # Number of IRIs in the source-IRI log. The materialized views are
138
+ # derived from this log; reinfer replays it.
139
+ def observed_iri_count
140
+ return @storage.observed_iri_count if @storage.respond_to?(:observed_iri_count)
141
+ 0
142
+ end
143
+
144
+ # Scan observed values for shape patterns that recur frequently enough
145
+ # to suggest a new Recognizer. Returns RecognizerProposal records; nothing
146
+ # is automatically applied — the proposal carries enough evidence for a
147
+ # human to decide whether to bake the Recognizer in.
148
+ #
149
+ # Strategies are pluggable; the default set lives in
150
+ # Iriq::ProposalStrategy::DEFAULTS. Pass `strategies:` to limit / extend.
151
+ # Pass `min_observations:` / `min_coverage:` / `min_hosts:` to tune
152
+ # what passes the noise floor.
153
+ def propose_recognizers(strategies: ProposalStrategy::DEFAULTS, **opts)
154
+ strategies.flat_map { |s| s.propose(@storage, **opts) }
155
+ end
156
+
157
+ # Promote a RecognizerProposal into a live Recognizer for this corpus.
158
+ #
159
+ # Mechanics:
160
+ # 1. Synthesize a SynthesizedRecognizer from the proposal's prefix.
161
+ # 2. Switch to a per-corpus classifier (if we were sharing the
162
+ # module-level DEFAULT) so activation doesn't leak to other
163
+ # corpora using the same default singleton.
164
+ # 3. Register the Recognizer on the classifier — the ensemble
165
+ # picks it up on the next classify() call.
166
+ # 4. Persist the activation in storage so reopens re-apply it.
167
+ # 5. Reinfer so existing observations get re-classified through
168
+ # the new Recognizer.
169
+ #
170
+ # Returns the synthesized Recognizer.
171
+ def activate_proposal(proposal)
172
+ recognizer = SynthesizedRecognizer.from_proposal(proposal)
173
+ ensure_per_corpus_classifier!
174
+ @classifier.register_recognizer(recognizer)
175
+ if @storage.respond_to?(:record_activated_recognizer)
176
+ @storage.record_activated_recognizer(recognizer.to_dump)
177
+ end
178
+ reinfer
179
+ recognizer
180
+ end
181
+
182
+ # Convenience: activate every proposal whose confidence clears the
183
+ # given threshold. Returns the activated Recognizers. Confidence
184
+ # incorporates both per-position coverage AND cross-host
185
+ # corroboration — see RecognizerProposal#compute_confidence.
186
+ def activate_proposals_above(confidence_threshold, **propose_opts)
187
+ proposals = propose_recognizers(**propose_opts)
188
+ proposals.select { |p| p.confidence >= confidence_threshold }.map { |p| activate_proposal(p) }
189
+ end
190
+
191
+ # Number of activated recognizers persisted with this corpus.
192
+ def activated_recognizer_count
193
+ return @storage.activated_recognizer_count if @storage.respond_to?(:activated_recognizer_count)
194
+ 0
195
+ end
196
+
197
+ # Route shapes that recur across `min_hosts` or more distinct hosts.
198
+ # Returns CrossHostShape records sorted by host_count desc, then by
199
+ # observation_count desc, then by shape (stable, deterministic).
200
+ #
201
+ # Cross-host recurrence is independent evidence of a real semantic
202
+ # pattern — two unrelated hosts inventing the same `/users/{integer}`
203
+ # structure by accident is unlikely. A natural follow-up is feeding
204
+ # this signal back into RecognizerProposal confidence: a proposal
205
+ # supported by N hosts is much stronger than one seen on a single
206
+ # host with the same per-position coverage.
207
+ def cross_host_shapes(min_hosts: 2)
208
+ by_shape = Hash.new { |h, k| h[k] = { hosts: Set.new, count: 0 } }
209
+ @storage.clusters.each do |cluster|
210
+ # Skip non-URL clusters (URN clusters have no host).
211
+ next if cluster.host.nil? || cluster.host.empty?
212
+
213
+ agg = by_shape[cluster.shape]
214
+ agg[:hosts] << cluster.host
215
+ agg[:count] += cluster.count
216
+ end
217
+
218
+ by_shape.filter_map do |shape, data|
219
+ next nil if data[:hosts].size < min_hosts
220
+
221
+ CrossHostShape.new(
222
+ shape: shape,
223
+ hosts: data[:hosts],
224
+ observation_count: data[:count],
225
+ )
226
+ end.sort_by { |s| [-s.host_count, -s.observation_count, s.shape] }
227
+ end
228
+
229
+ # Build the ordered Event list for `input` without applying it. Useful
230
+ # for inspection, tests, and future event-log persistence. Each call is
231
+ # pure — no storage side-effects.
232
+ def events_for(input)
70
233
  iri = coerce(input)
71
234
  hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
72
235
  raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
73
236
  hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
237
+ keying_host = effective_host(iri.host)
74
238
 
75
- cluster = nil
76
- @storage.transaction do |s|
77
- s.increment_host(iri.host)
78
- s.increment_path_length(iri.path_segments.size)
79
- s.increment_raw_shape(raw_shape)
80
- s.increment_fingerprint(hinted_shape)
81
-
82
- prefix = ""
83
- hinted_entries.each do |entry|
84
- s.observe_position(iri.host, prefix, entry[:value], entry[:type])
85
- prefix = "#{prefix}/#{placeholder(entry)}"
86
- end
239
+ events = [
240
+ Event::HostSeen.new(keying_host),
241
+ Event::PathLengthSeen.new(iri.path_segments.size),
242
+ Event::RawShapeSeen.new(raw_shape),
243
+ Event::FingerprintSeen.new(hinted_shape),
244
+ ]
87
245
 
88
- key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape)
89
- cluster = s.add_to_cluster(key, host, scheme, shape, iri)
246
+ prefix = ""
247
+ hinted_entries.each do |entry|
248
+ events << Event::PositionSeen.new(
249
+ Position.path(host: keying_host, prefix: prefix),
250
+ entry[:value], entry[:type],
251
+ )
252
+ prefix = "#{prefix}/#{placeholder(entry)}"
90
253
  end
91
254
 
92
- Observation.new(corpus: self, identifier: iri, cluster: cluster)
255
+ key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape, host: keying_host)
256
+ events << Event::ClusterAddition.new(key, host, scheme, shape, iri)
257
+
258
+ events
93
259
  end
94
260
 
95
261
  # Corpus-informed normalization. Falls back to mechanical normalization
96
- # when the corpus has no signal for a position.
262
+ # when the corpus has no signal for a position. Implemented as a thin
263
+ # call into Normalizer with `evidence: self`; the corpus-informed path
264
+ # and query rendering live in #render_path / #render_query below
265
+ # (the evidence-source interface).
97
266
  def normalize(input)
98
267
  iri = coerce(input)
99
- return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
268
+ Normalizer.normalize_identifier(iri, classifier: @classifier, hints: true, evidence: self)
269
+ end
100
270
 
271
+ # Evidence-source interface — called by Normalizer when this Corpus is
272
+ # passed as `evidence:`. Renders the path using corpus-informed
273
+ # classifications (variability promotion, popular-outlier preservation).
274
+ # Always emits a leading "/" — empty path collapses to "/" to match
275
+ # mechanical output and anchor any trailing query.
276
+ def render_path(iri, _classifier, _hints)
101
277
  tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
102
- out = +""
103
- out << "#{iri.scheme}://" if iri.scheme
104
- out << iri.host if iri.host
105
- out << ":#{iri.port}" if iri.port
106
- out << "/" << tokens.join("/")
107
- out
278
+ "/" + tokens.join("/")
279
+ end
280
+
281
+ # Evidence-source interface render the query string with
282
+ # cluster-inferred param types where available. The mechanical
283
+ # NullEvidenceSource provides the classifier-only fallback; this
284
+ # version prefers the cluster's observed type per param (dominant
285
+ # type_count, subject to the corpus thresholds).
286
+ def render_query(iri, _classifier = @classifier)
287
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true)
288
+ .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
289
+ key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
290
+ host: effective_host(iri.host))
291
+ cluster = @storage.cluster_for(key)
292
+
293
+ iri.query_params.keys.sort.map do |k|
294
+ v = iri.query_params[k].to_s
295
+ type = inferred_param_type(cluster, k, v)
296
+ shaped = render_param_value(v, type)
297
+ "#{k}=#{shaped}"
298
+ end.join("&")
299
+ end
300
+
301
+ # Inferred params for the cluster `input` would fall into. Returns the
302
+ # same shape as Cluster#param_summary — useful for "what query params
303
+ # might this URL accept?" tooling. Empty array if no cluster has been
304
+ # observed for this shape yet.
305
+ def params_for(input)
306
+ iri = coerce(input)
307
+ hinted_shape = PathShape.new(classifier: @classifier, hints: true)
308
+ .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
309
+ key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
310
+ host: effective_host(iri.host))
311
+ cluster = @storage.cluster_for(key)
312
+ cluster ? cluster.param_summary : []
108
313
  end
109
314
 
110
315
  # Per-segment explanation with corpus-informed `classification`.
@@ -123,7 +328,7 @@ module Iriq
123
328
  def raw_shape_counts; @storage.raw_shape_counts; end
124
329
  def fingerprint_counts; @storage.fingerprint_counts; end
125
330
 
126
- # Iterates (host, prefix) → PositionStats over all observed positions.
331
+ # Iterates Position → PositionStats over all observed positions.
127
332
  # Used by inspection tooling; not part of the hot path.
128
333
  def each_position_stats(&block)
129
334
  @storage.each_position_stats(&block)
@@ -137,10 +342,12 @@ module Iriq
137
342
  @storage.cluster_size
138
343
  end
139
344
 
140
- # Stats for a given (host, prefix_shape) — useful for tests and
345
+ # Stats for a given (host, path-prefix) — useful for tests and
141
346
  # debugging. Returns nil if nothing has been observed there.
142
- def stats_for(host, prefix)
143
- @storage.position_stats(host, prefix)
347
+ # Accepts either a Position or (host, prefix) for ergonomics.
348
+ def stats_for(host_or_position, prefix = nil)
349
+ position = host_or_position.is_a?(Position) ? host_or_position : Position.path(host: host_or_position, prefix: prefix)
350
+ @storage.position_stats(position)
144
351
  end
145
352
 
146
353
  # Persist the corpus.
@@ -172,6 +379,27 @@ module Iriq
172
379
 
173
380
  private
174
381
 
382
+ # If we're still sharing the module-level DEFAULT classifier, switch
383
+ # to our own copy so register_recognizer doesn't leak into other
384
+ # corpora using the same default singleton.
385
+ def ensure_per_corpus_classifier!
386
+ return if @classifier != SegmentClassifier::DEFAULT
387
+
388
+ @classifier = SegmentClassifier.new
389
+ end
390
+
391
+ # On Corpus.open, walk the stored activations and register each one
392
+ # on this corpus's classifier. Switches to a per-corpus classifier
393
+ # if any activations exist.
394
+ def reapply_activated_recognizers!
395
+ return if @storage.activated_recognizer_count.zero?
396
+
397
+ ensure_per_corpus_classifier!
398
+ @storage.each_activated_recognizer do |dump|
399
+ @classifier.register_recognizer(SynthesizedRecognizer.from_dump(dump))
400
+ end
401
+ end
402
+
175
403
  def coerce(input)
176
404
  input.is_a?(Identifier) ? input : Parser.parse(input)
177
405
  end
@@ -179,8 +407,9 @@ module Iriq
179
407
  def annotate_segments(iri)
180
408
  hinted = SegmentHints.derive(iri.path_segments, @classifier)
181
409
  prefix = ""
410
+ keying_host = effective_host(iri.host)
182
411
  hinted.map do |entry|
183
- stats = @storage.position_stats(iri.host, prefix)
412
+ stats = @storage.position_stats(Position.path(host: keying_host, prefix: prefix))
184
413
  out = entry.merge(
185
414
  prefix: prefix,
186
415
  classification: classify(entry, stats),
@@ -193,14 +422,28 @@ module Iriq
193
422
  def placeholder(entry)
194
423
  return entry[:value] unless entry[:variable]
195
424
 
196
- "{#{entry[:hint] || entry[:type]}}"
425
+ "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}"
197
426
  end
198
427
 
428
+ # Types whose values are often a small fixed set (or a single static
429
+ # value baked into a REST route). For these, run through the same
430
+ # cardinality / value-fraction analysis literals get — a dominant
431
+ # value gets preserved as :stable_literal instead of being
432
+ # placeholdered as a generic {version}/{slug}/etc.
433
+ #
434
+ # Slug + opaque_id are here because a lot of route literals
435
+ # accidentally match those shapes (`/users/{id}/create-new`,
436
+ # reference codes like `WK1234`). When a single value dominates the
437
+ # position, the literal is almost always the better display.
438
+ STABLE_VARIABLE_TYPES = %i[version locale currency boolean slug opaque_id].freeze
439
+
199
440
  def classify(entry, stats)
200
441
  variable = entry[:variable]
201
442
 
202
443
  return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
203
- return :variable_identifier if variable
444
+ if variable && !STABLE_VARIABLE_TYPES.include?(entry[:type])
445
+ return :variable_identifier
446
+ end
204
447
 
205
448
  value = entry[:value]
206
449
  total = stats.total
@@ -209,6 +452,17 @@ module Iriq
209
452
  enough_data = total >= MIN_OBSERVATIONS_FOR_INFERENCE
210
453
  value_frac = stats.value_fraction(value)
211
454
 
455
+ # For STABLE_VARIABLE_TYPES (version, locale, currency, boolean),
456
+ # a dominant value wins over the variable-dominance branch — a
457
+ # single-version /api/v1/... pattern stays as the literal `v1`
458
+ # rather than placeholdering to {version}. Without dominance,
459
+ # fall through to :variable_identifier (the per-type placeholder).
460
+ if variable
461
+ return :stable_literal if value_frac >= STABLE_LITERAL_THRESHOLD
462
+
463
+ return :variable_identifier
464
+ end
465
+
212
466
  if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
213
467
  # Position is dominated by variable types (UUIDs, integers, etc.).
214
468
  # A literal here is a special-case outlier (e.g. /users/me).
@@ -247,6 +501,28 @@ module Iriq
247
501
  stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
248
502
  end
249
503
 
504
+ def inferred_param_type(cluster, name, value)
505
+ # Prefer the cluster's confident type when we have enough samples;
506
+ # otherwise classify the current value directly. Cluster#param_type
507
+ # applies the :date quorum gate (see Cluster::DATE_CONFIDENCE_THRESHOLD).
508
+ stats = cluster && cluster.param_stats[name]
509
+ if stats && stats.total >= MIN_OBSERVATIONS_FOR_INFERENCE
510
+ cluster.param_type(name) || @classifier.classify(value)
511
+ else
512
+ @classifier.classify(value)
513
+ end
514
+ end
515
+
516
+ def render_param_value(value, type)
517
+ if type == :date && (canon = SegmentClassifier.canonical_date(value))
518
+ canon
519
+ elsif @classifier.variable?(type)
520
+ "{#{SegmentClassifier.display_type(type)}}"
521
+ else
522
+ value
523
+ end
524
+ end
525
+
250
526
  def corpus_token(entry)
251
527
  case entry[:classification]
252
528
  when :variable_identifier, :corpus_inferred_variable
@@ -257,7 +533,13 @@ module Iriq
257
533
  end
258
534
 
259
535
  def placeholder_for_variable(entry)
260
- return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
536
+ # Dates render in canonical ISO form rather than as a `{date}` placeholder
537
+ # — matches what mechanical Iriq.normalize does for path segments and
538
+ # what render_param_value does for query params.
539
+ if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
540
+ return canon
541
+ end
542
+ return "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}" if entry[:variable]
261
543
 
262
544
  # corpus-inferred variable: classifier said literal, corpus says
263
545
  # otherwise. Derive a hint from the prefix's last literal segment if
@@ -0,0 +1,37 @@
1
+ require "set"
2
+
3
+ module Iriq
4
+ # A route shape that recurs across multiple hosts.
5
+ #
6
+ # Emitted by Corpus#cross_host_shapes. The shape string ("/users/{user_id}")
7
+ # is the cluster's rendered placeholder form; two clusters with the same
8
+ # shape but different hosts coalesce into one CrossHostShape record.
9
+ #
10
+ # A shape appearing at N hosts is strong evidence of a semantic pattern
11
+ # rather than a host-local quirk — independent hosts are unlikely to
12
+ # invent the same `/users/{integer}` structure by accident. Future work
13
+ # can feed this signal into proposal confidence and corpus-informed
14
+ # normalization (raise weight when a Shape has cross-host support).
15
+ class CrossHostShape
16
+ attr_reader :shape, :hosts, :observation_count
17
+
18
+ def initialize(shape:, hosts:, observation_count:)
19
+ @shape = shape
20
+ @hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
21
+ @observation_count = observation_count
22
+ end
23
+
24
+ def host_count
25
+ @hosts.size
26
+ end
27
+
28
+ def to_h
29
+ {
30
+ shape: @shape,
31
+ hosts: @hosts.to_a.sort,
32
+ host_count: host_count,
33
+ observation_count: @observation_count,
34
+ }
35
+ end
36
+ end
37
+ end
data/lib/iriq/event.rb ADDED
@@ -0,0 +1,22 @@
1
+ module Iriq
2
+ # Events are the atomic observation-time facts emitted by Corpus#observe
3
+ # before any state changes. A single observe(iri) call emits a small
4
+ # ordered list of Events; Reducers consume that list to update materialized
5
+ # views (host counts, position stats, clusters, etc.).
6
+ #
7
+ # Today the event list is transient — built fresh per observe(), applied,
8
+ # and discarded. The shape is in place so a future commit can persist the
9
+ # log and replay it to re-derive materialized views without re-feeding
10
+ # source IRIs (the "re-runnable inference" win from ROADMAP.md).
11
+ #
12
+ # Each Event is a Struct so callers can pattern-match on type and access
13
+ # fields positionally or by name.
14
+ module Event
15
+ HostSeen = Struct.new(:host)
16
+ PathLengthSeen = Struct.new(:length)
17
+ RawShapeSeen = Struct.new(:shape)
18
+ FingerprintSeen = Struct.new(:shape)
19
+ PositionSeen = Struct.new(:position, :value, :type)
20
+ ClusterAddition = Struct.new(:key, :host, :scheme, :shape, :identifier)
21
+ end
22
+ end
@@ -0,0 +1,114 @@
1
+ module Iriq
2
+ # Evidence is the structured substrate for explanation. Each Record
3
+ # captures one fact about the system's reasoning: "this segment
4
+ # classified as :integer because the Integer recognizer fired with
5
+ # specificity TYPED", "the IPv4 type collapses to {ip} by policy",
6
+ # "Position P is mostly variable because of corpus stats".
7
+ #
8
+ # Trace and Explanation are views over a list of Evidence records;
9
+ # the structured form is what programmatic consumers (test assertions,
10
+ # PR-diff annotators, downstream tooling) should build on. Human note
11
+ # strings emitted by Trace are derived from Evidence payloads, so
12
+ # adding a new note kind starts with adding a new Evidence shape.
13
+ #
14
+ # Two axes:
15
+ #
16
+ # subject_kind ∈ {:segment, :position, :cluster}
17
+ # What this Evidence is about. Today most Evidence is :segment
18
+ # (per-segment classification facts). :position and :cluster
19
+ # Evidence become load-bearing once corpus-informed Trace lands
20
+ # in a follow-up step.
21
+ #
22
+ # source ∈ {:lexical, :recognizer, :corpus, :neighbor, :policy}
23
+ # What kind of fact is being asserted.
24
+ # :lexical — pure shape match (e.g. "matches DATE_RE")
25
+ # :recognizer — a named Recognizer fired with confidence/specificity
26
+ # :corpus — aggregated counts/distributions support this
27
+ # :neighbor — adjacent context informed this (prior literal,
28
+ # param name hint)
29
+ # :policy — a normalization policy applied (ip umbrella
30
+ # collapse, canonical date, currency upcase)
31
+ module Evidence
32
+ SUBJECT_KINDS = %i[segment position cluster].freeze
33
+ SOURCES = %i[lexical recognizer corpus neighbor policy].freeze
34
+
35
+ # A single Evidence fact.
36
+ #
37
+ # subject_kind — :segment | :position | :cluster
38
+ # subject — kind-specific identity:
39
+ # :segment → { index:, value: }
40
+ # :position → Iriq::Position
41
+ # :cluster → cluster key (string)
42
+ # source — :lexical | :recognizer | :corpus | :neighbor | :policy
43
+ # payload — source-and-kind-specific structured data
44
+ # weight — optional float in [0,1] — contribution to the
45
+ # ultimate decision. Set when scoring is meaningful;
46
+ # nil otherwise.
47
+ # notes — optional human-readable strings. Trace renders
48
+ # these directly; programmatic consumers can ignore.
49
+ class Record
50
+ attr_reader :subject_kind, :subject, :source, :payload, :weight, :notes
51
+
52
+ def initialize(subject_kind:, subject:, source:, payload:, weight: nil, notes: [])
53
+ unless SUBJECT_KINDS.include?(subject_kind)
54
+ raise ArgumentError, "subject_kind must be one of #{SUBJECT_KINDS.inspect}"
55
+ end
56
+ unless SOURCES.include?(source)
57
+ raise ArgumentError, "source must be one of #{SOURCES.inspect}"
58
+ end
59
+
60
+ @subject_kind = subject_kind
61
+ @subject = subject
62
+ @source = source
63
+ @payload = payload || {}
64
+ @weight = weight
65
+ @notes = notes || []
66
+ end
67
+
68
+ def to_h
69
+ {
70
+ subject_kind: @subject_kind,
71
+ subject: subject_serialized,
72
+ source: @source,
73
+ payload: @payload,
74
+ weight: @weight,
75
+ notes: @notes,
76
+ }.compact
77
+ end
78
+
79
+ private
80
+
81
+ def subject_serialized
82
+ return @subject.to_h if @subject.respond_to?(:to_h) && !@subject.is_a?(Hash)
83
+ @subject
84
+ end
85
+ end
86
+
87
+ module_function
88
+
89
+ # Factories so call sites don't have to repeat subject_kind:.
90
+ def segment(index:, value:, source:, payload:, weight: nil, notes: [])
91
+ Record.new(
92
+ subject_kind: :segment,
93
+ subject: { index: index, value: value },
94
+ source: source, payload: payload, weight: weight, notes: notes,
95
+ )
96
+ end
97
+
98
+ def position(position:, source:, payload:, weight: nil, notes: [])
99
+ Record.new(
100
+ subject_kind: :position,
101
+ subject: position,
102
+ source: source, payload: payload, weight: weight, notes: notes,
103
+ )
104
+ end
105
+
106
+ def cluster(key:, source:, payload:, weight: nil, notes: [])
107
+ Record.new(
108
+ subject_kind: :cluster,
109
+ subject: key,
110
+ source: source, payload: payload, weight: weight, notes: notes,
111
+ )
112
+ end
113
+ end
114
+ end
@@ -4,7 +4,7 @@ module Iriq
4
4
  # Explanation.explain("https://foo.com/users/123")
5
5
  # # => [
6
6
  # # { value: "users", type: :literal, variable: false, hint: nil },
7
- # # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
7
+ # # { value: "123", type: :integer, variable: true, hint: "user_id" },
8
8
  # # ]
9
9
  module Explanation
10
10
  module_function