iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ module Iriq
2
+ # Per-Recognizer claim strength. Higher specificity wins when multiple
3
+ # Recognizers fire on the same segment; the ensemble picks the
4
+ # max(specificity × confidence).
5
+ #
6
+ # The bands below capture the current type taxonomy at coarse-grain:
7
+ # they're explicitly NOT linear "how confident" scores. They encode "how
8
+ # surprising would it be for this Recognizer to fire by accident on a
9
+ # different actual type." UUID's shape is so distinctive that a non-UUID
10
+ # producing that string is vanishingly unlikely (SEMANTIC); a 4-digit
11
+ # integer could plausibly be a year, an HTTP status, or an ID, so
12
+ # `:integer` claims only TYPED.
13
+ #
14
+ # Calibration corpus tests in spec/iriq/calibration_spec.rb / Go's
15
+ # calibration_test.go are the source of truth for whether these
16
+ # values are well-chosen — adjust them and re-run to validate.
17
+ module Specificity
18
+ # Unambiguous semantic shapes — the regex effectively can't fire by
19
+ # accident. (UUID, JWT, email with @, URL with ://, color hex.)
20
+ SEMANTIC = 1.0
21
+ # Restrictive structured patterns. Could collide with broader types
22
+ # at edges. (date, file with known ext, ipv4, mime.)
23
+ STRUCTURED = 0.8
24
+ # Digit-shaped with an additional bound — range or allowlist — that
25
+ # makes the shape alone meaningful. (timestamp, currency, country,
26
+ # boolean.)
27
+ BOUNDED = 0.7
28
+ # Lexically broad but typed. (integer, float, version.)
29
+ TYPED = 0.5
30
+ # Generic pattern-based shape. (slug.)
31
+ PATTERN = 0.3
32
+ # Generic fallback shapes. (literal, opaque_id.)
33
+ FALLBACK = 0.1
34
+ end
35
+ end
@@ -9,11 +9,15 @@ module Iriq
9
9
  # increment_path_length(length)
10
10
  # increment_raw_shape(shape)
11
11
  # increment_fingerprint(shape)
12
- # observe_position(host, prefix, value, type)
12
+ # observe_position(position, value, type) # position is Iriq::Position
13
13
  # add_to_cluster(key, host, scheme, shape, identifier)
14
+ # record_observation(canonical) # append to source-IRI log
14
15
  #
15
16
  # host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
16
- # position_stats(host, prefix)
17
+ # position_stats(position)
18
+ # each_position_stats { |position, stats| ... }
19
+ # each_observed_iri { |canonical| ... }
20
+ # clear_materialized_views # for reinfer
17
21
  # clusters / cluster_size
18
22
  #
19
23
  # transaction { ... } # backends may batch within
@@ -36,6 +40,15 @@ module Iriq
36
40
  @fingerprint_counts = Hash.new(0)
37
41
  @position_stats = {}
38
42
  @clusters = {}
43
+ # The source-IRI log. Persisted alongside materialized views; the
44
+ # log is the source of truth, the views are derived. Corpus#reinfer
45
+ # drops the views and replays the log through events + reducers.
46
+ @observed_iris = []
47
+ # Recognizers promoted from RecognizerProposal via
48
+ # Corpus#activate_proposal. Stored as {prefix, type, specificity}
49
+ # hashes so reopens can re-synthesize them onto the corpus's
50
+ # classifier.
51
+ @activated_recognizers = []
39
52
  end
40
53
 
41
54
  def transaction
@@ -70,17 +83,61 @@ module Iriq
70
83
  @fingerprint_counts[shape] += 1
71
84
  end
72
85
 
73
- def observe_position(host, prefix, value, type)
74
- stats = @position_stats[[host, prefix]] ||= PositionStats.new(max_values: @max_values_per_position)
86
+ def observe_position(position, value, type)
87
+ stats = @position_stats[position] ||= PositionStats.new(max_values: @max_values_per_position)
75
88
  stats.observe(value, type)
76
89
  end
77
90
 
78
91
  def add_to_cluster(key, host, scheme, shape, identifier)
79
- cluster = @clusters[key] ||= Cluster.new(key: key, host: host, scheme: scheme, shape: shape)
80
- cluster.add(identifier)
92
+ cluster = @clusters[key] ||= Cluster.new(
93
+ key: key, host: host, scheme: scheme, shape: shape,
94
+ max_values: @max_values_per_position,
95
+ )
96
+ cluster.add(identifier, classifier: @classifier)
81
97
  cluster
82
98
  end
83
99
 
100
+ # Append a canonical IRI to the source-IRI log. Called by Corpus#observe
101
+ # after the event reducers have applied; the log is the source of truth
102
+ # that Corpus#reinfer replays.
103
+ def record_observation(canonical)
104
+ @observed_iris << canonical
105
+ end
106
+
107
+ def each_observed_iri(&block)
108
+ @observed_iris.each(&block)
109
+ end
110
+
111
+ def observed_iri_count
112
+ @observed_iris.size
113
+ end
114
+
115
+ # --- Activated recognizers (Corpus#activate_proposal) -----------------
116
+
117
+ def record_activated_recognizer(dump)
118
+ @activated_recognizers << dump
119
+ end
120
+
121
+ def each_activated_recognizer(&block)
122
+ @activated_recognizers.each(&block)
123
+ end
124
+
125
+ def activated_recognizer_count
126
+ @activated_recognizers.size
127
+ end
128
+
129
+ # Drop every materialized view (host_counts, position_stats, clusters,
130
+ # …) without touching the source-IRI log. Corpus#reinfer calls this
131
+ # before replaying the log so views rebuild from scratch.
132
+ def clear_materialized_views
133
+ @host_counts = Hash.new(0)
134
+ @path_length_counts = Hash.new(0)
135
+ @raw_shape_counts = Hash.new(0)
136
+ @fingerprint_counts = Hash.new(0)
137
+ @position_stats = {}
138
+ @clusters = {}
139
+ end
140
+
84
141
  # --- Reads ------------------------------------------------------------
85
142
 
86
143
  def host_counts; @host_counts; end
@@ -88,8 +145,8 @@ module Iriq
88
145
  def raw_shape_counts; @raw_shape_counts; end
89
146
  def fingerprint_counts; @fingerprint_counts; end
90
147
 
91
- def position_stats(host, prefix)
92
- @position_stats[[host, prefix]]
148
+ def position_stats(position)
149
+ @position_stats[position]
93
150
  end
94
151
 
95
152
  def each_position_stats(&block)
@@ -104,6 +161,13 @@ module Iriq
104
161
  @clusters.size
105
162
  end
106
163
 
164
+ # O(1) lookup by cluster key — used by Corpus#normalize to pull the
165
+ # cluster's param_stats for the URL being normalized. nil if no cluster
166
+ # has been observed under this key yet.
167
+ def cluster_for(key)
168
+ @clusters[key]
169
+ end
170
+
107
171
  # --- Bulk load (used by JSON backend) --------------------------------
108
172
 
109
173
  def load_dump!(h)
@@ -112,11 +176,14 @@ module Iriq
112
176
  @raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
113
177
  @fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
114
178
  @max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
115
- @position_stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
116
- acc[[host, prefix]] = PositionStats.from_dump(sdump)
179
+ @position_stats = h["position_stats"].each_with_object({}) do |entry, acc|
180
+ position = Position.from_dump(entry["position"])
181
+ acc[position] = PositionStats.from_dump(entry["stats"])
117
182
  end
118
183
  cdump = h.fetch("clusterer", { "clusters" => {} })
119
- @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c) }
184
+ @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c, max_values: @max_values_per_position) }
185
+ @observed_iris = h.fetch("observed_iris", [])
186
+ @activated_recognizers = h.fetch("activated_recognizers", [])
120
187
  self
121
188
  end
122
189
 
@@ -127,10 +194,14 @@ module Iriq
127
194
  "raw_shape_counts" => @raw_shape_counts,
128
195
  "fingerprint_counts" => @fingerprint_counts,
129
196
  "max_values_per_position" => @max_values_per_position,
130
- "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
197
+ "position_stats" => @position_stats.map { |pos, s|
198
+ { "position" => pos.to_dump, "stats" => s.dump }
199
+ },
131
200
  "clusterer" => {
132
201
  "clusters" => @clusters.transform_values(&:dump),
133
202
  },
203
+ "observed_iris" => @observed_iris,
204
+ "activated_recognizers" => @activated_recognizers,
134
205
  }
135
206
  end
136
207
  end
@@ -11,7 +11,7 @@ module Iriq
11
11
  # the existing `iriq --corpus c.db <url>` pattern works without a flock
12
12
  # at the application layer.
13
13
  class Sqlite
14
- SCHEMA_VERSION = 1
14
+ SCHEMA_VERSION = 4
15
15
 
16
16
  SCHEMA = <<~SQL.freeze
17
17
  CREATE TABLE IF NOT EXISTS meta (
@@ -34,25 +34,33 @@ module Iriq
34
34
  shape TEXT PRIMARY KEY,
35
35
  count INTEGER NOT NULL
36
36
  );
37
+ -- Position is (host, scope, locator). For scope='path' the locator
38
+ -- is the typed prefix; for scope='query' it's the param name.
39
+ -- Today only 'path' is observed here (query params live on the
40
+ -- cluster_* tables) — scope is in the schema so future commits
41
+ -- can fold query positions in without another migration.
37
42
  CREATE TABLE IF NOT EXISTS position_stats (
38
- host TEXT NOT NULL,
39
- prefix TEXT NOT NULL,
40
- total INTEGER NOT NULL DEFAULT 0,
41
- PRIMARY KEY (host, prefix)
43
+ host TEXT NOT NULL,
44
+ scope TEXT NOT NULL,
45
+ locator TEXT NOT NULL,
46
+ total INTEGER NOT NULL DEFAULT 0,
47
+ PRIMARY KEY (host, scope, locator)
42
48
  );
43
49
  CREATE TABLE IF NOT EXISTS position_values (
44
- host TEXT NOT NULL,
45
- prefix TEXT NOT NULL,
46
- value TEXT NOT NULL,
47
- count INTEGER NOT NULL,
48
- PRIMARY KEY (host, prefix, value)
50
+ host TEXT NOT NULL,
51
+ scope TEXT NOT NULL,
52
+ locator TEXT NOT NULL,
53
+ value TEXT NOT NULL,
54
+ count INTEGER NOT NULL,
55
+ PRIMARY KEY (host, scope, locator, value)
49
56
  );
50
57
  CREATE TABLE IF NOT EXISTS position_types (
51
- host TEXT NOT NULL,
52
- prefix TEXT NOT NULL,
53
- type TEXT NOT NULL,
54
- count INTEGER NOT NULL,
55
- PRIMARY KEY (host, prefix, type)
58
+ host TEXT NOT NULL,
59
+ scope TEXT NOT NULL,
60
+ locator TEXT NOT NULL,
61
+ type TEXT NOT NULL,
62
+ count INTEGER NOT NULL,
63
+ PRIMARY KEY (host, scope, locator, type)
56
64
  );
57
65
  CREATE TABLE IF NOT EXISTS clusters (
58
66
  key TEXT PRIMARY KEY,
@@ -75,6 +83,44 @@ module Iriq
75
83
  count INTEGER NOT NULL,
76
84
  PRIMARY KEY (cluster_key, position, value)
77
85
  );
86
+ CREATE TABLE IF NOT EXISTS cluster_params (
87
+ cluster_key TEXT NOT NULL,
88
+ name TEXT NOT NULL,
89
+ total INTEGER NOT NULL DEFAULT 0,
90
+ PRIMARY KEY (cluster_key, name)
91
+ );
92
+ CREATE TABLE IF NOT EXISTS cluster_param_values (
93
+ cluster_key TEXT NOT NULL,
94
+ name TEXT NOT NULL,
95
+ value TEXT NOT NULL,
96
+ count INTEGER NOT NULL,
97
+ PRIMARY KEY (cluster_key, name, value)
98
+ );
99
+ CREATE TABLE IF NOT EXISTS cluster_param_types (
100
+ cluster_key TEXT NOT NULL,
101
+ name TEXT NOT NULL,
102
+ type TEXT NOT NULL,
103
+ count INTEGER NOT NULL,
104
+ PRIMARY KEY (cluster_key, name, type)
105
+ );
106
+ -- Source-IRI log. The materialized views above are derived from
107
+ -- this log via events + reducers. Corpus#reinfer drops the views
108
+ -- and replays the log to rebuild them. id is monotonic so
109
+ -- iteration order is observation order.
110
+ CREATE TABLE IF NOT EXISTS observed_iris (
111
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
112
+ canonical TEXT NOT NULL
113
+ );
114
+ -- Recognizers promoted from RecognizerProposal via
115
+ -- Corpus#activate_proposal. Re-applied to the corpus's
116
+ -- classifier on Corpus.open so a reopen picks up its learned
117
+ -- patterns. Keyed by prefix; activating the same prefix twice
118
+ -- is a no-op.
119
+ CREATE TABLE IF NOT EXISTS activated_recognizers (
120
+ prefix TEXT PRIMARY KEY,
121
+ type TEXT NOT NULL,
122
+ specificity REAL NOT NULL DEFAULT 1.0
123
+ );
78
124
  SQL
79
125
 
80
126
  attr_reader :path, :max_values_per_position
@@ -188,36 +234,38 @@ module Iriq
188
234
  upsert_shape("fingerprint_counts", shape)
189
235
  end
190
236
 
191
- def observe_position(host, prefix, value, type)
192
- host ||= ""
193
- @db.execute(<<~SQL, [host, prefix])
194
- INSERT INTO position_stats (host, prefix, total) VALUES (?, ?, 1)
195
- ON CONFLICT(host, prefix) DO UPDATE SET total = total + 1
237
+ def observe_position(position, value, type)
238
+ host = position.host || ""
239
+ scope = position.scope.to_s
240
+ locator = position.locator
241
+ @db.execute(<<~SQL, [host, scope, locator])
242
+ INSERT INTO position_stats (host, scope, locator, total) VALUES (?, ?, ?, 1)
243
+ ON CONFLICT(host, scope, locator) DO UPDATE SET total = total + 1
196
244
  SQL
197
245
 
198
246
  # Type counts are unbounded — always upsert.
199
- @db.execute(<<~SQL, [host, prefix, type.to_s])
200
- INSERT INTO position_types (host, prefix, type, count) VALUES (?, ?, ?, 1)
201
- ON CONFLICT(host, prefix, type) DO UPDATE SET count = count + 1
247
+ @db.execute(<<~SQL, [host, scope, locator, type.to_s])
248
+ INSERT INTO position_types (host, scope, locator, type, count) VALUES (?, ?, ?, ?, 1)
249
+ ON CONFLICT(host, scope, locator, type) DO UPDATE SET count = count + 1
202
250
  SQL
203
251
 
204
252
  # Value counts are capped at max_values_per_position. If the value
205
253
  # already exists, increment it; otherwise insert only when
206
254
  # cardinality is below the cap. Two-step rather than ON CONFLICT
207
255
  # because we need to enforce the cap on insert.
208
- @db.execute(<<~SQL, [host, prefix, value])
256
+ @db.execute(<<~SQL, [host, scope, locator, value])
209
257
  UPDATE position_values SET count = count + 1
210
- WHERE host = ? AND prefix = ? AND value = ?
258
+ WHERE host = ? AND scope = ? AND locator = ? AND value = ?
211
259
  SQL
212
260
  if @db.changes.zero?
213
261
  card = @db.get_first_value(
214
- "SELECT COUNT(*) FROM position_values WHERE host = ? AND prefix = ?",
215
- [host, prefix],
262
+ "SELECT COUNT(*) FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
263
+ [host, scope, locator],
216
264
  )
217
265
  if card < @max_values_per_position
218
266
  @db.execute(
219
- "INSERT INTO position_values (host, prefix, value, count) VALUES (?, ?, ?, 1)",
220
- [host, prefix, value],
267
+ "INSERT INTO position_values (host, scope, locator, value, count) VALUES (?, ?, ?, ?, 1)",
268
+ [host, scope, locator, value],
221
269
  )
222
270
  end
223
271
  end
@@ -251,9 +299,99 @@ module Iriq
251
299
  SQL
252
300
  end
253
301
 
302
+ # Per-param stats (presence + value cardinality + type) — mirrors the
303
+ # in-memory Cluster#add path. Value table respects the same per-key
304
+ # cap as position_values.
305
+ (identifier.query_params || {}).each do |name, value|
306
+ v = value.to_s
307
+ type = @classifier.classify(v).to_s
308
+
309
+ @db.execute(<<~SQL, [key, name])
310
+ INSERT INTO cluster_params (cluster_key, name, total) VALUES (?, ?, 1)
311
+ ON CONFLICT(cluster_key, name) DO UPDATE SET total = total + 1
312
+ SQL
313
+ @db.execute(<<~SQL, [key, name, type])
314
+ INSERT INTO cluster_param_types (cluster_key, name, type, count) VALUES (?, ?, ?, 1)
315
+ ON CONFLICT(cluster_key, name, type) DO UPDATE SET count = count + 1
316
+ SQL
317
+
318
+ @db.execute(<<~SQL, [key, name, v])
319
+ UPDATE cluster_param_values SET count = count + 1
320
+ WHERE cluster_key = ? AND name = ? AND value = ?
321
+ SQL
322
+ if @db.changes.zero?
323
+ card = @db.get_first_value(
324
+ "SELECT COUNT(*) FROM cluster_param_values WHERE cluster_key = ? AND name = ?",
325
+ [key, name],
326
+ )
327
+ if card < @max_values_per_position
328
+ @db.execute(
329
+ "INSERT INTO cluster_param_values (cluster_key, name, value, count) VALUES (?, ?, ?, 1)",
330
+ [key, name, v],
331
+ )
332
+ end
333
+ end
334
+ end
335
+
254
336
  load_cluster(key)
255
337
  end
256
338
 
339
+ # Append a canonical IRI to the source-IRI log. Inside the same
340
+ # transaction as the event reducers, so the log and views stay
341
+ # consistent.
342
+ def record_observation(canonical)
343
+ @db.execute("INSERT INTO observed_iris (canonical) VALUES (?)", [canonical])
344
+ end
345
+
346
+ def each_observed_iri
347
+ @db.execute("SELECT canonical FROM observed_iris ORDER BY id") do |row|
348
+ yield row[0]
349
+ end
350
+ end
351
+
352
+ def observed_iri_count
353
+ @db.get_first_value("SELECT COUNT(*) FROM observed_iris") || 0
354
+ end
355
+
356
+ # --- Activated recognizers --------------------------------------------
357
+
358
+ def record_activated_recognizer(dump)
359
+ @db.execute(<<~SQL, [dump["prefix"], dump["type"], dump.fetch("specificity", 1.0)])
360
+ INSERT INTO activated_recognizers (prefix, type, specificity) VALUES (?, ?, ?)
361
+ ON CONFLICT(prefix) DO UPDATE SET type = excluded.type, specificity = excluded.specificity
362
+ SQL
363
+ end
364
+
365
+ def each_activated_recognizer
366
+ @db.execute("SELECT prefix, type, specificity FROM activated_recognizers ORDER BY prefix") do |row|
367
+ yield({ "prefix" => row[0], "type" => row[1], "specificity" => row[2] })
368
+ end
369
+ end
370
+
371
+ def activated_recognizer_count
372
+ @db.get_first_value("SELECT COUNT(*) FROM activated_recognizers") || 0
373
+ end
374
+
375
+ # Drop every materialized view without touching the source-IRI log.
376
+ # Corpus#reinfer calls this before replaying the log.
377
+ def clear_materialized_views
378
+ @db.execute_batch(<<~SQL)
379
+ DELETE FROM host_counts;
380
+ DELETE FROM path_length_counts;
381
+ DELETE FROM raw_shape_counts;
382
+ DELETE FROM fingerprint_counts;
383
+ DELETE FROM position_stats;
384
+ DELETE FROM position_values;
385
+ DELETE FROM position_types;
386
+ DELETE FROM clusters;
387
+ DELETE FROM cluster_examples;
388
+ DELETE FROM cluster_segments;
389
+ DELETE FROM cluster_params;
390
+ DELETE FROM cluster_param_values;
391
+ DELETE FROM cluster_param_types;
392
+ SQL
393
+ end
394
+
257
395
  # --- Reads ------------------------------------------------------------
258
396
 
259
397
  def host_counts
@@ -274,10 +412,13 @@ module Iriq
274
412
  rows_to_count_hash("fingerprint_counts", "shape")
275
413
  end
276
414
 
277
- def position_stats(host, prefix)
278
- host ||= ""
415
+ def position_stats(position)
416
+ host = position.host || ""
417
+ scope = position.scope.to_s
418
+ locator = position.locator
279
419
  total = @db.get_first_value(
280
- "SELECT total FROM position_stats WHERE host = ? AND prefix = ?", [host, prefix],
420
+ "SELECT total FROM position_stats WHERE host = ? AND scope = ? AND locator = ?",
421
+ [host, scope, locator],
281
422
  )
282
423
  return nil if total.nil?
283
424
 
@@ -286,13 +427,15 @@ module Iriq
286
427
 
287
428
  vc = Hash.new(0)
288
429
  @db.execute(
289
- "SELECT value, count FROM position_values WHERE host = ? AND prefix = ?", [host, prefix]
430
+ "SELECT value, count FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
431
+ [host, scope, locator],
290
432
  ) { |r| vc[r[0]] = r[1] }
291
433
  stats.instance_variable_set(:@value_counts, vc)
292
434
 
293
435
  tc = Hash.new(0)
294
436
  @db.execute(
295
- "SELECT type, count FROM position_types WHERE host = ? AND prefix = ?", [host, prefix]
437
+ "SELECT type, count FROM position_types WHERE host = ? AND scope = ? AND locator = ?",
438
+ [host, scope, locator],
296
439
  ) { |r| tc[r[0].to_sym] = r[1] }
297
440
  stats.instance_variable_set(:@type_counts, tc)
298
441
 
@@ -301,10 +444,13 @@ module Iriq
301
444
 
302
445
  def each_position_stats
303
446
  seen = []
304
- @db.execute("SELECT DISTINCT host, prefix FROM position_stats ORDER BY ROWID") do |row|
447
+ @db.execute("SELECT DISTINCT host, scope, locator FROM position_stats ORDER BY ROWID") do |row|
305
448
  seen << row
306
449
  end
307
- seen.each { |host, prefix| yield [host, prefix], position_stats(host, prefix) }
450
+ seen.each do |host, scope, locator|
451
+ pos = Position.new(host: host, scope: scope.to_sym, locator: locator)
452
+ yield pos, position_stats(pos)
453
+ end
308
454
  end
309
455
 
310
456
  def clusters
@@ -319,6 +465,10 @@ module Iriq
319
465
  @db.get_first_value("SELECT COUNT(*) FROM clusters")
320
466
  end
321
467
 
468
+ def cluster_for(key)
469
+ load_cluster(key)
470
+ end
471
+
322
472
  private
323
473
 
324
474
  def upsert_shape(table, shape)
@@ -340,7 +490,10 @@ module Iriq
340
490
  )
341
491
  return nil unless row
342
492
 
343
- c = Cluster.new(key: row[0], host: row[1], scheme: row[2], shape: row[3])
493
+ c = Cluster.new(
494
+ key: row[0], host: row[1], scheme: row[2], shape: row[3],
495
+ max_values: @max_values_per_position,
496
+ )
344
497
  c.instance_variable_set(:@count, row[4])
345
498
 
346
499
  examples = []
@@ -360,6 +513,32 @@ module Iriq
360
513
  end
361
514
  c.instance_variable_set(:@segment_counts, seg_counts)
362
515
 
516
+ # Rebuild @param_stats from the three cluster_param_* tables.
517
+ params = {}
518
+ @db.execute(
519
+ "SELECT name, total FROM cluster_params WHERE cluster_key = ?", [key],
520
+ ) do |r|
521
+ # PositionStats.new already initializes empty Hash.new(0) for value
522
+ # and type counts; only @total needs filling here. The followup
523
+ # SELECTs below populate value/type rows in place.
524
+ stats = PositionStats.new(max_values: @max_values_per_position)
525
+ stats.instance_variable_set(:@total, r[1])
526
+ params[r[0]] = stats
527
+ end
528
+ @db.execute(
529
+ "SELECT name, value, count FROM cluster_param_values WHERE cluster_key = ?", [key],
530
+ ) do |r|
531
+ stats = params[r[0]] or next
532
+ stats.value_counts[r[1]] = r[2]
533
+ end
534
+ @db.execute(
535
+ "SELECT name, type, count FROM cluster_param_types WHERE cluster_key = ?", [key],
536
+ ) do |r|
537
+ stats = params[r[0]] or next
538
+ stats.type_counts[r[1].to_sym] = r[2]
539
+ end
540
+ c.instance_variable_set(:@param_stats, params)
541
+
363
542
  c
364
543
  end
365
544
  end
@@ -0,0 +1,56 @@
1
+ module Iriq
2
+ # Recognizer built dynamically from a learned-prefix pattern.
3
+ #
4
+ # Used by Corpus#activate_proposal to promote a RecognizerProposal
5
+ # into a live Recognizer that the classifier ensemble consults. Same
6
+ # shape as the built-in Recognizers — uuid, date, integer — but the
7
+ # pattern + type are supplied at construction instead of compiled-in.
8
+ #
9
+ # r = SynthesizedRecognizer.new(prefix: "ghp_", type: :ghp)
10
+ # r.try("ghp_abcdef123") # → {type: :ghp, confidence: 1.0, specificity: 1.0}
11
+ #
12
+ # Pattern: `<prefix><[A-Za-z0-9]+>` — anchored, alphanumeric suffix
13
+ # only. Matches the same shape PrefixUnderscoreId proposes from, so
14
+ # round-trip (propose → activate → reinfer) reclassifies the same
15
+ # values the proposal was derived from.
16
+ #
17
+ # Specificity defaults to SEMANTIC. A learned prefix is very specific
18
+ # by construction (a distinctive literal prefix that recurred enough
19
+ # to clear the proposal noise floor) — calling it as confident as a
20
+ # built-in UUID is reasonable.
21
+ class SynthesizedRecognizer < Recognizer
22
+ attr_reader :prefix, :type, :specificity
23
+
24
+ def self.from_proposal(proposal)
25
+ new(prefix: proposal.prefix, type: proposal.suggested_type)
26
+ end
27
+
28
+ def initialize(prefix:, type:, specificity: Specificity::SEMANTIC)
29
+ raise ArgumentError, "prefix must be a non-empty string" if prefix.nil? || prefix.empty?
30
+ raise ArgumentError, "type must be a symbol" unless type.is_a?(Symbol)
31
+
32
+ @prefix = prefix
33
+ @type = type
34
+ @specificity = specificity
35
+ @pattern = /\A#{Regexp.escape(prefix)}[A-Za-z0-9]+\z/.freeze
36
+ end
37
+
38
+ def try(segment)
39
+ return nil unless segment.start_with?(@prefix) && @pattern.match?(segment)
40
+
41
+ { type: @type, confidence: 1.0, specificity: @specificity }
42
+ end
43
+
44
+ def to_dump
45
+ { "prefix" => @prefix, "type" => @type.to_s, "specificity" => @specificity }
46
+ end
47
+
48
+ def self.from_dump(h)
49
+ new(
50
+ prefix: h["prefix"],
51
+ type: h["type"].to_sym,
52
+ specificity: h.fetch("specificity", Specificity::SEMANTIC),
53
+ )
54
+ end
55
+ end
56
+ end