iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
@@ -0,0 +1,546 @@
1
+ require "sqlite3"
2
+
3
+ module Iriq
4
+ module Storage
5
+ # Sqlite is the incremental-write backend. Each observation translates
6
+ # to a handful of UPSERTs against a long-lived connection; nothing is
7
+ # materialized in memory beyond what reads explicitly ask for.
8
+ #
9
+ # WAL journaling lets multiple processes observe against the same file
10
+ # concurrently — the writer is serialized, readers are not blocked, and
11
+ # the existing `iriq --corpus c.db <url>` pattern works without a flock
12
+ # at the application layer.
13
+ class Sqlite
14
+ SCHEMA_VERSION = 4
15
+
16
+ SCHEMA = <<~SQL.freeze
17
+ CREATE TABLE IF NOT EXISTS meta (
18
+ key TEXT PRIMARY KEY,
19
+ value TEXT
20
+ );
21
+ CREATE TABLE IF NOT EXISTS host_counts (
22
+ host TEXT PRIMARY KEY,
23
+ count INTEGER NOT NULL
24
+ );
25
+ CREATE TABLE IF NOT EXISTS path_length_counts (
26
+ length INTEGER PRIMARY KEY,
27
+ count INTEGER NOT NULL
28
+ );
29
+ CREATE TABLE IF NOT EXISTS raw_shape_counts (
30
+ shape TEXT PRIMARY KEY,
31
+ count INTEGER NOT NULL
32
+ );
33
+ CREATE TABLE IF NOT EXISTS fingerprint_counts (
34
+ shape TEXT PRIMARY KEY,
35
+ count INTEGER NOT NULL
36
+ );
37
+ -- Position is (host, scope, locator). For scope='path' the locator
38
+ -- is the typed prefix; for scope='query' it's the param name.
39
+ -- Today only 'path' is observed here (query params live on the
40
+ -- cluster_* tables) — scope is in the schema so future commits
41
+ -- can fold query positions in without another migration.
42
+ CREATE TABLE IF NOT EXISTS position_stats (
43
+ host TEXT NOT NULL,
44
+ scope TEXT NOT NULL,
45
+ locator TEXT NOT NULL,
46
+ total INTEGER NOT NULL DEFAULT 0,
47
+ PRIMARY KEY (host, scope, locator)
48
+ );
49
+ CREATE TABLE IF NOT EXISTS position_values (
50
+ host TEXT NOT NULL,
51
+ scope TEXT NOT NULL,
52
+ locator TEXT NOT NULL,
53
+ value TEXT NOT NULL,
54
+ count INTEGER NOT NULL,
55
+ PRIMARY KEY (host, scope, locator, value)
56
+ );
57
+ CREATE TABLE IF NOT EXISTS position_types (
58
+ host TEXT NOT NULL,
59
+ scope TEXT NOT NULL,
60
+ locator TEXT NOT NULL,
61
+ type TEXT NOT NULL,
62
+ count INTEGER NOT NULL,
63
+ PRIMARY KEY (host, scope, locator, type)
64
+ );
65
+ CREATE TABLE IF NOT EXISTS clusters (
66
+ key TEXT PRIMARY KEY,
67
+ host TEXT,
68
+ scheme TEXT,
69
+ shape TEXT,
70
+ count INTEGER NOT NULL DEFAULT 0,
71
+ ord INTEGER NOT NULL
72
+ );
73
+ CREATE TABLE IF NOT EXISTS cluster_examples (
74
+ cluster_key TEXT NOT NULL,
75
+ position INTEGER NOT NULL,
76
+ canonical TEXT NOT NULL,
77
+ PRIMARY KEY (cluster_key, position)
78
+ );
79
+ CREATE TABLE IF NOT EXISTS cluster_segments (
80
+ cluster_key TEXT NOT NULL,
81
+ position INTEGER NOT NULL,
82
+ value TEXT NOT NULL,
83
+ count INTEGER NOT NULL,
84
+ PRIMARY KEY (cluster_key, position, value)
85
+ );
86
+ CREATE TABLE IF NOT EXISTS cluster_params (
87
+ cluster_key TEXT NOT NULL,
88
+ name TEXT NOT NULL,
89
+ total INTEGER NOT NULL DEFAULT 0,
90
+ PRIMARY KEY (cluster_key, name)
91
+ );
92
+ CREATE TABLE IF NOT EXISTS cluster_param_values (
93
+ cluster_key TEXT NOT NULL,
94
+ name TEXT NOT NULL,
95
+ value TEXT NOT NULL,
96
+ count INTEGER NOT NULL,
97
+ PRIMARY KEY (cluster_key, name, value)
98
+ );
99
+ CREATE TABLE IF NOT EXISTS cluster_param_types (
100
+ cluster_key TEXT NOT NULL,
101
+ name TEXT NOT NULL,
102
+ type TEXT NOT NULL,
103
+ count INTEGER NOT NULL,
104
+ PRIMARY KEY (cluster_key, name, type)
105
+ );
106
+ -- Source-IRI log. The materialized views above are derived from
107
+ -- this log via events + reducers. Corpus#reinfer drops the views
108
+ -- and replays the log to rebuild them. id is monotonic so
109
+ -- iteration order is observation order.
110
+ CREATE TABLE IF NOT EXISTS observed_iris (
111
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
112
+ canonical TEXT NOT NULL
113
+ );
114
+ -- Recognizers promoted from RecognizerProposal via
115
+ -- Corpus#activate_proposal. Re-applied to the corpus's
116
+ -- classifier on Corpus.open so a reopen picks up its learned
117
+ -- patterns. Keyed by prefix; activating the same prefix twice
118
+ -- is a no-op.
119
+ CREATE TABLE IF NOT EXISTS activated_recognizers (
120
+ prefix TEXT PRIMARY KEY,
121
+ type TEXT NOT NULL,
122
+ specificity REAL NOT NULL DEFAULT 1.0
123
+ );
124
+ SQL
125
+
126
+ attr_reader :path, :max_values_per_position
127
+
128
+ def self.open(path, classifier: SegmentClassifier::DEFAULT,
129
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
130
+ new(path: path, classifier: classifier, max_values_per_position: max_values_per_position).tap(&:setup!)
131
+ end
132
+
133
+ def initialize(path:, classifier: SegmentClassifier::DEFAULT,
134
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
135
+ @path = path
136
+ @classifier = classifier
137
+ @max_values_per_position = max_values_per_position
138
+ @db = SQLite3::Database.new(path)
139
+ # busy_timeout MUST come first: other PRAGMAs (journal_mode in
140
+ # particular) can themselves block on the write lock under
141
+ # concurrent open, and without busy_timeout set they fail
142
+ # immediately with SQLITE_BUSY.
143
+ @db.execute("PRAGMA busy_timeout = 30000")
144
+ @db.execute("PRAGMA journal_mode = WAL")
145
+ @db.execute("PRAGMA synchronous = NORMAL")
146
+ @db.execute("PRAGMA foreign_keys = ON")
147
+ @in_batch = false
148
+ end
149
+
150
+ def setup!
151
+ @db.execute_batch(SCHEMA)
152
+ existing = @db.get_first_value("SELECT value FROM meta WHERE key = 'schema_version'")
153
+ if existing.nil?
154
+ @db.execute("INSERT INTO meta (key, value) VALUES ('schema_version', ?)", SCHEMA_VERSION.to_s)
155
+ @db.execute("INSERT INTO meta (key, value) VALUES ('max_values_per_position', ?)",
156
+ @max_values_per_position.to_s)
157
+ else
158
+ @max_values_per_position = (@db.get_first_value(
159
+ "SELECT value FROM meta WHERE key = 'max_values_per_position'"
160
+ ) || @max_values_per_position).to_i
161
+ end
162
+ self
163
+ end
164
+
165
+ def transaction
166
+ # While inside an outer batch, observe()-time transactions become
167
+ # no-ops — the outer batch wraps everything in one txn for speed.
168
+ return yield(self) if @in_batch
169
+
170
+ @db.transaction
171
+ yield self
172
+ @db.commit
173
+ rescue
174
+ @db.rollback rescue nil
175
+ raise
176
+ end
177
+
178
+ # Wrap many observations in a single transaction. Cuts SQLite write
179
+ # overhead from O(observations) fsyncs to O(1).
180
+ def batch
181
+ return yield if @in_batch
182
+
183
+ @in_batch = true
184
+ @db.transaction
185
+ begin
186
+ yield
187
+ @db.commit
188
+ rescue
189
+ @db.rollback rescue nil
190
+ raise
191
+ ensure
192
+ @in_batch = false
193
+ end
194
+ end
195
+
196
+ # Saving is automatic — incremental UPSERTs hit disk on commit. flush
197
+ # makes that explicit; close releases the connection.
198
+ def flush; end
199
+
200
+ def save(_path = nil)
201
+ # Already persisted. Provided for parity with the JSON backend.
202
+ end
203
+
204
+ def close
205
+ # Checkpoint + truncate the WAL so the .db-wal sidecar doesn't grow
206
+ # unbounded across long-lived `iriq --corpus c.db` sessions.
207
+ @db.execute("PRAGMA wal_checkpoint(TRUNCATE)") rescue nil
208
+ @db.close
209
+ end
210
+
211
+ # --- Increments -------------------------------------------------------
212
+
213
+ def increment_host(host)
214
+ return unless host
215
+
216
+ @db.execute(<<~SQL, host)
217
+ INSERT INTO host_counts (host, count) VALUES (?, 1)
218
+ ON CONFLICT(host) DO UPDATE SET count = count + 1
219
+ SQL
220
+ end
221
+
222
+ def increment_path_length(length)
223
+ @db.execute(<<~SQL, length)
224
+ INSERT INTO path_length_counts (length, count) VALUES (?, 1)
225
+ ON CONFLICT(length) DO UPDATE SET count = count + 1
226
+ SQL
227
+ end
228
+
229
+ def increment_raw_shape(shape)
230
+ upsert_shape("raw_shape_counts", shape)
231
+ end
232
+
233
+ def increment_fingerprint(shape)
234
+ upsert_shape("fingerprint_counts", shape)
235
+ end
236
+
237
+ def observe_position(position, value, type)
238
+ host = position.host || ""
239
+ scope = position.scope.to_s
240
+ locator = position.locator
241
+ @db.execute(<<~SQL, [host, scope, locator])
242
+ INSERT INTO position_stats (host, scope, locator, total) VALUES (?, ?, ?, 1)
243
+ ON CONFLICT(host, scope, locator) DO UPDATE SET total = total + 1
244
+ SQL
245
+
246
+ # Type counts are unbounded — always upsert.
247
+ @db.execute(<<~SQL, [host, scope, locator, type.to_s])
248
+ INSERT INTO position_types (host, scope, locator, type, count) VALUES (?, ?, ?, ?, 1)
249
+ ON CONFLICT(host, scope, locator, type) DO UPDATE SET count = count + 1
250
+ SQL
251
+
252
+ # Value counts are capped at max_values_per_position. If the value
253
+ # already exists, increment it; otherwise insert only when
254
+ # cardinality is below the cap. Two-step rather than ON CONFLICT
255
+ # because we need to enforce the cap on insert.
256
+ @db.execute(<<~SQL, [host, scope, locator, value])
257
+ UPDATE position_values SET count = count + 1
258
+ WHERE host = ? AND scope = ? AND locator = ? AND value = ?
259
+ SQL
260
+ if @db.changes.zero?
261
+ card = @db.get_first_value(
262
+ "SELECT COUNT(*) FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
263
+ [host, scope, locator],
264
+ )
265
+ if card < @max_values_per_position
266
+ @db.execute(
267
+ "INSERT INTO position_values (host, scope, locator, value, count) VALUES (?, ?, ?, ?, 1)",
268
+ [host, scope, locator, value],
269
+ )
270
+ end
271
+ end
272
+ end
273
+
274
+ def add_to_cluster(key, host, scheme, shape, identifier)
275
+ # Insert the cluster row if new (with a monotonic ord for stable
276
+ # iteration), then bump its count.
277
+ @db.execute(<<~SQL, [key, host, scheme, shape])
278
+ INSERT INTO clusters (key, host, scheme, shape, count, ord)
279
+ VALUES (?, ?, ?, ?, 1, (SELECT COALESCE(MAX(ord), 0) + 1 FROM clusters))
280
+ ON CONFLICT(key) DO UPDATE SET count = count + 1
281
+ SQL
282
+
283
+ # Examples — capped at Cluster::MAX_EXAMPLES.
284
+ examples_count = @db.get_first_value(
285
+ "SELECT COUNT(*) FROM cluster_examples WHERE cluster_key = ?", [key],
286
+ )
287
+ if examples_count < Cluster::MAX_EXAMPLES
288
+ @db.execute(<<~SQL, [key, examples_count, identifier.canonical])
289
+ INSERT INTO cluster_examples (cluster_key, position, canonical)
290
+ VALUES (?, ?, ?)
291
+ SQL
292
+ end
293
+
294
+ # Per-position segment counts — uncapped.
295
+ identifier.path_segments.each_with_index do |seg, i|
296
+ @db.execute(<<~SQL, [key, i, seg])
297
+ INSERT INTO cluster_segments (cluster_key, position, value, count) VALUES (?, ?, ?, 1)
298
+ ON CONFLICT(cluster_key, position, value) DO UPDATE SET count = count + 1
299
+ SQL
300
+ end
301
+
302
+ # Per-param stats (presence + value cardinality + type) — mirrors the
303
+ # in-memory Cluster#add path. Value table respects the same per-key
304
+ # cap as position_values.
305
+ (identifier.query_params || {}).each do |name, value|
306
+ v = value.to_s
307
+ type = @classifier.classify(v).to_s
308
+
309
+ @db.execute(<<~SQL, [key, name])
310
+ INSERT INTO cluster_params (cluster_key, name, total) VALUES (?, ?, 1)
311
+ ON CONFLICT(cluster_key, name) DO UPDATE SET total = total + 1
312
+ SQL
313
+ @db.execute(<<~SQL, [key, name, type])
314
+ INSERT INTO cluster_param_types (cluster_key, name, type, count) VALUES (?, ?, ?, 1)
315
+ ON CONFLICT(cluster_key, name, type) DO UPDATE SET count = count + 1
316
+ SQL
317
+
318
+ @db.execute(<<~SQL, [key, name, v])
319
+ UPDATE cluster_param_values SET count = count + 1
320
+ WHERE cluster_key = ? AND name = ? AND value = ?
321
+ SQL
322
+ if @db.changes.zero?
323
+ card = @db.get_first_value(
324
+ "SELECT COUNT(*) FROM cluster_param_values WHERE cluster_key = ? AND name = ?",
325
+ [key, name],
326
+ )
327
+ if card < @max_values_per_position
328
+ @db.execute(
329
+ "INSERT INTO cluster_param_values (cluster_key, name, value, count) VALUES (?, ?, ?, 1)",
330
+ [key, name, v],
331
+ )
332
+ end
333
+ end
334
+ end
335
+
336
+ load_cluster(key)
337
+ end
338
+
339
+ # Append a canonical IRI to the source-IRI log. Inside the same
340
+ # transaction as the event reducers, so the log and views stay
341
+ # consistent.
342
+ def record_observation(canonical)
343
+ @db.execute("INSERT INTO observed_iris (canonical) VALUES (?)", [canonical])
344
+ end
345
+
346
+ def each_observed_iri
347
+ @db.execute("SELECT canonical FROM observed_iris ORDER BY id") do |row|
348
+ yield row[0]
349
+ end
350
+ end
351
+
352
+ def observed_iri_count
353
+ @db.get_first_value("SELECT COUNT(*) FROM observed_iris") || 0
354
+ end
355
+
356
+ # --- Activated recognizers --------------------------------------------
357
+
358
+ def record_activated_recognizer(dump)
359
+ @db.execute(<<~SQL, [dump["prefix"], dump["type"], dump.fetch("specificity", 1.0)])
360
+ INSERT INTO activated_recognizers (prefix, type, specificity) VALUES (?, ?, ?)
361
+ ON CONFLICT(prefix) DO UPDATE SET type = excluded.type, specificity = excluded.specificity
362
+ SQL
363
+ end
364
+
365
+ def each_activated_recognizer
366
+ @db.execute("SELECT prefix, type, specificity FROM activated_recognizers ORDER BY prefix") do |row|
367
+ yield({ "prefix" => row[0], "type" => row[1], "specificity" => row[2] })
368
+ end
369
+ end
370
+
371
+ def activated_recognizer_count
372
+ @db.get_first_value("SELECT COUNT(*) FROM activated_recognizers") || 0
373
+ end
374
+
375
+ # Drop every materialized view without touching the source-IRI log.
376
+ # Corpus#reinfer calls this before replaying the log.
377
+ def clear_materialized_views
378
+ @db.execute_batch(<<~SQL)
379
+ DELETE FROM host_counts;
380
+ DELETE FROM path_length_counts;
381
+ DELETE FROM raw_shape_counts;
382
+ DELETE FROM fingerprint_counts;
383
+ DELETE FROM position_stats;
384
+ DELETE FROM position_values;
385
+ DELETE FROM position_types;
386
+ DELETE FROM clusters;
387
+ DELETE FROM cluster_examples;
388
+ DELETE FROM cluster_segments;
389
+ DELETE FROM cluster_params;
390
+ DELETE FROM cluster_param_values;
391
+ DELETE FROM cluster_param_types;
392
+ SQL
393
+ end
394
+
395
+ # --- Reads ------------------------------------------------------------
396
+
397
+ def host_counts
398
+ rows_to_count_hash("host_counts", "host")
399
+ end
400
+
401
+ def path_length_counts
402
+ h = Hash.new(0)
403
+ @db.execute("SELECT length, count FROM path_length_counts") { |r| h[r[0]] = r[1] }
404
+ h
405
+ end
406
+
407
+ def raw_shape_counts
408
+ rows_to_count_hash("raw_shape_counts", "shape")
409
+ end
410
+
411
+ def fingerprint_counts
412
+ rows_to_count_hash("fingerprint_counts", "shape")
413
+ end
414
+
415
+ def position_stats(position)
416
+ host = position.host || ""
417
+ scope = position.scope.to_s
418
+ locator = position.locator
419
+ total = @db.get_first_value(
420
+ "SELECT total FROM position_stats WHERE host = ? AND scope = ? AND locator = ?",
421
+ [host, scope, locator],
422
+ )
423
+ return nil if total.nil?
424
+
425
+ stats = PositionStats.new(max_values: @max_values_per_position)
426
+ stats.instance_variable_set(:@total, total)
427
+
428
+ vc = Hash.new(0)
429
+ @db.execute(
430
+ "SELECT value, count FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
431
+ [host, scope, locator],
432
+ ) { |r| vc[r[0]] = r[1] }
433
+ stats.instance_variable_set(:@value_counts, vc)
434
+
435
+ tc = Hash.new(0)
436
+ @db.execute(
437
+ "SELECT type, count FROM position_types WHERE host = ? AND scope = ? AND locator = ?",
438
+ [host, scope, locator],
439
+ ) { |r| tc[r[0].to_sym] = r[1] }
440
+ stats.instance_variable_set(:@type_counts, tc)
441
+
442
+ stats
443
+ end
444
+
445
+ def each_position_stats
446
+ seen = []
447
+ @db.execute("SELECT DISTINCT host, scope, locator FROM position_stats ORDER BY ROWID") do |row|
448
+ seen << row
449
+ end
450
+ seen.each do |host, scope, locator|
451
+ pos = Position.new(host: host, scope: scope.to_sym, locator: locator)
452
+ yield pos, position_stats(pos)
453
+ end
454
+ end
455
+
456
+ def clusters
457
+ out = []
458
+ @db.execute("SELECT key FROM clusters ORDER BY ord") do |row|
459
+ out << load_cluster(row[0])
460
+ end
461
+ out
462
+ end
463
+
464
+ def cluster_size
465
+ @db.get_first_value("SELECT COUNT(*) FROM clusters")
466
+ end
467
+
468
+ def cluster_for(key)
469
+ load_cluster(key)
470
+ end
471
+
472
+ private
473
+
474
+ def upsert_shape(table, shape)
475
+ @db.execute(<<~SQL, shape)
476
+ INSERT INTO #{table} (shape, count) VALUES (?, 1)
477
+ ON CONFLICT(shape) DO UPDATE SET count = count + 1
478
+ SQL
479
+ end
480
+
481
+ def rows_to_count_hash(table, key_col)
482
+ h = Hash.new(0)
483
+ @db.execute("SELECT #{key_col}, count FROM #{table}") { |r| h[r[0]] = r[1] }
484
+ h
485
+ end
486
+
487
+ def load_cluster(key)
488
+ row = @db.get_first_row(
489
+ "SELECT key, host, scheme, shape, count FROM clusters WHERE key = ?", [key],
490
+ )
491
+ return nil unless row
492
+
493
+ c = Cluster.new(
494
+ key: row[0], host: row[1], scheme: row[2], shape: row[3],
495
+ max_values: @max_values_per_position,
496
+ )
497
+ c.instance_variable_set(:@count, row[4])
498
+
499
+ examples = []
500
+ @db.execute(
501
+ "SELECT canonical FROM cluster_examples WHERE cluster_key = ? ORDER BY position", [key]
502
+ ) { |r| examples << Parser.parse(r[0]) }
503
+ c.instance_variable_set(:@examples, examples)
504
+
505
+ seg_counts = []
506
+ @db.execute(
507
+ "SELECT position, value, count FROM cluster_segments WHERE cluster_key = ? ORDER BY position",
508
+ [key],
509
+ ) do |r|
510
+ pos = r[0]
511
+ seg_counts[pos] ||= Hash.new(0)
512
+ seg_counts[pos][r[1]] = r[2]
513
+ end
514
+ c.instance_variable_set(:@segment_counts, seg_counts)
515
+
516
+ # Rebuild @param_stats from the three cluster_param_* tables.
517
+ params = {}
518
+ @db.execute(
519
+ "SELECT name, total FROM cluster_params WHERE cluster_key = ?", [key],
520
+ ) do |r|
521
+ # PositionStats.new already initializes empty Hash.new(0) for value
522
+ # and type counts; only @total needs filling here. The followup
523
+ # SELECTs below populate value/type rows in place.
524
+ stats = PositionStats.new(max_values: @max_values_per_position)
525
+ stats.instance_variable_set(:@total, r[1])
526
+ params[r[0]] = stats
527
+ end
528
+ @db.execute(
529
+ "SELECT name, value, count FROM cluster_param_values WHERE cluster_key = ?", [key],
530
+ ) do |r|
531
+ stats = params[r[0]] or next
532
+ stats.value_counts[r[1]] = r[2]
533
+ end
534
+ @db.execute(
535
+ "SELECT name, type, count FROM cluster_param_types WHERE cluster_key = ?", [key],
536
+ ) do |r|
537
+ stats = params[r[0]] or next
538
+ stats.type_counts[r[1].to_sym] = r[2]
539
+ end
540
+ c.instance_variable_set(:@param_stats, params)
541
+
542
+ c
543
+ end
544
+ end
545
+ end
546
+ end
@@ -0,0 +1,35 @@
1
+ module Iriq
2
+ # Storage is the persistence layer for a Corpus. It owns every counter and
3
+ # per-(host, prefix) frequency map; the Corpus class delegates state to it.
4
+ #
5
+ # Three concrete backends ship:
6
+ #
7
+ # Storage::Memory — in-memory only; matches the original behavior.
8
+ # Storage::Json — Memory backend wrapped with load/save against a JSON file.
9
+ # Storage::Sqlite — incremental UPSERTs against a SQLite database.
10
+ #
11
+ # File-extension dispatch keeps callers simple: `.json` (or anything else)
12
+ # picks Json, `.db`/`.sqlite`/`.sqlite3` picks Sqlite.
13
+ module Storage
14
+ SQLITE_EXTS = %w[.db .sqlite .sqlite3].freeze
15
+
16
+ module_function
17
+
18
+ # Opens (or creates) a storage at `path`, picking the backend by extension.
19
+ # If `path` is nil, returns a Memory backend.
20
+ def open(path, classifier: SegmentClassifier::DEFAULT,
21
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
22
+ return Memory.new(classifier: classifier, max_values_per_position: max_values_per_position) if path.nil?
23
+
24
+ if SQLITE_EXTS.include?(File.extname(path).downcase)
25
+ require "iriq/storage/sqlite"
26
+ Sqlite.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
27
+ else
28
+ require "iriq/storage/json"
29
+ Json.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ require "iriq/storage/memory"
@@ -0,0 +1,56 @@
1
+ module Iriq
2
+ # Recognizer built dynamically from a learned-prefix pattern.
3
+ #
4
+ # Used by Corpus#activate_proposal to promote a RecognizerProposal
5
+ # into a live Recognizer that the classifier ensemble consults. Same
6
+ # shape as the built-in Recognizers — uuid, date, integer — but the
7
+ # pattern + type are supplied at construction instead of compiled-in.
8
+ #
9
+ # r = SynthesizedRecognizer.new(prefix: "ghp_", type: :ghp)
10
+ # r.try("ghp_abcdef123") # → {type: :ghp, confidence: 1.0, specificity: 1.0}
11
+ #
12
+ # Pattern: `<prefix><[A-Za-z0-9]+>` — anchored, alphanumeric suffix
13
+ # only. Matches the same shape PrefixUnderscoreId proposes from, so
14
+ # round-trip (propose → activate → reinfer) reclassifies the same
15
+ # values the proposal was derived from.
16
+ #
17
+ # Specificity defaults to SEMANTIC. A learned prefix is very specific
18
+ # by construction (a distinctive literal prefix that recurred enough
19
+ # to clear the proposal noise floor) — calling it as confident as a
20
+ # built-in UUID is reasonable.
21
+ class SynthesizedRecognizer < Recognizer
22
+ attr_reader :prefix, :type, :specificity
23
+
24
+ def self.from_proposal(proposal)
25
+ new(prefix: proposal.prefix, type: proposal.suggested_type)
26
+ end
27
+
28
+ def initialize(prefix:, type:, specificity: Specificity::SEMANTIC)
29
+ raise ArgumentError, "prefix must be a non-empty string" if prefix.nil? || prefix.empty?
30
+ raise ArgumentError, "type must be a symbol" unless type.is_a?(Symbol)
31
+
32
+ @prefix = prefix
33
+ @type = type
34
+ @specificity = specificity
35
+ @pattern = /\A#{Regexp.escape(prefix)}[A-Za-z0-9]+\z/.freeze
36
+ end
37
+
38
+ def try(segment)
39
+ return nil unless segment.start_with?(@prefix) && @pattern.match?(segment)
40
+
41
+ { type: @type, confidence: 1.0, specificity: @specificity }
42
+ end
43
+
44
+ def to_dump
45
+ { "prefix" => @prefix, "type" => @type.to_s, "specificity" => @specificity }
46
+ end
47
+
48
+ def self.from_dump(h)
49
+ new(
50
+ prefix: h["prefix"],
51
+ type: h["type"].to_sym,
52
+ specificity: h.fetch("specificity", Specificity::SEMANTIC),
53
+ )
54
+ end
55
+ end
56
+ end