iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
require "sqlite3"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
module Storage
|
|
5
|
+
# Sqlite is the incremental-write backend. Each observation translates
|
|
6
|
+
# to a handful of UPSERTs against a long-lived connection; nothing is
|
|
7
|
+
# materialized in memory beyond what reads explicitly ask for.
|
|
8
|
+
#
|
|
9
|
+
# WAL journaling lets multiple processes observe against the same file
|
|
10
|
+
# concurrently — the writer is serialized, readers are not blocked, and
|
|
11
|
+
# the existing `iriq --corpus c.db <url>` pattern works without a flock
|
|
12
|
+
# at the application layer.
|
|
13
|
+
class Sqlite
|
|
14
|
+
SCHEMA_VERSION = 4
|
|
15
|
+
|
|
16
|
+
SCHEMA = <<~SQL.freeze
|
|
17
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
18
|
+
key TEXT PRIMARY KEY,
|
|
19
|
+
value TEXT
|
|
20
|
+
);
|
|
21
|
+
CREATE TABLE IF NOT EXISTS host_counts (
|
|
22
|
+
host TEXT PRIMARY KEY,
|
|
23
|
+
count INTEGER NOT NULL
|
|
24
|
+
);
|
|
25
|
+
CREATE TABLE IF NOT EXISTS path_length_counts (
|
|
26
|
+
length INTEGER PRIMARY KEY,
|
|
27
|
+
count INTEGER NOT NULL
|
|
28
|
+
);
|
|
29
|
+
CREATE TABLE IF NOT EXISTS raw_shape_counts (
|
|
30
|
+
shape TEXT PRIMARY KEY,
|
|
31
|
+
count INTEGER NOT NULL
|
|
32
|
+
);
|
|
33
|
+
CREATE TABLE IF NOT EXISTS fingerprint_counts (
|
|
34
|
+
shape TEXT PRIMARY KEY,
|
|
35
|
+
count INTEGER NOT NULL
|
|
36
|
+
);
|
|
37
|
+
-- Position is (host, scope, locator). For scope='path' the locator
|
|
38
|
+
-- is the typed prefix; for scope='query' it's the param name.
|
|
39
|
+
-- Today only 'path' is observed here (query params live on the
|
|
40
|
+
-- cluster_* tables) — scope is in the schema so future commits
|
|
41
|
+
-- can fold query positions in without another migration.
|
|
42
|
+
CREATE TABLE IF NOT EXISTS position_stats (
|
|
43
|
+
host TEXT NOT NULL,
|
|
44
|
+
scope TEXT NOT NULL,
|
|
45
|
+
locator TEXT NOT NULL,
|
|
46
|
+
total INTEGER NOT NULL DEFAULT 0,
|
|
47
|
+
PRIMARY KEY (host, scope, locator)
|
|
48
|
+
);
|
|
49
|
+
CREATE TABLE IF NOT EXISTS position_values (
|
|
50
|
+
host TEXT NOT NULL,
|
|
51
|
+
scope TEXT NOT NULL,
|
|
52
|
+
locator TEXT NOT NULL,
|
|
53
|
+
value TEXT NOT NULL,
|
|
54
|
+
count INTEGER NOT NULL,
|
|
55
|
+
PRIMARY KEY (host, scope, locator, value)
|
|
56
|
+
);
|
|
57
|
+
CREATE TABLE IF NOT EXISTS position_types (
|
|
58
|
+
host TEXT NOT NULL,
|
|
59
|
+
scope TEXT NOT NULL,
|
|
60
|
+
locator TEXT NOT NULL,
|
|
61
|
+
type TEXT NOT NULL,
|
|
62
|
+
count INTEGER NOT NULL,
|
|
63
|
+
PRIMARY KEY (host, scope, locator, type)
|
|
64
|
+
);
|
|
65
|
+
CREATE TABLE IF NOT EXISTS clusters (
|
|
66
|
+
key TEXT PRIMARY KEY,
|
|
67
|
+
host TEXT,
|
|
68
|
+
scheme TEXT,
|
|
69
|
+
shape TEXT,
|
|
70
|
+
count INTEGER NOT NULL DEFAULT 0,
|
|
71
|
+
ord INTEGER NOT NULL
|
|
72
|
+
);
|
|
73
|
+
CREATE TABLE IF NOT EXISTS cluster_examples (
|
|
74
|
+
cluster_key TEXT NOT NULL,
|
|
75
|
+
position INTEGER NOT NULL,
|
|
76
|
+
canonical TEXT NOT NULL,
|
|
77
|
+
PRIMARY KEY (cluster_key, position)
|
|
78
|
+
);
|
|
79
|
+
CREATE TABLE IF NOT EXISTS cluster_segments (
|
|
80
|
+
cluster_key TEXT NOT NULL,
|
|
81
|
+
position INTEGER NOT NULL,
|
|
82
|
+
value TEXT NOT NULL,
|
|
83
|
+
count INTEGER NOT NULL,
|
|
84
|
+
PRIMARY KEY (cluster_key, position, value)
|
|
85
|
+
);
|
|
86
|
+
CREATE TABLE IF NOT EXISTS cluster_params (
|
|
87
|
+
cluster_key TEXT NOT NULL,
|
|
88
|
+
name TEXT NOT NULL,
|
|
89
|
+
total INTEGER NOT NULL DEFAULT 0,
|
|
90
|
+
PRIMARY KEY (cluster_key, name)
|
|
91
|
+
);
|
|
92
|
+
CREATE TABLE IF NOT EXISTS cluster_param_values (
|
|
93
|
+
cluster_key TEXT NOT NULL,
|
|
94
|
+
name TEXT NOT NULL,
|
|
95
|
+
value TEXT NOT NULL,
|
|
96
|
+
count INTEGER NOT NULL,
|
|
97
|
+
PRIMARY KEY (cluster_key, name, value)
|
|
98
|
+
);
|
|
99
|
+
CREATE TABLE IF NOT EXISTS cluster_param_types (
|
|
100
|
+
cluster_key TEXT NOT NULL,
|
|
101
|
+
name TEXT NOT NULL,
|
|
102
|
+
type TEXT NOT NULL,
|
|
103
|
+
count INTEGER NOT NULL,
|
|
104
|
+
PRIMARY KEY (cluster_key, name, type)
|
|
105
|
+
);
|
|
106
|
+
-- Source-IRI log. The materialized views above are derived from
|
|
107
|
+
-- this log via events + reducers. Corpus#reinfer drops the views
|
|
108
|
+
-- and replays the log to rebuild them. id is monotonic so
|
|
109
|
+
-- iteration order is observation order.
|
|
110
|
+
CREATE TABLE IF NOT EXISTS observed_iris (
|
|
111
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
112
|
+
canonical TEXT NOT NULL
|
|
113
|
+
);
|
|
114
|
+
-- Recognizers promoted from RecognizerProposal via
|
|
115
|
+
-- Corpus#activate_proposal. Re-applied to the corpus's
|
|
116
|
+
-- classifier on Corpus.open so a reopen picks up its learned
|
|
117
|
+
-- patterns. Keyed by prefix; activating the same prefix twice
|
|
118
|
+
-- is a no-op.
|
|
119
|
+
CREATE TABLE IF NOT EXISTS activated_recognizers (
|
|
120
|
+
prefix TEXT PRIMARY KEY,
|
|
121
|
+
type TEXT NOT NULL,
|
|
122
|
+
specificity REAL NOT NULL DEFAULT 1.0
|
|
123
|
+
);
|
|
124
|
+
SQL
|
|
125
|
+
|
|
126
|
+
attr_reader :path, :max_values_per_position
|
|
127
|
+
|
|
128
|
+
def self.open(path, classifier: SegmentClassifier::DEFAULT,
|
|
129
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
130
|
+
new(path: path, classifier: classifier, max_values_per_position: max_values_per_position).tap(&:setup!)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def initialize(path:, classifier: SegmentClassifier::DEFAULT,
|
|
134
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
135
|
+
@path = path
|
|
136
|
+
@classifier = classifier
|
|
137
|
+
@max_values_per_position = max_values_per_position
|
|
138
|
+
@db = SQLite3::Database.new(path)
|
|
139
|
+
# busy_timeout MUST come first: other PRAGMAs (journal_mode in
|
|
140
|
+
# particular) can themselves block on the write lock under
|
|
141
|
+
# concurrent open, and without busy_timeout set they fail
|
|
142
|
+
# immediately with SQLITE_BUSY.
|
|
143
|
+
@db.execute("PRAGMA busy_timeout = 30000")
|
|
144
|
+
@db.execute("PRAGMA journal_mode = WAL")
|
|
145
|
+
@db.execute("PRAGMA synchronous = NORMAL")
|
|
146
|
+
@db.execute("PRAGMA foreign_keys = ON")
|
|
147
|
+
@in_batch = false
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def setup!
|
|
151
|
+
@db.execute_batch(SCHEMA)
|
|
152
|
+
existing = @db.get_first_value("SELECT value FROM meta WHERE key = 'schema_version'")
|
|
153
|
+
if existing.nil?
|
|
154
|
+
@db.execute("INSERT INTO meta (key, value) VALUES ('schema_version', ?)", SCHEMA_VERSION.to_s)
|
|
155
|
+
@db.execute("INSERT INTO meta (key, value) VALUES ('max_values_per_position', ?)",
|
|
156
|
+
@max_values_per_position.to_s)
|
|
157
|
+
else
|
|
158
|
+
@max_values_per_position = (@db.get_first_value(
|
|
159
|
+
"SELECT value FROM meta WHERE key = 'max_values_per_position'"
|
|
160
|
+
) || @max_values_per_position).to_i
|
|
161
|
+
end
|
|
162
|
+
self
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def transaction
|
|
166
|
+
# While inside an outer batch, observe()-time transactions become
|
|
167
|
+
# no-ops — the outer batch wraps everything in one txn for speed.
|
|
168
|
+
return yield(self) if @in_batch
|
|
169
|
+
|
|
170
|
+
@db.transaction
|
|
171
|
+
yield self
|
|
172
|
+
@db.commit
|
|
173
|
+
rescue
|
|
174
|
+
@db.rollback rescue nil
|
|
175
|
+
raise
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Wrap many observations in a single transaction. Cuts SQLite write
|
|
179
|
+
# overhead from O(observations) fsyncs to O(1).
|
|
180
|
+
def batch
|
|
181
|
+
return yield if @in_batch
|
|
182
|
+
|
|
183
|
+
@in_batch = true
|
|
184
|
+
@db.transaction
|
|
185
|
+
begin
|
|
186
|
+
yield
|
|
187
|
+
@db.commit
|
|
188
|
+
rescue
|
|
189
|
+
@db.rollback rescue nil
|
|
190
|
+
raise
|
|
191
|
+
ensure
|
|
192
|
+
@in_batch = false
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Saving is automatic — incremental UPSERTs hit disk on commit. flush
|
|
197
|
+
# makes that explicit; close releases the connection.
|
|
198
|
+
def flush; end
|
|
199
|
+
|
|
200
|
+
def save(_path = nil)
|
|
201
|
+
# Already persisted. Provided for parity with the JSON backend.
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def close
|
|
205
|
+
# Checkpoint + truncate the WAL so the .db-wal sidecar doesn't grow
|
|
206
|
+
# unbounded across long-lived `iriq --corpus c.db` sessions.
|
|
207
|
+
@db.execute("PRAGMA wal_checkpoint(TRUNCATE)") rescue nil
|
|
208
|
+
@db.close
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# --- Increments -------------------------------------------------------
|
|
212
|
+
|
|
213
|
+
def increment_host(host)
|
|
214
|
+
return unless host
|
|
215
|
+
|
|
216
|
+
@db.execute(<<~SQL, host)
|
|
217
|
+
INSERT INTO host_counts (host, count) VALUES (?, 1)
|
|
218
|
+
ON CONFLICT(host) DO UPDATE SET count = count + 1
|
|
219
|
+
SQL
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def increment_path_length(length)
|
|
223
|
+
@db.execute(<<~SQL, length)
|
|
224
|
+
INSERT INTO path_length_counts (length, count) VALUES (?, 1)
|
|
225
|
+
ON CONFLICT(length) DO UPDATE SET count = count + 1
|
|
226
|
+
SQL
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def increment_raw_shape(shape)
|
|
230
|
+
upsert_shape("raw_shape_counts", shape)
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def increment_fingerprint(shape)
|
|
234
|
+
upsert_shape("fingerprint_counts", shape)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def observe_position(position, value, type)
|
|
238
|
+
host = position.host || ""
|
|
239
|
+
scope = position.scope.to_s
|
|
240
|
+
locator = position.locator
|
|
241
|
+
@db.execute(<<~SQL, [host, scope, locator])
|
|
242
|
+
INSERT INTO position_stats (host, scope, locator, total) VALUES (?, ?, ?, 1)
|
|
243
|
+
ON CONFLICT(host, scope, locator) DO UPDATE SET total = total + 1
|
|
244
|
+
SQL
|
|
245
|
+
|
|
246
|
+
# Type counts are unbounded — always upsert.
|
|
247
|
+
@db.execute(<<~SQL, [host, scope, locator, type.to_s])
|
|
248
|
+
INSERT INTO position_types (host, scope, locator, type, count) VALUES (?, ?, ?, ?, 1)
|
|
249
|
+
ON CONFLICT(host, scope, locator, type) DO UPDATE SET count = count + 1
|
|
250
|
+
SQL
|
|
251
|
+
|
|
252
|
+
# Value counts are capped at max_values_per_position. If the value
|
|
253
|
+
# already exists, increment it; otherwise insert only when
|
|
254
|
+
# cardinality is below the cap. Two-step rather than ON CONFLICT
|
|
255
|
+
# because we need to enforce the cap on insert.
|
|
256
|
+
@db.execute(<<~SQL, [host, scope, locator, value])
|
|
257
|
+
UPDATE position_values SET count = count + 1
|
|
258
|
+
WHERE host = ? AND scope = ? AND locator = ? AND value = ?
|
|
259
|
+
SQL
|
|
260
|
+
if @db.changes.zero?
|
|
261
|
+
card = @db.get_first_value(
|
|
262
|
+
"SELECT COUNT(*) FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
|
|
263
|
+
[host, scope, locator],
|
|
264
|
+
)
|
|
265
|
+
if card < @max_values_per_position
|
|
266
|
+
@db.execute(
|
|
267
|
+
"INSERT INTO position_values (host, scope, locator, value, count) VALUES (?, ?, ?, ?, 1)",
|
|
268
|
+
[host, scope, locator, value],
|
|
269
|
+
)
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def add_to_cluster(key, host, scheme, shape, identifier)
|
|
275
|
+
# Insert the cluster row if new (with a monotonic ord for stable
|
|
276
|
+
# iteration), then bump its count.
|
|
277
|
+
@db.execute(<<~SQL, [key, host, scheme, shape])
|
|
278
|
+
INSERT INTO clusters (key, host, scheme, shape, count, ord)
|
|
279
|
+
VALUES (?, ?, ?, ?, 1, (SELECT COALESCE(MAX(ord), 0) + 1 FROM clusters))
|
|
280
|
+
ON CONFLICT(key) DO UPDATE SET count = count + 1
|
|
281
|
+
SQL
|
|
282
|
+
|
|
283
|
+
# Examples — capped at Cluster::MAX_EXAMPLES.
|
|
284
|
+
examples_count = @db.get_first_value(
|
|
285
|
+
"SELECT COUNT(*) FROM cluster_examples WHERE cluster_key = ?", [key],
|
|
286
|
+
)
|
|
287
|
+
if examples_count < Cluster::MAX_EXAMPLES
|
|
288
|
+
@db.execute(<<~SQL, [key, examples_count, identifier.canonical])
|
|
289
|
+
INSERT INTO cluster_examples (cluster_key, position, canonical)
|
|
290
|
+
VALUES (?, ?, ?)
|
|
291
|
+
SQL
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Per-position segment counts — uncapped.
|
|
295
|
+
identifier.path_segments.each_with_index do |seg, i|
|
|
296
|
+
@db.execute(<<~SQL, [key, i, seg])
|
|
297
|
+
INSERT INTO cluster_segments (cluster_key, position, value, count) VALUES (?, ?, ?, 1)
|
|
298
|
+
ON CONFLICT(cluster_key, position, value) DO UPDATE SET count = count + 1
|
|
299
|
+
SQL
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Per-param stats (presence + value cardinality + type) — mirrors the
|
|
303
|
+
# in-memory Cluster#add path. Value table respects the same per-key
|
|
304
|
+
# cap as position_values.
|
|
305
|
+
(identifier.query_params || {}).each do |name, value|
|
|
306
|
+
v = value.to_s
|
|
307
|
+
type = @classifier.classify(v).to_s
|
|
308
|
+
|
|
309
|
+
@db.execute(<<~SQL, [key, name])
|
|
310
|
+
INSERT INTO cluster_params (cluster_key, name, total) VALUES (?, ?, 1)
|
|
311
|
+
ON CONFLICT(cluster_key, name) DO UPDATE SET total = total + 1
|
|
312
|
+
SQL
|
|
313
|
+
@db.execute(<<~SQL, [key, name, type])
|
|
314
|
+
INSERT INTO cluster_param_types (cluster_key, name, type, count) VALUES (?, ?, ?, 1)
|
|
315
|
+
ON CONFLICT(cluster_key, name, type) DO UPDATE SET count = count + 1
|
|
316
|
+
SQL
|
|
317
|
+
|
|
318
|
+
@db.execute(<<~SQL, [key, name, v])
|
|
319
|
+
UPDATE cluster_param_values SET count = count + 1
|
|
320
|
+
WHERE cluster_key = ? AND name = ? AND value = ?
|
|
321
|
+
SQL
|
|
322
|
+
if @db.changes.zero?
|
|
323
|
+
card = @db.get_first_value(
|
|
324
|
+
"SELECT COUNT(*) FROM cluster_param_values WHERE cluster_key = ? AND name = ?",
|
|
325
|
+
[key, name],
|
|
326
|
+
)
|
|
327
|
+
if card < @max_values_per_position
|
|
328
|
+
@db.execute(
|
|
329
|
+
"INSERT INTO cluster_param_values (cluster_key, name, value, count) VALUES (?, ?, ?, 1)",
|
|
330
|
+
[key, name, v],
|
|
331
|
+
)
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
load_cluster(key)
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Append a canonical IRI to the source-IRI log. Inside the same
|
|
340
|
+
# transaction as the event reducers, so the log and views stay
|
|
341
|
+
# consistent.
|
|
342
|
+
def record_observation(canonical)
|
|
343
|
+
@db.execute("INSERT INTO observed_iris (canonical) VALUES (?)", [canonical])
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def each_observed_iri
|
|
347
|
+
@db.execute("SELECT canonical FROM observed_iris ORDER BY id") do |row|
|
|
348
|
+
yield row[0]
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def observed_iri_count
|
|
353
|
+
@db.get_first_value("SELECT COUNT(*) FROM observed_iris") || 0
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# --- Activated recognizers --------------------------------------------
|
|
357
|
+
|
|
358
|
+
def record_activated_recognizer(dump)
|
|
359
|
+
@db.execute(<<~SQL, [dump["prefix"], dump["type"], dump.fetch("specificity", 1.0)])
|
|
360
|
+
INSERT INTO activated_recognizers (prefix, type, specificity) VALUES (?, ?, ?)
|
|
361
|
+
ON CONFLICT(prefix) DO UPDATE SET type = excluded.type, specificity = excluded.specificity
|
|
362
|
+
SQL
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def each_activated_recognizer
|
|
366
|
+
@db.execute("SELECT prefix, type, specificity FROM activated_recognizers ORDER BY prefix") do |row|
|
|
367
|
+
yield({ "prefix" => row[0], "type" => row[1], "specificity" => row[2] })
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def activated_recognizer_count
|
|
372
|
+
@db.get_first_value("SELECT COUNT(*) FROM activated_recognizers") || 0
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# Drop every materialized view without touching the source-IRI log.
|
|
376
|
+
# Corpus#reinfer calls this before replaying the log.
|
|
377
|
+
def clear_materialized_views
|
|
378
|
+
@db.execute_batch(<<~SQL)
|
|
379
|
+
DELETE FROM host_counts;
|
|
380
|
+
DELETE FROM path_length_counts;
|
|
381
|
+
DELETE FROM raw_shape_counts;
|
|
382
|
+
DELETE FROM fingerprint_counts;
|
|
383
|
+
DELETE FROM position_stats;
|
|
384
|
+
DELETE FROM position_values;
|
|
385
|
+
DELETE FROM position_types;
|
|
386
|
+
DELETE FROM clusters;
|
|
387
|
+
DELETE FROM cluster_examples;
|
|
388
|
+
DELETE FROM cluster_segments;
|
|
389
|
+
DELETE FROM cluster_params;
|
|
390
|
+
DELETE FROM cluster_param_values;
|
|
391
|
+
DELETE FROM cluster_param_types;
|
|
392
|
+
SQL
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# --- Reads ------------------------------------------------------------
|
|
396
|
+
|
|
397
|
+
def host_counts
|
|
398
|
+
rows_to_count_hash("host_counts", "host")
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
def path_length_counts
|
|
402
|
+
h = Hash.new(0)
|
|
403
|
+
@db.execute("SELECT length, count FROM path_length_counts") { |r| h[r[0]] = r[1] }
|
|
404
|
+
h
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def raw_shape_counts
|
|
408
|
+
rows_to_count_hash("raw_shape_counts", "shape")
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
def fingerprint_counts
|
|
412
|
+
rows_to_count_hash("fingerprint_counts", "shape")
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def position_stats(position)
|
|
416
|
+
host = position.host || ""
|
|
417
|
+
scope = position.scope.to_s
|
|
418
|
+
locator = position.locator
|
|
419
|
+
total = @db.get_first_value(
|
|
420
|
+
"SELECT total FROM position_stats WHERE host = ? AND scope = ? AND locator = ?",
|
|
421
|
+
[host, scope, locator],
|
|
422
|
+
)
|
|
423
|
+
return nil if total.nil?
|
|
424
|
+
|
|
425
|
+
stats = PositionStats.new(max_values: @max_values_per_position)
|
|
426
|
+
stats.instance_variable_set(:@total, total)
|
|
427
|
+
|
|
428
|
+
vc = Hash.new(0)
|
|
429
|
+
@db.execute(
|
|
430
|
+
"SELECT value, count FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
|
|
431
|
+
[host, scope, locator],
|
|
432
|
+
) { |r| vc[r[0]] = r[1] }
|
|
433
|
+
stats.instance_variable_set(:@value_counts, vc)
|
|
434
|
+
|
|
435
|
+
tc = Hash.new(0)
|
|
436
|
+
@db.execute(
|
|
437
|
+
"SELECT type, count FROM position_types WHERE host = ? AND scope = ? AND locator = ?",
|
|
438
|
+
[host, scope, locator],
|
|
439
|
+
) { |r| tc[r[0].to_sym] = r[1] }
|
|
440
|
+
stats.instance_variable_set(:@type_counts, tc)
|
|
441
|
+
|
|
442
|
+
stats
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def each_position_stats
|
|
446
|
+
seen = []
|
|
447
|
+
@db.execute("SELECT DISTINCT host, scope, locator FROM position_stats ORDER BY ROWID") do |row|
|
|
448
|
+
seen << row
|
|
449
|
+
end
|
|
450
|
+
seen.each do |host, scope, locator|
|
|
451
|
+
pos = Position.new(host: host, scope: scope.to_sym, locator: locator)
|
|
452
|
+
yield pos, position_stats(pos)
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def clusters
|
|
457
|
+
out = []
|
|
458
|
+
@db.execute("SELECT key FROM clusters ORDER BY ord") do |row|
|
|
459
|
+
out << load_cluster(row[0])
|
|
460
|
+
end
|
|
461
|
+
out
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
def cluster_size
|
|
465
|
+
@db.get_first_value("SELECT COUNT(*) FROM clusters")
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def cluster_for(key)
|
|
469
|
+
load_cluster(key)
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
private
|
|
473
|
+
|
|
474
|
+
def upsert_shape(table, shape)
|
|
475
|
+
@db.execute(<<~SQL, shape)
|
|
476
|
+
INSERT INTO #{table} (shape, count) VALUES (?, 1)
|
|
477
|
+
ON CONFLICT(shape) DO UPDATE SET count = count + 1
|
|
478
|
+
SQL
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def rows_to_count_hash(table, key_col)
|
|
482
|
+
h = Hash.new(0)
|
|
483
|
+
@db.execute("SELECT #{key_col}, count FROM #{table}") { |r| h[r[0]] = r[1] }
|
|
484
|
+
h
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
def load_cluster(key)
|
|
488
|
+
row = @db.get_first_row(
|
|
489
|
+
"SELECT key, host, scheme, shape, count FROM clusters WHERE key = ?", [key],
|
|
490
|
+
)
|
|
491
|
+
return nil unless row
|
|
492
|
+
|
|
493
|
+
c = Cluster.new(
|
|
494
|
+
key: row[0], host: row[1], scheme: row[2], shape: row[3],
|
|
495
|
+
max_values: @max_values_per_position,
|
|
496
|
+
)
|
|
497
|
+
c.instance_variable_set(:@count, row[4])
|
|
498
|
+
|
|
499
|
+
examples = []
|
|
500
|
+
@db.execute(
|
|
501
|
+
"SELECT canonical FROM cluster_examples WHERE cluster_key = ? ORDER BY position", [key]
|
|
502
|
+
) { |r| examples << Parser.parse(r[0]) }
|
|
503
|
+
c.instance_variable_set(:@examples, examples)
|
|
504
|
+
|
|
505
|
+
seg_counts = []
|
|
506
|
+
@db.execute(
|
|
507
|
+
"SELECT position, value, count FROM cluster_segments WHERE cluster_key = ? ORDER BY position",
|
|
508
|
+
[key],
|
|
509
|
+
) do |r|
|
|
510
|
+
pos = r[0]
|
|
511
|
+
seg_counts[pos] ||= Hash.new(0)
|
|
512
|
+
seg_counts[pos][r[1]] = r[2]
|
|
513
|
+
end
|
|
514
|
+
c.instance_variable_set(:@segment_counts, seg_counts)
|
|
515
|
+
|
|
516
|
+
# Rebuild @param_stats from the three cluster_param_* tables.
|
|
517
|
+
params = {}
|
|
518
|
+
@db.execute(
|
|
519
|
+
"SELECT name, total FROM cluster_params WHERE cluster_key = ?", [key],
|
|
520
|
+
) do |r|
|
|
521
|
+
# PositionStats.new already initializes empty Hash.new(0) for value
|
|
522
|
+
# and type counts; only @total needs filling here. The followup
|
|
523
|
+
# SELECTs below populate value/type rows in place.
|
|
524
|
+
stats = PositionStats.new(max_values: @max_values_per_position)
|
|
525
|
+
stats.instance_variable_set(:@total, r[1])
|
|
526
|
+
params[r[0]] = stats
|
|
527
|
+
end
|
|
528
|
+
@db.execute(
|
|
529
|
+
"SELECT name, value, count FROM cluster_param_values WHERE cluster_key = ?", [key],
|
|
530
|
+
) do |r|
|
|
531
|
+
stats = params[r[0]] or next
|
|
532
|
+
stats.value_counts[r[1]] = r[2]
|
|
533
|
+
end
|
|
534
|
+
@db.execute(
|
|
535
|
+
"SELECT name, type, count FROM cluster_param_types WHERE cluster_key = ?", [key],
|
|
536
|
+
) do |r|
|
|
537
|
+
stats = params[r[0]] or next
|
|
538
|
+
stats.type_counts[r[1].to_sym] = r[2]
|
|
539
|
+
end
|
|
540
|
+
c.instance_variable_set(:@param_stats, params)
|
|
541
|
+
|
|
542
|
+
c
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
end
|
|
546
|
+
end
|
data/lib/iriq/storage.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Storage is the persistence layer for a Corpus. It owns every counter and
|
|
3
|
+
# per-(host, prefix) frequency map; the Corpus class delegates state to it.
|
|
4
|
+
#
|
|
5
|
+
# Three concrete backends ship:
|
|
6
|
+
#
|
|
7
|
+
# Storage::Memory — in-memory only; matches the original behavior.
|
|
8
|
+
# Storage::Json — Memory backend wrapped with load/save against a JSON file.
|
|
9
|
+
# Storage::Sqlite — incremental UPSERTs against a SQLite database.
|
|
10
|
+
#
|
|
11
|
+
# File-extension dispatch keeps callers simple: `.json` (or anything else)
|
|
12
|
+
# picks Json, `.db`/`.sqlite`/`.sqlite3` picks Sqlite.
|
|
13
|
+
module Storage
|
|
14
|
+
SQLITE_EXTS = %w[.db .sqlite .sqlite3].freeze
|
|
15
|
+
|
|
16
|
+
module_function
|
|
17
|
+
|
|
18
|
+
# Opens (or creates) a storage at `path`, picking the backend by extension.
|
|
19
|
+
# If `path` is nil, returns a Memory backend.
|
|
20
|
+
def open(path, classifier: SegmentClassifier::DEFAULT,
|
|
21
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
22
|
+
return Memory.new(classifier: classifier, max_values_per_position: max_values_per_position) if path.nil?
|
|
23
|
+
|
|
24
|
+
if SQLITE_EXTS.include?(File.extname(path).downcase)
|
|
25
|
+
require "iriq/storage/sqlite"
|
|
26
|
+
Sqlite.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
|
|
27
|
+
else
|
|
28
|
+
require "iriq/storage/json"
|
|
29
|
+
Json.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
require "iriq/storage/memory"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Recognizer built dynamically from a learned-prefix pattern.
|
|
3
|
+
#
|
|
4
|
+
# Used by Corpus#activate_proposal to promote a RecognizerProposal
|
|
5
|
+
# into a live Recognizer that the classifier ensemble consults. Same
|
|
6
|
+
# shape as the built-in Recognizers — uuid, date, integer — but the
|
|
7
|
+
# pattern + type are supplied at construction instead of compiled-in.
|
|
8
|
+
#
|
|
9
|
+
# r = SynthesizedRecognizer.new(prefix: "ghp_", type: :ghp)
|
|
10
|
+
# r.try("ghp_abcdef123") # → {type: :ghp, confidence: 1.0, specificity: 1.0}
|
|
11
|
+
#
|
|
12
|
+
# Pattern: `<prefix><[A-Za-z0-9]+>` — anchored, alphanumeric suffix
|
|
13
|
+
# only. Matches the same shape PrefixUnderscoreId proposes from, so
|
|
14
|
+
# round-trip (propose → activate → reinfer) reclassifies the same
|
|
15
|
+
# values the proposal was derived from.
|
|
16
|
+
#
|
|
17
|
+
# Specificity defaults to SEMANTIC. A learned prefix is very specific
|
|
18
|
+
# by construction (a distinctive literal prefix that recurred enough
|
|
19
|
+
# to clear the proposal noise floor) — calling it as confident as a
|
|
20
|
+
# built-in UUID is reasonable.
|
|
21
|
+
class SynthesizedRecognizer < Recognizer
|
|
22
|
+
attr_reader :prefix, :type, :specificity
|
|
23
|
+
|
|
24
|
+
def self.from_proposal(proposal)
|
|
25
|
+
new(prefix: proposal.prefix, type: proposal.suggested_type)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def initialize(prefix:, type:, specificity: Specificity::SEMANTIC)
|
|
29
|
+
raise ArgumentError, "prefix must be a non-empty string" if prefix.nil? || prefix.empty?
|
|
30
|
+
raise ArgumentError, "type must be a symbol" unless type.is_a?(Symbol)
|
|
31
|
+
|
|
32
|
+
@prefix = prefix
|
|
33
|
+
@type = type
|
|
34
|
+
@specificity = specificity
|
|
35
|
+
@pattern = /\A#{Regexp.escape(prefix)}[A-Za-z0-9]+\z/.freeze
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def try(segment)
|
|
39
|
+
return nil unless segment.start_with?(@prefix) && @pattern.match?(segment)
|
|
40
|
+
|
|
41
|
+
{ type: @type, confidence: 1.0, specificity: @specificity }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def to_dump
|
|
45
|
+
{ "prefix" => @prefix, "type" => @type.to_s, "specificity" => @specificity }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def self.from_dump(h)
|
|
49
|
+
new(
|
|
50
|
+
prefix: h["prefix"],
|
|
51
|
+
type: h["type"].to_sym,
|
|
52
|
+
specificity: h.fetch("specificity", Specificity::SEMANTIC),
|
|
53
|
+
)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|