iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Per-Recognizer claim strength. Higher specificity wins when multiple
|
|
3
|
+
# Recognizers fire on the same segment; the ensemble picks the
|
|
4
|
+
# max(specificity × confidence).
|
|
5
|
+
#
|
|
6
|
+
# The bands below capture the current type taxonomy at coarse-grain:
|
|
7
|
+
# they're explicitly NOT linear "how confident" scores. They encode "how
|
|
8
|
+
# surprising would it be for this Recognizer to fire by accident on a
|
|
9
|
+
# different actual type." UUID's shape is so distinctive that a non-UUID
|
|
10
|
+
# producing that string is vanishingly unlikely (SEMANTIC); a 4-digit
|
|
11
|
+
# integer could plausibly be a year, an HTTP status, or an ID, so
|
|
12
|
+
# `:integer` claims only TYPED.
|
|
13
|
+
#
|
|
14
|
+
# Calibration corpus tests in spec/iriq/calibration_spec.rb / Go's
|
|
15
|
+
# calibration_test.go are the source of truth for whether these
|
|
16
|
+
# values are well-chosen — adjust them and re-run to validate.
|
|
17
|
+
module Specificity
|
|
18
|
+
# Unambiguous semantic shapes — the regex effectively can't fire by
|
|
19
|
+
# accident. (UUID, JWT, email with @, URL with ://, color hex.)
|
|
20
|
+
SEMANTIC = 1.0
|
|
21
|
+
# Restrictive structured patterns. Could collide with broader types
|
|
22
|
+
# at edges. (date, file with known ext, ipv4, mime.)
|
|
23
|
+
STRUCTURED = 0.8
|
|
24
|
+
# Digit-shaped with an additional bound — range or allowlist — that
|
|
25
|
+
# makes the shape alone meaningful. (timestamp, currency, country,
|
|
26
|
+
# boolean.)
|
|
27
|
+
BOUNDED = 0.7
|
|
28
|
+
# Lexically broad but typed. (integer, float, version.)
|
|
29
|
+
TYPED = 0.5
|
|
30
|
+
# Generic pattern-based shape. (slug.)
|
|
31
|
+
PATTERN = 0.3
|
|
32
|
+
# Generic fallback shapes. (literal, opaque_id.)
|
|
33
|
+
FALLBACK = 0.1
|
|
34
|
+
end
|
|
35
|
+
end
|
data/lib/iriq/storage/memory.rb
CHANGED
|
@@ -9,11 +9,15 @@ module Iriq
|
|
|
9
9
|
# increment_path_length(length)
|
|
10
10
|
# increment_raw_shape(shape)
|
|
11
11
|
# increment_fingerprint(shape)
|
|
12
|
-
# observe_position(
|
|
12
|
+
# observe_position(position, value, type) # position is Iriq::Position
|
|
13
13
|
# add_to_cluster(key, host, scheme, shape, identifier)
|
|
14
|
+
# record_observation(canonical) # append to source-IRI log
|
|
14
15
|
#
|
|
15
16
|
# host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
|
|
16
|
-
# position_stats(
|
|
17
|
+
# position_stats(position)
|
|
18
|
+
# each_position_stats { |position, stats| ... }
|
|
19
|
+
# each_observed_iri { |canonical| ... }
|
|
20
|
+
# clear_materialized_views # for reinfer
|
|
17
21
|
# clusters / cluster_size
|
|
18
22
|
#
|
|
19
23
|
# transaction { ... } # backends may batch within
|
|
@@ -36,6 +40,15 @@ module Iriq
|
|
|
36
40
|
@fingerprint_counts = Hash.new(0)
|
|
37
41
|
@position_stats = {}
|
|
38
42
|
@clusters = {}
|
|
43
|
+
# The source-IRI log. Persisted alongside materialized views; the
|
|
44
|
+
# log is the source of truth, the views are derived. Corpus#reinfer
|
|
45
|
+
# drops the views and replays the log through events + reducers.
|
|
46
|
+
@observed_iris = []
|
|
47
|
+
# Recognizers promoted from RecognizerProposal via
|
|
48
|
+
# Corpus#activate_proposal. Stored as {prefix, type, specificity}
|
|
49
|
+
# hashes so reopens can re-synthesize them onto the corpus's
|
|
50
|
+
# classifier.
|
|
51
|
+
@activated_recognizers = []
|
|
39
52
|
end
|
|
40
53
|
|
|
41
54
|
def transaction
|
|
@@ -70,17 +83,61 @@ module Iriq
|
|
|
70
83
|
@fingerprint_counts[shape] += 1
|
|
71
84
|
end
|
|
72
85
|
|
|
73
|
-
def observe_position(
|
|
74
|
-
stats = @position_stats[
|
|
86
|
+
def observe_position(position, value, type)
|
|
87
|
+
stats = @position_stats[position] ||= PositionStats.new(max_values: @max_values_per_position)
|
|
75
88
|
stats.observe(value, type)
|
|
76
89
|
end
|
|
77
90
|
|
|
78
91
|
def add_to_cluster(key, host, scheme, shape, identifier)
|
|
79
|
-
cluster = @clusters[key] ||= Cluster.new(
|
|
80
|
-
|
|
92
|
+
cluster = @clusters[key] ||= Cluster.new(
|
|
93
|
+
key: key, host: host, scheme: scheme, shape: shape,
|
|
94
|
+
max_values: @max_values_per_position,
|
|
95
|
+
)
|
|
96
|
+
cluster.add(identifier, classifier: @classifier)
|
|
81
97
|
cluster
|
|
82
98
|
end
|
|
83
99
|
|
|
100
|
+
# Append a canonical IRI to the source-IRI log. Called by Corpus#observe
|
|
101
|
+
# after the event reducers have applied; the log is the source of truth
|
|
102
|
+
# that Corpus#reinfer replays.
|
|
103
|
+
def record_observation(canonical)
|
|
104
|
+
@observed_iris << canonical
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def each_observed_iri(&block)
|
|
108
|
+
@observed_iris.each(&block)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def observed_iri_count
|
|
112
|
+
@observed_iris.size
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# --- Activated recognizers (Corpus#activate_proposal) -----------------
|
|
116
|
+
|
|
117
|
+
def record_activated_recognizer(dump)
|
|
118
|
+
@activated_recognizers << dump
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def each_activated_recognizer(&block)
|
|
122
|
+
@activated_recognizers.each(&block)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def activated_recognizer_count
|
|
126
|
+
@activated_recognizers.size
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Drop every materialized view (host_counts, position_stats, clusters,
|
|
130
|
+
# …) without touching the source-IRI log. Corpus#reinfer calls this
|
|
131
|
+
# before replaying the log so views rebuild from scratch.
|
|
132
|
+
def clear_materialized_views
|
|
133
|
+
@host_counts = Hash.new(0)
|
|
134
|
+
@path_length_counts = Hash.new(0)
|
|
135
|
+
@raw_shape_counts = Hash.new(0)
|
|
136
|
+
@fingerprint_counts = Hash.new(0)
|
|
137
|
+
@position_stats = {}
|
|
138
|
+
@clusters = {}
|
|
139
|
+
end
|
|
140
|
+
|
|
84
141
|
# --- Reads ------------------------------------------------------------
|
|
85
142
|
|
|
86
143
|
def host_counts; @host_counts; end
|
|
@@ -88,8 +145,8 @@ module Iriq
|
|
|
88
145
|
def raw_shape_counts; @raw_shape_counts; end
|
|
89
146
|
def fingerprint_counts; @fingerprint_counts; end
|
|
90
147
|
|
|
91
|
-
def position_stats(
|
|
92
|
-
@position_stats[
|
|
148
|
+
def position_stats(position)
|
|
149
|
+
@position_stats[position]
|
|
93
150
|
end
|
|
94
151
|
|
|
95
152
|
def each_position_stats(&block)
|
|
@@ -104,6 +161,13 @@ module Iriq
|
|
|
104
161
|
@clusters.size
|
|
105
162
|
end
|
|
106
163
|
|
|
164
|
+
# O(1) lookup by cluster key — used by Corpus#normalize to pull the
|
|
165
|
+
# cluster's param_stats for the URL being normalized. nil if no cluster
|
|
166
|
+
# has been observed under this key yet.
|
|
167
|
+
def cluster_for(key)
|
|
168
|
+
@clusters[key]
|
|
169
|
+
end
|
|
170
|
+
|
|
107
171
|
# --- Bulk load (used by JSON backend) --------------------------------
|
|
108
172
|
|
|
109
173
|
def load_dump!(h)
|
|
@@ -112,11 +176,14 @@ module Iriq
|
|
|
112
176
|
@raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
|
|
113
177
|
@fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
|
|
114
178
|
@max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
|
|
115
|
-
@position_stats = h["position_stats"].each_with_object({}) do |
|
|
116
|
-
|
|
179
|
+
@position_stats = h["position_stats"].each_with_object({}) do |entry, acc|
|
|
180
|
+
position = Position.from_dump(entry["position"])
|
|
181
|
+
acc[position] = PositionStats.from_dump(entry["stats"])
|
|
117
182
|
end
|
|
118
183
|
cdump = h.fetch("clusterer", { "clusters" => {} })
|
|
119
|
-
@clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c) }
|
|
184
|
+
@clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c, max_values: @max_values_per_position) }
|
|
185
|
+
@observed_iris = h.fetch("observed_iris", [])
|
|
186
|
+
@activated_recognizers = h.fetch("activated_recognizers", [])
|
|
120
187
|
self
|
|
121
188
|
end
|
|
122
189
|
|
|
@@ -127,10 +194,14 @@ module Iriq
|
|
|
127
194
|
"raw_shape_counts" => @raw_shape_counts,
|
|
128
195
|
"fingerprint_counts" => @fingerprint_counts,
|
|
129
196
|
"max_values_per_position" => @max_values_per_position,
|
|
130
|
-
"position_stats" => @position_stats.map { |
|
|
197
|
+
"position_stats" => @position_stats.map { |pos, s|
|
|
198
|
+
{ "position" => pos.to_dump, "stats" => s.dump }
|
|
199
|
+
},
|
|
131
200
|
"clusterer" => {
|
|
132
201
|
"clusters" => @clusters.transform_values(&:dump),
|
|
133
202
|
},
|
|
203
|
+
"observed_iris" => @observed_iris,
|
|
204
|
+
"activated_recognizers" => @activated_recognizers,
|
|
134
205
|
}
|
|
135
206
|
end
|
|
136
207
|
end
|
data/lib/iriq/storage/sqlite.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Iriq
|
|
|
11
11
|
# the existing `iriq --corpus c.db <url>` pattern works without a flock
|
|
12
12
|
# at the application layer.
|
|
13
13
|
class Sqlite
|
|
14
|
-
SCHEMA_VERSION =
|
|
14
|
+
SCHEMA_VERSION = 4
|
|
15
15
|
|
|
16
16
|
SCHEMA = <<~SQL.freeze
|
|
17
17
|
CREATE TABLE IF NOT EXISTS meta (
|
|
@@ -34,25 +34,33 @@ module Iriq
|
|
|
34
34
|
shape TEXT PRIMARY KEY,
|
|
35
35
|
count INTEGER NOT NULL
|
|
36
36
|
);
|
|
37
|
+
-- Position is (host, scope, locator). For scope='path' the locator
|
|
38
|
+
-- is the typed prefix; for scope='query' it's the param name.
|
|
39
|
+
-- Today only 'path' is observed here (query params live on the
|
|
40
|
+
-- cluster_* tables) — scope is in the schema so future commits
|
|
41
|
+
-- can fold query positions in without another migration.
|
|
37
42
|
CREATE TABLE IF NOT EXISTS position_stats (
|
|
38
|
-
host
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
43
|
+
host TEXT NOT NULL,
|
|
44
|
+
scope TEXT NOT NULL,
|
|
45
|
+
locator TEXT NOT NULL,
|
|
46
|
+
total INTEGER NOT NULL DEFAULT 0,
|
|
47
|
+
PRIMARY KEY (host, scope, locator)
|
|
42
48
|
);
|
|
43
49
|
CREATE TABLE IF NOT EXISTS position_values (
|
|
44
|
-
host
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
50
|
+
host TEXT NOT NULL,
|
|
51
|
+
scope TEXT NOT NULL,
|
|
52
|
+
locator TEXT NOT NULL,
|
|
53
|
+
value TEXT NOT NULL,
|
|
54
|
+
count INTEGER NOT NULL,
|
|
55
|
+
PRIMARY KEY (host, scope, locator, value)
|
|
49
56
|
);
|
|
50
57
|
CREATE TABLE IF NOT EXISTS position_types (
|
|
51
|
-
host
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
58
|
+
host TEXT NOT NULL,
|
|
59
|
+
scope TEXT NOT NULL,
|
|
60
|
+
locator TEXT NOT NULL,
|
|
61
|
+
type TEXT NOT NULL,
|
|
62
|
+
count INTEGER NOT NULL,
|
|
63
|
+
PRIMARY KEY (host, scope, locator, type)
|
|
56
64
|
);
|
|
57
65
|
CREATE TABLE IF NOT EXISTS clusters (
|
|
58
66
|
key TEXT PRIMARY KEY,
|
|
@@ -75,6 +83,44 @@ module Iriq
|
|
|
75
83
|
count INTEGER NOT NULL,
|
|
76
84
|
PRIMARY KEY (cluster_key, position, value)
|
|
77
85
|
);
|
|
86
|
+
CREATE TABLE IF NOT EXISTS cluster_params (
|
|
87
|
+
cluster_key TEXT NOT NULL,
|
|
88
|
+
name TEXT NOT NULL,
|
|
89
|
+
total INTEGER NOT NULL DEFAULT 0,
|
|
90
|
+
PRIMARY KEY (cluster_key, name)
|
|
91
|
+
);
|
|
92
|
+
CREATE TABLE IF NOT EXISTS cluster_param_values (
|
|
93
|
+
cluster_key TEXT NOT NULL,
|
|
94
|
+
name TEXT NOT NULL,
|
|
95
|
+
value TEXT NOT NULL,
|
|
96
|
+
count INTEGER NOT NULL,
|
|
97
|
+
PRIMARY KEY (cluster_key, name, value)
|
|
98
|
+
);
|
|
99
|
+
CREATE TABLE IF NOT EXISTS cluster_param_types (
|
|
100
|
+
cluster_key TEXT NOT NULL,
|
|
101
|
+
name TEXT NOT NULL,
|
|
102
|
+
type TEXT NOT NULL,
|
|
103
|
+
count INTEGER NOT NULL,
|
|
104
|
+
PRIMARY KEY (cluster_key, name, type)
|
|
105
|
+
);
|
|
106
|
+
-- Source-IRI log. The materialized views above are derived from
|
|
107
|
+
-- this log via events + reducers. Corpus#reinfer drops the views
|
|
108
|
+
-- and replays the log to rebuild them. id is monotonic so
|
|
109
|
+
-- iteration order is observation order.
|
|
110
|
+
CREATE TABLE IF NOT EXISTS observed_iris (
|
|
111
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
112
|
+
canonical TEXT NOT NULL
|
|
113
|
+
);
|
|
114
|
+
-- Recognizers promoted from RecognizerProposal via
|
|
115
|
+
-- Corpus#activate_proposal. Re-applied to the corpus's
|
|
116
|
+
-- classifier on Corpus.open so a reopen picks up its learned
|
|
117
|
+
-- patterns. Keyed by prefix; activating the same prefix twice
|
|
118
|
+
-- is a no-op.
|
|
119
|
+
CREATE TABLE IF NOT EXISTS activated_recognizers (
|
|
120
|
+
prefix TEXT PRIMARY KEY,
|
|
121
|
+
type TEXT NOT NULL,
|
|
122
|
+
specificity REAL NOT NULL DEFAULT 1.0
|
|
123
|
+
);
|
|
78
124
|
SQL
|
|
79
125
|
|
|
80
126
|
attr_reader :path, :max_values_per_position
|
|
@@ -188,36 +234,38 @@ module Iriq
|
|
|
188
234
|
upsert_shape("fingerprint_counts", shape)
|
|
189
235
|
end
|
|
190
236
|
|
|
191
|
-
def observe_position(
|
|
192
|
-
host
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
237
|
+
def observe_position(position, value, type)
|
|
238
|
+
host = position.host || ""
|
|
239
|
+
scope = position.scope.to_s
|
|
240
|
+
locator = position.locator
|
|
241
|
+
@db.execute(<<~SQL, [host, scope, locator])
|
|
242
|
+
INSERT INTO position_stats (host, scope, locator, total) VALUES (?, ?, ?, 1)
|
|
243
|
+
ON CONFLICT(host, scope, locator) DO UPDATE SET total = total + 1
|
|
196
244
|
SQL
|
|
197
245
|
|
|
198
246
|
# Type counts are unbounded — always upsert.
|
|
199
|
-
@db.execute(<<~SQL, [host,
|
|
200
|
-
INSERT INTO position_types (host,
|
|
201
|
-
ON CONFLICT(host,
|
|
247
|
+
@db.execute(<<~SQL, [host, scope, locator, type.to_s])
|
|
248
|
+
INSERT INTO position_types (host, scope, locator, type, count) VALUES (?, ?, ?, ?, 1)
|
|
249
|
+
ON CONFLICT(host, scope, locator, type) DO UPDATE SET count = count + 1
|
|
202
250
|
SQL
|
|
203
251
|
|
|
204
252
|
# Value counts are capped at max_values_per_position. If the value
|
|
205
253
|
# already exists, increment it; otherwise insert only when
|
|
206
254
|
# cardinality is below the cap. Two-step rather than ON CONFLICT
|
|
207
255
|
# because we need to enforce the cap on insert.
|
|
208
|
-
@db.execute(<<~SQL, [host,
|
|
256
|
+
@db.execute(<<~SQL, [host, scope, locator, value])
|
|
209
257
|
UPDATE position_values SET count = count + 1
|
|
210
|
-
WHERE host = ? AND
|
|
258
|
+
WHERE host = ? AND scope = ? AND locator = ? AND value = ?
|
|
211
259
|
SQL
|
|
212
260
|
if @db.changes.zero?
|
|
213
261
|
card = @db.get_first_value(
|
|
214
|
-
"SELECT COUNT(*) FROM position_values WHERE host = ? AND
|
|
215
|
-
[host,
|
|
262
|
+
"SELECT COUNT(*) FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
|
|
263
|
+
[host, scope, locator],
|
|
216
264
|
)
|
|
217
265
|
if card < @max_values_per_position
|
|
218
266
|
@db.execute(
|
|
219
|
-
"INSERT INTO position_values (host,
|
|
220
|
-
[host,
|
|
267
|
+
"INSERT INTO position_values (host, scope, locator, value, count) VALUES (?, ?, ?, ?, 1)",
|
|
268
|
+
[host, scope, locator, value],
|
|
221
269
|
)
|
|
222
270
|
end
|
|
223
271
|
end
|
|
@@ -251,9 +299,99 @@ module Iriq
|
|
|
251
299
|
SQL
|
|
252
300
|
end
|
|
253
301
|
|
|
302
|
+
# Per-param stats (presence + value cardinality + type) — mirrors the
|
|
303
|
+
# in-memory Cluster#add path. Value table respects the same per-key
|
|
304
|
+
# cap as position_values.
|
|
305
|
+
(identifier.query_params || {}).each do |name, value|
|
|
306
|
+
v = value.to_s
|
|
307
|
+
type = @classifier.classify(v).to_s
|
|
308
|
+
|
|
309
|
+
@db.execute(<<~SQL, [key, name])
|
|
310
|
+
INSERT INTO cluster_params (cluster_key, name, total) VALUES (?, ?, 1)
|
|
311
|
+
ON CONFLICT(cluster_key, name) DO UPDATE SET total = total + 1
|
|
312
|
+
SQL
|
|
313
|
+
@db.execute(<<~SQL, [key, name, type])
|
|
314
|
+
INSERT INTO cluster_param_types (cluster_key, name, type, count) VALUES (?, ?, ?, 1)
|
|
315
|
+
ON CONFLICT(cluster_key, name, type) DO UPDATE SET count = count + 1
|
|
316
|
+
SQL
|
|
317
|
+
|
|
318
|
+
@db.execute(<<~SQL, [key, name, v])
|
|
319
|
+
UPDATE cluster_param_values SET count = count + 1
|
|
320
|
+
WHERE cluster_key = ? AND name = ? AND value = ?
|
|
321
|
+
SQL
|
|
322
|
+
if @db.changes.zero?
|
|
323
|
+
card = @db.get_first_value(
|
|
324
|
+
"SELECT COUNT(*) FROM cluster_param_values WHERE cluster_key = ? AND name = ?",
|
|
325
|
+
[key, name],
|
|
326
|
+
)
|
|
327
|
+
if card < @max_values_per_position
|
|
328
|
+
@db.execute(
|
|
329
|
+
"INSERT INTO cluster_param_values (cluster_key, name, value, count) VALUES (?, ?, ?, 1)",
|
|
330
|
+
[key, name, v],
|
|
331
|
+
)
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
254
336
|
load_cluster(key)
|
|
255
337
|
end
|
|
256
338
|
|
|
339
|
+
# Append a canonical IRI to the source-IRI log. Inside the same
|
|
340
|
+
# transaction as the event reducers, so the log and views stay
|
|
341
|
+
# consistent.
|
|
342
|
+
def record_observation(canonical)
|
|
343
|
+
@db.execute("INSERT INTO observed_iris (canonical) VALUES (?)", [canonical])
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def each_observed_iri
|
|
347
|
+
@db.execute("SELECT canonical FROM observed_iris ORDER BY id") do |row|
|
|
348
|
+
yield row[0]
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def observed_iri_count
|
|
353
|
+
@db.get_first_value("SELECT COUNT(*) FROM observed_iris") || 0
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# --- Activated recognizers --------------------------------------------
|
|
357
|
+
|
|
358
|
+
def record_activated_recognizer(dump)
|
|
359
|
+
@db.execute(<<~SQL, [dump["prefix"], dump["type"], dump.fetch("specificity", 1.0)])
|
|
360
|
+
INSERT INTO activated_recognizers (prefix, type, specificity) VALUES (?, ?, ?)
|
|
361
|
+
ON CONFLICT(prefix) DO UPDATE SET type = excluded.type, specificity = excluded.specificity
|
|
362
|
+
SQL
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def each_activated_recognizer
|
|
366
|
+
@db.execute("SELECT prefix, type, specificity FROM activated_recognizers ORDER BY prefix") do |row|
|
|
367
|
+
yield({ "prefix" => row[0], "type" => row[1], "specificity" => row[2] })
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def activated_recognizer_count
|
|
372
|
+
@db.get_first_value("SELECT COUNT(*) FROM activated_recognizers") || 0
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# Drop every materialized view without touching the source-IRI log.
|
|
376
|
+
# Corpus#reinfer calls this before replaying the log.
|
|
377
|
+
def clear_materialized_views
|
|
378
|
+
@db.execute_batch(<<~SQL)
|
|
379
|
+
DELETE FROM host_counts;
|
|
380
|
+
DELETE FROM path_length_counts;
|
|
381
|
+
DELETE FROM raw_shape_counts;
|
|
382
|
+
DELETE FROM fingerprint_counts;
|
|
383
|
+
DELETE FROM position_stats;
|
|
384
|
+
DELETE FROM position_values;
|
|
385
|
+
DELETE FROM position_types;
|
|
386
|
+
DELETE FROM clusters;
|
|
387
|
+
DELETE FROM cluster_examples;
|
|
388
|
+
DELETE FROM cluster_segments;
|
|
389
|
+
DELETE FROM cluster_params;
|
|
390
|
+
DELETE FROM cluster_param_values;
|
|
391
|
+
DELETE FROM cluster_param_types;
|
|
392
|
+
SQL
|
|
393
|
+
end
|
|
394
|
+
|
|
257
395
|
# --- Reads ------------------------------------------------------------
|
|
258
396
|
|
|
259
397
|
def host_counts
|
|
@@ -274,10 +412,13 @@ module Iriq
|
|
|
274
412
|
rows_to_count_hash("fingerprint_counts", "shape")
|
|
275
413
|
end
|
|
276
414
|
|
|
277
|
-
def position_stats(
|
|
278
|
-
host
|
|
415
|
+
def position_stats(position)
|
|
416
|
+
host = position.host || ""
|
|
417
|
+
scope = position.scope.to_s
|
|
418
|
+
locator = position.locator
|
|
279
419
|
total = @db.get_first_value(
|
|
280
|
-
"SELECT total FROM position_stats WHERE host = ? AND
|
|
420
|
+
"SELECT total FROM position_stats WHERE host = ? AND scope = ? AND locator = ?",
|
|
421
|
+
[host, scope, locator],
|
|
281
422
|
)
|
|
282
423
|
return nil if total.nil?
|
|
283
424
|
|
|
@@ -286,13 +427,15 @@ module Iriq
|
|
|
286
427
|
|
|
287
428
|
vc = Hash.new(0)
|
|
288
429
|
@db.execute(
|
|
289
|
-
"SELECT value, count FROM position_values WHERE host = ? AND
|
|
430
|
+
"SELECT value, count FROM position_values WHERE host = ? AND scope = ? AND locator = ?",
|
|
431
|
+
[host, scope, locator],
|
|
290
432
|
) { |r| vc[r[0]] = r[1] }
|
|
291
433
|
stats.instance_variable_set(:@value_counts, vc)
|
|
292
434
|
|
|
293
435
|
tc = Hash.new(0)
|
|
294
436
|
@db.execute(
|
|
295
|
-
"SELECT type, count FROM position_types WHERE host = ? AND
|
|
437
|
+
"SELECT type, count FROM position_types WHERE host = ? AND scope = ? AND locator = ?",
|
|
438
|
+
[host, scope, locator],
|
|
296
439
|
) { |r| tc[r[0].to_sym] = r[1] }
|
|
297
440
|
stats.instance_variable_set(:@type_counts, tc)
|
|
298
441
|
|
|
@@ -301,10 +444,13 @@ module Iriq
|
|
|
301
444
|
|
|
302
445
|
def each_position_stats
|
|
303
446
|
seen = []
|
|
304
|
-
@db.execute("SELECT DISTINCT host,
|
|
447
|
+
@db.execute("SELECT DISTINCT host, scope, locator FROM position_stats ORDER BY ROWID") do |row|
|
|
305
448
|
seen << row
|
|
306
449
|
end
|
|
307
|
-
seen.each
|
|
450
|
+
seen.each do |host, scope, locator|
|
|
451
|
+
pos = Position.new(host: host, scope: scope.to_sym, locator: locator)
|
|
452
|
+
yield pos, position_stats(pos)
|
|
453
|
+
end
|
|
308
454
|
end
|
|
309
455
|
|
|
310
456
|
def clusters
|
|
@@ -319,6 +465,10 @@ module Iriq
|
|
|
319
465
|
@db.get_first_value("SELECT COUNT(*) FROM clusters")
|
|
320
466
|
end
|
|
321
467
|
|
|
468
|
+
def cluster_for(key)
|
|
469
|
+
load_cluster(key)
|
|
470
|
+
end
|
|
471
|
+
|
|
322
472
|
private
|
|
323
473
|
|
|
324
474
|
def upsert_shape(table, shape)
|
|
@@ -340,7 +490,10 @@ module Iriq
|
|
|
340
490
|
)
|
|
341
491
|
return nil unless row
|
|
342
492
|
|
|
343
|
-
c = Cluster.new(
|
|
493
|
+
c = Cluster.new(
|
|
494
|
+
key: row[0], host: row[1], scheme: row[2], shape: row[3],
|
|
495
|
+
max_values: @max_values_per_position,
|
|
496
|
+
)
|
|
344
497
|
c.instance_variable_set(:@count, row[4])
|
|
345
498
|
|
|
346
499
|
examples = []
|
|
@@ -360,6 +513,32 @@ module Iriq
|
|
|
360
513
|
end
|
|
361
514
|
c.instance_variable_set(:@segment_counts, seg_counts)
|
|
362
515
|
|
|
516
|
+
# Rebuild @param_stats from the three cluster_param_* tables.
|
|
517
|
+
params = {}
|
|
518
|
+
@db.execute(
|
|
519
|
+
"SELECT name, total FROM cluster_params WHERE cluster_key = ?", [key],
|
|
520
|
+
) do |r|
|
|
521
|
+
# PositionStats.new already initializes empty Hash.new(0) for value
|
|
522
|
+
# and type counts; only @total needs filling here. The followup
|
|
523
|
+
# SELECTs below populate value/type rows in place.
|
|
524
|
+
stats = PositionStats.new(max_values: @max_values_per_position)
|
|
525
|
+
stats.instance_variable_set(:@total, r[1])
|
|
526
|
+
params[r[0]] = stats
|
|
527
|
+
end
|
|
528
|
+
@db.execute(
|
|
529
|
+
"SELECT name, value, count FROM cluster_param_values WHERE cluster_key = ?", [key],
|
|
530
|
+
) do |r|
|
|
531
|
+
stats = params[r[0]] or next
|
|
532
|
+
stats.value_counts[r[1]] = r[2]
|
|
533
|
+
end
|
|
534
|
+
@db.execute(
|
|
535
|
+
"SELECT name, type, count FROM cluster_param_types WHERE cluster_key = ?", [key],
|
|
536
|
+
) do |r|
|
|
537
|
+
stats = params[r[0]] or next
|
|
538
|
+
stats.type_counts[r[1].to_sym] = r[2]
|
|
539
|
+
end
|
|
540
|
+
c.instance_variable_set(:@param_stats, params)
|
|
541
|
+
|
|
363
542
|
c
|
|
364
543
|
end
|
|
365
544
|
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Recognizer built dynamically from a learned-prefix pattern.
|
|
3
|
+
#
|
|
4
|
+
# Used by Corpus#activate_proposal to promote a RecognizerProposal
|
|
5
|
+
# into a live Recognizer that the classifier ensemble consults. Same
|
|
6
|
+
# shape as the built-in Recognizers — uuid, date, integer — but the
|
|
7
|
+
# pattern + type are supplied at construction instead of compiled-in.
|
|
8
|
+
#
|
|
9
|
+
# r = SynthesizedRecognizer.new(prefix: "ghp_", type: :ghp)
|
|
10
|
+
# r.try("ghp_abcdef123") # → {type: :ghp, confidence: 1.0, specificity: 1.0}
|
|
11
|
+
#
|
|
12
|
+
# Pattern: `<prefix><[A-Za-z0-9]+>` — anchored, alphanumeric suffix
|
|
13
|
+
# only. Matches the same shape PrefixUnderscoreId proposes from, so
|
|
14
|
+
# round-trip (propose → activate → reinfer) reclassifies the same
|
|
15
|
+
# values the proposal was derived from.
|
|
16
|
+
#
|
|
17
|
+
# Specificity defaults to SEMANTIC. A learned prefix is very specific
|
|
18
|
+
# by construction (a distinctive literal prefix that recurred enough
|
|
19
|
+
# to clear the proposal noise floor) — calling it as confident as a
|
|
20
|
+
# built-in UUID is reasonable.
|
|
21
|
+
class SynthesizedRecognizer < Recognizer
|
|
22
|
+
attr_reader :prefix, :type, :specificity
|
|
23
|
+
|
|
24
|
+
def self.from_proposal(proposal)
|
|
25
|
+
new(prefix: proposal.prefix, type: proposal.suggested_type)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def initialize(prefix:, type:, specificity: Specificity::SEMANTIC)
|
|
29
|
+
raise ArgumentError, "prefix must be a non-empty string" if prefix.nil? || prefix.empty?
|
|
30
|
+
raise ArgumentError, "type must be a symbol" unless type.is_a?(Symbol)
|
|
31
|
+
|
|
32
|
+
@prefix = prefix
|
|
33
|
+
@type = type
|
|
34
|
+
@specificity = specificity
|
|
35
|
+
@pattern = /\A#{Regexp.escape(prefix)}[A-Za-z0-9]+\z/.freeze
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def try(segment)
|
|
39
|
+
return nil unless segment.start_with?(@prefix) && @pattern.match?(segment)
|
|
40
|
+
|
|
41
|
+
{ type: @type, confidence: 1.0, specificity: @specificity }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def to_dump
|
|
45
|
+
{ "prefix" => @prefix, "type" => @type.to_s, "specificity" => @specificity }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def self.from_dump(h)
|
|
49
|
+
new(
|
|
50
|
+
prefix: h["prefix"],
|
|
51
|
+
type: h["type"].to_sym,
|
|
52
|
+
specificity: h.fetch("specificity", Specificity::SEMANTIC),
|
|
53
|
+
)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|