iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Per-Recognizer claim strength. Higher specificity wins when multiple
|
|
3
|
+
# Recognizers fire on the same segment; the ensemble picks the
|
|
4
|
+
# max(specificity × confidence).
|
|
5
|
+
#
|
|
6
|
+
# The bands below capture the current type taxonomy at coarse-grain:
|
|
7
|
+
# they're explicitly NOT linear "how confident" scores. They encode "how
|
|
8
|
+
# surprising would it be for this Recognizer to fire by accident on a
|
|
9
|
+
# different actual type." UUID's shape is so distinctive that a non-UUID
|
|
10
|
+
# producing that string is vanishingly unlikely (SEMANTIC); a 4-digit
|
|
11
|
+
# integer could plausibly be a year, an HTTP status, or an ID, so
|
|
12
|
+
# `:integer` claims only TYPED.
|
|
13
|
+
#
|
|
14
|
+
# Calibration corpus tests in spec/iriq/calibration_spec.rb / Go's
|
|
15
|
+
# calibration_test.go are the source of truth for whether these
|
|
16
|
+
# values are well-chosen — adjust them and re-run to validate.
|
|
17
|
+
module Specificity
|
|
18
|
+
# Unambiguous semantic shapes — the regex effectively can't fire by
|
|
19
|
+
# accident. (UUID, JWT, email with @, URL with ://, color hex.)
|
|
20
|
+
SEMANTIC = 1.0
|
|
21
|
+
# Restrictive structured patterns. Could collide with broader types
|
|
22
|
+
# at edges. (date, file with known ext, ipv4, mime.)
|
|
23
|
+
STRUCTURED = 0.8
|
|
24
|
+
# Digit-shaped with an additional bound — range or allowlist — that
|
|
25
|
+
# makes the shape alone meaningful. (timestamp, currency, country,
|
|
26
|
+
# boolean.)
|
|
27
|
+
BOUNDED = 0.7
|
|
28
|
+
# Lexically broad but typed. (integer, float, version.)
|
|
29
|
+
TYPED = 0.5
|
|
30
|
+
# Generic pattern-based shape. (slug.)
|
|
31
|
+
PATTERN = 0.3
|
|
32
|
+
# Generic fallback shapes. (literal, opaque_id.)
|
|
33
|
+
FALLBACK = 0.1
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
module Storage
|
|
5
|
+
# Json wraps Memory with load-from-file at open and save-to-file at close.
|
|
6
|
+
# Same JSON shape as the pre-Storage release, so files round-trip across
|
|
7
|
+
# versions.
|
|
8
|
+
class Json < Memory
|
|
9
|
+
attr_reader :path
|
|
10
|
+
|
|
11
|
+
def initialize(path: nil, **opts)
|
|
12
|
+
super(**opts)
|
|
13
|
+
@path = path
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.open(path, **opts)
|
|
17
|
+
s = new(path: path, **opts)
|
|
18
|
+
s.load!(path) if File.exist?(path) && File.size(path).positive?
|
|
19
|
+
s
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def load!(path)
|
|
23
|
+
data = File.read(path)
|
|
24
|
+
return self if data.empty?
|
|
25
|
+
|
|
26
|
+
load_dump!(JSON.parse(data))
|
|
27
|
+
@path = path
|
|
28
|
+
self
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# save writes atomically (tmp + rename). Defaults to the path passed at
|
|
32
|
+
# open(); pass an explicit path to write elsewhere.
|
|
33
|
+
def save(path = nil)
|
|
34
|
+
target = path || @path
|
|
35
|
+
raise ArgumentError, "no path provided" unless target
|
|
36
|
+
|
|
37
|
+
tmp = "#{target}.tmp"
|
|
38
|
+
File.write(tmp, JSON.generate(to_dump))
|
|
39
|
+
File.rename(tmp, target)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
module Storage
|
|
3
|
+
# Memory is the canonical backend — every other backend either wraps it
|
|
4
|
+
# (Json) or implements the same surface against an external store (Sqlite).
|
|
5
|
+
#
|
|
6
|
+
# The contract is small enough to enumerate up top:
|
|
7
|
+
#
|
|
8
|
+
# increment_host(host)
|
|
9
|
+
# increment_path_length(length)
|
|
10
|
+
# increment_raw_shape(shape)
|
|
11
|
+
# increment_fingerprint(shape)
|
|
12
|
+
# observe_position(position, value, type) # position is Iriq::Position
|
|
13
|
+
# add_to_cluster(key, host, scheme, shape, identifier)
|
|
14
|
+
# record_observation(canonical) # append to source-IRI log
|
|
15
|
+
#
|
|
16
|
+
# host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
|
|
17
|
+
# position_stats(position)
|
|
18
|
+
# each_position_stats { |position, stats| ... }
|
|
19
|
+
# each_observed_iri { |canonical| ... }
|
|
20
|
+
# clear_materialized_views # for reinfer
|
|
21
|
+
# clusters / cluster_size
|
|
22
|
+
#
|
|
23
|
+
# transaction { ... } # backends may batch within
|
|
24
|
+
# flush # commit pending writes (no-op for Memory)
|
|
25
|
+
# close # release resources
|
|
26
|
+
class Memory
|
|
27
|
+
attr_reader :max_values_per_position
|
|
28
|
+
|
|
29
|
+
# Path of the underlying file, if any. Memory backends are unpathed;
|
|
30
|
+
# Json/Sqlite override.
|
|
31
|
+
def path; nil; end
|
|
32
|
+
|
|
33
|
+
def initialize(classifier: SegmentClassifier::DEFAULT,
|
|
34
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
35
|
+
@classifier = classifier
|
|
36
|
+
@max_values_per_position = max_values_per_position
|
|
37
|
+
@host_counts = Hash.new(0)
|
|
38
|
+
@path_length_counts = Hash.new(0)
|
|
39
|
+
@raw_shape_counts = Hash.new(0)
|
|
40
|
+
@fingerprint_counts = Hash.new(0)
|
|
41
|
+
@position_stats = {}
|
|
42
|
+
@clusters = {}
|
|
43
|
+
# The source-IRI log. Persisted alongside materialized views; the
|
|
44
|
+
# log is the source of truth, the views are derived. Corpus#reinfer
|
|
45
|
+
# drops the views and replays the log through events + reducers.
|
|
46
|
+
@observed_iris = []
|
|
47
|
+
# Recognizers promoted from RecognizerProposal via
|
|
48
|
+
# Corpus#activate_proposal. Stored as {prefix, type, specificity}
|
|
49
|
+
# hashes so reopens can re-synthesize them onto the corpus's
|
|
50
|
+
# classifier.
|
|
51
|
+
@activated_recognizers = []
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def transaction
|
|
55
|
+
yield self
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def batch
|
|
59
|
+
yield
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def flush; end
|
|
63
|
+
def close; end
|
|
64
|
+
|
|
65
|
+
# No-op for in-memory; subclasses override.
|
|
66
|
+
def save(path = nil); end
|
|
67
|
+
|
|
68
|
+
# --- Increments -------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def increment_host(host)
|
|
71
|
+
@host_counts[host] += 1 if host
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def increment_path_length(length)
|
|
75
|
+
@path_length_counts[length] += 1
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def increment_raw_shape(shape)
|
|
79
|
+
@raw_shape_counts[shape] += 1
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def increment_fingerprint(shape)
|
|
83
|
+
@fingerprint_counts[shape] += 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def observe_position(position, value, type)
|
|
87
|
+
stats = @position_stats[position] ||= PositionStats.new(max_values: @max_values_per_position)
|
|
88
|
+
stats.observe(value, type)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def add_to_cluster(key, host, scheme, shape, identifier)
|
|
92
|
+
cluster = @clusters[key] ||= Cluster.new(
|
|
93
|
+
key: key, host: host, scheme: scheme, shape: shape,
|
|
94
|
+
max_values: @max_values_per_position,
|
|
95
|
+
)
|
|
96
|
+
cluster.add(identifier, classifier: @classifier)
|
|
97
|
+
cluster
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Append a canonical IRI to the source-IRI log. Called by Corpus#observe
|
|
101
|
+
# after the event reducers have applied; the log is the source of truth
|
|
102
|
+
# that Corpus#reinfer replays.
|
|
103
|
+
def record_observation(canonical)
|
|
104
|
+
@observed_iris << canonical
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def each_observed_iri(&block)
|
|
108
|
+
@observed_iris.each(&block)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def observed_iri_count
|
|
112
|
+
@observed_iris.size
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# --- Activated recognizers (Corpus#activate_proposal) -----------------
|
|
116
|
+
|
|
117
|
+
def record_activated_recognizer(dump)
|
|
118
|
+
@activated_recognizers << dump
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def each_activated_recognizer(&block)
|
|
122
|
+
@activated_recognizers.each(&block)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def activated_recognizer_count
|
|
126
|
+
@activated_recognizers.size
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Drop every materialized view (host_counts, position_stats, clusters,
|
|
130
|
+
# …) without touching the source-IRI log. Corpus#reinfer calls this
|
|
131
|
+
# before replaying the log so views rebuild from scratch.
|
|
132
|
+
def clear_materialized_views
|
|
133
|
+
@host_counts = Hash.new(0)
|
|
134
|
+
@path_length_counts = Hash.new(0)
|
|
135
|
+
@raw_shape_counts = Hash.new(0)
|
|
136
|
+
@fingerprint_counts = Hash.new(0)
|
|
137
|
+
@position_stats = {}
|
|
138
|
+
@clusters = {}
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# --- Reads ------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
def host_counts; @host_counts; end
|
|
144
|
+
def path_length_counts; @path_length_counts; end
|
|
145
|
+
def raw_shape_counts; @raw_shape_counts; end
|
|
146
|
+
def fingerprint_counts; @fingerprint_counts; end
|
|
147
|
+
|
|
148
|
+
def position_stats(position)
|
|
149
|
+
@position_stats[position]
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def each_position_stats(&block)
|
|
153
|
+
@position_stats.each(&block)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def clusters
|
|
157
|
+
@clusters.values
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def cluster_size
|
|
161
|
+
@clusters.size
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# O(1) lookup by cluster key — used by Corpus#normalize to pull the
|
|
165
|
+
# cluster's param_stats for the URL being normalized. nil if no cluster
|
|
166
|
+
# has been observed under this key yet.
|
|
167
|
+
def cluster_for(key)
|
|
168
|
+
@clusters[key]
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# --- Bulk load (used by JSON backend) --------------------------------
|
|
172
|
+
|
|
173
|
+
def load_dump!(h)
|
|
174
|
+
@host_counts = Hash.new(0).merge(h["host_counts"])
|
|
175
|
+
@path_length_counts = Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i))
|
|
176
|
+
@raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
|
|
177
|
+
@fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
|
|
178
|
+
@max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
|
|
179
|
+
@position_stats = h["position_stats"].each_with_object({}) do |entry, acc|
|
|
180
|
+
position = Position.from_dump(entry["position"])
|
|
181
|
+
acc[position] = PositionStats.from_dump(entry["stats"])
|
|
182
|
+
end
|
|
183
|
+
cdump = h.fetch("clusterer", { "clusters" => {} })
|
|
184
|
+
@clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c, max_values: @max_values_per_position) }
|
|
185
|
+
@observed_iris = h.fetch("observed_iris", [])
|
|
186
|
+
@activated_recognizers = h.fetch("activated_recognizers", [])
|
|
187
|
+
self
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def to_dump
|
|
191
|
+
{
|
|
192
|
+
"host_counts" => @host_counts,
|
|
193
|
+
"path_length_counts" => @path_length_counts.transform_keys(&:to_s),
|
|
194
|
+
"raw_shape_counts" => @raw_shape_counts,
|
|
195
|
+
"fingerprint_counts" => @fingerprint_counts,
|
|
196
|
+
"max_values_per_position" => @max_values_per_position,
|
|
197
|
+
"position_stats" => @position_stats.map { |pos, s|
|
|
198
|
+
{ "position" => pos.to_dump, "stats" => s.dump }
|
|
199
|
+
},
|
|
200
|
+
"clusterer" => {
|
|
201
|
+
"clusters" => @clusters.transform_values(&:dump),
|
|
202
|
+
},
|
|
203
|
+
"observed_iris" => @observed_iris,
|
|
204
|
+
"activated_recognizers" => @activated_recognizers,
|
|
205
|
+
}
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|