iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
@@ -0,0 +1,35 @@
1
+ module Iriq
2
+ # Per-Recognizer claim strength. Higher specificity wins when multiple
3
+ # Recognizers fire on the same segment; the ensemble picks the
4
+ # max(specificity × confidence).
5
+ #
6
+ # The bands below capture the current type taxonomy at coarse-grain:
7
+ # they're explicitly NOT linear "how confident" scores. They encode "how
8
+ # surprising would it be for this Recognizer to fire by accident on a
9
+ # different actual type." UUID's shape is so distinctive that a non-UUID
10
+ # producing that string is vanishingly unlikely (SEMANTIC); a 4-digit
11
+ # integer could plausibly be a year, an HTTP status, or an ID, so
12
+ # `:integer` claims only TYPED.
13
+ #
14
+ # Calibration corpus tests in spec/iriq/calibration_spec.rb / Go's
15
+ # calibration_test.go are the source of truth for whether these
16
+ # values are well-chosen — adjust them and re-run to validate.
17
+ module Specificity
18
+ # Unambiguous semantic shapes — the regex effectively can't fire by
19
+ # accident. (UUID, JWT, email with @, URL with ://, color hex.)
20
+ SEMANTIC = 1.0
21
+ # Restrictive structured patterns. Could collide with broader types
22
+ # at edges. (date, file with known ext, ipv4, mime.)
23
+ STRUCTURED = 0.8
24
+ # Digit-shaped with an additional bound — range or allowlist — that
25
+ # makes the shape alone meaningful. (timestamp, currency, country,
26
+ # boolean.)
27
+ BOUNDED = 0.7
28
+ # Lexically broad but typed. (integer, float, version.)
29
+ TYPED = 0.5
30
+ # Generic pattern-based shape. (slug.)
31
+ PATTERN = 0.3
32
+ # Generic fallback shapes. (literal, opaque_id.)
33
+ FALLBACK = 0.1
34
+ end
35
+ end
@@ -0,0 +1,43 @@
1
+ require "json"
2
+
3
+ module Iriq
4
+ module Storage
5
+ # Json wraps Memory with load-from-file at open and save-to-file at close.
6
+ # Same JSON shape as the pre-Storage release, so files round-trip across
7
+ # versions.
8
+ class Json < Memory
9
+ attr_reader :path
10
+
11
+ def initialize(path: nil, **opts)
12
+ super(**opts)
13
+ @path = path
14
+ end
15
+
16
+ def self.open(path, **opts)
17
+ s = new(path: path, **opts)
18
+ s.load!(path) if File.exist?(path) && File.size(path).positive?
19
+ s
20
+ end
21
+
22
+ def load!(path)
23
+ data = File.read(path)
24
+ return self if data.empty?
25
+
26
+ load_dump!(JSON.parse(data))
27
+ @path = path
28
+ self
29
+ end
30
+
31
+ # save writes atomically (tmp + rename). Defaults to the path passed at
32
+ # open(); pass an explicit path to write elsewhere.
33
+ def save(path = nil)
34
+ target = path || @path
35
+ raise ArgumentError, "no path provided" unless target
36
+
37
+ tmp = "#{target}.tmp"
38
+ File.write(tmp, JSON.generate(to_dump))
39
+ File.rename(tmp, target)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,209 @@
1
+ module Iriq
2
+ module Storage
3
+ # Memory is the canonical backend — every other backend either wraps it
4
+ # (Json) or implements the same surface against an external store (Sqlite).
5
+ #
6
+ # The contract is small enough to enumerate up top:
7
+ #
8
+ # increment_host(host)
9
+ # increment_path_length(length)
10
+ # increment_raw_shape(shape)
11
+ # increment_fingerprint(shape)
12
+ # observe_position(position, value, type) # position is Iriq::Position
13
+ # add_to_cluster(key, host, scheme, shape, identifier)
14
+ # record_observation(canonical) # append to source-IRI log
15
+ #
16
+ # host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
17
+ # position_stats(position)
18
+ # each_position_stats { |position, stats| ... }
19
+ # each_observed_iri { |canonical| ... }
20
+ # clear_materialized_views # for reinfer
21
+ # clusters / cluster_size
22
+ #
23
+ # transaction { ... } # backends may batch within
24
+ # flush # commit pending writes (no-op for Memory)
25
+ # close # release resources
26
+ class Memory
27
+ attr_reader :max_values_per_position
28
+
29
+ # Path of the underlying file, if any. Memory backends are unpathed;
30
+ # Json/Sqlite override.
31
+ def path; nil; end
32
+
33
+ def initialize(classifier: SegmentClassifier::DEFAULT,
34
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
35
+ @classifier = classifier
36
+ @max_values_per_position = max_values_per_position
37
+ @host_counts = Hash.new(0)
38
+ @path_length_counts = Hash.new(0)
39
+ @raw_shape_counts = Hash.new(0)
40
+ @fingerprint_counts = Hash.new(0)
41
+ @position_stats = {}
42
+ @clusters = {}
43
+ # The source-IRI log. Persisted alongside materialized views; the
44
+ # log is the source of truth, the views are derived. Corpus#reinfer
45
+ # drops the views and replays the log through events + reducers.
46
+ @observed_iris = []
47
+ # Recognizers promoted from RecognizerProposal via
48
+ # Corpus#activate_proposal. Stored as {prefix, type, specificity}
49
+ # hashes so reopens can re-synthesize them onto the corpus's
50
+ # classifier.
51
+ @activated_recognizers = []
52
+ end
53
+
54
+ def transaction
55
+ yield self
56
+ end
57
+
58
+ def batch
59
+ yield
60
+ end
61
+
62
+ def flush; end
63
+ def close; end
64
+
65
+ # No-op for in-memory; subclasses override.
66
+ def save(path = nil); end
67
+
68
+ # --- Increments -------------------------------------------------------
69
+
70
+ def increment_host(host)
71
+ @host_counts[host] += 1 if host
72
+ end
73
+
74
+ def increment_path_length(length)
75
+ @path_length_counts[length] += 1
76
+ end
77
+
78
+ def increment_raw_shape(shape)
79
+ @raw_shape_counts[shape] += 1
80
+ end
81
+
82
+ def increment_fingerprint(shape)
83
+ @fingerprint_counts[shape] += 1
84
+ end
85
+
86
+ def observe_position(position, value, type)
87
+ stats = @position_stats[position] ||= PositionStats.new(max_values: @max_values_per_position)
88
+ stats.observe(value, type)
89
+ end
90
+
91
+ def add_to_cluster(key, host, scheme, shape, identifier)
92
+ cluster = @clusters[key] ||= Cluster.new(
93
+ key: key, host: host, scheme: scheme, shape: shape,
94
+ max_values: @max_values_per_position,
95
+ )
96
+ cluster.add(identifier, classifier: @classifier)
97
+ cluster
98
+ end
99
+
100
+ # Append a canonical IRI to the source-IRI log. Called by Corpus#observe
101
+ # after the event reducers have applied; the log is the source of truth
102
+ # that Corpus#reinfer replays.
103
+ def record_observation(canonical)
104
+ @observed_iris << canonical
105
+ end
106
+
107
+ def each_observed_iri(&block)
108
+ @observed_iris.each(&block)
109
+ end
110
+
111
+ def observed_iri_count
112
+ @observed_iris.size
113
+ end
114
+
115
+ # --- Activated recognizers (Corpus#activate_proposal) -----------------
116
+
117
+ def record_activated_recognizer(dump)
118
+ @activated_recognizers << dump
119
+ end
120
+
121
+ def each_activated_recognizer(&block)
122
+ @activated_recognizers.each(&block)
123
+ end
124
+
125
+ def activated_recognizer_count
126
+ @activated_recognizers.size
127
+ end
128
+
129
+ # Drop every materialized view (host_counts, position_stats, clusters,
130
+ # …) without touching the source-IRI log. Corpus#reinfer calls this
131
+ # before replaying the log so views rebuild from scratch.
132
+ def clear_materialized_views
133
+ @host_counts = Hash.new(0)
134
+ @path_length_counts = Hash.new(0)
135
+ @raw_shape_counts = Hash.new(0)
136
+ @fingerprint_counts = Hash.new(0)
137
+ @position_stats = {}
138
+ @clusters = {}
139
+ end
140
+
141
+ # --- Reads ------------------------------------------------------------
142
+
143
+ def host_counts; @host_counts; end
144
+ def path_length_counts; @path_length_counts; end
145
+ def raw_shape_counts; @raw_shape_counts; end
146
+ def fingerprint_counts; @fingerprint_counts; end
147
+
148
+ def position_stats(position)
149
+ @position_stats[position]
150
+ end
151
+
152
+ def each_position_stats(&block)
153
+ @position_stats.each(&block)
154
+ end
155
+
156
+ def clusters
157
+ @clusters.values
158
+ end
159
+
160
+ def cluster_size
161
+ @clusters.size
162
+ end
163
+
164
+ # O(1) lookup by cluster key — used by Corpus#normalize to pull the
165
+ # cluster's param_stats for the URL being normalized. nil if no cluster
166
+ # has been observed under this key yet.
167
+ def cluster_for(key)
168
+ @clusters[key]
169
+ end
170
+
171
+ # --- Bulk load (used by JSON backend) --------------------------------
172
+
173
+ def load_dump!(h)
174
+ @host_counts = Hash.new(0).merge(h["host_counts"])
175
+ @path_length_counts = Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i))
176
+ @raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
177
+ @fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
178
+ @max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
179
+ @position_stats = h["position_stats"].each_with_object({}) do |entry, acc|
180
+ position = Position.from_dump(entry["position"])
181
+ acc[position] = PositionStats.from_dump(entry["stats"])
182
+ end
183
+ cdump = h.fetch("clusterer", { "clusters" => {} })
184
+ @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c, max_values: @max_values_per_position) }
185
+ @observed_iris = h.fetch("observed_iris", [])
186
+ @activated_recognizers = h.fetch("activated_recognizers", [])
187
+ self
188
+ end
189
+
190
+ def to_dump
191
+ {
192
+ "host_counts" => @host_counts,
193
+ "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
194
+ "raw_shape_counts" => @raw_shape_counts,
195
+ "fingerprint_counts" => @fingerprint_counts,
196
+ "max_values_per_position" => @max_values_per_position,
197
+ "position_stats" => @position_stats.map { |pos, s|
198
+ { "position" => pos.to_dump, "stats" => s.dump }
199
+ },
200
+ "clusterer" => {
201
+ "clusters" => @clusters.transform_values(&:dump),
202
+ },
203
+ "observed_iris" => @observed_iris,
204
+ "activated_recognizers" => @activated_recognizers,
205
+ }
206
+ end
207
+ end
208
+ end
209
+ end