iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
data/lib/iriq/cluster.rb CHANGED
@@ -1,31 +1,79 @@
1
1
  module Iriq
2
2
  # A group of identifiers that share a host + shape key. Tracks examples and
3
3
  # per-position segment statistics so callers can ask which positions are
4
- # actually stable in practice (e.g. /users/ always literal, /{integer_id}
4
+ # actually stable in practice (e.g. /users/ always literal, /{integer}
5
5
  # always variable).
6
6
  class Cluster
7
- attr_reader :key, :host, :scheme, :shape, :examples, :count
7
+ attr_reader :key, :host, :scheme, :shape, :examples, :count, :param_stats, :max_values
8
+
9
+ # Structured Shape lazily derived from the first observed example —
10
+ # Iriq::Shape, or nil if no examples are present yet. Cached after the
11
+ # first call.
12
+ def shape_object(classifier: SegmentClassifier::DEFAULT)
13
+ return @shape_object if @shape_object
14
+ return nil if @examples.empty?
15
+
16
+ @shape_object = Shape.from_segments(@examples.first.path_segments, classifier: classifier)
17
+ end
8
18
 
9
19
  MAX_EXAMPLES = 10
10
20
 
11
- def initialize(key:, host:, scheme:, shape:)
21
+ # Share of date-typed observations required before the corpus promotes
22
+ # a param to :date. 8-digit IDs in the 1900..2100 range look like
23
+ # YYYYMMDD by accident — without quorum we'd canonicalize random IDs.
24
+ DATE_CONFIDENCE_THRESHOLD = 0.8
25
+
26
+ # `:number` umbrella thresholds. Promote a position to :number when
27
+ # the combined :integer + :float observations dominate (≥ majority)
28
+ # AND neither subtype alone hits the strong threshold (we have a clear
29
+ # numeric pattern but it isn't purely ints or purely floats).
30
+ NUMBER_CONFIDENCE_THRESHOLD = 0.8
31
+ NUMBER_SUBTYPE_THRESHOLD = 0.8
32
+
33
+ # `:enum` thresholds. Promote a param to :enum when the corpus has seen
34
+ # enough samples to trust the bound, the value set is small, each value
35
+ # appears more than once (rules out singletons), and the tracked values
36
+ # account for nearly all observations (lets a few stragglers through).
37
+ ENUM_MIN_OBSERVATIONS = 20
38
+ ENUM_MAX_CARDINALITY = 10
39
+ ENUM_MIN_VALUE_COUNT = 2
40
+ ENUM_MIN_COVERAGE = 0.95
41
+
42
+ def initialize(key:, host:, scheme:, shape:, max_values: PositionStats::DEFAULT_MAX_VALUES)
12
43
  @key = key
13
44
  @host = host
14
45
  @scheme = scheme
15
46
  @shape = shape
47
+ @shape_object = nil
16
48
  @examples = []
49
+ @example_keys = Set.new
17
50
  @count = 0
18
51
  @segment_counts = []
52
+ @max_values = max_values
53
+ # Query-param stats keyed by param name. Each is a PositionStats — same
54
+ # cardinality cap, same type-counts machinery, just indexed by ?key=
55
+ # instead of by path position.
56
+ @param_stats = {}
19
57
  end
20
58
 
21
- def add(identifier)
59
+ def add(identifier, classifier: SegmentClassifier::DEFAULT)
22
60
  @count += 1
23
- @examples << identifier if @examples.size < MAX_EXAMPLES
61
+ if @examples.size < MAX_EXAMPLES
62
+ canon = identifier.canonical
63
+ @examples << identifier unless @example_keys.include?(canon)
64
+ @example_keys << canon
65
+ end
24
66
 
25
67
  identifier.path_segments.each_with_index do |seg, i|
26
68
  @segment_counts[i] ||= Hash.new(0)
27
69
  @segment_counts[i][seg] += 1
28
70
  end
71
+
72
+ return unless identifier.query_params
73
+ identifier.query_params.each do |name, value|
74
+ stats = @param_stats[name] ||= PositionStats.new(max_values: @max_values)
75
+ stats.observe(value.to_s, classifier.classify(value.to_s))
76
+ end
29
77
  end
30
78
 
31
79
  # Per-position summary:
@@ -52,9 +100,223 @@ module Iriq
52
100
  count: count,
53
101
  examples: examples.map(&:canonical),
54
102
  segments: segment_stats,
103
+ params: param_summary,
55
104
  }
56
105
  end
57
106
 
107
+ # Per-param summary, ordered by descending presence. Each entry is:
108
+ # { name: "page", count: N, type: :integer, cardinality: K, presence: 0.83 }
109
+ # presence is count / @count — the fraction of observations that had
110
+ # this param.
111
+ def param_summary
112
+ return [] if @param_stats.empty?
113
+
114
+ @param_stats.map { |name, _stats|
115
+ stats = @param_stats[name]
116
+ type = param_type(name)
117
+ row = {
118
+ name: name,
119
+ count: stats.total,
120
+ type: type,
121
+ cardinality: stats.cardinality,
122
+ presence: @count.positive? ? stats.total.to_f / @count : 0.0,
123
+ }
124
+ row[:values] = enum_values(stats) if type == :enum
125
+ # Verbose value distribution — fractions over tracked occurrences.
126
+ # Boolean and enum positions get the per-value breakdown (e.g.
127
+ # `true: 0.97, false: 0.03`). Number positions get the int-vs-float
128
+ # split via :subtype_distribution.
129
+ if type == :boolean || type == :enum
130
+ row[:value_distribution] = value_distribution(stats)
131
+ end
132
+ if type == :number
133
+ row[:subtype_distribution] = subtype_distribution(stats, %i[integer float])
134
+ end
135
+ # :file kind breakdown — derived from tracked value_counts at
136
+ # summary time. Best-effort: only reflects observations within
137
+ # the value-tracking cap.
138
+ if type == :file
139
+ row[:kind_distribution] = file_kind_distribution(stats)
140
+ end
141
+ if stats.numeric_count.positive?
142
+ row[:min] = stats.numeric_min
143
+ row[:max] = stats.numeric_max
144
+ row[:avg] = stats.numeric_avg
145
+ end
146
+ row
147
+ }.sort_by { |row| [-row[:count], row[:name]] }
148
+ end
149
+
150
+ # Returns the type the corpus is confident enough to call this param.
151
+ # Equals stats.dominant_type when the dominant type isn't :date; when
152
+ # :date is dominant but below DATE_CONFIDENCE_THRESHOLD, falls back to
153
+ # the most-common non-date type (or :literal if none exists). Shared
154
+ # by Cluster#param_summary and Corpus#inferred_param_type so both views
155
+ # agree on what the corpus "thinks" about a param.
156
+ def param_type(name)
157
+ stats = @param_stats[name]
158
+ return nil unless stats
159
+ return nil if stats.total.zero?
160
+
161
+ type = stats.dominant_type
162
+
163
+ # :year takes priority over :enum for numeric range columns —
164
+ # a "years 2020..2026" position is more useful described as a
165
+ # ranged year than as an enum of those specific values.
166
+ return :year if year_position?(type, stats)
167
+ # :http_status — 3-digit ints clustered in 100..599 are almost
168
+ # certainly HTTP statuses. Same shape as :year (range check) but
169
+ # tighter window. Useful for `?status=...` or path positions that
170
+ # echo a status code.
171
+ return :http_status if http_status_position?(type, stats)
172
+
173
+ # :enum check — bounded set of repeated values trumps the underlying
174
+ # value type. `?status=active|draft|archived` surfaces as :enum
175
+ # (with the value list) rather than :literal even though each value
176
+ # individually classifies as a literal. Skip the override when the
177
+ # dominant type is already specific (`:boolean` carries more meaning
178
+ # than a 2-value enum).
179
+ return :enum if enum?(stats) && type != :boolean
180
+
181
+ # :date gate — demote when there isn't enough date-typed quorum.
182
+ if type == :date
183
+ date_frac = stats.type_counts[:date].to_f / stats.total
184
+ return type if date_frac >= DATE_CONFIDENCE_THRESHOLD
185
+
186
+ return dominant_excluding(stats, :date) || :literal
187
+ end
188
+
189
+ # :number umbrella — promote when ints + floats together dominate
190
+ # but neither alone is the clear winner.
191
+ if type == :integer || type == :float
192
+ int_frac = stats.type_counts[:integer].to_f / stats.total
193
+ float_frac = stats.type_counts[:float].to_f / stats.total
194
+ if int_frac < NUMBER_SUBTYPE_THRESHOLD &&
195
+ float_frac < NUMBER_SUBTYPE_THRESHOLD &&
196
+ (int_frac + float_frac) >= NUMBER_CONFIDENCE_THRESHOLD
197
+ return :number
198
+ end
199
+ end
200
+
201
+ # Param-name fallback — `?phone=...` overrides a generic literal
202
+ # type with `:phone` when the value's shape was too weak to detect
203
+ # on its own. Only fires for overridable types (literal/opaque_id/slug).
204
+ if (hint = SegmentClassifier.param_name_hint(name, type))
205
+ return hint
206
+ end
207
+
208
+ type
209
+ end
210
+
211
+ YEAR_RANGE = 1900..2100
212
+ YEAR_MIN_OBSERVATIONS = 5
213
+ YEAR_MIN_DISTINCT = 2
214
+ YEAR_MAX_DISTINCT = 150
215
+
216
+ def year_position?(type, stats)
217
+ return false unless type == :integer
218
+ return false if stats.numeric_count.zero?
219
+ return false if stats.cardinality < YEAR_MIN_DISTINCT
220
+ return false if stats.cardinality > YEAR_MAX_DISTINCT
221
+ return false if stats.total < YEAR_MIN_OBSERVATIONS
222
+
223
+ YEAR_RANGE.cover?(stats.numeric_min) && YEAR_RANGE.cover?(stats.numeric_max)
224
+ end
225
+
226
+ HTTP_STATUS_RANGE = 100..599
227
+ HTTP_STATUS_MIN_OBSERVATIONS = 5
228
+ HTTP_STATUS_MIN_DISTINCT = 2
229
+ HTTP_STATUS_MAX_DISTINCT = 30
230
+
231
+ def http_status_position?(type, stats)
232
+ return false unless type == :integer
233
+ return false if stats.numeric_count.zero?
234
+ return false if stats.cardinality < HTTP_STATUS_MIN_DISTINCT
235
+ return false if stats.cardinality > HTTP_STATUS_MAX_DISTINCT
236
+ return false if stats.total < HTTP_STATUS_MIN_OBSERVATIONS
237
+
238
+ HTTP_STATUS_RANGE.cover?(stats.numeric_min) && HTTP_STATUS_RANGE.cover?(stats.numeric_max)
239
+ end
240
+
241
+ # True when stats shows a bounded set of repeated values worth treating
242
+ # as an enum. See ENUM_* constants at the top of this class.
243
+ def enum?(stats)
244
+ return false if stats.total < ENUM_MIN_OBSERVATIONS
245
+ return false if stats.cardinality.zero? || stats.cardinality > ENUM_MAX_CARDINALITY
246
+ return false if stats.value_counts.any? { |_, n| n < ENUM_MIN_VALUE_COUNT }
247
+
248
+ coverage = stats.value_counts.values.sum.to_f / stats.total
249
+ coverage >= ENUM_MIN_COVERAGE
250
+ end
251
+
252
+ # Distinct values tracked for this param, ordered by descending count
253
+ # (lex tie-break). Returned alongside :enum-typed rows in param_summary
254
+ # so verbose/explain consumers can render the value set.
255
+ def enum_values(stats)
256
+ stats.value_counts.sort_by { |v, n| [-n, v] }.map(&:first)
257
+ end
258
+
259
+ # value_distribution returns the fraction of total observations each
260
+ # tracked value represents, ordered by descending count then lex. Used
261
+ # by param_summary for :boolean and :enum positions so callers can
262
+ # render "true 97%, false 3%"-style breakdowns.
263
+ def value_distribution(stats)
264
+ return {} if stats.total.zero?
265
+
266
+ stats.value_counts.sort_by { |v, n| [-n, v] }.to_h.transform_values do |n|
267
+ (n.to_f / stats.total).round(4)
268
+ end
269
+ end
270
+
271
+ # subtype_distribution slices type_counts to a specific subset and
272
+ # returns the fraction each subtype represents. Used for the :number
273
+ # umbrella to expose the int-vs-float split.
274
+ def subtype_distribution(stats, subtypes)
275
+ return {} if stats.total.zero?
276
+
277
+ subtypes.each_with_object({}) do |t, out|
278
+ n = stats.type_counts[t] || 0
279
+ out[t] = (n.to_f / stats.total).round(4) if n.positive?
280
+ end
281
+ end
282
+
283
+ # file_kind_distribution buckets tracked values by file kind and
284
+ # returns the fraction each kind represents over tracked observations.
285
+ # `:unknown` covers values that classified as :file but whose extension
286
+ # isn't in the kind allowlist (shouldn't normally happen since the
287
+ # classifier already gates on the kind map). Sums to ≤ 1.0 since
288
+ # value_counts caps at PositionStats::DEFAULT_MAX_VALUES.
289
+ def file_kind_distribution(stats)
290
+ return {} if stats.value_counts.empty?
291
+
292
+ total = stats.value_counts.values.sum
293
+ return {} if total.zero?
294
+
295
+ kinds = Hash.new(0)
296
+ stats.value_counts.each do |value, n|
297
+ kind = SegmentClassifier.file_kind(value) || :unknown
298
+ kinds[kind] += n
299
+ end
300
+ kinds.sort_by { |k, n| [-n, k.to_s] }.to_h.transform_values do |n|
301
+ (n.to_f / total).round(4)
302
+ end
303
+ end
304
+
305
+ # Most common type in stats.type_counts excluding `skip` — lex tie-break
306
+ # so the choice is deterministic across runtimes.
307
+ def dominant_excluding(stats, skip)
308
+ best = nil
309
+ best_count = -1
310
+ stats.type_counts.each do |t, n|
311
+ next if t == skip
312
+ if n > best_count || (n == best_count && t.to_s < best.to_s)
313
+ best = t
314
+ best_count = n
315
+ end
316
+ end
317
+ best
318
+ end
319
+
58
320
  # JSON-friendly dump for persistence (distinct from #to_h which is a
59
321
  # display form). Examples are dumped as canonical strings and re-parsed
60
322
  # on load.
@@ -67,15 +329,49 @@ module Iriq
67
329
  "count" => count,
68
330
  "examples" => examples.map(&:canonical),
69
331
  "segment_counts" => @segment_counts.map { |h| h || {} },
332
+ "param_stats" => @param_stats.transform_values(&:dump),
70
333
  }
71
334
  end
72
335
 
73
- def self.from_dump(h)
74
- cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
336
+ def self.from_dump(h, max_values: PositionStats::DEFAULT_MAX_VALUES)
337
+ cluster = new(
338
+ key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"],
339
+ max_values: max_values,
340
+ )
75
341
  cluster.instance_variable_set(:@count, h["count"])
76
- cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
342
+ examples = h["examples"].map { |s| Parser.parse(s) }
343
+ cluster.instance_variable_set(:@examples, examples)
344
+ cluster.instance_variable_set(:@example_keys, examples.map(&:canonical).to_set)
77
345
  cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
346
+ params = (h["param_stats"] || {}).transform_values { |sd| PositionStats.from_dump(sd) }
347
+ cluster.instance_variable_set(:@param_stats, params)
78
348
  cluster
79
349
  end
350
+
351
+ # Shared cluster-key derivation. Returns [key, host, scheme, shape] —
352
+ # callers that already have a hinted shape can pass it in to skip the
353
+ # recomputation; URN inputs ignore the override and always derive their
354
+ # own shape from the NSS value. `host:` overrides iri.host — used by
355
+ # Corpus when host_strategy collapses subdomains or ignores the host.
356
+ def self.key_for(iri, classifier:, shape: nil, host: nil)
357
+ if iri.urn?
358
+ ns, value = (iri.nss || "").split(":", 2)
359
+ derived = value ? urn_value_shape(ns, value, classifier) : nil
360
+ key = "urn:#{ns}:#{derived}"
361
+ [key, nil, "urn", key]
362
+ else
363
+ shape ||= PathShape.new(classifier: classifier).for(iri.path_segments)
364
+ effective_host = host.nil? ? iri.host : host
365
+ key = "#{iri.scheme}://#{effective_host}#{shape}"
366
+ [key, effective_host, iri.scheme, shape]
367
+ end
368
+ end
369
+
370
+ def self.urn_value_shape(ns, value, classifier)
371
+ entry = SegmentHints.derive([ns, value], classifier).last
372
+ return entry[:value] unless entry[:variable]
373
+
374
+ "{#{entry[:hint] || entry[:type]}}"
375
+ end
80
376
  end
81
377
  end
@@ -3,31 +3,28 @@ module Iriq
3
3
  # `clusters` to read out the groups. `explain` annotates a single identifier
4
4
  # against the cluster it would fall into, including which positions are
5
5
  # stable across all observed members.
6
+ #
7
+ # Implemented as a thin wrapper over Storage::Memory — the same code path
8
+ # Corpus uses for the cluster portion of its state, so there's only one
9
+ # place that knows how clusters get stored.
6
10
  class Clusterer
7
11
  def initialize(classifier: SegmentClassifier::DEFAULT)
8
12
  @classifier = classifier
9
- @clusters = {}
13
+ @storage = Storage::Memory.new(classifier: classifier)
10
14
  end
11
15
 
12
16
  def add(input, shape: nil)
13
17
  iri = coerce(input)
14
- key, host, scheme, shape = cluster_key(iri, shape: shape)
15
- cluster = @clusters[key] ||= Cluster.new(
16
- key: key,
17
- host: host,
18
- scheme: scheme,
19
- shape: shape,
20
- )
21
- cluster.add(iri)
22
- cluster
18
+ key, host, scheme, derived = Cluster.key_for(iri, classifier: @classifier, shape: shape)
19
+ @storage.add_to_cluster(key, host, scheme, derived, iri)
23
20
  end
24
21
 
25
22
  def clusters
26
- @clusters.values
23
+ @storage.clusters
27
24
  end
28
25
 
29
26
  def size
30
- @clusters.size
27
+ @storage.cluster_size
31
28
  end
32
29
 
33
30
  # Returns a per-segment explanation for the input, merging classifier
@@ -36,8 +33,8 @@ module Iriq
36
33
  # would otherwise call them variable).
37
34
  def explain(input)
38
35
  iri = coerce(input)
39
- key, * = cluster_key(iri)
40
- cluster = @clusters[key]
36
+ key, * = Cluster.key_for(iri, classifier: @classifier)
37
+ cluster = clusters.find { |c| c.key == key }
41
38
  stats = cluster ? cluster.segment_stats : []
42
39
  hinted = SegmentHints.derive(iri.path_segments, @classifier)
43
40
 
@@ -50,43 +47,21 @@ module Iriq
50
47
  end
51
48
  end
52
49
 
53
- private
54
-
55
- def coerce(input)
56
- input.is_a?(Identifier) ? input : Parser.parse(input)
57
- end
58
-
59
- def cluster_key(iri, shape: nil)
60
- if iri.urn?
61
- ns, value = (iri.nss || "").split(":", 2)
62
- shape = value ? urn_value_shape(ns, value) : nil
63
- key = "urn:#{ns}:#{shape}"
64
- [key, nil, "urn", key]
65
- else
66
- shape ||= PathShape.new(classifier: @classifier).for(iri.path_segments)
67
- key = "#{iri.scheme}://#{iri.host}#{shape}"
68
- [key, iri.host, iri.scheme, shape]
69
- end
70
- end
71
-
72
- def urn_value_shape(ns, value)
73
- entry = SegmentHints.derive([ns, value], @classifier).last
74
- return entry[:value] unless entry[:variable]
75
-
76
- "{#{entry[:hint] || entry[:type]}}"
77
- end
78
-
79
- public
80
-
81
50
  def dump
82
- { "clusters" => @clusters.transform_values(&:dump) }
51
+ { "clusters" => clusters.each_with_object({}) { |c, h| h[c.key] = c.dump } }
83
52
  end
84
53
 
85
54
  def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
86
55
  c = new(classifier: classifier)
87
56
  restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
88
- c.instance_variable_set(:@clusters, restored)
57
+ c.instance_variable_get(:@storage).instance_variable_set(:@clusters, restored)
89
58
  c
90
59
  end
60
+
61
+ private
62
+
63
+ def coerce(input)
64
+ input.is_a?(Identifier) ? input : Parser.parse(input)
65
+ end
91
66
  end
92
67
  end