iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
data/lib/iriq/cli.rb
CHANGED
|
@@ -18,6 +18,8 @@ module Iriq
|
|
|
18
18
|
LARGE_BATCH_THRESHOLD = 10
|
|
19
19
|
|
|
20
20
|
USAGE = <<~TXT
|
|
21
|
+
iriq — find a URL's shape: the route template behind it (e.g. /users/{id}).
|
|
22
|
+
|
|
21
23
|
Usage: iriq [options] <input>
|
|
22
24
|
iriq [options] < text
|
|
23
25
|
iriq cluster [options] [file]
|
|
@@ -26,18 +28,49 @@ module Iriq
|
|
|
26
28
|
text via stdin.
|
|
27
29
|
|
|
28
30
|
Sections (combine freely):
|
|
29
|
-
-n, --normalize Shape
|
|
31
|
+
-n, --normalize Shape — variable parts become placeholders
|
|
32
|
+
-c, --canonical Clean form — tidy scheme/host, keep the values
|
|
30
33
|
-p, --parse Parsed fields
|
|
34
|
+
-e, --explain Annotated trace — per-segment notes about why
|
|
35
|
+
each placeholder / canonical value was chosen
|
|
31
36
|
|
|
32
37
|
Corpus + stats:
|
|
33
38
|
--corpus PATH Load/create a JSON corpus; observe and save atomically.
|
|
34
39
|
-n becomes corpus-informed once it has data.
|
|
40
|
+
--host MODE Host-keying strategy for clustering:
|
|
41
|
+
full (default), registrable (or reg) strips
|
|
42
|
+
subdomains, none ignores host entirely.
|
|
35
43
|
--stats Print rolling aggregates
|
|
44
|
+
--reinfer Replay the source-IRI log through the current
|
|
45
|
+
classifier + reducers; rebuilds materialized
|
|
46
|
+
views from scratch. Requires --corpus.
|
|
47
|
+
--propose-recognizers
|
|
48
|
+
Scan observed values for shape patterns that
|
|
49
|
+
recur enough to suggest a new Recognizer.
|
|
50
|
+
Combine with --json for structured output.
|
|
51
|
+
Requires --corpus.
|
|
52
|
+
--cross-host-shapes
|
|
53
|
+
List route shapes that recur across
|
|
54
|
+
multiple hosts. Combine with --min-hosts.
|
|
55
|
+
Requires --corpus.
|
|
56
|
+
--activate-above F With --propose-recognizers, promote every
|
|
57
|
+
proposal at or above CONFIDENCE F into a
|
|
58
|
+
live Recognizer on the corpus, then
|
|
59
|
+
reinfer. Confidence integrates coverage
|
|
60
|
+
and cross-host corroboration.
|
|
61
|
+
|
|
62
|
+
Thresholds (apply to --propose-recognizers / --cross-host-shapes):
|
|
63
|
+
--min-observations N proposal noise floor (default 20)
|
|
64
|
+
--min-coverage F proposal coverage floor (default 0.7)
|
|
65
|
+
--min-hosts N proposal: minimum hosts (default 1);
|
|
66
|
+
cross-host-shapes: minimum hosts to
|
|
67
|
+
list (default 2)
|
|
36
68
|
|
|
37
69
|
Other:
|
|
38
70
|
-h, --help Show this message
|
|
39
71
|
-j, --json Emit JSON instead of human-readable output
|
|
40
|
-
-
|
|
72
|
+
-J, --ndjson Newline-delimited JSON (one object per line). Implies --json.
|
|
73
|
+
-N, --no-hints Use {integer} placeholders instead of {user_id}
|
|
41
74
|
--no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
|
|
42
75
|
-V, --version Print version
|
|
43
76
|
|
|
@@ -62,11 +95,22 @@ module Iriq
|
|
|
62
95
|
|
|
63
96
|
# Returns an integer exit code.
|
|
64
97
|
def run(argv)
|
|
98
|
+
# Pre-scan so an error during option parsing can still honor --json.
|
|
99
|
+
# Re-set authoritatively from opts once parsing succeeds.
|
|
100
|
+
@json = json_requested?(argv)
|
|
65
101
|
args, opts = parse_options(argv)
|
|
102
|
+
@json = opts[:json]
|
|
66
103
|
|
|
67
104
|
return print_usage(stdout, 0) if opts[:help]
|
|
68
105
|
return print_version if opts[:version]
|
|
69
106
|
|
|
107
|
+
# `iriq completion <shell>` short-circuits — no corpus, no IRI input,
|
|
108
|
+
# just emit the script bundled with the gem.
|
|
109
|
+
if args.first == "completion"
|
|
110
|
+
args.shift
|
|
111
|
+
return cmd_completion(args)
|
|
112
|
+
end
|
|
113
|
+
|
|
70
114
|
explicit_cluster = (args.first == "cluster")
|
|
71
115
|
args.shift if explicit_cluster
|
|
72
116
|
|
|
@@ -79,11 +123,17 @@ module Iriq
|
|
|
79
123
|
batch_mode = explicit_cluster || positional_is_file ||
|
|
80
124
|
(args.empty? && piped_stdin?)
|
|
81
125
|
|
|
82
|
-
return print_usage(stdout, 0) if args.empty? && !batch_mode
|
|
126
|
+
return print_usage(stdout, 0) if args.empty? && !batch_mode && !opts[:reinfer] && !opts[:propose] && !opts[:cross_host_shapes]
|
|
83
127
|
|
|
84
|
-
corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
|
|
128
|
+
corpus = opts[:corpus] ? load_corpus(opts[:corpus], host_strategy: opts[:host_strategy]) : nil
|
|
85
129
|
|
|
86
|
-
code = if
|
|
130
|
+
code = if opts[:reinfer]
|
|
131
|
+
cmd_reinfer(corpus, opts)
|
|
132
|
+
elsif opts[:propose]
|
|
133
|
+
cmd_propose(corpus, opts)
|
|
134
|
+
elsif opts[:cross_host_shapes]
|
|
135
|
+
cmd_cross_host_shapes(corpus, opts)
|
|
136
|
+
elsif batch_mode
|
|
87
137
|
cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
|
|
88
138
|
elsif opts[:stats]
|
|
89
139
|
cmd_stats(corpus, opts)
|
|
@@ -94,11 +144,9 @@ module Iriq
|
|
|
94
144
|
corpus.save(opts[:corpus]) if corpus && opts[:corpus]
|
|
95
145
|
code
|
|
96
146
|
rescue Iriq::ParseError => e
|
|
97
|
-
|
|
98
|
-
2
|
|
147
|
+
emit_error("parse_error", e.message, 2, human: "iriq: parse error: #{e.message}")
|
|
99
148
|
rescue OptionParser::ParseError => e
|
|
100
|
-
|
|
101
|
-
1
|
|
149
|
+
emit_error("option_error", e.message, 1)
|
|
102
150
|
end
|
|
103
151
|
|
|
104
152
|
def parseable_iri?(input)
|
|
@@ -113,22 +161,45 @@ module Iriq
|
|
|
113
161
|
def parse_options(argv)
|
|
114
162
|
opts = {
|
|
115
163
|
json: false,
|
|
164
|
+
ndjson: false,
|
|
116
165
|
help: false,
|
|
117
166
|
version: false,
|
|
118
167
|
hints: true,
|
|
119
168
|
sections: [],
|
|
120
|
-
corpus:
|
|
121
|
-
stats:
|
|
122
|
-
|
|
169
|
+
corpus: nil,
|
|
170
|
+
stats: false,
|
|
171
|
+
reinfer: false,
|
|
172
|
+
propose: false,
|
|
173
|
+
propose_min_obs: nil,
|
|
174
|
+
propose_min_coverage: nil,
|
|
175
|
+
# --min-hosts is generic: it applies to both --propose-recognizers
|
|
176
|
+
# (proposal threshold) and --cross-host-shapes (cross-host
|
|
177
|
+
# recurrence threshold).
|
|
178
|
+
min_hosts: nil,
|
|
179
|
+
activate_above: nil,
|
|
180
|
+
cross_host_shapes: false,
|
|
181
|
+
scheme_less: true,
|
|
182
|
+
host_strategy: :full,
|
|
123
183
|
}
|
|
124
184
|
parser = OptionParser.new do |o|
|
|
125
185
|
o.on("-p", "--parse") { opts[:sections] << :parse }
|
|
126
186
|
o.on("-n", "--normalize") { opts[:sections] << :normalize }
|
|
187
|
+
o.on("-c", "--canonical") { opts[:sections] << :canonical }
|
|
188
|
+
o.on("-e", "--explain") { opts[:sections] << :explain }
|
|
127
189
|
o.on("-j", "--json") { opts[:json] = true }
|
|
190
|
+
o.on("-J", "--ndjson") { opts[:json] = true; opts[:ndjson] = true }
|
|
128
191
|
o.on("--[no-]hints") { |v| opts[:hints] = v }
|
|
129
192
|
o.on("-N") { opts[:hints] = false }
|
|
130
193
|
o.on("--corpus PATH") { |v| opts[:corpus] = v }
|
|
194
|
+
o.on("--host MODE") { |v| opts[:host_strategy] = host_strategy_arg(v) }
|
|
131
195
|
o.on("--stats") { opts[:stats] = true }
|
|
196
|
+
o.on("--reinfer") { opts[:reinfer] = true }
|
|
197
|
+
o.on("--propose-recognizers") { opts[:propose] = true }
|
|
198
|
+
o.on("--min-observations N", Integer) { |v| opts[:propose_min_obs] = v }
|
|
199
|
+
o.on("--min-coverage F", Float) { |v| opts[:propose_min_coverage] = v }
|
|
200
|
+
o.on("--min-hosts N", Integer) { |v| opts[:min_hosts] = v }
|
|
201
|
+
o.on("--activate-above F", Float) { |v| opts[:activate_above] = v }
|
|
202
|
+
o.on("--cross-host-shapes") { opts[:cross_host_shapes] = true }
|
|
132
203
|
o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
|
|
133
204
|
o.on("-h", "--help") { opts[:help] = true }
|
|
134
205
|
o.on("-V", "--version") { opts[:version] = true }
|
|
@@ -149,10 +220,20 @@ module Iriq
|
|
|
149
220
|
end
|
|
150
221
|
end
|
|
151
222
|
|
|
152
|
-
def load_corpus(path)
|
|
153
|
-
|
|
223
|
+
def load_corpus(path, host_strategy: :full)
|
|
224
|
+
Corpus.open(path, host_strategy: host_strategy)
|
|
225
|
+
end
|
|
154
226
|
|
|
155
|
-
|
|
227
|
+
# Accept `--host=reg` as a short alias for the `registrable` mode.
|
|
228
|
+
HOST_STRATEGY_ALIASES = {
|
|
229
|
+
"full" => :full, "registrable" => :registrable, "reg" => :registrable, "none" => :none,
|
|
230
|
+
}.freeze
|
|
231
|
+
|
|
232
|
+
def host_strategy_arg(value)
|
|
233
|
+
mode = HOST_STRATEGY_ALIASES[value.to_s.downcase]
|
|
234
|
+
raise OptionParser::InvalidArgument, "--host: expected full|registrable|reg|none, got #{value.inspect}" unless mode
|
|
235
|
+
|
|
236
|
+
mode
|
|
156
237
|
end
|
|
157
238
|
|
|
158
239
|
def print_usage(io, code)
|
|
@@ -173,9 +254,13 @@ module Iriq
|
|
|
173
254
|
|
|
174
255
|
data = {}
|
|
175
256
|
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
257
|
+
data[:canonical] = iri.canonical if sections.include?(:canonical)
|
|
176
258
|
if sections.include?(:normalize)
|
|
177
259
|
data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
|
|
178
260
|
end
|
|
261
|
+
if sections.include?(:explain)
|
|
262
|
+
data[:explain] = Trace.for(iri, hints: opts[:hints])
|
|
263
|
+
end
|
|
179
264
|
|
|
180
265
|
if opts[:json]
|
|
181
266
|
payload = sections.size == 1 ? data.values.first : data
|
|
@@ -192,12 +277,21 @@ module Iriq
|
|
|
192
277
|
# corpus is ephemeral unless --corpus was given.
|
|
193
278
|
def cmd_batch(args, opts, corpus, explicit_cluster: false)
|
|
194
279
|
corpus ||= Corpus.new
|
|
195
|
-
iris = extract_text(read_text(args.first), opts)
|
|
196
|
-
iris.each { |iri| corpus.observe(iri) }
|
|
197
280
|
|
|
281
|
+
# Per-IRI sections (-n/-p/-c/-e) are independent line to line, so we
|
|
282
|
+
# stream: read input lazily, extract per line, and emit each IRI as it
|
|
283
|
+
# arrives (flushed for live `tail -f | iriq -n` pipelines). The aggregate
|
|
284
|
+
# views below — stats, clusters, the deduped URL list — need the whole
|
|
285
|
+
# input, so they slurp.
|
|
198
286
|
if opts[:sections].any?
|
|
199
|
-
emit_per_iri_sections(
|
|
200
|
-
|
|
287
|
+
emit_per_iri_sections(lazy_iris(args.first, opts), opts, corpus)
|
|
288
|
+
return 0
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
iris = extract_text(read_text(args.first), opts)
|
|
292
|
+
corpus.batch { iris.each { |iri| corpus.observe(iri) } }
|
|
293
|
+
|
|
294
|
+
if opts[:stats]
|
|
201
295
|
emit_stats(corpus, opts)
|
|
202
296
|
elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
|
|
203
297
|
# Either the user asked for clusters explicitly, or the input is
|
|
@@ -209,36 +303,68 @@ module Iriq
|
|
|
209
303
|
0
|
|
210
304
|
end
|
|
211
305
|
|
|
212
|
-
#
|
|
213
|
-
#
|
|
214
|
-
|
|
306
|
+
# Lazily yield IRIs from the input, one input line at a time, so an
|
|
307
|
+
# unbounded stream flows through without being buffered in full. Matches
|
|
308
|
+
# whole-text extraction exactly: a candidate never spans a newline
|
|
309
|
+
# (URL_CHAR_CLASS excludes whitespace) and `extract` does not dedup.
|
|
310
|
+
def lazy_iris(path, opts)
|
|
311
|
+
extractor = Extractor.new(scheme_less: opts[:scheme_less])
|
|
312
|
+
input_lines(path).lazy.flat_map { |line| extractor.extract(line) }
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def input_lines(path)
|
|
316
|
+
if path.nil? || path == "-"
|
|
317
|
+
stdin.each_line
|
|
318
|
+
else
|
|
319
|
+
File.foreach(path)
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Emit the requested sections (parse/normalize/explain) for each extracted
|
|
324
|
+
# IRI, observing each into `corpus` as it passes. `iris` may be a lazy
|
|
325
|
+
# enumerator; human and NDJSON output stream (flushed per IRI) while a single
|
|
326
|
+
# JSON array must be materialized. -n alone is the cleanest case: one line
|
|
327
|
+
# per URL.
|
|
328
|
+
def emit_per_iri_sections(iris, opts, corpus)
|
|
215
329
|
sections = opts[:sections]
|
|
216
|
-
payloads = iris.map { |iri| section_payload(iri, sections, opts) }
|
|
217
330
|
|
|
218
|
-
|
|
331
|
+
# A wrapping JSON array can't be emitted incrementally — collect it
|
|
332
|
+
# (force the lazy enumerator to a real Array so emit_json sees an array).
|
|
333
|
+
if opts[:json] && !opts[:ndjson]
|
|
334
|
+
payloads = iris.map { |iri| corpus.observe(iri); section_payload(iri, sections, opts) }.to_a
|
|
219
335
|
out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
336
|
+
return emit_json(out, opts)
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
iris.each_with_index do |iri, i|
|
|
340
|
+
corpus.observe(iri)
|
|
341
|
+
p = section_payload(iri, sections, opts)
|
|
342
|
+
if opts[:ndjson]
|
|
343
|
+
items = sections.size == 1 ? p.values : [p]
|
|
344
|
+
items.each { |item| stdout.puts JSON.generate(item) }
|
|
345
|
+
elsif sections == [:normalize] || sections == [:canonical]
|
|
346
|
+
# Most common case — keep it tight: one URL per line, no headers.
|
|
347
|
+
stdout.puts p[sections.first]
|
|
348
|
+
else
|
|
226
349
|
stdout.puts if i > 0
|
|
227
|
-
stdout.puts "# #{
|
|
350
|
+
stdout.puts "# #{iri.canonical}"
|
|
228
351
|
sections.each_with_index do |sec, j|
|
|
229
352
|
stdout.puts if j > 0 # blank line between sections within one IRI
|
|
230
353
|
case sec
|
|
231
354
|
when :parse then emit_parse_human(p[:parse])
|
|
355
|
+
when :canonical then stdout.puts p[:canonical]
|
|
232
356
|
when :normalize then stdout.puts p[:normalize]
|
|
233
357
|
end
|
|
234
358
|
end
|
|
235
359
|
end
|
|
360
|
+
stdout.flush
|
|
236
361
|
end
|
|
237
362
|
end
|
|
238
363
|
|
|
239
364
|
def section_payload(iri, sections, opts)
|
|
240
365
|
data = {}
|
|
241
366
|
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
367
|
+
data[:canonical] = iri.canonical if sections.include?(:canonical)
|
|
242
368
|
data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
|
|
243
369
|
data
|
|
244
370
|
end
|
|
@@ -262,7 +388,7 @@ module Iriq
|
|
|
262
388
|
sorted = counts.sort_by { |k, c| [-c, first[k]] }
|
|
263
389
|
|
|
264
390
|
if opts[:json]
|
|
265
|
-
|
|
391
|
+
emit_json(sorted.map { |k, c| { iri: k, count: c } }, opts)
|
|
266
392
|
elsif sorted.all? { |_, c| c == 1 }
|
|
267
393
|
sorted.each { |k, _| stdout.puts k }
|
|
268
394
|
else
|
|
@@ -277,9 +403,152 @@ module Iriq
|
|
|
277
403
|
0
|
|
278
404
|
end
|
|
279
405
|
|
|
406
|
+
# --propose-recognizers: scan observed values for prefix patterns
|
|
407
|
+
# that recur enough to suggest a new Recognizer. Prints one block
|
|
408
|
+
# per proposal in human mode, or a JSON array under --json. With
|
|
409
|
+
# --activate-above F, every proposal at or above coverage F is
|
|
410
|
+
# promoted to a live Recognizer on the corpus's classifier and the
|
|
411
|
+
# corpus reinfers to apply the new classifier to existing
|
|
412
|
+
# observations.
|
|
413
|
+
def cmd_propose(corpus, opts)
|
|
414
|
+
return missing("--corpus") unless corpus
|
|
415
|
+
|
|
416
|
+
kwargs = {}
|
|
417
|
+
kwargs[:min_observations] = opts[:propose_min_obs] if opts[:propose_min_obs]
|
|
418
|
+
kwargs[:min_coverage] = opts[:propose_min_coverage] if opts[:propose_min_coverage]
|
|
419
|
+
kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
|
|
420
|
+
|
|
421
|
+
if opts[:activate_above]
|
|
422
|
+
activated = corpus.activate_proposals_above(opts[:activate_above], **kwargs)
|
|
423
|
+
if activated.empty?
|
|
424
|
+
stdout.puts "no proposals at or above coverage #{opts[:activate_above]}"
|
|
425
|
+
else
|
|
426
|
+
activated.each do |r|
|
|
427
|
+
stdout.puts "activated: #{r.type} (#{r.prefix})"
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
return 0
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
proposals = corpus.propose_recognizers(**kwargs)
|
|
434
|
+
|
|
435
|
+
if opts[:json]
|
|
436
|
+
stdout.puts JSON.generate(proposals.map(&:to_h))
|
|
437
|
+
return 0
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
if proposals.empty?
|
|
441
|
+
stdout.puts "no recognizer proposals (#{corpus.observed_iri_count} observations scanned)"
|
|
442
|
+
return 0
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
proposals.each_with_index do |p, i|
|
|
446
|
+
stdout.puts if i > 0
|
|
447
|
+
stdout.puts "proposal: #{p.suggested_type} (#{p.prefix})"
|
|
448
|
+
stdout.puts " strategy: #{p.strategy}"
|
|
449
|
+
stdout.puts " coverage: #{format('%.2f', p.coverage)}"
|
|
450
|
+
stdout.puts " confidence: #{format('%.2f', p.confidence)}"
|
|
451
|
+
stdout.puts " observations: #{p.observation_count}"
|
|
452
|
+
stdout.puts " hosts: #{p.hosts.to_a.sort.join(', ')}"
|
|
453
|
+
stdout.puts " positions: #{p.positions.size}"
|
|
454
|
+
stdout.puts " samples: #{p.sample_values.first(3).join(', ')}"
|
|
455
|
+
end
|
|
456
|
+
0
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# --reinfer: drop the materialized views in the corpus and replay the
|
|
460
|
+
# source-IRI log through the current classifier + reducers. Prints a
|
|
461
|
+
# short before/after summary so the user can see what changed.
|
|
462
|
+
def cmd_reinfer(corpus, _opts)
|
|
463
|
+
return missing("--corpus") unless corpus
|
|
464
|
+
|
|
465
|
+
n = corpus.observed_iri_count
|
|
466
|
+
before = corpus.size
|
|
467
|
+
corpus.reinfer
|
|
468
|
+
after = corpus.size
|
|
469
|
+
|
|
470
|
+
stdout.puts "reinferred #{n} observation#{n == 1 ? '' : 's'}: " \
|
|
471
|
+
"#{before} → #{after} cluster#{after == 1 ? '' : 's'}"
|
|
472
|
+
0
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
# `completion <shell>` — emit the bundled shell-completion script.
|
|
476
|
+
# Scripts live in completions/{iriq.bash,_iriq} alongside the gem;
|
|
477
|
+
# Homebrew installs them automatically, but the user can also do
|
|
478
|
+
# `source <(iriq completion bash)` in their shell rc.
|
|
479
|
+
COMPLETIONS_DIR = File.expand_path("../../completions", __dir__).freeze
|
|
480
|
+
COMPLETION_FILES = {
|
|
481
|
+
"bash" => File.join(COMPLETIONS_DIR, "iriq.bash"),
|
|
482
|
+
"zsh" => File.join(COMPLETIONS_DIR, "_iriq"),
|
|
483
|
+
}.freeze
|
|
484
|
+
|
|
485
|
+
def cmd_completion(args)
|
|
486
|
+
shell = args.first || default_shell
|
|
487
|
+
path = COMPLETION_FILES[shell]
|
|
488
|
+
unless path
|
|
489
|
+
return emit_error("unknown_shell", "unknown shell #{shell.inspect} (try bash or zsh)", 1)
|
|
490
|
+
end
|
|
491
|
+
stdout.write(File.read(path))
|
|
492
|
+
0
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
def default_shell
|
|
496
|
+
shell = ENV["SHELL"].to_s
|
|
497
|
+
shell.empty? ? "bash" : File.basename(shell).sub(/\.exe\z/, "")
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
# --cross-host-shapes: list route shapes that recur across multiple
|
|
501
|
+
# hosts in the corpus. One block per shape in human mode, JSON array
|
|
502
|
+
# under --json. Tunable via --min-hosts (default 2).
|
|
503
|
+
def cmd_cross_host_shapes(corpus, opts)
|
|
504
|
+
return missing("--corpus") unless corpus
|
|
505
|
+
|
|
506
|
+
kwargs = {}
|
|
507
|
+
kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
|
|
508
|
+
shapes = corpus.cross_host_shapes(**kwargs)
|
|
509
|
+
|
|
510
|
+
if opts[:json]
|
|
511
|
+
stdout.puts JSON.generate(shapes.map(&:to_h))
|
|
512
|
+
return 0
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
if shapes.empty?
|
|
516
|
+
stdout.puts "no cross-host shapes (#{corpus.size} cluster#{corpus.size == 1 ? '' : 's'} scanned)"
|
|
517
|
+
return 0
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
shapes.each do |s|
|
|
521
|
+
host_list = s.hosts.to_a.sort.join(", ")
|
|
522
|
+
stdout.puts "#{s.shape} (#{s.host_count} host#{s.host_count == 1 ? '' : 's'}: #{host_list}) obs=#{s.observation_count}"
|
|
523
|
+
end
|
|
524
|
+
0
|
|
525
|
+
end
|
|
526
|
+
|
|
280
527
|
def missing(name)
|
|
281
|
-
|
|
282
|
-
|
|
528
|
+
emit_error("missing_argument", "missing argument <#{name}>", 1)
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
# Detect whether JSON output was requested by scanning raw argv. Used
|
|
532
|
+
# before option parsing completes (or when it fails) so errors can still
|
|
533
|
+
# honor --json. Handles bundled short flags like -nj.
|
|
534
|
+
def json_requested?(argv)
|
|
535
|
+
argv.any? do |a|
|
|
536
|
+
a == "--json" || a == "--ndjson" ||
|
|
537
|
+
(a.start_with?("-") && !a.start_with?("--") && a.match?(/[jJ]/))
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
# Emit an error to stderr and return its exit code. Under --json/--ndjson
|
|
542
|
+
# the error is a structured envelope ({"error":{"code","message"}}) so
|
|
543
|
+
# agents and pipelines get parseable output on the failure path; otherwise
|
|
544
|
+
# the plain "iriq: <human>" line (human defaults to "iriq: <message>").
|
|
545
|
+
def emit_error(code, message, exit_code, human: nil)
|
|
546
|
+
if @json
|
|
547
|
+
stderr.puts JSON.generate(error: { code: code, message: message })
|
|
548
|
+
else
|
|
549
|
+
stderr.puts(human || "iriq: #{message}")
|
|
550
|
+
end
|
|
551
|
+
exit_code
|
|
283
552
|
end
|
|
284
553
|
|
|
285
554
|
def read_input(path)
|
|
@@ -298,6 +567,9 @@ module Iriq
|
|
|
298
567
|
end
|
|
299
568
|
end
|
|
300
569
|
|
|
570
|
+
# Compact identifier hash for parse output (both JSON and human). Drops
|
|
571
|
+
# nil values and empty collections so URN dumps don't carry empty
|
|
572
|
+
# host/path/query slots, and URL dumps don't include null fragment/nss.
|
|
301
573
|
def identifier_hash(iri)
|
|
302
574
|
{
|
|
303
575
|
original: iri.original,
|
|
@@ -310,7 +582,20 @@ module Iriq
|
|
|
310
582
|
fragment: iri.fragment,
|
|
311
583
|
nss: iri.nss,
|
|
312
584
|
canonical: iri.canonical,
|
|
313
|
-
}
|
|
585
|
+
}.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
# Emit a JSON payload to stdout. When --ndjson is set and the payload is
|
|
589
|
+
# an Array, write one object per line (newline-delimited JSON) instead of
|
|
590
|
+
# one wrapping array — friendlier for `jq -c`, streaming pipelines, and
|
|
591
|
+
# log ingest tools. Non-array payloads (single objects) emit the same
|
|
592
|
+
# under both flags.
|
|
593
|
+
def emit_json(payload, opts)
|
|
594
|
+
if opts[:ndjson] && payload.is_a?(Array)
|
|
595
|
+
payload.each { |item| stdout.puts JSON.generate(item) }
|
|
596
|
+
else
|
|
597
|
+
stdout.puts JSON.generate(payload)
|
|
598
|
+
end
|
|
314
599
|
end
|
|
315
600
|
|
|
316
601
|
def emit_sections(data, sections)
|
|
@@ -320,41 +605,106 @@ module Iriq
|
|
|
320
605
|
stdout.puts "# #{sec}" if multi
|
|
321
606
|
case sec
|
|
322
607
|
when :parse then emit_parse_human(data[:parse])
|
|
608
|
+
when :canonical then stdout.puts data[:canonical]
|
|
323
609
|
when :normalize then stdout.puts data[:normalize]
|
|
610
|
+
when :explain then emit_explain_human(data[:explain])
|
|
324
611
|
end
|
|
325
612
|
end
|
|
326
613
|
end
|
|
327
614
|
|
|
615
|
+
# Render the trace hash as a vertically-aligned per-segment table.
|
|
616
|
+
# path rows first, then query rows.
|
|
617
|
+
def emit_explain_human(trace)
|
|
618
|
+
stdout.puts trace[:normalized]
|
|
619
|
+
emit_trace_section("path", trace[:path])
|
|
620
|
+
emit_trace_section("query", trace[:query]) if trace[:query]
|
|
621
|
+
end
|
|
622
|
+
|
|
623
|
+
def emit_trace_section(label, rows)
|
|
624
|
+
return if rows.nil? || rows.empty?
|
|
625
|
+
|
|
626
|
+
stdout.puts
|
|
627
|
+
stdout.puts "#{label}:"
|
|
628
|
+
name_width = rows.map { |r| trace_label(r).length }.max
|
|
629
|
+
type_width = rows.map { |r| r[:type].to_s.length }.max
|
|
630
|
+
out_width = rows.map { |r| r[:output].to_s.length }.max
|
|
631
|
+
rows.each do |r|
|
|
632
|
+
stdout.puts " #{trace_label(r).ljust(name_width)} #{r[:type].to_s.ljust(type_width)} #{r[:output].to_s.ljust(out_width)}#{format_notes(r[:notes])}"
|
|
633
|
+
end
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
def trace_label(row)
|
|
637
|
+
# Path rows have :value, query rows have :name=:value.
|
|
638
|
+
row[:name] ? "#{row[:name]}=#{row[:value]}" : row[:value].to_s
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
def format_notes(notes)
|
|
642
|
+
return "" if notes.nil? || notes.empty?
|
|
643
|
+
" (" + notes.join("; ") + ")"
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
# Render the compact identifier_hash. Keys/values are already filtered;
|
|
647
|
+
# array/hash values get .inspect, everything else .to_s.
|
|
328
648
|
def emit_parse_human(h)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
stdout.puts "port: #{h[:port]}" if h[:port]
|
|
334
|
-
stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
|
|
335
|
-
stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
|
|
336
|
-
stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
|
|
337
|
-
stdout.puts "nss: #{h[:nss]}" if h[:nss]
|
|
338
|
-
stdout.puts "canonical: #{h[:canonical]}"
|
|
649
|
+
h.each do |key, value|
|
|
650
|
+
rendered = value.is_a?(Array) || value.is_a?(Hash) ? value.inspect : value.to_s
|
|
651
|
+
stdout.puts "#{"#{key}:".ljust(15)}#{rendered}"
|
|
652
|
+
end
|
|
339
653
|
end
|
|
340
654
|
|
|
341
655
|
def emit_clusters(clusters, opts)
|
|
342
656
|
sorted = clusters.sort_by { |c| -c.count }
|
|
343
657
|
|
|
344
658
|
if opts[:json]
|
|
345
|
-
|
|
659
|
+
emit_json(sorted.map(&:to_h), opts)
|
|
346
660
|
else
|
|
347
661
|
sorted.each_with_index do |c, i|
|
|
348
662
|
stdout.puts if i > 0
|
|
349
663
|
host = c.host || "(urn)"
|
|
350
664
|
shape = opts[:hints] ? c.shape : raw_shape_for(c)
|
|
351
665
|
stdout.puts "[#{c.count}] #{host} #{shape}"
|
|
352
|
-
c.examples.first(3)
|
|
353
|
-
stdout.puts "
|
|
666
|
+
examples = c.examples.first(3)
|
|
667
|
+
examples.each { |e| stdout.puts " #{e.canonical}" }
|
|
668
|
+
remaining = c.count - examples.size
|
|
669
|
+
stdout.puts " + #{remaining} more" if remaining.positive?
|
|
670
|
+
emit_param_summary(c)
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
# One line per param: type, range (numeric), cardinality, presence.
|
|
676
|
+
# `page integer 1..100 avg 50.5 (10 distinct, 100%)`
|
|
677
|
+
def emit_param_summary(cluster)
|
|
678
|
+
rows = cluster.param_summary
|
|
679
|
+
return if rows.empty?
|
|
680
|
+
|
|
681
|
+
width = rows.map { |r| r[:name].length }.max
|
|
682
|
+
rows.each do |r|
|
|
683
|
+
bits = ["#{r[:type]}"]
|
|
684
|
+
if r[:min] && r[:max]
|
|
685
|
+
bits << format_range(r[:min], r[:max])
|
|
686
|
+
bits << "avg #{format_num(r[:avg])}" if r[:avg]
|
|
354
687
|
end
|
|
688
|
+
bits << "(#{r[:cardinality]} distinct, #{format_pct(r[:presence])})"
|
|
689
|
+
stdout.puts " #{r[:name].to_s.ljust(width)} #{bits.join(' ')}"
|
|
355
690
|
end
|
|
356
691
|
end
|
|
357
692
|
|
|
693
|
+
def format_range(lo, hi)
|
|
694
|
+
"#{format_num(lo)}..#{format_num(hi)}"
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
def format_num(n)
|
|
698
|
+
return n.to_s if n.is_a?(Integer)
|
|
699
|
+
whole = n.to_i
|
|
700
|
+
return whole.to_s if whole == n
|
|
701
|
+
n.round(2).to_s
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
def format_pct(frac)
|
|
705
|
+
"#{(frac * 100).round}%"
|
|
706
|
+
end
|
|
707
|
+
|
|
358
708
|
def raw_shape_for(cluster)
|
|
359
709
|
example = cluster.examples.first
|
|
360
710
|
return cluster.shape unless example
|
|
@@ -387,7 +737,10 @@ module Iriq
|
|
|
387
737
|
end
|
|
388
738
|
|
|
389
739
|
def top(hash)
|
|
390
|
-
|
|
740
|
+
# Lex tie-break on equal counts — Ruby Hash insertion order would
|
|
741
|
+
# otherwise diverge from Go's map iteration (which has no insertion
|
|
742
|
+
# order). Keeps Ruby ↔ Go --stats parity stable.
|
|
743
|
+
hash.sort_by { |k, n| [-n, k] }.first(TOP_N_STATS).to_h
|
|
391
744
|
end
|
|
392
745
|
end
|
|
393
746
|
end
|