iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
data/lib/iriq/cli.rb
CHANGED
|
@@ -18,6 +18,8 @@ module Iriq
|
|
|
18
18
|
LARGE_BATCH_THRESHOLD = 10
|
|
19
19
|
|
|
20
20
|
USAGE = <<~TXT
|
|
21
|
+
iriq — find a URL's shape: the route template behind it (e.g. /users/{id}).
|
|
22
|
+
|
|
21
23
|
Usage: iriq [options] <input>
|
|
22
24
|
iriq [options] < text
|
|
23
25
|
iriq cluster [options] [file]
|
|
@@ -26,18 +28,49 @@ module Iriq
|
|
|
26
28
|
text via stdin.
|
|
27
29
|
|
|
28
30
|
Sections (combine freely):
|
|
29
|
-
-n, --normalize Shape
|
|
31
|
+
-n, --normalize Shape — variable parts become placeholders
|
|
32
|
+
-c, --canonical Clean form — tidy scheme/host, keep the values
|
|
30
33
|
-p, --parse Parsed fields
|
|
34
|
+
-e, --explain Annotated trace — per-segment notes about why
|
|
35
|
+
each placeholder / canonical value was chosen
|
|
31
36
|
|
|
32
37
|
Corpus + stats:
|
|
33
38
|
--corpus PATH Load/create a JSON corpus; observe and save atomically.
|
|
34
39
|
-n becomes corpus-informed once it has data.
|
|
40
|
+
--host MODE Host-keying strategy for clustering:
|
|
41
|
+
full (default), registrable (or reg) strips
|
|
42
|
+
subdomains, none ignores host entirely.
|
|
35
43
|
--stats Print rolling aggregates
|
|
44
|
+
--reinfer Replay the source-IRI log through the current
|
|
45
|
+
classifier + reducers; rebuilds materialized
|
|
46
|
+
views from scratch. Requires --corpus.
|
|
47
|
+
--propose-recognizers
|
|
48
|
+
Scan observed values for shape patterns that
|
|
49
|
+
recur enough to suggest a new Recognizer.
|
|
50
|
+
Combine with --json for structured output.
|
|
51
|
+
Requires --corpus.
|
|
52
|
+
--cross-host-shapes
|
|
53
|
+
List route shapes that recur across
|
|
54
|
+
multiple hosts. Combine with --min-hosts.
|
|
55
|
+
Requires --corpus.
|
|
56
|
+
--activate-above F With --propose-recognizers, promote every
|
|
57
|
+
proposal at or above CONFIDENCE F into a
|
|
58
|
+
live Recognizer on the corpus, then
|
|
59
|
+
reinfer. Confidence integrates coverage
|
|
60
|
+
and cross-host corroboration.
|
|
61
|
+
|
|
62
|
+
Thresholds (apply to --propose-recognizers / --cross-host-shapes):
|
|
63
|
+
--min-observations N proposal noise floor (default 20)
|
|
64
|
+
--min-coverage F proposal coverage floor (default 0.7)
|
|
65
|
+
--min-hosts N proposal: minimum hosts (default 1);
|
|
66
|
+
cross-host-shapes: minimum hosts to
|
|
67
|
+
list (default 2)
|
|
36
68
|
|
|
37
69
|
Other:
|
|
38
70
|
-h, --help Show this message
|
|
39
71
|
-j, --json Emit JSON instead of human-readable output
|
|
40
|
-
-
|
|
72
|
+
-J, --ndjson Newline-delimited JSON (one object per line). Implies --json.
|
|
73
|
+
-N, --no-hints Use {integer} placeholders instead of {user_id}
|
|
41
74
|
--no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
|
|
42
75
|
-V, --version Print version
|
|
43
76
|
|
|
@@ -62,11 +95,22 @@ module Iriq
|
|
|
62
95
|
|
|
63
96
|
# Returns an integer exit code.
|
|
64
97
|
def run(argv)
|
|
98
|
+
# Pre-scan so an error during option parsing can still honor --json.
|
|
99
|
+
# Re-set authoritatively from opts once parsing succeeds.
|
|
100
|
+
@json = json_requested?(argv)
|
|
65
101
|
args, opts = parse_options(argv)
|
|
102
|
+
@json = opts[:json]
|
|
66
103
|
|
|
67
104
|
return print_usage(stdout, 0) if opts[:help]
|
|
68
105
|
return print_version if opts[:version]
|
|
69
106
|
|
|
107
|
+
# `iriq completion <shell>` short-circuits — no corpus, no IRI input,
|
|
108
|
+
# just emit the script bundled with the gem.
|
|
109
|
+
if args.first == "completion"
|
|
110
|
+
args.shift
|
|
111
|
+
return cmd_completion(args)
|
|
112
|
+
end
|
|
113
|
+
|
|
70
114
|
explicit_cluster = (args.first == "cluster")
|
|
71
115
|
args.shift if explicit_cluster
|
|
72
116
|
|
|
@@ -79,11 +123,17 @@ module Iriq
|
|
|
79
123
|
batch_mode = explicit_cluster || positional_is_file ||
|
|
80
124
|
(args.empty? && piped_stdin?)
|
|
81
125
|
|
|
82
|
-
return print_usage(stdout, 0) if args.empty? && !batch_mode
|
|
126
|
+
return print_usage(stdout, 0) if args.empty? && !batch_mode && !opts[:reinfer] && !opts[:propose] && !opts[:cross_host_shapes]
|
|
83
127
|
|
|
84
|
-
corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
|
|
128
|
+
corpus = opts[:corpus] ? load_corpus(opts[:corpus], host_strategy: opts[:host_strategy]) : nil
|
|
85
129
|
|
|
86
|
-
code = if
|
|
130
|
+
code = if opts[:reinfer]
|
|
131
|
+
cmd_reinfer(corpus, opts)
|
|
132
|
+
elsif opts[:propose]
|
|
133
|
+
cmd_propose(corpus, opts)
|
|
134
|
+
elsif opts[:cross_host_shapes]
|
|
135
|
+
cmd_cross_host_shapes(corpus, opts)
|
|
136
|
+
elsif batch_mode
|
|
87
137
|
cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
|
|
88
138
|
elsif opts[:stats]
|
|
89
139
|
cmd_stats(corpus, opts)
|
|
@@ -94,11 +144,9 @@ module Iriq
|
|
|
94
144
|
corpus.save(opts[:corpus]) if corpus && opts[:corpus]
|
|
95
145
|
code
|
|
96
146
|
rescue Iriq::ParseError => e
|
|
97
|
-
|
|
98
|
-
2
|
|
147
|
+
emit_error("parse_error", e.message, 2, human: "iriq: parse error: #{e.message}")
|
|
99
148
|
rescue OptionParser::ParseError => e
|
|
100
|
-
|
|
101
|
-
1
|
|
149
|
+
emit_error("option_error", e.message, 1)
|
|
102
150
|
end
|
|
103
151
|
|
|
104
152
|
def parseable_iri?(input)
|
|
@@ -113,22 +161,45 @@ module Iriq
|
|
|
113
161
|
def parse_options(argv)
|
|
114
162
|
opts = {
|
|
115
163
|
json: false,
|
|
164
|
+
ndjson: false,
|
|
116
165
|
help: false,
|
|
117
166
|
version: false,
|
|
118
167
|
hints: true,
|
|
119
168
|
sections: [],
|
|
120
|
-
corpus:
|
|
121
|
-
stats:
|
|
122
|
-
|
|
169
|
+
corpus: nil,
|
|
170
|
+
stats: false,
|
|
171
|
+
reinfer: false,
|
|
172
|
+
propose: false,
|
|
173
|
+
propose_min_obs: nil,
|
|
174
|
+
propose_min_coverage: nil,
|
|
175
|
+
# --min-hosts is generic: it applies to both --propose-recognizers
|
|
176
|
+
# (proposal threshold) and --cross-host-shapes (cross-host
|
|
177
|
+
# recurrence threshold).
|
|
178
|
+
min_hosts: nil,
|
|
179
|
+
activate_above: nil,
|
|
180
|
+
cross_host_shapes: false,
|
|
181
|
+
scheme_less: true,
|
|
182
|
+
host_strategy: :full,
|
|
123
183
|
}
|
|
124
184
|
parser = OptionParser.new do |o|
|
|
125
185
|
o.on("-p", "--parse") { opts[:sections] << :parse }
|
|
126
186
|
o.on("-n", "--normalize") { opts[:sections] << :normalize }
|
|
187
|
+
o.on("-c", "--canonical") { opts[:sections] << :canonical }
|
|
188
|
+
o.on("-e", "--explain") { opts[:sections] << :explain }
|
|
127
189
|
o.on("-j", "--json") { opts[:json] = true }
|
|
190
|
+
o.on("-J", "--ndjson") { opts[:json] = true; opts[:ndjson] = true }
|
|
128
191
|
o.on("--[no-]hints") { |v| opts[:hints] = v }
|
|
129
192
|
o.on("-N") { opts[:hints] = false }
|
|
130
193
|
o.on("--corpus PATH") { |v| opts[:corpus] = v }
|
|
194
|
+
o.on("--host MODE") { |v| opts[:host_strategy] = host_strategy_arg(v) }
|
|
131
195
|
o.on("--stats") { opts[:stats] = true }
|
|
196
|
+
o.on("--reinfer") { opts[:reinfer] = true }
|
|
197
|
+
o.on("--propose-recognizers") { opts[:propose] = true }
|
|
198
|
+
o.on("--min-observations N", Integer) { |v| opts[:propose_min_obs] = v }
|
|
199
|
+
o.on("--min-coverage F", Float) { |v| opts[:propose_min_coverage] = v }
|
|
200
|
+
o.on("--min-hosts N", Integer) { |v| opts[:min_hosts] = v }
|
|
201
|
+
o.on("--activate-above F", Float) { |v| opts[:activate_above] = v }
|
|
202
|
+
o.on("--cross-host-shapes") { opts[:cross_host_shapes] = true }
|
|
132
203
|
o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
|
|
133
204
|
o.on("-h", "--help") { opts[:help] = true }
|
|
134
205
|
o.on("-V", "--version") { opts[:version] = true }
|
|
@@ -149,8 +220,20 @@ module Iriq
|
|
|
149
220
|
end
|
|
150
221
|
end
|
|
151
222
|
|
|
152
|
-
def load_corpus(path)
|
|
153
|
-
Corpus.open(path)
|
|
223
|
+
def load_corpus(path, host_strategy: :full)
|
|
224
|
+
Corpus.open(path, host_strategy: host_strategy)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Accept `--host=reg` as a short alias for the `registrable` mode.
|
|
228
|
+
HOST_STRATEGY_ALIASES = {
|
|
229
|
+
"full" => :full, "registrable" => :registrable, "reg" => :registrable, "none" => :none,
|
|
230
|
+
}.freeze
|
|
231
|
+
|
|
232
|
+
def host_strategy_arg(value)
|
|
233
|
+
mode = HOST_STRATEGY_ALIASES[value.to_s.downcase]
|
|
234
|
+
raise OptionParser::InvalidArgument, "--host: expected full|registrable|reg|none, got #{value.inspect}" unless mode
|
|
235
|
+
|
|
236
|
+
mode
|
|
154
237
|
end
|
|
155
238
|
|
|
156
239
|
def print_usage(io, code)
|
|
@@ -171,9 +254,13 @@ module Iriq
|
|
|
171
254
|
|
|
172
255
|
data = {}
|
|
173
256
|
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
257
|
+
data[:canonical] = iri.canonical if sections.include?(:canonical)
|
|
174
258
|
if sections.include?(:normalize)
|
|
175
259
|
data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
|
|
176
260
|
end
|
|
261
|
+
if sections.include?(:explain)
|
|
262
|
+
data[:explain] = Trace.for(iri, hints: opts[:hints])
|
|
263
|
+
end
|
|
177
264
|
|
|
178
265
|
if opts[:json]
|
|
179
266
|
payload = sections.size == 1 ? data.values.first : data
|
|
@@ -190,12 +277,21 @@ module Iriq
|
|
|
190
277
|
# corpus is ephemeral unless --corpus was given.
|
|
191
278
|
def cmd_batch(args, opts, corpus, explicit_cluster: false)
|
|
192
279
|
corpus ||= Corpus.new
|
|
280
|
+
|
|
281
|
+
# Per-IRI sections (-n/-p/-c/-e) are independent line to line, so we
|
|
282
|
+
# stream: read input lazily, extract per line, and emit each IRI as it
|
|
283
|
+
# arrives (flushed for live `tail -f | iriq -n` pipelines). The aggregate
|
|
284
|
+
# views below — stats, clusters, the deduped URL list — need the whole
|
|
285
|
+
# input, so they slurp.
|
|
286
|
+
if opts[:sections].any?
|
|
287
|
+
emit_per_iri_sections(lazy_iris(args.first, opts), opts, corpus)
|
|
288
|
+
return 0
|
|
289
|
+
end
|
|
290
|
+
|
|
193
291
|
iris = extract_text(read_text(args.first), opts)
|
|
194
292
|
corpus.batch { iris.each { |iri| corpus.observe(iri) } }
|
|
195
293
|
|
|
196
|
-
if opts[:
|
|
197
|
-
emit_per_iri_sections(iris, opts)
|
|
198
|
-
elsif opts[:stats]
|
|
294
|
+
if opts[:stats]
|
|
199
295
|
emit_stats(corpus, opts)
|
|
200
296
|
elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
|
|
201
297
|
# Either the user asked for clusters explicitly, or the input is
|
|
@@ -207,36 +303,68 @@ module Iriq
|
|
|
207
303
|
0
|
|
208
304
|
end
|
|
209
305
|
|
|
210
|
-
#
|
|
211
|
-
#
|
|
212
|
-
|
|
306
|
+
# Lazily yield IRIs from the input, one input line at a time, so an
|
|
307
|
+
# unbounded stream flows through without being buffered in full. Matches
|
|
308
|
+
# whole-text extraction exactly: a candidate never spans a newline
|
|
309
|
+
# (URL_CHAR_CLASS excludes whitespace) and `extract` does not dedup.
|
|
310
|
+
def lazy_iris(path, opts)
|
|
311
|
+
extractor = Extractor.new(scheme_less: opts[:scheme_less])
|
|
312
|
+
input_lines(path).lazy.flat_map { |line| extractor.extract(line) }
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def input_lines(path)
|
|
316
|
+
if path.nil? || path == "-"
|
|
317
|
+
stdin.each_line
|
|
318
|
+
else
|
|
319
|
+
File.foreach(path)
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Emit the requested sections (parse/normalize/explain) for each extracted
|
|
324
|
+
# IRI, observing each into `corpus` as it passes. `iris` may be a lazy
|
|
325
|
+
# enumerator; human and NDJSON output stream (flushed per IRI) while a single
|
|
326
|
+
# JSON array must be materialized. -n alone is the cleanest case: one line
|
|
327
|
+
# per URL.
|
|
328
|
+
def emit_per_iri_sections(iris, opts, corpus)
|
|
213
329
|
sections = opts[:sections]
|
|
214
|
-
payloads = iris.map { |iri| section_payload(iri, sections, opts) }
|
|
215
330
|
|
|
216
|
-
|
|
331
|
+
# A wrapping JSON array can't be emitted incrementally — collect it
|
|
332
|
+
# (force the lazy enumerator to a real Array so emit_json sees an array).
|
|
333
|
+
if opts[:json] && !opts[:ndjson]
|
|
334
|
+
payloads = iris.map { |iri| corpus.observe(iri); section_payload(iri, sections, opts) }.to_a
|
|
217
335
|
out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
336
|
+
return emit_json(out, opts)
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
iris.each_with_index do |iri, i|
|
|
340
|
+
corpus.observe(iri)
|
|
341
|
+
p = section_payload(iri, sections, opts)
|
|
342
|
+
if opts[:ndjson]
|
|
343
|
+
items = sections.size == 1 ? p.values : [p]
|
|
344
|
+
items.each { |item| stdout.puts JSON.generate(item) }
|
|
345
|
+
elsif sections == [:normalize] || sections == [:canonical]
|
|
346
|
+
# Most common case — keep it tight: one URL per line, no headers.
|
|
347
|
+
stdout.puts p[sections.first]
|
|
348
|
+
else
|
|
224
349
|
stdout.puts if i > 0
|
|
225
|
-
stdout.puts "# #{
|
|
350
|
+
stdout.puts "# #{iri.canonical}"
|
|
226
351
|
sections.each_with_index do |sec, j|
|
|
227
352
|
stdout.puts if j > 0 # blank line between sections within one IRI
|
|
228
353
|
case sec
|
|
229
354
|
when :parse then emit_parse_human(p[:parse])
|
|
355
|
+
when :canonical then stdout.puts p[:canonical]
|
|
230
356
|
when :normalize then stdout.puts p[:normalize]
|
|
231
357
|
end
|
|
232
358
|
end
|
|
233
359
|
end
|
|
360
|
+
stdout.flush
|
|
234
361
|
end
|
|
235
362
|
end
|
|
236
363
|
|
|
237
364
|
def section_payload(iri, sections, opts)
|
|
238
365
|
data = {}
|
|
239
366
|
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
367
|
+
data[:canonical] = iri.canonical if sections.include?(:canonical)
|
|
240
368
|
data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
|
|
241
369
|
data
|
|
242
370
|
end
|
|
@@ -260,7 +388,7 @@ module Iriq
|
|
|
260
388
|
sorted = counts.sort_by { |k, c| [-c, first[k]] }
|
|
261
389
|
|
|
262
390
|
if opts[:json]
|
|
263
|
-
|
|
391
|
+
emit_json(sorted.map { |k, c| { iri: k, count: c } }, opts)
|
|
264
392
|
elsif sorted.all? { |_, c| c == 1 }
|
|
265
393
|
sorted.each { |k, _| stdout.puts k }
|
|
266
394
|
else
|
|
@@ -275,9 +403,152 @@ module Iriq
|
|
|
275
403
|
0
|
|
276
404
|
end
|
|
277
405
|
|
|
406
|
+
# --propose-recognizers: scan observed values for prefix patterns
|
|
407
|
+
# that recur enough to suggest a new Recognizer. Prints one block
|
|
408
|
+
# per proposal in human mode, or a JSON array under --json. With
|
|
409
|
+
# --activate-above F, every proposal at or above coverage F is
|
|
410
|
+
# promoted to a live Recognizer on the corpus's classifier and the
|
|
411
|
+
# corpus reinfers to apply the new classifier to existing
|
|
412
|
+
# observations.
|
|
413
|
+
def cmd_propose(corpus, opts)
|
|
414
|
+
return missing("--corpus") unless corpus
|
|
415
|
+
|
|
416
|
+
kwargs = {}
|
|
417
|
+
kwargs[:min_observations] = opts[:propose_min_obs] if opts[:propose_min_obs]
|
|
418
|
+
kwargs[:min_coverage] = opts[:propose_min_coverage] if opts[:propose_min_coverage]
|
|
419
|
+
kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
|
|
420
|
+
|
|
421
|
+
if opts[:activate_above]
|
|
422
|
+
activated = corpus.activate_proposals_above(opts[:activate_above], **kwargs)
|
|
423
|
+
if activated.empty?
|
|
424
|
+
stdout.puts "no proposals at or above coverage #{opts[:activate_above]}"
|
|
425
|
+
else
|
|
426
|
+
activated.each do |r|
|
|
427
|
+
stdout.puts "activated: #{r.type} (#{r.prefix})"
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
return 0
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
proposals = corpus.propose_recognizers(**kwargs)
|
|
434
|
+
|
|
435
|
+
if opts[:json]
|
|
436
|
+
stdout.puts JSON.generate(proposals.map(&:to_h))
|
|
437
|
+
return 0
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
if proposals.empty?
|
|
441
|
+
stdout.puts "no recognizer proposals (#{corpus.observed_iri_count} observations scanned)"
|
|
442
|
+
return 0
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
proposals.each_with_index do |p, i|
|
|
446
|
+
stdout.puts if i > 0
|
|
447
|
+
stdout.puts "proposal: #{p.suggested_type} (#{p.prefix})"
|
|
448
|
+
stdout.puts " strategy: #{p.strategy}"
|
|
449
|
+
stdout.puts " coverage: #{format('%.2f', p.coverage)}"
|
|
450
|
+
stdout.puts " confidence: #{format('%.2f', p.confidence)}"
|
|
451
|
+
stdout.puts " observations: #{p.observation_count}"
|
|
452
|
+
stdout.puts " hosts: #{p.hosts.to_a.sort.join(', ')}"
|
|
453
|
+
stdout.puts " positions: #{p.positions.size}"
|
|
454
|
+
stdout.puts " samples: #{p.sample_values.first(3).join(', ')}"
|
|
455
|
+
end
|
|
456
|
+
0
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# --reinfer: drop the materialized views in the corpus and replay the
|
|
460
|
+
# source-IRI log through the current classifier + reducers. Prints a
|
|
461
|
+
# short before/after summary so the user can see what changed.
|
|
462
|
+
def cmd_reinfer(corpus, _opts)
|
|
463
|
+
return missing("--corpus") unless corpus
|
|
464
|
+
|
|
465
|
+
n = corpus.observed_iri_count
|
|
466
|
+
before = corpus.size
|
|
467
|
+
corpus.reinfer
|
|
468
|
+
after = corpus.size
|
|
469
|
+
|
|
470
|
+
stdout.puts "reinferred #{n} observation#{n == 1 ? '' : 's'}: " \
|
|
471
|
+
"#{before} → #{after} cluster#{after == 1 ? '' : 's'}"
|
|
472
|
+
0
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
# `completion <shell>` — emit the bundled shell-completion script.
|
|
476
|
+
# Scripts live in completions/{iriq.bash,_iriq} alongside the gem;
|
|
477
|
+
# Homebrew installs them automatically, but the user can also do
|
|
478
|
+
# `source <(iriq completion bash)` in their shell rc.
|
|
479
|
+
COMPLETIONS_DIR = File.expand_path("../../completions", __dir__).freeze
|
|
480
|
+
COMPLETION_FILES = {
|
|
481
|
+
"bash" => File.join(COMPLETIONS_DIR, "iriq.bash"),
|
|
482
|
+
"zsh" => File.join(COMPLETIONS_DIR, "_iriq"),
|
|
483
|
+
}.freeze
|
|
484
|
+
|
|
485
|
+
def cmd_completion(args)
|
|
486
|
+
shell = args.first || default_shell
|
|
487
|
+
path = COMPLETION_FILES[shell]
|
|
488
|
+
unless path
|
|
489
|
+
return emit_error("unknown_shell", "unknown shell #{shell.inspect} (try bash or zsh)", 1)
|
|
490
|
+
end
|
|
491
|
+
stdout.write(File.read(path))
|
|
492
|
+
0
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
def default_shell
|
|
496
|
+
shell = ENV["SHELL"].to_s
|
|
497
|
+
shell.empty? ? "bash" : File.basename(shell).sub(/\.exe\z/, "")
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
# --cross-host-shapes: list route shapes that recur across multiple
|
|
501
|
+
# hosts in the corpus. One block per shape in human mode, JSON array
|
|
502
|
+
# under --json. Tunable via --min-hosts (default 2).
|
|
503
|
+
def cmd_cross_host_shapes(corpus, opts)
|
|
504
|
+
return missing("--corpus") unless corpus
|
|
505
|
+
|
|
506
|
+
kwargs = {}
|
|
507
|
+
kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
|
|
508
|
+
shapes = corpus.cross_host_shapes(**kwargs)
|
|
509
|
+
|
|
510
|
+
if opts[:json]
|
|
511
|
+
stdout.puts JSON.generate(shapes.map(&:to_h))
|
|
512
|
+
return 0
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
if shapes.empty?
|
|
516
|
+
stdout.puts "no cross-host shapes (#{corpus.size} cluster#{corpus.size == 1 ? '' : 's'} scanned)"
|
|
517
|
+
return 0
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
shapes.each do |s|
|
|
521
|
+
host_list = s.hosts.to_a.sort.join(", ")
|
|
522
|
+
stdout.puts "#{s.shape} (#{s.host_count} host#{s.host_count == 1 ? '' : 's'}: #{host_list}) obs=#{s.observation_count}"
|
|
523
|
+
end
|
|
524
|
+
0
|
|
525
|
+
end
|
|
526
|
+
|
|
278
527
|
def missing(name)
|
|
279
|
-
|
|
280
|
-
|
|
528
|
+
emit_error("missing_argument", "missing argument <#{name}>", 1)
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
# Detect whether JSON output was requested by scanning raw argv. Used
|
|
532
|
+
# before option parsing completes (or when it fails) so errors can still
|
|
533
|
+
# honor --json. Handles bundled short flags like -nj.
|
|
534
|
+
def json_requested?(argv)
|
|
535
|
+
argv.any? do |a|
|
|
536
|
+
a == "--json" || a == "--ndjson" ||
|
|
537
|
+
(a.start_with?("-") && !a.start_with?("--") && a.match?(/[jJ]/))
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
# Emit an error to stderr and return its exit code. Under --json/--ndjson
|
|
542
|
+
# the error is a structured envelope ({"error":{"code","message"}}) so
|
|
543
|
+
# agents and pipelines get parseable output on the failure path; otherwise
|
|
544
|
+
# the plain "iriq: <human>" line (human defaults to "iriq: <message>").
|
|
545
|
+
def emit_error(code, message, exit_code, human: nil)
|
|
546
|
+
if @json
|
|
547
|
+
stderr.puts JSON.generate(error: { code: code, message: message })
|
|
548
|
+
else
|
|
549
|
+
stderr.puts(human || "iriq: #{message}")
|
|
550
|
+
end
|
|
551
|
+
exit_code
|
|
281
552
|
end
|
|
282
553
|
|
|
283
554
|
def read_input(path)
|
|
@@ -314,6 +585,19 @@ module Iriq
|
|
|
314
585
|
}.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
|
315
586
|
end
|
|
316
587
|
|
|
588
|
+
# Emit a JSON payload to stdout. When --ndjson is set and the payload is
|
|
589
|
+
# an Array, write one object per line (newline-delimited JSON) instead of
|
|
590
|
+
# one wrapping array — friendlier for `jq -c`, streaming pipelines, and
|
|
591
|
+
# log ingest tools. Non-array payloads (single objects) emit the same
|
|
592
|
+
# under both flags.
|
|
593
|
+
def emit_json(payload, opts)
|
|
594
|
+
if opts[:ndjson] && payload.is_a?(Array)
|
|
595
|
+
payload.each { |item| stdout.puts JSON.generate(item) }
|
|
596
|
+
else
|
|
597
|
+
stdout.puts JSON.generate(payload)
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
|
|
317
601
|
def emit_sections(data, sections)
|
|
318
602
|
multi = sections.size > 1
|
|
319
603
|
sections.each_with_index do |sec, i|
|
|
@@ -321,41 +605,106 @@ module Iriq
|
|
|
321
605
|
stdout.puts "# #{sec}" if multi
|
|
322
606
|
case sec
|
|
323
607
|
when :parse then emit_parse_human(data[:parse])
|
|
608
|
+
when :canonical then stdout.puts data[:canonical]
|
|
324
609
|
when :normalize then stdout.puts data[:normalize]
|
|
610
|
+
when :explain then emit_explain_human(data[:explain])
|
|
325
611
|
end
|
|
326
612
|
end
|
|
327
613
|
end
|
|
328
614
|
|
|
615
|
+
# Render the trace hash as a vertically-aligned per-segment table.
|
|
616
|
+
# path rows first, then query rows.
|
|
617
|
+
def emit_explain_human(trace)
|
|
618
|
+
stdout.puts trace[:normalized]
|
|
619
|
+
emit_trace_section("path", trace[:path])
|
|
620
|
+
emit_trace_section("query", trace[:query]) if trace[:query]
|
|
621
|
+
end
|
|
622
|
+
|
|
623
|
+
def emit_trace_section(label, rows)
|
|
624
|
+
return if rows.nil? || rows.empty?
|
|
625
|
+
|
|
626
|
+
stdout.puts
|
|
627
|
+
stdout.puts "#{label}:"
|
|
628
|
+
name_width = rows.map { |r| trace_label(r).length }.max
|
|
629
|
+
type_width = rows.map { |r| r[:type].to_s.length }.max
|
|
630
|
+
out_width = rows.map { |r| r[:output].to_s.length }.max
|
|
631
|
+
rows.each do |r|
|
|
632
|
+
stdout.puts " #{trace_label(r).ljust(name_width)} #{r[:type].to_s.ljust(type_width)} #{r[:output].to_s.ljust(out_width)}#{format_notes(r[:notes])}"
|
|
633
|
+
end
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
def trace_label(row)
|
|
637
|
+
# Path rows have :value, query rows have :name=:value.
|
|
638
|
+
row[:name] ? "#{row[:name]}=#{row[:value]}" : row[:value].to_s
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
def format_notes(notes)
|
|
642
|
+
return "" if notes.nil? || notes.empty?
|
|
643
|
+
" (" + notes.join("; ") + ")"
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
# Render the compact identifier_hash. Keys/values are already filtered;
|
|
647
|
+
# array/hash values get .inspect, everything else .to_s.
|
|
329
648
|
def emit_parse_human(h)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
stdout.puts "port: #{h[:port]}" if h[:port]
|
|
335
|
-
stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
|
|
336
|
-
stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
|
|
337
|
-
stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
|
|
338
|
-
stdout.puts "nss: #{h[:nss]}" if h[:nss]
|
|
339
|
-
stdout.puts "canonical: #{h[:canonical]}"
|
|
649
|
+
h.each do |key, value|
|
|
650
|
+
rendered = value.is_a?(Array) || value.is_a?(Hash) ? value.inspect : value.to_s
|
|
651
|
+
stdout.puts "#{"#{key}:".ljust(15)}#{rendered}"
|
|
652
|
+
end
|
|
340
653
|
end
|
|
341
654
|
|
|
342
655
|
def emit_clusters(clusters, opts)
|
|
343
656
|
sorted = clusters.sort_by { |c| -c.count }
|
|
344
657
|
|
|
345
658
|
if opts[:json]
|
|
346
|
-
|
|
659
|
+
emit_json(sorted.map(&:to_h), opts)
|
|
347
660
|
else
|
|
348
661
|
sorted.each_with_index do |c, i|
|
|
349
662
|
stdout.puts if i > 0
|
|
350
663
|
host = c.host || "(urn)"
|
|
351
664
|
shape = opts[:hints] ? c.shape : raw_shape_for(c)
|
|
352
665
|
stdout.puts "[#{c.count}] #{host} #{shape}"
|
|
353
|
-
c.examples.first(3)
|
|
354
|
-
stdout.puts "
|
|
666
|
+
examples = c.examples.first(3)
|
|
667
|
+
examples.each { |e| stdout.puts " #{e.canonical}" }
|
|
668
|
+
remaining = c.count - examples.size
|
|
669
|
+
stdout.puts " + #{remaining} more" if remaining.positive?
|
|
670
|
+
emit_param_summary(c)
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
# One line per param: type, range (numeric), cardinality, presence.
|
|
676
|
+
# `page integer 1..100 avg 50.5 (10 distinct, 100%)`
|
|
677
|
+
def emit_param_summary(cluster)
|
|
678
|
+
rows = cluster.param_summary
|
|
679
|
+
return if rows.empty?
|
|
680
|
+
|
|
681
|
+
width = rows.map { |r| r[:name].length }.max
|
|
682
|
+
rows.each do |r|
|
|
683
|
+
bits = ["#{r[:type]}"]
|
|
684
|
+
if r[:min] && r[:max]
|
|
685
|
+
bits << format_range(r[:min], r[:max])
|
|
686
|
+
bits << "avg #{format_num(r[:avg])}" if r[:avg]
|
|
355
687
|
end
|
|
688
|
+
bits << "(#{r[:cardinality]} distinct, #{format_pct(r[:presence])})"
|
|
689
|
+
stdout.puts " #{r[:name].to_s.ljust(width)} #{bits.join(' ')}"
|
|
356
690
|
end
|
|
357
691
|
end
|
|
358
692
|
|
|
693
|
+
def format_range(lo, hi)
|
|
694
|
+
"#{format_num(lo)}..#{format_num(hi)}"
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
def format_num(n)
|
|
698
|
+
return n.to_s if n.is_a?(Integer)
|
|
699
|
+
whole = n.to_i
|
|
700
|
+
return whole.to_s if whole == n
|
|
701
|
+
n.round(2).to_s
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
def format_pct(frac)
|
|
705
|
+
"#{(frac * 100).round}%"
|
|
706
|
+
end
|
|
707
|
+
|
|
359
708
|
def raw_shape_for(cluster)
|
|
360
709
|
example = cluster.examples.first
|
|
361
710
|
return cluster.shape unless example
|
|
@@ -388,7 +737,10 @@ module Iriq
|
|
|
388
737
|
end
|
|
389
738
|
|
|
390
739
|
def top(hash)
|
|
391
|
-
|
|
740
|
+
# Lex tie-break on equal counts — Ruby Hash insertion order would
|
|
741
|
+
# otherwise diverge from Go's map iteration (which has no insertion
|
|
742
|
+
# order). Keeps Ruby ↔ Go --stats parity stable.
|
|
743
|
+
hash.sort_by { |k, n| [-n, k] }.first(TOP_N_STATS).to_h
|
|
392
744
|
end
|
|
393
745
|
end
|
|
394
746
|
end
|