iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
data/lib/iriq/cli.rb CHANGED
@@ -18,6 +18,8 @@ module Iriq
18
18
  LARGE_BATCH_THRESHOLD = 10
19
19
 
20
20
  USAGE = <<~TXT
21
+ iriq — find a URL's shape: the route template behind it (e.g. /users/{id}).
22
+
21
23
  Usage: iriq [options] <input>
22
24
  iriq [options] < text
23
25
  iriq cluster [options] [file]
@@ -26,18 +28,49 @@ module Iriq
26
28
  text via stdin.
27
29
 
28
30
  Sections (combine freely):
29
- -n, --normalize Shape-normalized form
31
+ -n, --normalize Shape — variable parts become placeholders
32
+ -c, --canonical Clean form — tidy scheme/host, keep the values
30
33
  -p, --parse Parsed fields
34
+ -e, --explain Annotated trace — per-segment notes about why
35
+ each placeholder / canonical value was chosen
31
36
 
32
37
  Corpus + stats:
33
38
  --corpus PATH Load/create a JSON corpus; observe and save atomically.
34
39
  -n becomes corpus-informed once it has data.
40
+ --host MODE Host-keying strategy for clustering:
41
+ full (default), registrable (or reg) strips
42
+ subdomains, none ignores host entirely.
35
43
  --stats Print rolling aggregates
44
+ --reinfer Replay the source-IRI log through the current
45
+ classifier + reducers; rebuilds materialized
46
+ views from scratch. Requires --corpus.
47
+ --propose-recognizers
48
+ Scan observed values for shape patterns that
49
+ recur enough to suggest a new Recognizer.
50
+ Combine with --json for structured output.
51
+ Requires --corpus.
52
+ --cross-host-shapes
53
+ List route shapes that recur across
54
+ multiple hosts. Combine with --min-hosts.
55
+ Requires --corpus.
56
+ --activate-above F With --propose-recognizers, promote every
57
+ proposal at or above CONFIDENCE F into a
58
+ live Recognizer on the corpus, then
59
+ reinfer. Confidence integrates coverage
60
+ and cross-host corroboration.
61
+
62
+ Thresholds (apply to --propose-recognizers / --cross-host-shapes):
63
+ --min-observations N proposal noise floor (default 20)
64
+ --min-coverage F proposal coverage floor (default 0.7)
65
+ --min-hosts N proposal: minimum hosts (default 1);
66
+ cross-host-shapes: minimum hosts to
67
+ list (default 2)
36
68
 
37
69
  Other:
38
70
  -h, --help Show this message
39
71
  -j, --json Emit JSON instead of human-readable output
40
- -N, --no-hints Use {integer_id} placeholders instead of {user_id}
72
+ -J, --ndjson Newline-delimited JSON (one object per line). Implies --json.
73
+ -N, --no-hints Use {integer} placeholders instead of {user_id}
41
74
  --no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
42
75
  -V, --version Print version
43
76
 
@@ -62,11 +95,22 @@ module Iriq
62
95
 
63
96
  # Returns an integer exit code.
64
97
  def run(argv)
98
+ # Pre-scan so an error during option parsing can still honor --json.
99
+ # Re-set authoritatively from opts once parsing succeeds.
100
+ @json = json_requested?(argv)
65
101
  args, opts = parse_options(argv)
102
+ @json = opts[:json]
66
103
 
67
104
  return print_usage(stdout, 0) if opts[:help]
68
105
  return print_version if opts[:version]
69
106
 
107
+ # `iriq completion <shell>` short-circuits — no corpus, no IRI input,
108
+ # just emit the script bundled with the gem.
109
+ if args.first == "completion"
110
+ args.shift
111
+ return cmd_completion(args)
112
+ end
113
+
70
114
  explicit_cluster = (args.first == "cluster")
71
115
  args.shift if explicit_cluster
72
116
 
@@ -79,11 +123,17 @@ module Iriq
79
123
  batch_mode = explicit_cluster || positional_is_file ||
80
124
  (args.empty? && piped_stdin?)
81
125
 
82
- return print_usage(stdout, 0) if args.empty? && !batch_mode
126
+ return print_usage(stdout, 0) if args.empty? && !batch_mode && !opts[:reinfer] && !opts[:propose] && !opts[:cross_host_shapes]
83
127
 
84
- corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
128
+ corpus = opts[:corpus] ? load_corpus(opts[:corpus], host_strategy: opts[:host_strategy]) : nil
85
129
 
86
- code = if batch_mode
130
+ code = if opts[:reinfer]
131
+ cmd_reinfer(corpus, opts)
132
+ elsif opts[:propose]
133
+ cmd_propose(corpus, opts)
134
+ elsif opts[:cross_host_shapes]
135
+ cmd_cross_host_shapes(corpus, opts)
136
+ elsif batch_mode
87
137
  cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
88
138
  elsif opts[:stats]
89
139
  cmd_stats(corpus, opts)
@@ -94,11 +144,9 @@ module Iriq
94
144
  corpus.save(opts[:corpus]) if corpus && opts[:corpus]
95
145
  code
96
146
  rescue Iriq::ParseError => e
97
- stderr.puts "iriq: parse error: #{e.message}"
98
- 2
147
+ emit_error("parse_error", e.message, 2, human: "iriq: parse error: #{e.message}")
99
148
  rescue OptionParser::ParseError => e
100
- stderr.puts "iriq: #{e.message}"
101
- 1
149
+ emit_error("option_error", e.message, 1)
102
150
  end
103
151
 
104
152
  def parseable_iri?(input)
@@ -113,22 +161,45 @@ module Iriq
113
161
  def parse_options(argv)
114
162
  opts = {
115
163
  json: false,
164
+ ndjson: false,
116
165
  help: false,
117
166
  version: false,
118
167
  hints: true,
119
168
  sections: [],
120
- corpus: nil,
121
- stats: false,
122
- scheme_less: true,
169
+ corpus: nil,
170
+ stats: false,
171
+ reinfer: false,
172
+ propose: false,
173
+ propose_min_obs: nil,
174
+ propose_min_coverage: nil,
175
+ # --min-hosts is generic: it applies to both --propose-recognizers
176
+ # (proposal threshold) and --cross-host-shapes (cross-host
177
+ # recurrence threshold).
178
+ min_hosts: nil,
179
+ activate_above: nil,
180
+ cross_host_shapes: false,
181
+ scheme_less: true,
182
+ host_strategy: :full,
123
183
  }
124
184
  parser = OptionParser.new do |o|
125
185
  o.on("-p", "--parse") { opts[:sections] << :parse }
126
186
  o.on("-n", "--normalize") { opts[:sections] << :normalize }
187
+ o.on("-c", "--canonical") { opts[:sections] << :canonical }
188
+ o.on("-e", "--explain") { opts[:sections] << :explain }
127
189
  o.on("-j", "--json") { opts[:json] = true }
190
+ o.on("-J", "--ndjson") { opts[:json] = true; opts[:ndjson] = true }
128
191
  o.on("--[no-]hints") { |v| opts[:hints] = v }
129
192
  o.on("-N") { opts[:hints] = false }
130
193
  o.on("--corpus PATH") { |v| opts[:corpus] = v }
194
+ o.on("--host MODE") { |v| opts[:host_strategy] = host_strategy_arg(v) }
131
195
  o.on("--stats") { opts[:stats] = true }
196
+ o.on("--reinfer") { opts[:reinfer] = true }
197
+ o.on("--propose-recognizers") { opts[:propose] = true }
198
+ o.on("--min-observations N", Integer) { |v| opts[:propose_min_obs] = v }
199
+ o.on("--min-coverage F", Float) { |v| opts[:propose_min_coverage] = v }
200
+ o.on("--min-hosts N", Integer) { |v| opts[:min_hosts] = v }
201
+ o.on("--activate-above F", Float) { |v| opts[:activate_above] = v }
202
+ o.on("--cross-host-shapes") { opts[:cross_host_shapes] = true }
132
203
  o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
133
204
  o.on("-h", "--help") { opts[:help] = true }
134
205
  o.on("-V", "--version") { opts[:version] = true }
@@ -149,10 +220,20 @@ module Iriq
149
220
  end
150
221
  end
151
222
 
152
- def load_corpus(path)
153
- return Corpus.load(path) if File.exist?(path)
223
+ def load_corpus(path, host_strategy: :full)
224
+ Corpus.open(path, host_strategy: host_strategy)
225
+ end
154
226
 
155
- Corpus.new
227
+ # Accept `--host=reg` as a short alias for the `registrable` mode.
228
+ HOST_STRATEGY_ALIASES = {
229
+ "full" => :full, "registrable" => :registrable, "reg" => :registrable, "none" => :none,
230
+ }.freeze
231
+
232
+ def host_strategy_arg(value)
233
+ mode = HOST_STRATEGY_ALIASES[value.to_s.downcase]
234
+ raise OptionParser::InvalidArgument, "--host: expected full|registrable|reg|none, got #{value.inspect}" unless mode
235
+
236
+ mode
156
237
  end
157
238
 
158
239
  def print_usage(io, code)
@@ -173,9 +254,13 @@ module Iriq
173
254
 
174
255
  data = {}
175
256
  data[:parse] = identifier_hash(iri) if sections.include?(:parse)
257
+ data[:canonical] = iri.canonical if sections.include?(:canonical)
176
258
  if sections.include?(:normalize)
177
259
  data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
178
260
  end
261
+ if sections.include?(:explain)
262
+ data[:explain] = Trace.for(iri, hints: opts[:hints])
263
+ end
179
264
 
180
265
  if opts[:json]
181
266
  payload = sections.size == 1 ? data.values.first : data
@@ -192,12 +277,21 @@ module Iriq
192
277
  # corpus is ephemeral unless --corpus was given.
193
278
  def cmd_batch(args, opts, corpus, explicit_cluster: false)
194
279
  corpus ||= Corpus.new
195
- iris = extract_text(read_text(args.first), opts)
196
- iris.each { |iri| corpus.observe(iri) }
197
280
 
281
+ # Per-IRI sections (-n/-p/-c/-e) are independent line to line, so we
282
+ # stream: read input lazily, extract per line, and emit each IRI as it
283
+ # arrives (flushed for live `tail -f | iriq -n` pipelines). The aggregate
284
+ # views below — stats, clusters, the deduped URL list — need the whole
285
+ # input, so they slurp.
198
286
  if opts[:sections].any?
199
- emit_per_iri_sections(iris, opts)
200
- elsif opts[:stats]
287
+ emit_per_iri_sections(lazy_iris(args.first, opts), opts, corpus)
288
+ return 0
289
+ end
290
+
291
+ iris = extract_text(read_text(args.first), opts)
292
+ corpus.batch { iris.each { |iri| corpus.observe(iri) } }
293
+
294
+ if opts[:stats]
201
295
  emit_stats(corpus, opts)
202
296
  elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
203
297
  # Either the user asked for clusters explicitly, or the input is
@@ -209,36 +303,68 @@ module Iriq
209
303
  0
210
304
  end
211
305
 
212
- # Emit the requested sections (parse/normalize/explain) for each
213
- # extracted IRI. -n alone is the cleanest case: one line per URL.
214
- def emit_per_iri_sections(iris, opts)
306
+ # Lazily yield IRIs from the input, one input line at a time, so an
307
+ # unbounded stream flows through without being buffered in full. Matches
308
+ # whole-text extraction exactly: a candidate never spans a newline
309
+ # (URL_CHAR_CLASS excludes whitespace) and `extract` does not dedup.
310
+ def lazy_iris(path, opts)
311
+ extractor = Extractor.new(scheme_less: opts[:scheme_less])
312
+ input_lines(path).lazy.flat_map { |line| extractor.extract(line) }
313
+ end
314
+
315
+ def input_lines(path)
316
+ if path.nil? || path == "-"
317
+ stdin.each_line
318
+ else
319
+ File.foreach(path)
320
+ end
321
+ end
322
+
323
+ # Emit the requested sections (parse/normalize/explain) for each extracted
324
+ # IRI, observing each into `corpus` as it passes. `iris` may be a lazy
325
+ # enumerator; human and NDJSON output stream (flushed per IRI) while a single
326
+ # JSON array must be materialized. -n alone is the cleanest case: one line
327
+ # per URL.
328
+ def emit_per_iri_sections(iris, opts, corpus)
215
329
  sections = opts[:sections]
216
- payloads = iris.map { |iri| section_payload(iri, sections, opts) }
217
330
 
218
- if opts[:json]
331
+ # A wrapping JSON array can't be emitted incrementally — collect it
332
+ # (force the lazy enumerator to a real Array so emit_json sees an array).
333
+ if opts[:json] && !opts[:ndjson]
334
+ payloads = iris.map { |iri| corpus.observe(iri); section_payload(iri, sections, opts) }.to_a
219
335
  out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
220
- stdout.puts JSON.generate(out)
221
- elsif sections == [:normalize]
222
- # Most common case — keep it tight: one URL per line, no headers.
223
- payloads.each { |p| stdout.puts p[:normalize] }
224
- else
225
- payloads.each_with_index do |p, i|
336
+ return emit_json(out, opts)
337
+ end
338
+
339
+ iris.each_with_index do |iri, i|
340
+ corpus.observe(iri)
341
+ p = section_payload(iri, sections, opts)
342
+ if opts[:ndjson]
343
+ items = sections.size == 1 ? p.values : [p]
344
+ items.each { |item| stdout.puts JSON.generate(item) }
345
+ elsif sections == [:normalize] || sections == [:canonical]
346
+ # Most common case — keep it tight: one URL per line, no headers.
347
+ stdout.puts p[sections.first]
348
+ else
226
349
  stdout.puts if i > 0
227
- stdout.puts "# #{iris[i].canonical}"
350
+ stdout.puts "# #{iri.canonical}"
228
351
  sections.each_with_index do |sec, j|
229
352
  stdout.puts if j > 0 # blank line between sections within one IRI
230
353
  case sec
231
354
  when :parse then emit_parse_human(p[:parse])
355
+ when :canonical then stdout.puts p[:canonical]
232
356
  when :normalize then stdout.puts p[:normalize]
233
357
  end
234
358
  end
235
359
  end
360
+ stdout.flush
236
361
  end
237
362
  end
238
363
 
239
364
  def section_payload(iri, sections, opts)
240
365
  data = {}
241
366
  data[:parse] = identifier_hash(iri) if sections.include?(:parse)
367
+ data[:canonical] = iri.canonical if sections.include?(:canonical)
242
368
  data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
243
369
  data
244
370
  end
@@ -262,7 +388,7 @@ module Iriq
262
388
  sorted = counts.sort_by { |k, c| [-c, first[k]] }
263
389
 
264
390
  if opts[:json]
265
- stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
391
+ emit_json(sorted.map { |k, c| { iri: k, count: c } }, opts)
266
392
  elsif sorted.all? { |_, c| c == 1 }
267
393
  sorted.each { |k, _| stdout.puts k }
268
394
  else
@@ -277,9 +403,152 @@ module Iriq
277
403
  0
278
404
  end
279
405
 
406
+ # --propose-recognizers: scan observed values for prefix patterns
407
+ # that recur enough to suggest a new Recognizer. Prints one block
408
+ # per proposal in human mode, or a JSON array under --json. With
409
+ # --activate-above F, every proposal at or above coverage F is
410
+ # promoted to a live Recognizer on the corpus's classifier and the
411
+ # corpus reinfers to apply the new classifier to existing
412
+ # observations.
413
+ def cmd_propose(corpus, opts)
414
+ return missing("--corpus") unless corpus
415
+
416
+ kwargs = {}
417
+ kwargs[:min_observations] = opts[:propose_min_obs] if opts[:propose_min_obs]
418
+ kwargs[:min_coverage] = opts[:propose_min_coverage] if opts[:propose_min_coverage]
419
+ kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
420
+
421
+ if opts[:activate_above]
422
+ activated = corpus.activate_proposals_above(opts[:activate_above], **kwargs)
423
+ if activated.empty?
424
+ stdout.puts "no proposals at or above coverage #{opts[:activate_above]}"
425
+ else
426
+ activated.each do |r|
427
+ stdout.puts "activated: #{r.type} (#{r.prefix})"
428
+ end
429
+ end
430
+ return 0
431
+ end
432
+
433
+ proposals = corpus.propose_recognizers(**kwargs)
434
+
435
+ if opts[:json]
436
+ stdout.puts JSON.generate(proposals.map(&:to_h))
437
+ return 0
438
+ end
439
+
440
+ if proposals.empty?
441
+ stdout.puts "no recognizer proposals (#{corpus.observed_iri_count} observations scanned)"
442
+ return 0
443
+ end
444
+
445
+ proposals.each_with_index do |p, i|
446
+ stdout.puts if i > 0
447
+ stdout.puts "proposal: #{p.suggested_type} (#{p.prefix})"
448
+ stdout.puts " strategy: #{p.strategy}"
449
+ stdout.puts " coverage: #{format('%.2f', p.coverage)}"
450
+ stdout.puts " confidence: #{format('%.2f', p.confidence)}"
451
+ stdout.puts " observations: #{p.observation_count}"
452
+ stdout.puts " hosts: #{p.hosts.to_a.sort.join(', ')}"
453
+ stdout.puts " positions: #{p.positions.size}"
454
+ stdout.puts " samples: #{p.sample_values.first(3).join(', ')}"
455
+ end
456
+ 0
457
+ end
458
+
459
+ # --reinfer: drop the materialized views in the corpus and replay the
460
+ # source-IRI log through the current classifier + reducers. Prints a
461
+ # short before/after summary so the user can see what changed.
462
+ def cmd_reinfer(corpus, _opts)
463
+ return missing("--corpus") unless corpus
464
+
465
+ n = corpus.observed_iri_count
466
+ before = corpus.size
467
+ corpus.reinfer
468
+ after = corpus.size
469
+
470
+ stdout.puts "reinferred #{n} observation#{n == 1 ? '' : 's'}: " \
471
+ "#{before} → #{after} cluster#{after == 1 ? '' : 's'}"
472
+ 0
473
+ end
474
+
475
+ # `completion <shell>` — emit the bundled shell-completion script.
476
+ # Scripts live in completions/{iriq.bash,_iriq} alongside the gem;
477
+ # Homebrew installs them automatically, but the user can also do
478
+ # `source <(iriq completion bash)` in their shell rc.
479
+ COMPLETIONS_DIR = File.expand_path("../../completions", __dir__).freeze
480
+ COMPLETION_FILES = {
481
+ "bash" => File.join(COMPLETIONS_DIR, "iriq.bash"),
482
+ "zsh" => File.join(COMPLETIONS_DIR, "_iriq"),
483
+ }.freeze
484
+
485
+ def cmd_completion(args)
486
+ shell = args.first || default_shell
487
+ path = COMPLETION_FILES[shell]
488
+ unless path
489
+ return emit_error("unknown_shell", "unknown shell #{shell.inspect} (try bash or zsh)", 1)
490
+ end
491
+ stdout.write(File.read(path))
492
+ 0
493
+ end
494
+
495
+ def default_shell
496
+ shell = ENV["SHELL"].to_s
497
+ shell.empty? ? "bash" : File.basename(shell).sub(/\.exe\z/, "")
498
+ end
499
+
500
+ # --cross-host-shapes: list route shapes that recur across multiple
501
+ # hosts in the corpus. One block per shape in human mode, JSON array
502
+ # under --json. Tunable via --min-hosts (default 2).
503
+ def cmd_cross_host_shapes(corpus, opts)
504
+ return missing("--corpus") unless corpus
505
+
506
+ kwargs = {}
507
+ kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
508
+ shapes = corpus.cross_host_shapes(**kwargs)
509
+
510
+ if opts[:json]
511
+ stdout.puts JSON.generate(shapes.map(&:to_h))
512
+ return 0
513
+ end
514
+
515
+ if shapes.empty?
516
+ stdout.puts "no cross-host shapes (#{corpus.size} cluster#{corpus.size == 1 ? '' : 's'} scanned)"
517
+ return 0
518
+ end
519
+
520
+ shapes.each do |s|
521
+ host_list = s.hosts.to_a.sort.join(", ")
522
+ stdout.puts "#{s.shape} (#{s.host_count} host#{s.host_count == 1 ? '' : 's'}: #{host_list}) obs=#{s.observation_count}"
523
+ end
524
+ 0
525
+ end
526
+
280
527
  def missing(name)
281
- stderr.puts "iriq: missing argument <#{name}>"
282
- 1
528
+ emit_error("missing_argument", "missing argument <#{name}>", 1)
529
+ end
530
+
531
+ # Detect whether JSON output was requested by scanning raw argv. Used
532
+ # before option parsing completes (or when it fails) so errors can still
533
+ # honor --json. Handles bundled short flags like -nj.
534
+ def json_requested?(argv)
535
+ argv.any? do |a|
536
+ a == "--json" || a == "--ndjson" ||
537
+ (a.start_with?("-") && !a.start_with?("--") && a.match?(/[jJ]/))
538
+ end
539
+ end
540
+
541
+ # Emit an error to stderr and return its exit code. Under --json/--ndjson
542
+ # the error is a structured envelope ({"error":{"code","message"}}) so
543
+ # agents and pipelines get parseable output on the failure path; otherwise
544
+ # the plain "iriq: <human>" line (human defaults to "iriq: <message>").
545
+ def emit_error(code, message, exit_code, human: nil)
546
+ if @json
547
+ stderr.puts JSON.generate(error: { code: code, message: message })
548
+ else
549
+ stderr.puts(human || "iriq: #{message}")
550
+ end
551
+ exit_code
283
552
  end
284
553
 
285
554
  def read_input(path)
@@ -298,6 +567,9 @@ module Iriq
298
567
  end
299
568
  end
300
569
 
570
+ # Compact identifier hash for parse output (both JSON and human). Drops
571
+ # nil values and empty collections so URN dumps don't carry empty
572
+ # host/path/query slots, and URL dumps don't include null fragment/nss.
301
573
  def identifier_hash(iri)
302
574
  {
303
575
  original: iri.original,
@@ -310,7 +582,20 @@ module Iriq
310
582
  fragment: iri.fragment,
311
583
  nss: iri.nss,
312
584
  canonical: iri.canonical,
313
- }
585
+ }.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
586
+ end
587
+
588
+ # Emit a JSON payload to stdout. When --ndjson is set and the payload is
589
+ # an Array, write one object per line (newline-delimited JSON) instead of
590
+ # one wrapping array — friendlier for `jq -c`, streaming pipelines, and
591
+ # log ingest tools. Non-array payloads (single objects) emit the same
592
+ # under both flags.
593
+ def emit_json(payload, opts)
594
+ if opts[:ndjson] && payload.is_a?(Array)
595
+ payload.each { |item| stdout.puts JSON.generate(item) }
596
+ else
597
+ stdout.puts JSON.generate(payload)
598
+ end
314
599
  end
315
600
 
316
601
  def emit_sections(data, sections)
@@ -320,41 +605,106 @@ module Iriq
320
605
  stdout.puts "# #{sec}" if multi
321
606
  case sec
322
607
  when :parse then emit_parse_human(data[:parse])
608
+ when :canonical then stdout.puts data[:canonical]
323
609
  when :normalize then stdout.puts data[:normalize]
610
+ when :explain then emit_explain_human(data[:explain])
324
611
  end
325
612
  end
326
613
  end
327
614
 
615
+ # Render the trace hash as a vertically-aligned per-segment table.
616
+ # path rows first, then query rows.
617
+ def emit_explain_human(trace)
618
+ stdout.puts trace[:normalized]
619
+ emit_trace_section("path", trace[:path])
620
+ emit_trace_section("query", trace[:query]) if trace[:query]
621
+ end
622
+
623
+ def emit_trace_section(label, rows)
624
+ return if rows.nil? || rows.empty?
625
+
626
+ stdout.puts
627
+ stdout.puts "#{label}:"
628
+ name_width = rows.map { |r| trace_label(r).length }.max
629
+ type_width = rows.map { |r| r[:type].to_s.length }.max
630
+ out_width = rows.map { |r| r[:output].to_s.length }.max
631
+ rows.each do |r|
632
+ stdout.puts " #{trace_label(r).ljust(name_width)} #{r[:type].to_s.ljust(type_width)} #{r[:output].to_s.ljust(out_width)}#{format_notes(r[:notes])}"
633
+ end
634
+ end
635
+
636
+ def trace_label(row)
637
+ # Path rows have :value, query rows have :name=:value.
638
+ row[:name] ? "#{row[:name]}=#{row[:value]}" : row[:value].to_s
639
+ end
640
+
641
+ def format_notes(notes)
642
+ return "" if notes.nil? || notes.empty?
643
+ " (" + notes.join("; ") + ")"
644
+ end
645
+
646
+ # Render the compact identifier_hash. Keys/values are already filtered;
647
+ # array/hash values get .inspect, everything else .to_s.
328
648
  def emit_parse_human(h)
329
- stdout.puts "original: #{h[:original]}"
330
- stdout.puts "kind: #{h[:kind]}"
331
- stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
332
- stdout.puts "host: #{h[:host]}" if h[:host]
333
- stdout.puts "port: #{h[:port]}" if h[:port]
334
- stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
335
- stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
336
- stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
337
- stdout.puts "nss: #{h[:nss]}" if h[:nss]
338
- stdout.puts "canonical: #{h[:canonical]}"
649
+ h.each do |key, value|
650
+ rendered = value.is_a?(Array) || value.is_a?(Hash) ? value.inspect : value.to_s
651
+ stdout.puts "#{"#{key}:".ljust(15)}#{rendered}"
652
+ end
339
653
  end
340
654
 
341
655
  def emit_clusters(clusters, opts)
342
656
  sorted = clusters.sort_by { |c| -c.count }
343
657
 
344
658
  if opts[:json]
345
- stdout.puts JSON.generate(sorted.map(&:to_h))
659
+ emit_json(sorted.map(&:to_h), opts)
346
660
  else
347
661
  sorted.each_with_index do |c, i|
348
662
  stdout.puts if i > 0
349
663
  host = c.host || "(urn)"
350
664
  shape = opts[:hints] ? c.shape : raw_shape_for(c)
351
665
  stdout.puts "[#{c.count}] #{host} #{shape}"
352
- c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
353
- stdout.puts " + #{c.count - 3} more" if c.count > 3
666
+ examples = c.examples.first(3)
667
+ examples.each { |e| stdout.puts " #{e.canonical}" }
668
+ remaining = c.count - examples.size
669
+ stdout.puts " + #{remaining} more" if remaining.positive?
670
+ emit_param_summary(c)
671
+ end
672
+ end
673
+ end
674
+
675
+ # One line per param: type, range (numeric), cardinality, presence.
676
+ # `page integer 1..100 avg 50.5 (10 distinct, 100%)`
677
+ def emit_param_summary(cluster)
678
+ rows = cluster.param_summary
679
+ return if rows.empty?
680
+
681
+ width = rows.map { |r| r[:name].length }.max
682
+ rows.each do |r|
683
+ bits = ["#{r[:type]}"]
684
+ if r[:min] && r[:max]
685
+ bits << format_range(r[:min], r[:max])
686
+ bits << "avg #{format_num(r[:avg])}" if r[:avg]
354
687
  end
688
+ bits << "(#{r[:cardinality]} distinct, #{format_pct(r[:presence])})"
689
+ stdout.puts " #{r[:name].to_s.ljust(width)} #{bits.join(' ')}"
355
690
  end
356
691
  end
357
692
 
693
+ def format_range(lo, hi)
694
+ "#{format_num(lo)}..#{format_num(hi)}"
695
+ end
696
+
697
+ def format_num(n)
698
+ return n.to_s if n.is_a?(Integer)
699
+ whole = n.to_i
700
+ return whole.to_s if whole == n
701
+ n.round(2).to_s
702
+ end
703
+
704
+ def format_pct(frac)
705
+ "#{(frac * 100).round}%"
706
+ end
707
+
358
708
  def raw_shape_for(cluster)
359
709
  example = cluster.examples.first
360
710
  return cluster.shape unless example
@@ -387,7 +737,10 @@ module Iriq
387
737
  end
388
738
 
389
739
  def top(hash)
390
- hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
740
+ # Lex tie-break on equal counts — Ruby Hash insertion order would
741
+ # otherwise diverge from Go's map iteration (which has no insertion
742
+ # order). Keeps Ruby ↔ Go --stats parity stable.
743
+ hash.sort_by { |k, n| [-n, k] }.first(TOP_N_STATS).to_h
391
744
  end
392
745
  end
393
746
  end