iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iriq/cli.rb CHANGED
@@ -18,6 +18,8 @@ module Iriq
18
18
  LARGE_BATCH_THRESHOLD = 10
19
19
 
20
20
  USAGE = <<~TXT
21
+ iriq — find a URL's shape: the route template behind it (e.g. /users/{id}).
22
+
21
23
  Usage: iriq [options] <input>
22
24
  iriq [options] < text
23
25
  iriq cluster [options] [file]
@@ -26,18 +28,49 @@ module Iriq
26
28
  text via stdin.
27
29
 
28
30
  Sections (combine freely):
29
- -n, --normalize Shape-normalized form
31
+ -n, --normalize Shape — variable parts become placeholders
32
+ -c, --canonical Clean form — tidy scheme/host, keep the values
30
33
  -p, --parse Parsed fields
34
+ -e, --explain Annotated trace — per-segment notes about why
35
+ each placeholder / canonical value was chosen
31
36
 
32
37
  Corpus + stats:
33
38
  --corpus PATH Load/create a JSON corpus; observe and save atomically.
34
39
  -n becomes corpus-informed once it has data.
40
+ --host MODE Host-keying strategy for clustering:
41
+ full (default), registrable (or reg) strips
42
+ subdomains, none ignores host entirely.
35
43
  --stats Print rolling aggregates
44
+ --reinfer Replay the source-IRI log through the current
45
+ classifier + reducers; rebuilds materialized
46
+ views from scratch. Requires --corpus.
47
+ --propose-recognizers
48
+ Scan observed values for shape patterns that
49
+ recur enough to suggest a new Recognizer.
50
+ Combine with --json for structured output.
51
+ Requires --corpus.
52
+ --cross-host-shapes
53
+ List route shapes that recur across
54
+ multiple hosts. Combine with --min-hosts.
55
+ Requires --corpus.
56
+ --activate-above F With --propose-recognizers, promote every
57
+ proposal at or above CONFIDENCE F into a
58
+ live Recognizer on the corpus, then
59
+ reinfer. Confidence integrates coverage
60
+ and cross-host corroboration.
61
+
62
+ Thresholds (apply to --propose-recognizers / --cross-host-shapes):
63
+ --min-observations N proposal noise floor (default 20)
64
+ --min-coverage F proposal coverage floor (default 0.7)
65
+ --min-hosts N proposal: minimum hosts (default 1);
66
+ cross-host-shapes: minimum hosts to
67
+ list (default 2)
36
68
 
37
69
  Other:
38
70
  -h, --help Show this message
39
71
  -j, --json Emit JSON instead of human-readable output
40
- -N, --no-hints Use {integer_id} placeholders instead of {user_id}
72
+ -J, --ndjson Newline-delimited JSON (one object per line). Implies --json.
73
+ -N, --no-hints Use {integer} placeholders instead of {user_id}
41
74
  --no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
42
75
  -V, --version Print version
43
76
 
@@ -62,11 +95,22 @@ module Iriq
62
95
 
63
96
  # Returns an integer exit code.
64
97
  def run(argv)
98
+ # Pre-scan so an error during option parsing can still honor --json.
99
+ # Re-set authoritatively from opts once parsing succeeds.
100
+ @json = json_requested?(argv)
65
101
  args, opts = parse_options(argv)
102
+ @json = opts[:json]
66
103
 
67
104
  return print_usage(stdout, 0) if opts[:help]
68
105
  return print_version if opts[:version]
69
106
 
107
+ # `iriq completion <shell>` short-circuits — no corpus, no IRI input,
108
+ # just emit the script bundled with the gem.
109
+ if args.first == "completion"
110
+ args.shift
111
+ return cmd_completion(args)
112
+ end
113
+
70
114
  explicit_cluster = (args.first == "cluster")
71
115
  args.shift if explicit_cluster
72
116
 
@@ -79,11 +123,17 @@ module Iriq
79
123
  batch_mode = explicit_cluster || positional_is_file ||
80
124
  (args.empty? && piped_stdin?)
81
125
 
82
- return print_usage(stdout, 0) if args.empty? && !batch_mode
126
+ return print_usage(stdout, 0) if args.empty? && !batch_mode && !opts[:reinfer] && !opts[:propose] && !opts[:cross_host_shapes]
83
127
 
84
- corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
128
+ corpus = opts[:corpus] ? load_corpus(opts[:corpus], host_strategy: opts[:host_strategy]) : nil
85
129
 
86
- code = if batch_mode
130
+ code = if opts[:reinfer]
131
+ cmd_reinfer(corpus, opts)
132
+ elsif opts[:propose]
133
+ cmd_propose(corpus, opts)
134
+ elsif opts[:cross_host_shapes]
135
+ cmd_cross_host_shapes(corpus, opts)
136
+ elsif batch_mode
87
137
  cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
88
138
  elsif opts[:stats]
89
139
  cmd_stats(corpus, opts)
@@ -94,11 +144,9 @@ module Iriq
94
144
  corpus.save(opts[:corpus]) if corpus && opts[:corpus]
95
145
  code
96
146
  rescue Iriq::ParseError => e
97
- stderr.puts "iriq: parse error: #{e.message}"
98
- 2
147
+ emit_error("parse_error", e.message, 2, human: "iriq: parse error: #{e.message}")
99
148
  rescue OptionParser::ParseError => e
100
- stderr.puts "iriq: #{e.message}"
101
- 1
149
+ emit_error("option_error", e.message, 1)
102
150
  end
103
151
 
104
152
  def parseable_iri?(input)
@@ -113,22 +161,45 @@ module Iriq
113
161
  def parse_options(argv)
114
162
  opts = {
115
163
  json: false,
164
+ ndjson: false,
116
165
  help: false,
117
166
  version: false,
118
167
  hints: true,
119
168
  sections: [],
120
- corpus: nil,
121
- stats: false,
122
- scheme_less: true,
169
+ corpus: nil,
170
+ stats: false,
171
+ reinfer: false,
172
+ propose: false,
173
+ propose_min_obs: nil,
174
+ propose_min_coverage: nil,
175
+ # --min-hosts is generic: it applies to both --propose-recognizers
176
+ # (proposal threshold) and --cross-host-shapes (cross-host
177
+ # recurrence threshold).
178
+ min_hosts: nil,
179
+ activate_above: nil,
180
+ cross_host_shapes: false,
181
+ scheme_less: true,
182
+ host_strategy: :full,
123
183
  }
124
184
  parser = OptionParser.new do |o|
125
185
  o.on("-p", "--parse") { opts[:sections] << :parse }
126
186
  o.on("-n", "--normalize") { opts[:sections] << :normalize }
187
+ o.on("-c", "--canonical") { opts[:sections] << :canonical }
188
+ o.on("-e", "--explain") { opts[:sections] << :explain }
127
189
  o.on("-j", "--json") { opts[:json] = true }
190
+ o.on("-J", "--ndjson") { opts[:json] = true; opts[:ndjson] = true }
128
191
  o.on("--[no-]hints") { |v| opts[:hints] = v }
129
192
  o.on("-N") { opts[:hints] = false }
130
193
  o.on("--corpus PATH") { |v| opts[:corpus] = v }
194
+ o.on("--host MODE") { |v| opts[:host_strategy] = host_strategy_arg(v) }
131
195
  o.on("--stats") { opts[:stats] = true }
196
+ o.on("--reinfer") { opts[:reinfer] = true }
197
+ o.on("--propose-recognizers") { opts[:propose] = true }
198
+ o.on("--min-observations N", Integer) { |v| opts[:propose_min_obs] = v }
199
+ o.on("--min-coverage F", Float) { |v| opts[:propose_min_coverage] = v }
200
+ o.on("--min-hosts N", Integer) { |v| opts[:min_hosts] = v }
201
+ o.on("--activate-above F", Float) { |v| opts[:activate_above] = v }
202
+ o.on("--cross-host-shapes") { opts[:cross_host_shapes] = true }
132
203
  o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
133
204
  o.on("-h", "--help") { opts[:help] = true }
134
205
  o.on("-V", "--version") { opts[:version] = true }
@@ -149,8 +220,20 @@ module Iriq
149
220
  end
150
221
  end
151
222
 
152
- def load_corpus(path)
153
- Corpus.open(path)
223
+ def load_corpus(path, host_strategy: :full)
224
+ Corpus.open(path, host_strategy: host_strategy)
225
+ end
226
+
227
+ # Accept `--host=reg` as a short alias for the `registrable` mode.
228
+ HOST_STRATEGY_ALIASES = {
229
+ "full" => :full, "registrable" => :registrable, "reg" => :registrable, "none" => :none,
230
+ }.freeze
231
+
232
+ def host_strategy_arg(value)
233
+ mode = HOST_STRATEGY_ALIASES[value.to_s.downcase]
234
+ raise OptionParser::InvalidArgument, "--host: expected full|registrable|reg|none, got #{value.inspect}" unless mode
235
+
236
+ mode
154
237
  end
155
238
 
156
239
  def print_usage(io, code)
@@ -171,9 +254,13 @@ module Iriq
171
254
 
172
255
  data = {}
173
256
  data[:parse] = identifier_hash(iri) if sections.include?(:parse)
257
+ data[:canonical] = iri.canonical if sections.include?(:canonical)
174
258
  if sections.include?(:normalize)
175
259
  data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
176
260
  end
261
+ if sections.include?(:explain)
262
+ data[:explain] = Trace.for(iri, hints: opts[:hints])
263
+ end
177
264
 
178
265
  if opts[:json]
179
266
  payload = sections.size == 1 ? data.values.first : data
@@ -190,12 +277,21 @@ module Iriq
190
277
  # corpus is ephemeral unless --corpus was given.
191
278
  def cmd_batch(args, opts, corpus, explicit_cluster: false)
192
279
  corpus ||= Corpus.new
280
+
281
+ # Per-IRI sections (-n/-p/-c/-e) are independent line to line, so we
282
+ # stream: read input lazily, extract per line, and emit each IRI as it
283
+ # arrives (flushed for live `tail -f | iriq -n` pipelines). The aggregate
284
+ # views below — stats, clusters, the deduped URL list — need the whole
285
+ # input, so they slurp.
286
+ if opts[:sections].any?
287
+ emit_per_iri_sections(lazy_iris(args.first, opts), opts, corpus)
288
+ return 0
289
+ end
290
+
193
291
  iris = extract_text(read_text(args.first), opts)
194
292
  corpus.batch { iris.each { |iri| corpus.observe(iri) } }
195
293
 
196
- if opts[:sections].any?
197
- emit_per_iri_sections(iris, opts)
198
- elsif opts[:stats]
294
+ if opts[:stats]
199
295
  emit_stats(corpus, opts)
200
296
  elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
201
297
  # Either the user asked for clusters explicitly, or the input is
@@ -207,36 +303,68 @@ module Iriq
207
303
  0
208
304
  end
209
305
 
210
- # Emit the requested sections (parse/normalize/explain) for each
211
- # extracted IRI. -n alone is the cleanest case: one line per URL.
212
- def emit_per_iri_sections(iris, opts)
306
+ # Lazily yield IRIs from the input, one input line at a time, so an
307
+ # unbounded stream flows through without being buffered in full. Matches
308
+ # whole-text extraction exactly: a candidate never spans a newline
309
+ # (URL_CHAR_CLASS excludes whitespace) and `extract` does not dedup.
310
+ def lazy_iris(path, opts)
311
+ extractor = Extractor.new(scheme_less: opts[:scheme_less])
312
+ input_lines(path).lazy.flat_map { |line| extractor.extract(line) }
313
+ end
314
+
315
+ def input_lines(path)
316
+ if path.nil? || path == "-"
317
+ stdin.each_line
318
+ else
319
+ File.foreach(path)
320
+ end
321
+ end
322
+
323
+ # Emit the requested sections (parse/normalize/explain) for each extracted
324
+ # IRI, observing each into `corpus` as it passes. `iris` may be a lazy
325
+ # enumerator; human and NDJSON output stream (flushed per IRI) while a single
326
+ # JSON array must be materialized. -n alone is the cleanest case: one line
327
+ # per URL.
328
+ def emit_per_iri_sections(iris, opts, corpus)
213
329
  sections = opts[:sections]
214
- payloads = iris.map { |iri| section_payload(iri, sections, opts) }
215
330
 
216
- if opts[:json]
331
+ # A wrapping JSON array can't be emitted incrementally — collect it
332
+ # (force the lazy enumerator to a real Array so emit_json sees an array).
333
+ if opts[:json] && !opts[:ndjson]
334
+ payloads = iris.map { |iri| corpus.observe(iri); section_payload(iri, sections, opts) }.to_a
217
335
  out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
218
- stdout.puts JSON.generate(out)
219
- elsif sections == [:normalize]
220
- # Most common case — keep it tight: one URL per line, no headers.
221
- payloads.each { |p| stdout.puts p[:normalize] }
222
- else
223
- payloads.each_with_index do |p, i|
336
+ return emit_json(out, opts)
337
+ end
338
+
339
+ iris.each_with_index do |iri, i|
340
+ corpus.observe(iri)
341
+ p = section_payload(iri, sections, opts)
342
+ if opts[:ndjson]
343
+ items = sections.size == 1 ? p.values : [p]
344
+ items.each { |item| stdout.puts JSON.generate(item) }
345
+ elsif sections == [:normalize] || sections == [:canonical]
346
+ # Most common case — keep it tight: one URL per line, no headers.
347
+ stdout.puts p[sections.first]
348
+ else
224
349
  stdout.puts if i > 0
225
- stdout.puts "# #{iris[i].canonical}"
350
+ stdout.puts "# #{iri.canonical}"
226
351
  sections.each_with_index do |sec, j|
227
352
  stdout.puts if j > 0 # blank line between sections within one IRI
228
353
  case sec
229
354
  when :parse then emit_parse_human(p[:parse])
355
+ when :canonical then stdout.puts p[:canonical]
230
356
  when :normalize then stdout.puts p[:normalize]
231
357
  end
232
358
  end
233
359
  end
360
+ stdout.flush
234
361
  end
235
362
  end
236
363
 
237
364
  def section_payload(iri, sections, opts)
238
365
  data = {}
239
366
  data[:parse] = identifier_hash(iri) if sections.include?(:parse)
367
+ data[:canonical] = iri.canonical if sections.include?(:canonical)
240
368
  data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
241
369
  data
242
370
  end
@@ -260,7 +388,7 @@ module Iriq
260
388
  sorted = counts.sort_by { |k, c| [-c, first[k]] }
261
389
 
262
390
  if opts[:json]
263
- stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
391
+ emit_json(sorted.map { |k, c| { iri: k, count: c } }, opts)
264
392
  elsif sorted.all? { |_, c| c == 1 }
265
393
  sorted.each { |k, _| stdout.puts k }
266
394
  else
@@ -275,9 +403,152 @@ module Iriq
275
403
  0
276
404
  end
277
405
 
406
+ # --propose-recognizers: scan observed values for prefix patterns
407
+ # that recur enough to suggest a new Recognizer. Prints one block
408
+ # per proposal in human mode, or a JSON array under --json. With
409
+ # --activate-above F, every proposal at or above coverage F is
410
+ # promoted to a live Recognizer on the corpus's classifier and the
411
+ # corpus reinfers to apply the new classifier to existing
412
+ # observations.
413
+ def cmd_propose(corpus, opts)
414
+ return missing("--corpus") unless corpus
415
+
416
+ kwargs = {}
417
+ kwargs[:min_observations] = opts[:propose_min_obs] if opts[:propose_min_obs]
418
+ kwargs[:min_coverage] = opts[:propose_min_coverage] if opts[:propose_min_coverage]
419
+ kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
420
+
421
+ if opts[:activate_above]
422
+ activated = corpus.activate_proposals_above(opts[:activate_above], **kwargs)
423
+ if activated.empty?
424
+ stdout.puts "no proposals at or above coverage #{opts[:activate_above]}"
425
+ else
426
+ activated.each do |r|
427
+ stdout.puts "activated: #{r.type} (#{r.prefix})"
428
+ end
429
+ end
430
+ return 0
431
+ end
432
+
433
+ proposals = corpus.propose_recognizers(**kwargs)
434
+
435
+ if opts[:json]
436
+ stdout.puts JSON.generate(proposals.map(&:to_h))
437
+ return 0
438
+ end
439
+
440
+ if proposals.empty?
441
+ stdout.puts "no recognizer proposals (#{corpus.observed_iri_count} observations scanned)"
442
+ return 0
443
+ end
444
+
445
+ proposals.each_with_index do |p, i|
446
+ stdout.puts if i > 0
447
+ stdout.puts "proposal: #{p.suggested_type} (#{p.prefix})"
448
+ stdout.puts " strategy: #{p.strategy}"
449
+ stdout.puts " coverage: #{format('%.2f', p.coverage)}"
450
+ stdout.puts " confidence: #{format('%.2f', p.confidence)}"
451
+ stdout.puts " observations: #{p.observation_count}"
452
+ stdout.puts " hosts: #{p.hosts.to_a.sort.join(', ')}"
453
+ stdout.puts " positions: #{p.positions.size}"
454
+ stdout.puts " samples: #{p.sample_values.first(3).join(', ')}"
455
+ end
456
+ 0
457
+ end
458
+
459
+ # --reinfer: drop the materialized views in the corpus and replay the
460
+ # source-IRI log through the current classifier + reducers. Prints a
461
+ # short before/after summary so the user can see what changed.
462
+ def cmd_reinfer(corpus, _opts)
463
+ return missing("--corpus") unless corpus
464
+
465
+ n = corpus.observed_iri_count
466
+ before = corpus.size
467
+ corpus.reinfer
468
+ after = corpus.size
469
+
470
+ stdout.puts "reinferred #{n} observation#{n == 1 ? '' : 's'}: " \
471
+ "#{before} → #{after} cluster#{after == 1 ? '' : 's'}"
472
+ 0
473
+ end
474
+
475
+ # `completion <shell>` — emit the bundled shell-completion script.
476
+ # Scripts live in completions/{iriq.bash,_iriq} alongside the gem;
477
+ # Homebrew installs them automatically, but the user can also do
478
+ # `source <(iriq completion bash)` in their shell rc.
479
+ COMPLETIONS_DIR = File.expand_path("../../completions", __dir__).freeze
480
+ COMPLETION_FILES = {
481
+ "bash" => File.join(COMPLETIONS_DIR, "iriq.bash"),
482
+ "zsh" => File.join(COMPLETIONS_DIR, "_iriq"),
483
+ }.freeze
484
+
485
+ def cmd_completion(args)
486
+ shell = args.first || default_shell
487
+ path = COMPLETION_FILES[shell]
488
+ unless path
489
+ return emit_error("unknown_shell", "unknown shell #{shell.inspect} (try bash or zsh)", 1)
490
+ end
491
+ stdout.write(File.read(path))
492
+ 0
493
+ end
494
+
495
+ def default_shell
496
+ shell = ENV["SHELL"].to_s
497
+ shell.empty? ? "bash" : File.basename(shell).sub(/\.exe\z/, "")
498
+ end
499
+
500
+ # --cross-host-shapes: list route shapes that recur across multiple
501
+ # hosts in the corpus. One block per shape in human mode, JSON array
502
+ # under --json. Tunable via --min-hosts (default 2).
503
+ def cmd_cross_host_shapes(corpus, opts)
504
+ return missing("--corpus") unless corpus
505
+
506
+ kwargs = {}
507
+ kwargs[:min_hosts] = opts[:min_hosts] if opts[:min_hosts]
508
+ shapes = corpus.cross_host_shapes(**kwargs)
509
+
510
+ if opts[:json]
511
+ stdout.puts JSON.generate(shapes.map(&:to_h))
512
+ return 0
513
+ end
514
+
515
+ if shapes.empty?
516
+ stdout.puts "no cross-host shapes (#{corpus.size} cluster#{corpus.size == 1 ? '' : 's'} scanned)"
517
+ return 0
518
+ end
519
+
520
+ shapes.each do |s|
521
+ host_list = s.hosts.to_a.sort.join(", ")
522
+ stdout.puts "#{s.shape} (#{s.host_count} host#{s.host_count == 1 ? '' : 's'}: #{host_list}) obs=#{s.observation_count}"
523
+ end
524
+ 0
525
+ end
526
+
278
527
  def missing(name)
279
- stderr.puts "iriq: missing argument <#{name}>"
280
- 1
528
+ emit_error("missing_argument", "missing argument <#{name}>", 1)
529
+ end
530
+
531
+ # Detect whether JSON output was requested by scanning raw argv. Used
532
+ # before option parsing completes (or when it fails) so errors can still
533
+ # honor --json. Handles bundled short flags like -nj.
534
+ def json_requested?(argv)
535
+ argv.any? do |a|
536
+ a == "--json" || a == "--ndjson" ||
537
+ (a.start_with?("-") && !a.start_with?("--") && a.match?(/[jJ]/))
538
+ end
539
+ end
540
+
541
+ # Emit an error to stderr and return its exit code. Under --json/--ndjson
542
+ # the error is a structured envelope ({"error":{"code","message"}}) so
543
+ # agents and pipelines get parseable output on the failure path; otherwise
544
+ # the plain "iriq: <human>" line (human defaults to "iriq: <message>").
545
+ def emit_error(code, message, exit_code, human: nil)
546
+ if @json
547
+ stderr.puts JSON.generate(error: { code: code, message: message })
548
+ else
549
+ stderr.puts(human || "iriq: #{message}")
550
+ end
551
+ exit_code
281
552
  end
282
553
 
283
554
  def read_input(path)
@@ -314,6 +585,19 @@ module Iriq
314
585
  }.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
315
586
  end
316
587
 
588
+ # Emit a JSON payload to stdout. When --ndjson is set and the payload is
589
+ # an Array, write one object per line (newline-delimited JSON) instead of
590
+ # one wrapping array — friendlier for `jq -c`, streaming pipelines, and
591
+ # log ingest tools. Non-array payloads (single objects) emit the same
592
+ # under both flags.
593
+ def emit_json(payload, opts)
594
+ if opts[:ndjson] && payload.is_a?(Array)
595
+ payload.each { |item| stdout.puts JSON.generate(item) }
596
+ else
597
+ stdout.puts JSON.generate(payload)
598
+ end
599
+ end
600
+
317
601
  def emit_sections(data, sections)
318
602
  multi = sections.size > 1
319
603
  sections.each_with_index do |sec, i|
@@ -321,41 +605,106 @@ module Iriq
321
605
  stdout.puts "# #{sec}" if multi
322
606
  case sec
323
607
  when :parse then emit_parse_human(data[:parse])
608
+ when :canonical then stdout.puts data[:canonical]
324
609
  when :normalize then stdout.puts data[:normalize]
610
+ when :explain then emit_explain_human(data[:explain])
325
611
  end
326
612
  end
327
613
  end
328
614
 
615
+ # Render the trace hash as a vertically-aligned per-segment table.
616
+ # path rows first, then query rows.
617
+ def emit_explain_human(trace)
618
+ stdout.puts trace[:normalized]
619
+ emit_trace_section("path", trace[:path])
620
+ emit_trace_section("query", trace[:query]) if trace[:query]
621
+ end
622
+
623
+ def emit_trace_section(label, rows)
624
+ return if rows.nil? || rows.empty?
625
+
626
+ stdout.puts
627
+ stdout.puts "#{label}:"
628
+ name_width = rows.map { |r| trace_label(r).length }.max
629
+ type_width = rows.map { |r| r[:type].to_s.length }.max
630
+ out_width = rows.map { |r| r[:output].to_s.length }.max
631
+ rows.each do |r|
632
+ stdout.puts " #{trace_label(r).ljust(name_width)} #{r[:type].to_s.ljust(type_width)} #{r[:output].to_s.ljust(out_width)}#{format_notes(r[:notes])}"
633
+ end
634
+ end
635
+
636
+ def trace_label(row)
637
+ # Path rows have :value, query rows have :name=:value.
638
+ row[:name] ? "#{row[:name]}=#{row[:value]}" : row[:value].to_s
639
+ end
640
+
641
+ def format_notes(notes)
642
+ return "" if notes.nil? || notes.empty?
643
+ " (" + notes.join("; ") + ")"
644
+ end
645
+
646
+ # Render the compact identifier_hash. Keys/values are already filtered;
647
+ # array/hash values get .inspect, everything else .to_s.
329
648
  def emit_parse_human(h)
330
- stdout.puts "original: #{h[:original]}"
331
- stdout.puts "kind: #{h[:kind]}"
332
- stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
333
- stdout.puts "host: #{h[:host]}" if h[:host]
334
- stdout.puts "port: #{h[:port]}" if h[:port]
335
- stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
336
- stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
337
- stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
338
- stdout.puts "nss: #{h[:nss]}" if h[:nss]
339
- stdout.puts "canonical: #{h[:canonical]}"
649
+ h.each do |key, value|
650
+ rendered = value.is_a?(Array) || value.is_a?(Hash) ? value.inspect : value.to_s
651
+ stdout.puts "#{"#{key}:".ljust(15)}#{rendered}"
652
+ end
340
653
  end
341
654
 
342
655
  def emit_clusters(clusters, opts)
343
656
  sorted = clusters.sort_by { |c| -c.count }
344
657
 
345
658
  if opts[:json]
346
- stdout.puts JSON.generate(sorted.map(&:to_h))
659
+ emit_json(sorted.map(&:to_h), opts)
347
660
  else
348
661
  sorted.each_with_index do |c, i|
349
662
  stdout.puts if i > 0
350
663
  host = c.host || "(urn)"
351
664
  shape = opts[:hints] ? c.shape : raw_shape_for(c)
352
665
  stdout.puts "[#{c.count}] #{host} #{shape}"
353
- c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
354
- stdout.puts " + #{c.count - 3} more" if c.count > 3
666
+ examples = c.examples.first(3)
667
+ examples.each { |e| stdout.puts " #{e.canonical}" }
668
+ remaining = c.count - examples.size
669
+ stdout.puts " + #{remaining} more" if remaining.positive?
670
+ emit_param_summary(c)
671
+ end
672
+ end
673
+ end
674
+
675
+ # One line per param: type, range (numeric), cardinality, presence.
676
+ # `page integer 1..100 avg 50.5 (10 distinct, 100%)`
677
+ def emit_param_summary(cluster)
678
+ rows = cluster.param_summary
679
+ return if rows.empty?
680
+
681
+ width = rows.map { |r| r[:name].length }.max
682
+ rows.each do |r|
683
+ bits = ["#{r[:type]}"]
684
+ if r[:min] && r[:max]
685
+ bits << format_range(r[:min], r[:max])
686
+ bits << "avg #{format_num(r[:avg])}" if r[:avg]
355
687
  end
688
+ bits << "(#{r[:cardinality]} distinct, #{format_pct(r[:presence])})"
689
+ stdout.puts " #{r[:name].to_s.ljust(width)} #{bits.join(' ')}"
356
690
  end
357
691
  end
358
692
 
693
+ def format_range(lo, hi)
694
+ "#{format_num(lo)}..#{format_num(hi)}"
695
+ end
696
+
697
+ def format_num(n)
698
+ return n.to_s if n.is_a?(Integer)
699
+ whole = n.to_i
700
+ return whole.to_s if whole == n
701
+ n.round(2).to_s
702
+ end
703
+
704
+ def format_pct(frac)
705
+ "#{(frac * 100).round}%"
706
+ end
707
+
359
708
  def raw_shape_for(cluster)
360
709
  example = cluster.examples.first
361
710
  return cluster.shape unless example
@@ -388,7 +737,10 @@ module Iriq
388
737
  end
389
738
 
390
739
  def top(hash)
391
- hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
740
+ # Lex tie-break on equal counts — Ruby Hash insertion order would
741
+ # otherwise diverge from Go's map iteration (which has no insertion
742
+ # order). Keeps Ruby ↔ Go --stats parity stable.
743
+ hash.sort_by { |k, n| [-n, k] }.first(TOP_N_STATS).to_h
392
744
  end
393
745
  end
394
746
  end