iriq 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iriq/cli.rb CHANGED
@@ -1,32 +1,55 @@
1
1
  require "json"
2
2
  require "optparse"
3
+ require "stringio"
3
4
 
4
5
  module Iriq
5
- # Tiny CLI wrapper around the public API. Construct with explicit IO so
6
- # specs can run it without shelling out.
6
+ # Flag-driven CLI. The default action for an input is a combined parse +
7
+ # normalize + explain summary; the -p/-n/-e flags select individual
8
+ # sections. The only subcommand is `cluster`, which is structurally
9
+ # different (many inputs, not one). Construct with explicit IO so specs
10
+ # can run it without shelling out.
7
11
  class CLI
8
- COMMANDS = %w[parse normalize explain classify cluster help version].freeze
12
+ SECTION_FLAGS = %i[parse normalize].freeze
13
+ TOP_N_STATS = 10
14
+
15
+ # When extraction yields this many or more IRIs, the default pipe
16
+ # output switches from a URL list to clusters — a longer list is
17
+ # easier to read as route-shape groups.
18
+ LARGE_BATCH_THRESHOLD = 10
9
19
 
10
20
  USAGE = <<~TXT
11
- Usage: iriq <command> [options] [args]
21
+ Usage: iriq [options] <input>
22
+ iriq [options] < text
23
+ iriq cluster [options] [file]
24
+
25
+ <input> may be an IRI, a file path (extracted automatically), or piped
26
+ text via stdin.
27
+
28
+ Sections (combine freely):
29
+ -n, --normalize Shape-normalized form
30
+ -p, --parse Parsed fields
12
31
 
13
- Commands:
14
- parse <input> Parse an identifier and print its fields
15
- normalize <input> Print the shape-normalized form
16
- explain <input> Annotate each path segment
17
- classify <segment> Classify a single segment
18
- cluster [file] Cluster identifiers from FILE or stdin (one per line)
19
- help Show this message
20
- version Print version
32
+ Corpus + stats:
33
+ --corpus PATH Load/create a JSON corpus; observe and save atomically.
34
+ -n becomes corpus-informed once it has data.
35
+ --stats Print rolling aggregates
21
36
 
22
- Options:
23
- -j, --json Emit JSON instead of human-readable output
24
- -h, --help Show this message
37
+ Other:
38
+ -h, --help Show this message
39
+ -j, --json Emit JSON instead of human-readable output
40
+ -N, --no-hints Use {integer_id} placeholders instead of {user_id}
41
+ --no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
42
+ -V, --version Print version
43
+
44
+ Subcommands:
45
+ cluster [file] Force cluster view (default for ≥10 IRIs anyway)
25
46
 
26
47
  Examples:
27
- iriq parse https://foo.com/users/123
28
- iriq normalize foo.com/users/456
29
- echo "https://foo.com/users/1\\nhttps://foo.com/users/2" | iriq cluster
48
+ iriq foo.com/users/456
49
+ iriq -n https://foo.com/users/123
50
+ iriq ./access.log # auto-detect file → extract URLs
51
+ cat README.md | iriq -n # one normalized URL per line
52
+ cat README.md | iriq --corpus c.json
30
53
  TXT
31
54
 
32
55
  attr_reader :stdin, :stdout, :stderr
@@ -41,16 +64,35 @@ module Iriq
41
64
  def run(argv)
42
65
  args, opts = parse_options(argv)
43
66
 
44
- cmd = args.shift
45
- return print_usage(stdout, 0) if cmd.nil? || cmd == "help" || opts[:help]
67
+ return print_usage(stdout, 0) if opts[:help]
68
+ return print_version if opts[:version]
69
+
70
+ explicit_cluster = (args.first == "cluster")
71
+ args.shift if explicit_cluster
72
+
73
+ # Auto-detect: a positional argument that isn't parseable as an IRI
74
+ # but IS an existing file gets treated as a file to extract from. This
75
+ # is what makes `iriq ./access.log` and `iriq /var/log/foo.log` Just
76
+ # Work without a separate --extract flag.
77
+ positional_is_file = args.first && File.file?(args.first) && !parseable_iri?(args.first)
78
+
79
+ batch_mode = explicit_cluster || positional_is_file ||
80
+ (args.empty? && piped_stdin?)
81
+
82
+ return print_usage(stdout, 0) if args.empty? && !batch_mode
83
+
84
+ corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
46
85
 
47
- unless COMMANDS.include?(cmd)
48
- stderr.puts "iriq: unknown command #{cmd.inspect}"
49
- print_usage(stderr, 1)
50
- return 1
86
+ code = if batch_mode
87
+ cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
88
+ elsif opts[:stats]
89
+ cmd_stats(corpus, opts)
90
+ else
91
+ cmd_summary(args, opts, corpus)
51
92
  end
52
93
 
53
- send("cmd_#{cmd}", args, opts)
94
+ corpus.save(opts[:corpus]) if corpus && opts[:corpus]
95
+ code
54
96
  rescue Iriq::ParseError => e
55
97
  stderr.puts "iriq: parse error: #{e.message}"
56
98
  2
@@ -59,89 +101,180 @@ module Iriq
59
101
  1
60
102
  end
61
103
 
104
+ def parseable_iri?(input)
105
+ Iriq.parse(input)
106
+ true
107
+ rescue Iriq::ParseError
108
+ false
109
+ end
110
+
62
111
  private
63
112
 
64
113
  def parse_options(argv)
65
- opts = { json: false, help: false }
114
+ opts = {
115
+ json: false,
116
+ help: false,
117
+ version: false,
118
+ hints: true,
119
+ sections: [],
120
+ corpus: nil,
121
+ stats: false,
122
+ scheme_less: true,
123
+ }
66
124
  parser = OptionParser.new do |o|
67
- o.on("-j", "--json") { opts[:json] = true }
68
- o.on("-h", "--help") { opts[:help] = true }
125
+ o.on("-p", "--parse") { opts[:sections] << :parse }
126
+ o.on("-n", "--normalize") { opts[:sections] << :normalize }
127
+ o.on("-j", "--json") { opts[:json] = true }
128
+ o.on("--[no-]hints") { |v| opts[:hints] = v }
129
+ o.on("-N") { opts[:hints] = false }
130
+ o.on("--corpus PATH") { |v| opts[:corpus] = v }
131
+ o.on("--stats") { opts[:stats] = true }
132
+ o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
133
+ o.on("-h", "--help") { opts[:help] = true }
134
+ o.on("-V", "--version") { opts[:version] = true }
69
135
  end
70
136
  args = parser.parse(argv)
71
137
  [args, opts]
72
138
  end
73
139
 
74
- def print_usage(io, code)
75
- io.puts USAGE
76
- code
140
+ def piped_stdin?
141
+ # StringIO is the test injection point; treat it as "piped" only when
142
+ # it actually has content. Real stdin: tty? tells us.
143
+ if stdin.is_a?(StringIO)
144
+ stdin.size.positive?
145
+ elsif stdin.respond_to?(:tty?)
146
+ !stdin.tty?
147
+ else
148
+ true
149
+ end
77
150
  end
78
151
 
79
- def require_arg!(args, name)
80
- return args.first if args.first
152
+ def load_corpus(path)
153
+ return Corpus.load(path) if File.exist?(path)
81
154
 
82
- stderr.puts "iriq: missing argument <#{name}>"
83
- throw :missing_arg, 1
155
+ Corpus.new
156
+ end
157
+
158
+ def print_usage(io, code)
159
+ io.puts USAGE
160
+ code
84
161
  end
85
162
 
86
- def cmd_version(_args, _opts)
163
+ def print_version
87
164
  stdout.puts Iriq::VERSION
88
165
  0
89
166
  end
90
167
 
91
- def cmd_parse(args, opts)
92
- input = args.first or return missing(:input)
93
- iri = Iriq.parse(input)
94
- emit_parse(iri, opts)
168
+ def cmd_summary(args, opts, corpus)
169
+ input = args.first or return missing(:input)
170
+ iri = Iriq.parse(input)
171
+ corpus&.observe(iri)
172
+ sections = opts[:sections].empty? ? SECTION_FLAGS : opts[:sections]
173
+
174
+ data = {}
175
+ data[:parse] = identifier_hash(iri) if sections.include?(:parse)
176
+ if sections.include?(:normalize)
177
+ data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
178
+ end
179
+
180
+ if opts[:json]
181
+ payload = sections.size == 1 ? data.values.first : data
182
+ stdout.puts JSON.generate(payload)
183
+ else
184
+ emit_sections(data, sections)
185
+ end
95
186
  0
96
187
  end
97
188
 
98
- def cmd_normalize(args, opts)
99
- input = args.first or return missing(:input)
100
- out = Iriq.normalize(input)
101
- opts[:json] ? stdout.puts(JSON.generate(normalized: out)) : stdout.puts(out)
189
+ # Used for the `cluster` subcommand and implicit piped batch mode. Reads
190
+ # the whole input as text and runs it through the extractor — so a file
191
+ # of URLs (one per line) and a file of prose with URLs both work. The
192
+ # corpus is ephemeral unless --corpus was given.
193
+ def cmd_batch(args, opts, corpus, explicit_cluster: false)
194
+ corpus ||= Corpus.new
195
+ iris = extract_text(read_text(args.first), opts)
196
+ iris.each { |iri| corpus.observe(iri) }
197
+
198
+ if opts[:sections].any?
199
+ emit_per_iri_sections(iris, opts)
200
+ elsif opts[:stats]
201
+ emit_stats(corpus, opts)
202
+ elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
203
+ # Either the user asked for clusters explicitly, or the input is
204
+ # big enough that the cluster summary beats a long URL list.
205
+ emit_clusters(corpus.clusters, opts)
206
+ else
207
+ emit_url_list(iris, opts)
208
+ end
102
209
  0
103
210
  end
104
211
 
105
- def cmd_explain(args, opts)
106
- input = args.first or return missing(:input)
107
- rows = Iriq.explain(input)
212
+ # Emit the requested sections (parse/normalize/explain) for each
213
+ # extracted IRI. -n alone is the cleanest case: one line per URL.
214
+ def emit_per_iri_sections(iris, opts)
215
+ sections = opts[:sections]
216
+ payloads = iris.map { |iri| section_payload(iri, sections, opts) }
217
+
108
218
  if opts[:json]
109
- stdout.puts JSON.generate(rows)
219
+ out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
220
+ stdout.puts JSON.generate(out)
221
+ elsif sections == [:normalize]
222
+ # Most common case — keep it tight: one URL per line, no headers.
223
+ payloads.each { |p| stdout.puts p[:normalize] }
110
224
  else
111
- rows.each do |r|
112
- mark = r[:variable] ? "*" : " "
113
- stdout.printf("%s %-12s %s\n", mark, r[:type], r[:value])
225
+ payloads.each_with_index do |p, i|
226
+ stdout.puts if i > 0
227
+ stdout.puts "# #{iris[i].canonical}"
228
+ sections.each_with_index do |sec, j|
229
+ stdout.puts if j > 0 # blank line between sections within one IRI
230
+ case sec
231
+ when :parse then emit_parse_human(p[:parse])
232
+ when :normalize then stdout.puts p[:normalize]
233
+ end
234
+ end
114
235
  end
115
236
  end
116
- 0
117
237
  end
118
238
 
119
- def cmd_classify(args, opts)
120
- seg = args.first or return missing(:segment)
121
- type = SegmentClassifier.new.classify(seg)
122
- opts[:json] ? stdout.puts(JSON.generate(value: seg, type: type)) : stdout.puts(type)
123
- 0
239
+ def section_payload(iri, sections, opts)
240
+ data = {}
241
+ data[:parse] = identifier_hash(iri) if sections.include?(:parse)
242
+ data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
243
+ data
124
244
  end
125
245
 
126
- def cmd_cluster(args, opts)
127
- lines = read_input(args.first)
128
- clusterer = Clusterer.new
129
- lines.each do |line|
130
- line = line.strip
131
- next if line.empty?
246
+ def extract_text(text, opts)
247
+ Extractor.new(scheme_less: opts[:scheme_less]).extract(text)
248
+ end
132
249
 
133
- begin
134
- clusterer.add(line)
135
- rescue Iriq::ParseError => e
136
- stderr.puts "iriq: skipped #{line.inspect}: #{e.message}"
137
- end
250
+ # Emit a deduplicated list of IRIs with occurrence counts, sorted desc
251
+ # by count then by first-seen order. If every IRI is a singleton the
252
+ # `[1]` prefix is omitted — just print the URLs.
253
+ def emit_url_list(iris, opts)
254
+ counts = Hash.new(0)
255
+ first = {}
256
+ iris.each_with_index do |iri, i|
257
+ key = iri.canonical
258
+ counts[key] += 1
259
+ first[key] ||= i
260
+ end
261
+
262
+ sorted = counts.sort_by { |k, c| [-c, first[k]] }
263
+
264
+ if opts[:json]
265
+ stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
266
+ elsif sorted.all? { |_, c| c == 1 }
267
+ sorted.each { |k, _| stdout.puts k }
268
+ else
269
+ sorted.each { |k, c| stdout.puts "[#{c}] #{k}" }
138
270
  end
139
- emit_clusters(clusterer.clusters, opts)
140
- 0
141
271
  end
142
272
 
143
- def cmd_help(_args, _opts)
144
- print_usage(stdout, 0)
273
+ def cmd_stats(corpus, opts)
274
+ return missing("--corpus") unless corpus
275
+
276
+ emit_stats(corpus, opts)
277
+ 0
145
278
  end
146
279
 
147
280
  def missing(name)
@@ -157,49 +290,104 @@ module Iriq
157
290
  end
158
291
  end
159
292
 
160
- def emit_parse(iri, opts)
161
- if opts[:json]
162
- stdout.puts JSON.generate(
163
- original: iri.original,
164
- kind: iri.kind,
165
- scheme: iri.scheme,
166
- host: iri.host,
167
- port: iri.port,
168
- path_segments: iri.path_segments,
169
- query_params: iri.query_params,
170
- fragment: iri.fragment,
171
- nss: iri.nss,
172
- canonical: iri.canonical,
173
- )
293
+ def read_text(path)
294
+ if path.nil? || path == "-"
295
+ stdin.read
174
296
  else
175
- stdout.puts "original: #{iri.original}"
176
- stdout.puts "kind: #{iri.kind}"
177
- stdout.puts "scheme: #{iri.scheme}" if iri.scheme
178
- stdout.puts "host: #{iri.host}" if iri.host
179
- stdout.puts "port: #{iri.port}" if iri.port
180
- stdout.puts "path_segments: #{iri.path_segments.inspect}" if iri.url?
181
- unless iri.query_params.empty?
182
- stdout.puts "query_params: #{iri.query_params.inspect}"
297
+ File.read(path)
298
+ end
299
+ end
300
+
301
+ def identifier_hash(iri)
302
+ {
303
+ original: iri.original,
304
+ kind: iri.kind,
305
+ scheme: iri.scheme,
306
+ host: iri.host,
307
+ port: iri.port,
308
+ path_segments: iri.path_segments,
309
+ query_params: iri.query_params,
310
+ fragment: iri.fragment,
311
+ nss: iri.nss,
312
+ canonical: iri.canonical,
313
+ }
314
+ end
315
+
316
+ def emit_sections(data, sections)
317
+ multi = sections.size > 1
318
+ sections.each_with_index do |sec, i|
319
+ stdout.puts if i > 0
320
+ stdout.puts "# #{sec}" if multi
321
+ case sec
322
+ when :parse then emit_parse_human(data[:parse])
323
+ when :normalize then stdout.puts data[:normalize]
183
324
  end
184
- stdout.puts "fragment: #{iri.fragment}" if iri.fragment
185
- stdout.puts "nss: #{iri.nss}" if iri.nss
186
- stdout.puts "canonical: #{iri.canonical}"
187
325
  end
188
326
  end
189
327
 
328
+ def emit_parse_human(h)
329
+ stdout.puts "original: #{h[:original]}"
330
+ stdout.puts "kind: #{h[:kind]}"
331
+ stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
332
+ stdout.puts "host: #{h[:host]}" if h[:host]
333
+ stdout.puts "port: #{h[:port]}" if h[:port]
334
+ stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
335
+ stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
336
+ stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
337
+ stdout.puts "nss: #{h[:nss]}" if h[:nss]
338
+ stdout.puts "canonical: #{h[:canonical]}"
339
+ end
340
+
190
341
  def emit_clusters(clusters, opts)
191
342
  sorted = clusters.sort_by { |c| -c.count }
192
343
 
193
344
  if opts[:json]
194
345
  stdout.puts JSON.generate(sorted.map(&:to_h))
195
346
  else
196
- sorted.each do |c|
197
- host = c.host || "(urn)"
198
- stdout.puts "[#{c.count}] #{host} #{c.shape}"
347
+ sorted.each_with_index do |c, i|
348
+ stdout.puts if i > 0
349
+ host = c.host || "(urn)"
350
+ shape = opts[:hints] ? c.shape : raw_shape_for(c)
351
+ stdout.puts "[#{c.count}] #{host} #{shape}"
199
352
  c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
200
353
  stdout.puts " + #{c.count - 3} more" if c.count > 3
201
354
  end
202
355
  end
203
356
  end
357
+
358
+ def raw_shape_for(cluster)
359
+ example = cluster.examples.first
360
+ return cluster.shape unless example
361
+
362
+ PathShape.for(example.path_segments, hints: false)
363
+ end
364
+
365
+ def emit_stats(corpus, opts)
366
+ payload = {
367
+ observations: corpus.host_counts.values.sum,
368
+ clusters: corpus.size,
369
+ hosts: top(corpus.host_counts),
370
+ shapes: top(corpus.fingerprint_counts),
371
+ raw_shapes: top(corpus.raw_shape_counts),
372
+ }
373
+
374
+ if opts[:json]
375
+ stdout.puts JSON.generate(payload)
376
+ else
377
+ stdout.puts "observations: #{payload[:observations]}"
378
+ stdout.puts "clusters: #{payload[:clusters]}"
379
+ stdout.puts
380
+ stdout.puts "top hosts:"
381
+ payload[:hosts].each { |h, n| stdout.puts " #{n.to_s.rjust(6)} #{h}" }
382
+ stdout.puts
383
+ stdout.puts "top shapes:"
384
+ shapes = opts[:hints] ? payload[:shapes] : payload[:raw_shapes]
385
+ shapes.each { |s, n| stdout.puts " #{n.to_s.rjust(6)} #{s}" }
386
+ end
387
+ end
388
+
389
+ def top(hash)
390
+ hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
391
+ end
204
392
  end
205
393
  end
data/lib/iriq/cluster.rb CHANGED
@@ -54,5 +54,28 @@ module Iriq
54
54
  segments: segment_stats,
55
55
  }
56
56
  end
57
+
58
+ # JSON-friendly dump for persistence (distinct from #to_h which is a
59
+ # display form). Examples are dumped as canonical strings and re-parsed
60
+ # on load.
61
+ def dump
62
+ {
63
+ "key" => key,
64
+ "host" => host,
65
+ "scheme" => scheme,
66
+ "shape" => shape,
67
+ "count" => count,
68
+ "examples" => examples.map(&:canonical),
69
+ "segment_counts" => @segment_counts.map { |h| h || {} },
70
+ }
71
+ end
72
+
73
+ def self.from_dump(h)
74
+ cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
75
+ cluster.instance_variable_set(:@count, h["count"])
76
+ cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
77
+ cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
78
+ cluster
79
+ end
57
80
  end
58
81
  end
@@ -4,14 +4,14 @@ module Iriq
4
4
  # against the cluster it would fall into, including which positions are
5
5
  # stable across all observed members.
6
6
  class Clusterer
7
- def initialize(classifier: SegmentClassifier.new)
7
+ def initialize(classifier: SegmentClassifier::DEFAULT)
8
8
  @classifier = classifier
9
9
  @clusters = {}
10
10
  end
11
11
 
12
- def add(input)
12
+ def add(input, shape: nil)
13
13
  iri = coerce(input)
14
- key, host, scheme, shape = cluster_key(iri)
14
+ key, host, scheme, shape = cluster_key(iri, shape: shape)
15
15
  cluster = @clusters[key] ||= Cluster.new(
16
16
  key: key,
17
17
  host: host,
@@ -39,16 +39,14 @@ module Iriq
39
39
  key, * = cluster_key(iri)
40
40
  cluster = @clusters[key]
41
41
  stats = cluster ? cluster.segment_stats : []
42
+ hinted = SegmentHints.derive(iri.path_segments, @classifier)
42
43
 
43
- iri.path_segments.each_with_index.map do |seg, i|
44
- type = @classifier.classify(seg)
44
+ hinted.each_with_index.map do |entry, i|
45
45
  stable = stats[i] && stats[i][:stable]
46
- {
47
- value: seg,
48
- type: type,
49
- variable: !stable && @classifier.variable?(type),
46
+ entry.merge(
47
+ variable: !stable && entry[:variable],
50
48
  stable: !!stable,
51
- }
49
+ )
52
50
  end
53
51
  end
54
52
 
@@ -58,20 +56,37 @@ module Iriq
58
56
  input.is_a?(Identifier) ? input : Parser.parse(input)
59
57
  end
60
58
 
61
- def cluster_key(iri)
59
+ def cluster_key(iri, shape: nil)
62
60
  if iri.urn?
63
61
  ns, value = (iri.nss || "").split(":", 2)
64
- shape = if value
65
- type = @classifier.classify(value)
66
- @classifier.variable?(type) ? "{#{type}}" : value
67
- end
68
- key = "urn:#{ns}:#{shape}"
62
+ shape = value ? urn_value_shape(ns, value) : nil
63
+ key = "urn:#{ns}:#{shape}"
69
64
  [key, nil, "urn", key]
70
65
  else
71
- shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
66
+ shape ||= PathShape.new(classifier: @classifier).for(iri.path_segments)
72
67
  key = "#{iri.scheme}://#{iri.host}#{shape}"
73
68
  [key, iri.host, iri.scheme, shape]
74
69
  end
75
70
  end
71
+
72
+ def urn_value_shape(ns, value)
73
+ entry = SegmentHints.derive([ns, value], @classifier).last
74
+ return entry[:value] unless entry[:variable]
75
+
76
+ "{#{entry[:hint] || entry[:type]}}"
77
+ end
78
+
79
+ public
80
+
81
+ def dump
82
+ { "clusters" => @clusters.transform_values(&:dump) }
83
+ end
84
+
85
+ def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
86
+ c = new(classifier: classifier)
87
+ restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
88
+ c.instance_variable_set(:@clusters, restored)
89
+ c
90
+ end
76
91
  end
77
92
  end