iriq 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/CLAUDE.md +121 -0
- data/Gemfile.lock +8 -2
- data/Makefile +56 -0
- data/README.md +334 -39
- data/iriq.gemspec +4 -3
- data/lib/iriq/cli.rb +289 -100
- data/lib/iriq/cluster.rb +47 -0
- data/lib/iriq/clusterer.rb +29 -39
- data/lib/iriq/corpus.rb +322 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +138 -0
- data/lib/iriq/storage/sqlite.rb +367 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +11 -0
- metadata +29 -4
data/lib/iriq/cli.rb
CHANGED
|
@@ -1,32 +1,55 @@
|
|
|
1
1
|
require "json"
|
|
2
2
|
require "optparse"
|
|
3
|
+
require "stringio"
|
|
3
4
|
|
|
4
5
|
module Iriq
|
|
5
|
-
#
|
|
6
|
-
#
|
|
6
|
+
# Flag-driven CLI. The default action for an input is a combined parse +
|
|
7
|
+
# normalize + explain summary; the -p/-n/-e flags select individual
|
|
8
|
+
# sections. The only subcommand is `cluster`, which is structurally
|
|
9
|
+
# different (many inputs, not one). Construct with explicit IO so specs
|
|
10
|
+
# can run it without shelling out.
|
|
7
11
|
class CLI
|
|
8
|
-
|
|
12
|
+
SECTION_FLAGS = %i[parse normalize].freeze
|
|
13
|
+
TOP_N_STATS = 10
|
|
14
|
+
|
|
15
|
+
# When extraction yields this many or more IRIs, the default pipe
|
|
16
|
+
# output switches from a URL list to clusters — a longer list is
|
|
17
|
+
# easier to read as route-shape groups.
|
|
18
|
+
LARGE_BATCH_THRESHOLD = 10
|
|
9
19
|
|
|
10
20
|
USAGE = <<~TXT
|
|
11
|
-
Usage: iriq
|
|
21
|
+
Usage: iriq [options] <input>
|
|
22
|
+
iriq [options] < text
|
|
23
|
+
iriq cluster [options] [file]
|
|
24
|
+
|
|
25
|
+
<input> may be an IRI, a file path (extracted automatically), or piped
|
|
26
|
+
text via stdin.
|
|
27
|
+
|
|
28
|
+
Sections (combine freely):
|
|
29
|
+
-n, --normalize Shape-normalized form
|
|
30
|
+
-p, --parse Parsed fields
|
|
12
31
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
classify <segment> Classify a single segment
|
|
18
|
-
cluster [file] Cluster identifiers from FILE or stdin (one per line)
|
|
19
|
-
help Show this message
|
|
20
|
-
version Print version
|
|
32
|
+
Corpus + stats:
|
|
33
|
+
--corpus PATH Load/create a JSON corpus; observe and save atomically.
|
|
34
|
+
-n becomes corpus-informed once it has data.
|
|
35
|
+
--stats Print rolling aggregates
|
|
21
36
|
|
|
22
|
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
37
|
+
Other:
|
|
38
|
+
-h, --help Show this message
|
|
39
|
+
-j, --json Emit JSON instead of human-readable output
|
|
40
|
+
-N, --no-hints Use {integer_id} placeholders instead of {user_id}
|
|
41
|
+
--no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
|
|
42
|
+
-V, --version Print version
|
|
43
|
+
|
|
44
|
+
Subcommands:
|
|
45
|
+
cluster [file] Force cluster view (default for ≥10 IRIs anyway)
|
|
25
46
|
|
|
26
47
|
Examples:
|
|
27
|
-
iriq
|
|
28
|
-
iriq
|
|
29
|
-
|
|
48
|
+
iriq foo.com/users/456
|
|
49
|
+
iriq -n https://foo.com/users/123
|
|
50
|
+
iriq ./access.log # auto-detect file → extract URLs
|
|
51
|
+
cat README.md | iriq -n # one normalized URL per line
|
|
52
|
+
cat README.md | iriq --corpus c.json
|
|
30
53
|
TXT
|
|
31
54
|
|
|
32
55
|
attr_reader :stdin, :stdout, :stderr
|
|
@@ -41,16 +64,35 @@ module Iriq
|
|
|
41
64
|
def run(argv)
|
|
42
65
|
args, opts = parse_options(argv)
|
|
43
66
|
|
|
44
|
-
|
|
45
|
-
return
|
|
67
|
+
return print_usage(stdout, 0) if opts[:help]
|
|
68
|
+
return print_version if opts[:version]
|
|
69
|
+
|
|
70
|
+
explicit_cluster = (args.first == "cluster")
|
|
71
|
+
args.shift if explicit_cluster
|
|
72
|
+
|
|
73
|
+
# Auto-detect: a positional argument that isn't parseable as an IRI
|
|
74
|
+
# but IS an existing file gets treated as a file to extract from. This
|
|
75
|
+
# is what makes `iriq ./access.log` and `iriq /var/log/foo.log` Just
|
|
76
|
+
# Work without a separate --extract flag.
|
|
77
|
+
positional_is_file = args.first && File.file?(args.first) && !parseable_iri?(args.first)
|
|
78
|
+
|
|
79
|
+
batch_mode = explicit_cluster || positional_is_file ||
|
|
80
|
+
(args.empty? && piped_stdin?)
|
|
81
|
+
|
|
82
|
+
return print_usage(stdout, 0) if args.empty? && !batch_mode
|
|
83
|
+
|
|
84
|
+
corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
|
|
46
85
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
86
|
+
code = if batch_mode
|
|
87
|
+
cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
|
|
88
|
+
elsif opts[:stats]
|
|
89
|
+
cmd_stats(corpus, opts)
|
|
90
|
+
else
|
|
91
|
+
cmd_summary(args, opts, corpus)
|
|
51
92
|
end
|
|
52
93
|
|
|
53
|
-
|
|
94
|
+
corpus.save(opts[:corpus]) if corpus && opts[:corpus]
|
|
95
|
+
code
|
|
54
96
|
rescue Iriq::ParseError => e
|
|
55
97
|
stderr.puts "iriq: parse error: #{e.message}"
|
|
56
98
|
2
|
|
@@ -59,89 +101,178 @@ module Iriq
|
|
|
59
101
|
1
|
|
60
102
|
end
|
|
61
103
|
|
|
104
|
+
def parseable_iri?(input)
|
|
105
|
+
Iriq.parse(input)
|
|
106
|
+
true
|
|
107
|
+
rescue Iriq::ParseError
|
|
108
|
+
false
|
|
109
|
+
end
|
|
110
|
+
|
|
62
111
|
private
|
|
63
112
|
|
|
64
113
|
def parse_options(argv)
|
|
65
|
-
opts = {
|
|
114
|
+
opts = {
|
|
115
|
+
json: false,
|
|
116
|
+
help: false,
|
|
117
|
+
version: false,
|
|
118
|
+
hints: true,
|
|
119
|
+
sections: [],
|
|
120
|
+
corpus: nil,
|
|
121
|
+
stats: false,
|
|
122
|
+
scheme_less: true,
|
|
123
|
+
}
|
|
66
124
|
parser = OptionParser.new do |o|
|
|
67
|
-
o.on("-
|
|
68
|
-
o.on("-
|
|
125
|
+
o.on("-p", "--parse") { opts[:sections] << :parse }
|
|
126
|
+
o.on("-n", "--normalize") { opts[:sections] << :normalize }
|
|
127
|
+
o.on("-j", "--json") { opts[:json] = true }
|
|
128
|
+
o.on("--[no-]hints") { |v| opts[:hints] = v }
|
|
129
|
+
o.on("-N") { opts[:hints] = false }
|
|
130
|
+
o.on("--corpus PATH") { |v| opts[:corpus] = v }
|
|
131
|
+
o.on("--stats") { opts[:stats] = true }
|
|
132
|
+
o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
|
|
133
|
+
o.on("-h", "--help") { opts[:help] = true }
|
|
134
|
+
o.on("-V", "--version") { opts[:version] = true }
|
|
69
135
|
end
|
|
70
136
|
args = parser.parse(argv)
|
|
71
137
|
[args, opts]
|
|
72
138
|
end
|
|
73
139
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
|
|
140
|
+
def piped_stdin?
|
|
141
|
+
# StringIO is the test injection point; treat it as "piped" only when
|
|
142
|
+
# it actually has content. Real stdin: tty? tells us.
|
|
143
|
+
if stdin.is_a?(StringIO)
|
|
144
|
+
stdin.size.positive?
|
|
145
|
+
elsif stdin.respond_to?(:tty?)
|
|
146
|
+
!stdin.tty?
|
|
147
|
+
else
|
|
148
|
+
true
|
|
149
|
+
end
|
|
77
150
|
end
|
|
78
151
|
|
|
79
|
-
def
|
|
80
|
-
|
|
152
|
+
def load_corpus(path)
|
|
153
|
+
Corpus.open(path)
|
|
154
|
+
end
|
|
81
155
|
|
|
82
|
-
|
|
83
|
-
|
|
156
|
+
def print_usage(io, code)
|
|
157
|
+
io.puts USAGE
|
|
158
|
+
code
|
|
84
159
|
end
|
|
85
160
|
|
|
86
|
-
def
|
|
161
|
+
def print_version
|
|
87
162
|
stdout.puts Iriq::VERSION
|
|
88
163
|
0
|
|
89
164
|
end
|
|
90
165
|
|
|
91
|
-
def
|
|
92
|
-
input
|
|
93
|
-
iri
|
|
94
|
-
|
|
166
|
+
def cmd_summary(args, opts, corpus)
|
|
167
|
+
input = args.first or return missing(:input)
|
|
168
|
+
iri = Iriq.parse(input)
|
|
169
|
+
corpus&.observe(iri)
|
|
170
|
+
sections = opts[:sections].empty? ? SECTION_FLAGS : opts[:sections]
|
|
171
|
+
|
|
172
|
+
data = {}
|
|
173
|
+
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
174
|
+
if sections.include?(:normalize)
|
|
175
|
+
data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
if opts[:json]
|
|
179
|
+
payload = sections.size == 1 ? data.values.first : data
|
|
180
|
+
stdout.puts JSON.generate(payload)
|
|
181
|
+
else
|
|
182
|
+
emit_sections(data, sections)
|
|
183
|
+
end
|
|
95
184
|
0
|
|
96
185
|
end
|
|
97
186
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
187
|
+
# Used for the `cluster` subcommand and implicit piped batch mode. Reads
|
|
188
|
+
# the whole input as text and runs it through the extractor — so a file
|
|
189
|
+
# of URLs (one per line) and a file of prose with URLs both work. The
|
|
190
|
+
# corpus is ephemeral unless --corpus was given.
|
|
191
|
+
def cmd_batch(args, opts, corpus, explicit_cluster: false)
|
|
192
|
+
corpus ||= Corpus.new
|
|
193
|
+
iris = extract_text(read_text(args.first), opts)
|
|
194
|
+
corpus.batch { iris.each { |iri| corpus.observe(iri) } }
|
|
195
|
+
|
|
196
|
+
if opts[:sections].any?
|
|
197
|
+
emit_per_iri_sections(iris, opts)
|
|
198
|
+
elsif opts[:stats]
|
|
199
|
+
emit_stats(corpus, opts)
|
|
200
|
+
elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
|
|
201
|
+
# Either the user asked for clusters explicitly, or the input is
|
|
202
|
+
# big enough that the cluster summary beats a long URL list.
|
|
203
|
+
emit_clusters(corpus.clusters, opts)
|
|
204
|
+
else
|
|
205
|
+
emit_url_list(iris, opts)
|
|
206
|
+
end
|
|
102
207
|
0
|
|
103
208
|
end
|
|
104
209
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
210
|
+
# Emit the requested sections (parse/normalize/explain) for each
|
|
211
|
+
# extracted IRI. -n alone is the cleanest case: one line per URL.
|
|
212
|
+
def emit_per_iri_sections(iris, opts)
|
|
213
|
+
sections = opts[:sections]
|
|
214
|
+
payloads = iris.map { |iri| section_payload(iri, sections, opts) }
|
|
215
|
+
|
|
108
216
|
if opts[:json]
|
|
109
|
-
|
|
217
|
+
out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
|
|
218
|
+
stdout.puts JSON.generate(out)
|
|
219
|
+
elsif sections == [:normalize]
|
|
220
|
+
# Most common case — keep it tight: one URL per line, no headers.
|
|
221
|
+
payloads.each { |p| stdout.puts p[:normalize] }
|
|
110
222
|
else
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
stdout.
|
|
223
|
+
payloads.each_with_index do |p, i|
|
|
224
|
+
stdout.puts if i > 0
|
|
225
|
+
stdout.puts "# #{iris[i].canonical}"
|
|
226
|
+
sections.each_with_index do |sec, j|
|
|
227
|
+
stdout.puts if j > 0 # blank line between sections within one IRI
|
|
228
|
+
case sec
|
|
229
|
+
when :parse then emit_parse_human(p[:parse])
|
|
230
|
+
when :normalize then stdout.puts p[:normalize]
|
|
231
|
+
end
|
|
232
|
+
end
|
|
114
233
|
end
|
|
115
234
|
end
|
|
116
|
-
0
|
|
117
235
|
end
|
|
118
236
|
|
|
119
|
-
def
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
237
|
+
def section_payload(iri, sections, opts)
|
|
238
|
+
data = {}
|
|
239
|
+
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
240
|
+
data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
|
|
241
|
+
data
|
|
124
242
|
end
|
|
125
243
|
|
|
126
|
-
def
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
lines.each do |line|
|
|
130
|
-
line = line.strip
|
|
131
|
-
next if line.empty?
|
|
244
|
+
def extract_text(text, opts)
|
|
245
|
+
Extractor.new(scheme_less: opts[:scheme_less]).extract(text)
|
|
246
|
+
end
|
|
132
247
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
248
|
+
# Emit a deduplicated list of IRIs with occurrence counts, sorted desc
|
|
249
|
+
# by count then by first-seen order. If every IRI is a singleton the
|
|
250
|
+
# `[1]` prefix is omitted — just print the URLs.
|
|
251
|
+
def emit_url_list(iris, opts)
|
|
252
|
+
counts = Hash.new(0)
|
|
253
|
+
first = {}
|
|
254
|
+
iris.each_with_index do |iri, i|
|
|
255
|
+
key = iri.canonical
|
|
256
|
+
counts[key] += 1
|
|
257
|
+
first[key] ||= i
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
sorted = counts.sort_by { |k, c| [-c, first[k]] }
|
|
261
|
+
|
|
262
|
+
if opts[:json]
|
|
263
|
+
stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
|
|
264
|
+
elsif sorted.all? { |_, c| c == 1 }
|
|
265
|
+
sorted.each { |k, _| stdout.puts k }
|
|
266
|
+
else
|
|
267
|
+
sorted.each { |k, c| stdout.puts "[#{c}] #{k}" }
|
|
138
268
|
end
|
|
139
|
-
emit_clusters(clusterer.clusters, opts)
|
|
140
|
-
0
|
|
141
269
|
end
|
|
142
270
|
|
|
143
|
-
def
|
|
144
|
-
|
|
271
|
+
def cmd_stats(corpus, opts)
|
|
272
|
+
return missing("--corpus") unless corpus
|
|
273
|
+
|
|
274
|
+
emit_stats(corpus, opts)
|
|
275
|
+
0
|
|
145
276
|
end
|
|
146
277
|
|
|
147
278
|
def missing(name)
|
|
@@ -157,49 +288,107 @@ module Iriq
|
|
|
157
288
|
end
|
|
158
289
|
end
|
|
159
290
|
|
|
160
|
-
def
|
|
161
|
-
if
|
|
162
|
-
|
|
163
|
-
original: iri.original,
|
|
164
|
-
kind: iri.kind,
|
|
165
|
-
scheme: iri.scheme,
|
|
166
|
-
host: iri.host,
|
|
167
|
-
port: iri.port,
|
|
168
|
-
path_segments: iri.path_segments,
|
|
169
|
-
query_params: iri.query_params,
|
|
170
|
-
fragment: iri.fragment,
|
|
171
|
-
nss: iri.nss,
|
|
172
|
-
canonical: iri.canonical,
|
|
173
|
-
)
|
|
291
|
+
def read_text(path)
|
|
292
|
+
if path.nil? || path == "-"
|
|
293
|
+
stdin.read
|
|
174
294
|
else
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
295
|
+
File.read(path)
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Compact identifier hash for parse output (both JSON and human). Drops
|
|
300
|
+
# nil values and empty collections so URN dumps don't carry empty
|
|
301
|
+
# host/path/query slots, and URL dumps don't include null fragment/nss.
|
|
302
|
+
def identifier_hash(iri)
|
|
303
|
+
{
|
|
304
|
+
original: iri.original,
|
|
305
|
+
kind: iri.kind,
|
|
306
|
+
scheme: iri.scheme,
|
|
307
|
+
host: iri.host,
|
|
308
|
+
port: iri.port,
|
|
309
|
+
path_segments: iri.path_segments,
|
|
310
|
+
query_params: iri.query_params,
|
|
311
|
+
fragment: iri.fragment,
|
|
312
|
+
nss: iri.nss,
|
|
313
|
+
canonical: iri.canonical,
|
|
314
|
+
}.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def emit_sections(data, sections)
|
|
318
|
+
multi = sections.size > 1
|
|
319
|
+
sections.each_with_index do |sec, i|
|
|
320
|
+
stdout.puts if i > 0
|
|
321
|
+
stdout.puts "# #{sec}" if multi
|
|
322
|
+
case sec
|
|
323
|
+
when :parse then emit_parse_human(data[:parse])
|
|
324
|
+
when :normalize then stdout.puts data[:normalize]
|
|
183
325
|
end
|
|
184
|
-
stdout.puts "fragment: #{iri.fragment}" if iri.fragment
|
|
185
|
-
stdout.puts "nss: #{iri.nss}" if iri.nss
|
|
186
|
-
stdout.puts "canonical: #{iri.canonical}"
|
|
187
326
|
end
|
|
188
327
|
end
|
|
189
328
|
|
|
329
|
+
def emit_parse_human(h)
|
|
330
|
+
stdout.puts "original: #{h[:original]}"
|
|
331
|
+
stdout.puts "kind: #{h[:kind]}"
|
|
332
|
+
stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
|
|
333
|
+
stdout.puts "host: #{h[:host]}" if h[:host]
|
|
334
|
+
stdout.puts "port: #{h[:port]}" if h[:port]
|
|
335
|
+
stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
|
|
336
|
+
stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
|
|
337
|
+
stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
|
|
338
|
+
stdout.puts "nss: #{h[:nss]}" if h[:nss]
|
|
339
|
+
stdout.puts "canonical: #{h[:canonical]}"
|
|
340
|
+
end
|
|
341
|
+
|
|
190
342
|
def emit_clusters(clusters, opts)
|
|
191
343
|
sorted = clusters.sort_by { |c| -c.count }
|
|
192
344
|
|
|
193
345
|
if opts[:json]
|
|
194
346
|
stdout.puts JSON.generate(sorted.map(&:to_h))
|
|
195
347
|
else
|
|
196
|
-
sorted.
|
|
197
|
-
|
|
198
|
-
|
|
348
|
+
sorted.each_with_index do |c, i|
|
|
349
|
+
stdout.puts if i > 0
|
|
350
|
+
host = c.host || "(urn)"
|
|
351
|
+
shape = opts[:hints] ? c.shape : raw_shape_for(c)
|
|
352
|
+
stdout.puts "[#{c.count}] #{host} #{shape}"
|
|
199
353
|
c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
|
|
200
354
|
stdout.puts " + #{c.count - 3} more" if c.count > 3
|
|
201
355
|
end
|
|
202
356
|
end
|
|
203
357
|
end
|
|
358
|
+
|
|
359
|
+
def raw_shape_for(cluster)
|
|
360
|
+
example = cluster.examples.first
|
|
361
|
+
return cluster.shape unless example
|
|
362
|
+
|
|
363
|
+
PathShape.for(example.path_segments, hints: false)
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
def emit_stats(corpus, opts)
|
|
367
|
+
payload = {
|
|
368
|
+
observations: corpus.host_counts.values.sum,
|
|
369
|
+
clusters: corpus.size,
|
|
370
|
+
hosts: top(corpus.host_counts),
|
|
371
|
+
shapes: top(corpus.fingerprint_counts),
|
|
372
|
+
raw_shapes: top(corpus.raw_shape_counts),
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if opts[:json]
|
|
376
|
+
stdout.puts JSON.generate(payload)
|
|
377
|
+
else
|
|
378
|
+
stdout.puts "observations: #{payload[:observations]}"
|
|
379
|
+
stdout.puts "clusters: #{payload[:clusters]}"
|
|
380
|
+
stdout.puts
|
|
381
|
+
stdout.puts "top hosts:"
|
|
382
|
+
payload[:hosts].each { |h, n| stdout.puts " #{n.to_s.rjust(6)} #{h}" }
|
|
383
|
+
stdout.puts
|
|
384
|
+
stdout.puts "top shapes:"
|
|
385
|
+
shapes = opts[:hints] ? payload[:shapes] : payload[:raw_shapes]
|
|
386
|
+
shapes.each { |s, n| stdout.puts " #{n.to_s.rjust(6)} #{s}" }
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
def top(hash)
|
|
391
|
+
hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
|
|
392
|
+
end
|
|
204
393
|
end
|
|
205
394
|
end
|
data/lib/iriq/cluster.rb
CHANGED
|
@@ -54,5 +54,52 @@ module Iriq
|
|
|
54
54
|
segments: segment_stats,
|
|
55
55
|
}
|
|
56
56
|
end
|
|
57
|
+
|
|
58
|
+
# JSON-friendly dump for persistence (distinct from #to_h which is a
|
|
59
|
+
# display form). Examples are dumped as canonical strings and re-parsed
|
|
60
|
+
# on load.
|
|
61
|
+
def dump
|
|
62
|
+
{
|
|
63
|
+
"key" => key,
|
|
64
|
+
"host" => host,
|
|
65
|
+
"scheme" => scheme,
|
|
66
|
+
"shape" => shape,
|
|
67
|
+
"count" => count,
|
|
68
|
+
"examples" => examples.map(&:canonical),
|
|
69
|
+
"segment_counts" => @segment_counts.map { |h| h || {} },
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def self.from_dump(h)
|
|
74
|
+
cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
|
|
75
|
+
cluster.instance_variable_set(:@count, h["count"])
|
|
76
|
+
cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
|
|
77
|
+
cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
|
|
78
|
+
cluster
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Shared cluster-key derivation. Returns [key, host, scheme, shape] —
|
|
82
|
+
# callers that already have a hinted shape can pass it in to skip the
|
|
83
|
+
# recomputation; URN inputs ignore the override and always derive their
|
|
84
|
+
# own shape from the NSS value.
|
|
85
|
+
def self.key_for(iri, classifier:, shape: nil)
|
|
86
|
+
if iri.urn?
|
|
87
|
+
ns, value = (iri.nss || "").split(":", 2)
|
|
88
|
+
derived = value ? urn_value_shape(ns, value, classifier) : nil
|
|
89
|
+
key = "urn:#{ns}:#{derived}"
|
|
90
|
+
[key, nil, "urn", key]
|
|
91
|
+
else
|
|
92
|
+
shape ||= PathShape.new(classifier: classifier).for(iri.path_segments)
|
|
93
|
+
key = "#{iri.scheme}://#{iri.host}#{shape}"
|
|
94
|
+
[key, iri.host, iri.scheme, shape]
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def self.urn_value_shape(ns, value, classifier)
|
|
99
|
+
entry = SegmentHints.derive([ns, value], classifier).last
|
|
100
|
+
return entry[:value] unless entry[:variable]
|
|
101
|
+
|
|
102
|
+
"{#{entry[:hint] || entry[:type]}}"
|
|
103
|
+
end
|
|
57
104
|
end
|
|
58
105
|
end
|
data/lib/iriq/clusterer.rb
CHANGED
|
@@ -3,31 +3,28 @@ module Iriq
|
|
|
3
3
|
# `clusters` to read out the groups. `explain` annotates a single identifier
|
|
4
4
|
# against the cluster it would fall into, including which positions are
|
|
5
5
|
# stable across all observed members.
|
|
6
|
+
#
|
|
7
|
+
# Implemented as a thin wrapper over Storage::Memory — the same code path
|
|
8
|
+
# Corpus uses for the cluster portion of its state, so there's only one
|
|
9
|
+
# place that knows how clusters get stored.
|
|
6
10
|
class Clusterer
|
|
7
|
-
def initialize(classifier: SegmentClassifier
|
|
11
|
+
def initialize(classifier: SegmentClassifier::DEFAULT)
|
|
8
12
|
@classifier = classifier
|
|
9
|
-
@
|
|
13
|
+
@storage = Storage::Memory.new(classifier: classifier)
|
|
10
14
|
end
|
|
11
15
|
|
|
12
|
-
def add(input)
|
|
16
|
+
def add(input, shape: nil)
|
|
13
17
|
iri = coerce(input)
|
|
14
|
-
key, host, scheme,
|
|
15
|
-
|
|
16
|
-
key: key,
|
|
17
|
-
host: host,
|
|
18
|
-
scheme: scheme,
|
|
19
|
-
shape: shape,
|
|
20
|
-
)
|
|
21
|
-
cluster.add(iri)
|
|
22
|
-
cluster
|
|
18
|
+
key, host, scheme, derived = Cluster.key_for(iri, classifier: @classifier, shape: shape)
|
|
19
|
+
@storage.add_to_cluster(key, host, scheme, derived, iri)
|
|
23
20
|
end
|
|
24
21
|
|
|
25
22
|
def clusters
|
|
26
|
-
@clusters
|
|
23
|
+
@storage.clusters
|
|
27
24
|
end
|
|
28
25
|
|
|
29
26
|
def size
|
|
30
|
-
@
|
|
27
|
+
@storage.cluster_size
|
|
31
28
|
end
|
|
32
29
|
|
|
33
30
|
# Returns a per-segment explanation for the input, merging classifier
|
|
@@ -36,42 +33,35 @@ module Iriq
|
|
|
36
33
|
# would otherwise call them variable).
|
|
37
34
|
def explain(input)
|
|
38
35
|
iri = coerce(input)
|
|
39
|
-
key, * =
|
|
40
|
-
cluster =
|
|
36
|
+
key, * = Cluster.key_for(iri, classifier: @classifier)
|
|
37
|
+
cluster = clusters.find { |c| c.key == key }
|
|
41
38
|
stats = cluster ? cluster.segment_stats : []
|
|
39
|
+
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
42
40
|
|
|
43
|
-
|
|
44
|
-
type = @classifier.classify(seg)
|
|
41
|
+
hinted.each_with_index.map do |entry, i|
|
|
45
42
|
stable = stats[i] && stats[i][:stable]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
type: type,
|
|
49
|
-
variable: !stable && @classifier.variable?(type),
|
|
43
|
+
entry.merge(
|
|
44
|
+
variable: !stable && entry[:variable],
|
|
50
45
|
stable: !!stable,
|
|
51
|
-
|
|
46
|
+
)
|
|
52
47
|
end
|
|
53
48
|
end
|
|
54
49
|
|
|
50
|
+
def dump
|
|
51
|
+
{ "clusters" => clusters.each_with_object({}) { |c, h| h[c.key] = c.dump } }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
|
|
55
|
+
c = new(classifier: classifier)
|
|
56
|
+
restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
|
|
57
|
+
c.instance_variable_get(:@storage).instance_variable_set(:@clusters, restored)
|
|
58
|
+
c
|
|
59
|
+
end
|
|
60
|
+
|
|
55
61
|
private
|
|
56
62
|
|
|
57
63
|
def coerce(input)
|
|
58
64
|
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
59
65
|
end
|
|
60
|
-
|
|
61
|
-
def cluster_key(iri)
|
|
62
|
-
if iri.urn?
|
|
63
|
-
ns, value = (iri.nss || "").split(":", 2)
|
|
64
|
-
shape = if value
|
|
65
|
-
type = @classifier.classify(value)
|
|
66
|
-
@classifier.variable?(type) ? "{#{type}}" : value
|
|
67
|
-
end
|
|
68
|
-
key = "urn:#{ns}:#{shape}"
|
|
69
|
-
[key, nil, "urn", key]
|
|
70
|
-
else
|
|
71
|
-
shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
|
|
72
|
-
key = "#{iri.scheme}://#{iri.host}#{shape}"
|
|
73
|
-
[key, iri.host, iri.scheme, shape]
|
|
74
|
-
end
|
|
75
|
-
end
|
|
76
66
|
end
|
|
77
67
|
end
|