iriq 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Gemfile.lock +2 -2
- data/README.md +227 -33
- data/lib/iriq/cli.rb +288 -100
- data/lib/iriq/cluster.rb +23 -0
- data/lib/iriq/clusterer.rb +32 -17
- data/lib/iriq/corpus.rb +268 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +10 -0
- data/script/benchmark.rb +81 -0
- data/script/memory.rb +121 -0
- metadata +9 -1
data/lib/iriq/cli.rb
CHANGED
|
@@ -1,32 +1,55 @@
|
|
|
1
1
|
require "json"
|
|
2
2
|
require "optparse"
|
|
3
|
+
require "stringio"
|
|
3
4
|
|
|
4
5
|
module Iriq
|
|
5
|
-
#
|
|
6
|
-
#
|
|
6
|
+
# Flag-driven CLI. The default action for an input is a combined parse +
|
|
7
|
+
# normalize + explain summary; the -p/-n/-e flags select individual
|
|
8
|
+
# sections. The only subcommand is `cluster`, which is structurally
|
|
9
|
+
# different (many inputs, not one). Construct with explicit IO so specs
|
|
10
|
+
# can run it without shelling out.
|
|
7
11
|
class CLI
|
|
8
|
-
|
|
12
|
+
SECTION_FLAGS = %i[parse normalize].freeze
|
|
13
|
+
TOP_N_STATS = 10
|
|
14
|
+
|
|
15
|
+
# When extraction yields this many or more IRIs, the default pipe
|
|
16
|
+
# output switches from a URL list to clusters — a longer list is
|
|
17
|
+
# easier to read as route-shape groups.
|
|
18
|
+
LARGE_BATCH_THRESHOLD = 10
|
|
9
19
|
|
|
10
20
|
USAGE = <<~TXT
|
|
11
|
-
Usage: iriq
|
|
21
|
+
Usage: iriq [options] <input>
|
|
22
|
+
iriq [options] < text
|
|
23
|
+
iriq cluster [options] [file]
|
|
24
|
+
|
|
25
|
+
<input> may be an IRI, a file path (extracted automatically), or piped
|
|
26
|
+
text via stdin.
|
|
27
|
+
|
|
28
|
+
Sections (combine freely):
|
|
29
|
+
-n, --normalize Shape-normalized form
|
|
30
|
+
-p, --parse Parsed fields
|
|
12
31
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
classify <segment> Classify a single segment
|
|
18
|
-
cluster [file] Cluster identifiers from FILE or stdin (one per line)
|
|
19
|
-
help Show this message
|
|
20
|
-
version Print version
|
|
32
|
+
Corpus + stats:
|
|
33
|
+
--corpus PATH Load/create a JSON corpus; observe and save atomically.
|
|
34
|
+
-n becomes corpus-informed once it has data.
|
|
35
|
+
--stats Print rolling aggregates
|
|
21
36
|
|
|
22
|
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
37
|
+
Other:
|
|
38
|
+
-h, --help Show this message
|
|
39
|
+
-j, --json Emit JSON instead of human-readable output
|
|
40
|
+
-N, --no-hints Use {integer_id} placeholders instead of {user_id}
|
|
41
|
+
--no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
|
|
42
|
+
-V, --version Print version
|
|
43
|
+
|
|
44
|
+
Subcommands:
|
|
45
|
+
cluster [file] Force cluster view (default for ≥10 IRIs anyway)
|
|
25
46
|
|
|
26
47
|
Examples:
|
|
27
|
-
iriq
|
|
28
|
-
iriq
|
|
29
|
-
|
|
48
|
+
iriq foo.com/users/456
|
|
49
|
+
iriq -n https://foo.com/users/123
|
|
50
|
+
iriq ./access.log # auto-detect file → extract URLs
|
|
51
|
+
cat README.md | iriq -n # one normalized URL per line
|
|
52
|
+
cat README.md | iriq --corpus c.json
|
|
30
53
|
TXT
|
|
31
54
|
|
|
32
55
|
attr_reader :stdin, :stdout, :stderr
|
|
@@ -41,16 +64,35 @@ module Iriq
|
|
|
41
64
|
def run(argv)
|
|
42
65
|
args, opts = parse_options(argv)
|
|
43
66
|
|
|
44
|
-
|
|
45
|
-
return
|
|
67
|
+
return print_usage(stdout, 0) if opts[:help]
|
|
68
|
+
return print_version if opts[:version]
|
|
69
|
+
|
|
70
|
+
explicit_cluster = (args.first == "cluster")
|
|
71
|
+
args.shift if explicit_cluster
|
|
72
|
+
|
|
73
|
+
# Auto-detect: a positional argument that isn't parseable as an IRI
|
|
74
|
+
# but IS an existing file gets treated as a file to extract from. This
|
|
75
|
+
# is what makes `iriq ./access.log` and `iriq /var/log/foo.log` Just
|
|
76
|
+
# Work without a separate --extract flag.
|
|
77
|
+
positional_is_file = args.first && File.file?(args.first) && !parseable_iri?(args.first)
|
|
78
|
+
|
|
79
|
+
batch_mode = explicit_cluster || positional_is_file ||
|
|
80
|
+
(args.empty? && piped_stdin?)
|
|
81
|
+
|
|
82
|
+
return print_usage(stdout, 0) if args.empty? && !batch_mode
|
|
83
|
+
|
|
84
|
+
corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
|
|
46
85
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
86
|
+
code = if batch_mode
|
|
87
|
+
cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
|
|
88
|
+
elsif opts[:stats]
|
|
89
|
+
cmd_stats(corpus, opts)
|
|
90
|
+
else
|
|
91
|
+
cmd_summary(args, opts, corpus)
|
|
51
92
|
end
|
|
52
93
|
|
|
53
|
-
|
|
94
|
+
corpus.save(opts[:corpus]) if corpus && opts[:corpus]
|
|
95
|
+
code
|
|
54
96
|
rescue Iriq::ParseError => e
|
|
55
97
|
stderr.puts "iriq: parse error: #{e.message}"
|
|
56
98
|
2
|
|
@@ -59,89 +101,180 @@ module Iriq
|
|
|
59
101
|
1
|
|
60
102
|
end
|
|
61
103
|
|
|
104
|
+
def parseable_iri?(input)
|
|
105
|
+
Iriq.parse(input)
|
|
106
|
+
true
|
|
107
|
+
rescue Iriq::ParseError
|
|
108
|
+
false
|
|
109
|
+
end
|
|
110
|
+
|
|
62
111
|
private
|
|
63
112
|
|
|
64
113
|
def parse_options(argv)
|
|
65
|
-
opts = {
|
|
114
|
+
opts = {
|
|
115
|
+
json: false,
|
|
116
|
+
help: false,
|
|
117
|
+
version: false,
|
|
118
|
+
hints: true,
|
|
119
|
+
sections: [],
|
|
120
|
+
corpus: nil,
|
|
121
|
+
stats: false,
|
|
122
|
+
scheme_less: true,
|
|
123
|
+
}
|
|
66
124
|
parser = OptionParser.new do |o|
|
|
67
|
-
o.on("-
|
|
68
|
-
o.on("-
|
|
125
|
+
o.on("-p", "--parse") { opts[:sections] << :parse }
|
|
126
|
+
o.on("-n", "--normalize") { opts[:sections] << :normalize }
|
|
127
|
+
o.on("-j", "--json") { opts[:json] = true }
|
|
128
|
+
o.on("--[no-]hints") { |v| opts[:hints] = v }
|
|
129
|
+
o.on("-N") { opts[:hints] = false }
|
|
130
|
+
o.on("--corpus PATH") { |v| opts[:corpus] = v }
|
|
131
|
+
o.on("--stats") { opts[:stats] = true }
|
|
132
|
+
o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
|
|
133
|
+
o.on("-h", "--help") { opts[:help] = true }
|
|
134
|
+
o.on("-V", "--version") { opts[:version] = true }
|
|
69
135
|
end
|
|
70
136
|
args = parser.parse(argv)
|
|
71
137
|
[args, opts]
|
|
72
138
|
end
|
|
73
139
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
|
|
140
|
+
def piped_stdin?
|
|
141
|
+
# StringIO is the test injection point; treat it as "piped" only when
|
|
142
|
+
# it actually has content. Real stdin: tty? tells us.
|
|
143
|
+
if stdin.is_a?(StringIO)
|
|
144
|
+
stdin.size.positive?
|
|
145
|
+
elsif stdin.respond_to?(:tty?)
|
|
146
|
+
!stdin.tty?
|
|
147
|
+
else
|
|
148
|
+
true
|
|
149
|
+
end
|
|
77
150
|
end
|
|
78
151
|
|
|
79
|
-
def
|
|
80
|
-
return
|
|
152
|
+
def load_corpus(path)
|
|
153
|
+
return Corpus.load(path) if File.exist?(path)
|
|
81
154
|
|
|
82
|
-
|
|
83
|
-
|
|
155
|
+
Corpus.new
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def print_usage(io, code)
|
|
159
|
+
io.puts USAGE
|
|
160
|
+
code
|
|
84
161
|
end
|
|
85
162
|
|
|
86
|
-
def
|
|
163
|
+
def print_version
|
|
87
164
|
stdout.puts Iriq::VERSION
|
|
88
165
|
0
|
|
89
166
|
end
|
|
90
167
|
|
|
91
|
-
def
|
|
92
|
-
input
|
|
93
|
-
iri
|
|
94
|
-
|
|
168
|
+
def cmd_summary(args, opts, corpus)
|
|
169
|
+
input = args.first or return missing(:input)
|
|
170
|
+
iri = Iriq.parse(input)
|
|
171
|
+
corpus&.observe(iri)
|
|
172
|
+
sections = opts[:sections].empty? ? SECTION_FLAGS : opts[:sections]
|
|
173
|
+
|
|
174
|
+
data = {}
|
|
175
|
+
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
176
|
+
if sections.include?(:normalize)
|
|
177
|
+
data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
if opts[:json]
|
|
181
|
+
payload = sections.size == 1 ? data.values.first : data
|
|
182
|
+
stdout.puts JSON.generate(payload)
|
|
183
|
+
else
|
|
184
|
+
emit_sections(data, sections)
|
|
185
|
+
end
|
|
95
186
|
0
|
|
96
187
|
end
|
|
97
188
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
189
|
+
# Used for the `cluster` subcommand and implicit piped batch mode. Reads
|
|
190
|
+
# the whole input as text and runs it through the extractor — so a file
|
|
191
|
+
# of URLs (one per line) and a file of prose with URLs both work. The
|
|
192
|
+
# corpus is ephemeral unless --corpus was given.
|
|
193
|
+
def cmd_batch(args, opts, corpus, explicit_cluster: false)
|
|
194
|
+
corpus ||= Corpus.new
|
|
195
|
+
iris = extract_text(read_text(args.first), opts)
|
|
196
|
+
iris.each { |iri| corpus.observe(iri) }
|
|
197
|
+
|
|
198
|
+
if opts[:sections].any?
|
|
199
|
+
emit_per_iri_sections(iris, opts)
|
|
200
|
+
elsif opts[:stats]
|
|
201
|
+
emit_stats(corpus, opts)
|
|
202
|
+
elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
|
|
203
|
+
# Either the user asked for clusters explicitly, or the input is
|
|
204
|
+
# big enough that the cluster summary beats a long URL list.
|
|
205
|
+
emit_clusters(corpus.clusters, opts)
|
|
206
|
+
else
|
|
207
|
+
emit_url_list(iris, opts)
|
|
208
|
+
end
|
|
102
209
|
0
|
|
103
210
|
end
|
|
104
211
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
212
|
+
# Emit the requested sections (parse/normalize/explain) for each
|
|
213
|
+
# extracted IRI. -n alone is the cleanest case: one line per URL.
|
|
214
|
+
def emit_per_iri_sections(iris, opts)
|
|
215
|
+
sections = opts[:sections]
|
|
216
|
+
payloads = iris.map { |iri| section_payload(iri, sections, opts) }
|
|
217
|
+
|
|
108
218
|
if opts[:json]
|
|
109
|
-
|
|
219
|
+
out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
|
|
220
|
+
stdout.puts JSON.generate(out)
|
|
221
|
+
elsif sections == [:normalize]
|
|
222
|
+
# Most common case — keep it tight: one URL per line, no headers.
|
|
223
|
+
payloads.each { |p| stdout.puts p[:normalize] }
|
|
110
224
|
else
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
stdout.
|
|
225
|
+
payloads.each_with_index do |p, i|
|
|
226
|
+
stdout.puts if i > 0
|
|
227
|
+
stdout.puts "# #{iris[i].canonical}"
|
|
228
|
+
sections.each_with_index do |sec, j|
|
|
229
|
+
stdout.puts if j > 0 # blank line between sections within one IRI
|
|
230
|
+
case sec
|
|
231
|
+
when :parse then emit_parse_human(p[:parse])
|
|
232
|
+
when :normalize then stdout.puts p[:normalize]
|
|
233
|
+
end
|
|
234
|
+
end
|
|
114
235
|
end
|
|
115
236
|
end
|
|
116
|
-
0
|
|
117
237
|
end
|
|
118
238
|
|
|
119
|
-
def
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
239
|
+
def section_payload(iri, sections, opts)
|
|
240
|
+
data = {}
|
|
241
|
+
data[:parse] = identifier_hash(iri) if sections.include?(:parse)
|
|
242
|
+
data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
|
|
243
|
+
data
|
|
124
244
|
end
|
|
125
245
|
|
|
126
|
-
def
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
lines.each do |line|
|
|
130
|
-
line = line.strip
|
|
131
|
-
next if line.empty?
|
|
246
|
+
def extract_text(text, opts)
|
|
247
|
+
Extractor.new(scheme_less: opts[:scheme_less]).extract(text)
|
|
248
|
+
end
|
|
132
249
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
250
|
+
# Emit a deduplicated list of IRIs with occurrence counts, sorted desc
|
|
251
|
+
# by count then by first-seen order. If every IRI is a singleton the
|
|
252
|
+
# `[1]` prefix is omitted — just print the URLs.
|
|
253
|
+
def emit_url_list(iris, opts)
|
|
254
|
+
counts = Hash.new(0)
|
|
255
|
+
first = {}
|
|
256
|
+
iris.each_with_index do |iri, i|
|
|
257
|
+
key = iri.canonical
|
|
258
|
+
counts[key] += 1
|
|
259
|
+
first[key] ||= i
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
sorted = counts.sort_by { |k, c| [-c, first[k]] }
|
|
263
|
+
|
|
264
|
+
if opts[:json]
|
|
265
|
+
stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
|
|
266
|
+
elsif sorted.all? { |_, c| c == 1 }
|
|
267
|
+
sorted.each { |k, _| stdout.puts k }
|
|
268
|
+
else
|
|
269
|
+
sorted.each { |k, c| stdout.puts "[#{c}] #{k}" }
|
|
138
270
|
end
|
|
139
|
-
emit_clusters(clusterer.clusters, opts)
|
|
140
|
-
0
|
|
141
271
|
end
|
|
142
272
|
|
|
143
|
-
def
|
|
144
|
-
|
|
273
|
+
def cmd_stats(corpus, opts)
|
|
274
|
+
return missing("--corpus") unless corpus
|
|
275
|
+
|
|
276
|
+
emit_stats(corpus, opts)
|
|
277
|
+
0
|
|
145
278
|
end
|
|
146
279
|
|
|
147
280
|
def missing(name)
|
|
@@ -157,49 +290,104 @@ module Iriq
|
|
|
157
290
|
end
|
|
158
291
|
end
|
|
159
292
|
|
|
160
|
-
def
|
|
161
|
-
if
|
|
162
|
-
|
|
163
|
-
original: iri.original,
|
|
164
|
-
kind: iri.kind,
|
|
165
|
-
scheme: iri.scheme,
|
|
166
|
-
host: iri.host,
|
|
167
|
-
port: iri.port,
|
|
168
|
-
path_segments: iri.path_segments,
|
|
169
|
-
query_params: iri.query_params,
|
|
170
|
-
fragment: iri.fragment,
|
|
171
|
-
nss: iri.nss,
|
|
172
|
-
canonical: iri.canonical,
|
|
173
|
-
)
|
|
293
|
+
def read_text(path)
|
|
294
|
+
if path.nil? || path == "-"
|
|
295
|
+
stdin.read
|
|
174
296
|
else
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
297
|
+
File.read(path)
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def identifier_hash(iri)
|
|
302
|
+
{
|
|
303
|
+
original: iri.original,
|
|
304
|
+
kind: iri.kind,
|
|
305
|
+
scheme: iri.scheme,
|
|
306
|
+
host: iri.host,
|
|
307
|
+
port: iri.port,
|
|
308
|
+
path_segments: iri.path_segments,
|
|
309
|
+
query_params: iri.query_params,
|
|
310
|
+
fragment: iri.fragment,
|
|
311
|
+
nss: iri.nss,
|
|
312
|
+
canonical: iri.canonical,
|
|
313
|
+
}
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def emit_sections(data, sections)
|
|
317
|
+
multi = sections.size > 1
|
|
318
|
+
sections.each_with_index do |sec, i|
|
|
319
|
+
stdout.puts if i > 0
|
|
320
|
+
stdout.puts "# #{sec}" if multi
|
|
321
|
+
case sec
|
|
322
|
+
when :parse then emit_parse_human(data[:parse])
|
|
323
|
+
when :normalize then stdout.puts data[:normalize]
|
|
183
324
|
end
|
|
184
|
-
stdout.puts "fragment: #{iri.fragment}" if iri.fragment
|
|
185
|
-
stdout.puts "nss: #{iri.nss}" if iri.nss
|
|
186
|
-
stdout.puts "canonical: #{iri.canonical}"
|
|
187
325
|
end
|
|
188
326
|
end
|
|
189
327
|
|
|
328
|
+
def emit_parse_human(h)
|
|
329
|
+
stdout.puts "original: #{h[:original]}"
|
|
330
|
+
stdout.puts "kind: #{h[:kind]}"
|
|
331
|
+
stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
|
|
332
|
+
stdout.puts "host: #{h[:host]}" if h[:host]
|
|
333
|
+
stdout.puts "port: #{h[:port]}" if h[:port]
|
|
334
|
+
stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
|
|
335
|
+
stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
|
|
336
|
+
stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
|
|
337
|
+
stdout.puts "nss: #{h[:nss]}" if h[:nss]
|
|
338
|
+
stdout.puts "canonical: #{h[:canonical]}"
|
|
339
|
+
end
|
|
340
|
+
|
|
190
341
|
def emit_clusters(clusters, opts)
|
|
191
342
|
sorted = clusters.sort_by { |c| -c.count }
|
|
192
343
|
|
|
193
344
|
if opts[:json]
|
|
194
345
|
stdout.puts JSON.generate(sorted.map(&:to_h))
|
|
195
346
|
else
|
|
196
|
-
sorted.
|
|
197
|
-
|
|
198
|
-
|
|
347
|
+
sorted.each_with_index do |c, i|
|
|
348
|
+
stdout.puts if i > 0
|
|
349
|
+
host = c.host || "(urn)"
|
|
350
|
+
shape = opts[:hints] ? c.shape : raw_shape_for(c)
|
|
351
|
+
stdout.puts "[#{c.count}] #{host} #{shape}"
|
|
199
352
|
c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
|
|
200
353
|
stdout.puts " + #{c.count - 3} more" if c.count > 3
|
|
201
354
|
end
|
|
202
355
|
end
|
|
203
356
|
end
|
|
357
|
+
|
|
358
|
+
def raw_shape_for(cluster)
|
|
359
|
+
example = cluster.examples.first
|
|
360
|
+
return cluster.shape unless example
|
|
361
|
+
|
|
362
|
+
PathShape.for(example.path_segments, hints: false)
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def emit_stats(corpus, opts)
|
|
366
|
+
payload = {
|
|
367
|
+
observations: corpus.host_counts.values.sum,
|
|
368
|
+
clusters: corpus.size,
|
|
369
|
+
hosts: top(corpus.host_counts),
|
|
370
|
+
shapes: top(corpus.fingerprint_counts),
|
|
371
|
+
raw_shapes: top(corpus.raw_shape_counts),
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
if opts[:json]
|
|
375
|
+
stdout.puts JSON.generate(payload)
|
|
376
|
+
else
|
|
377
|
+
stdout.puts "observations: #{payload[:observations]}"
|
|
378
|
+
stdout.puts "clusters: #{payload[:clusters]}"
|
|
379
|
+
stdout.puts
|
|
380
|
+
stdout.puts "top hosts:"
|
|
381
|
+
payload[:hosts].each { |h, n| stdout.puts " #{n.to_s.rjust(6)} #{h}" }
|
|
382
|
+
stdout.puts
|
|
383
|
+
stdout.puts "top shapes:"
|
|
384
|
+
shapes = opts[:hints] ? payload[:shapes] : payload[:raw_shapes]
|
|
385
|
+
shapes.each { |s, n| stdout.puts " #{n.to_s.rjust(6)} #{s}" }
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
def top(hash)
|
|
390
|
+
hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
|
|
391
|
+
end
|
|
204
392
|
end
|
|
205
393
|
end
|
data/lib/iriq/cluster.rb
CHANGED
|
@@ -54,5 +54,28 @@ module Iriq
|
|
|
54
54
|
segments: segment_stats,
|
|
55
55
|
}
|
|
56
56
|
end
|
|
57
|
+
|
|
58
|
+
# JSON-friendly dump for persistence (distinct from #to_h which is a
|
|
59
|
+
# display form). Examples are dumped as canonical strings and re-parsed
|
|
60
|
+
# on load.
|
|
61
|
+
def dump
|
|
62
|
+
{
|
|
63
|
+
"key" => key,
|
|
64
|
+
"host" => host,
|
|
65
|
+
"scheme" => scheme,
|
|
66
|
+
"shape" => shape,
|
|
67
|
+
"count" => count,
|
|
68
|
+
"examples" => examples.map(&:canonical),
|
|
69
|
+
"segment_counts" => @segment_counts.map { |h| h || {} },
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def self.from_dump(h)
|
|
74
|
+
cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
|
|
75
|
+
cluster.instance_variable_set(:@count, h["count"])
|
|
76
|
+
cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
|
|
77
|
+
cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
|
|
78
|
+
cluster
|
|
79
|
+
end
|
|
57
80
|
end
|
|
58
81
|
end
|
data/lib/iriq/clusterer.rb
CHANGED
|
@@ -4,14 +4,14 @@ module Iriq
|
|
|
4
4
|
# against the cluster it would fall into, including which positions are
|
|
5
5
|
# stable across all observed members.
|
|
6
6
|
class Clusterer
|
|
7
|
-
def initialize(classifier: SegmentClassifier
|
|
7
|
+
def initialize(classifier: SegmentClassifier::DEFAULT)
|
|
8
8
|
@classifier = classifier
|
|
9
9
|
@clusters = {}
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
-
def add(input)
|
|
12
|
+
def add(input, shape: nil)
|
|
13
13
|
iri = coerce(input)
|
|
14
|
-
key, host, scheme, shape = cluster_key(iri)
|
|
14
|
+
key, host, scheme, shape = cluster_key(iri, shape: shape)
|
|
15
15
|
cluster = @clusters[key] ||= Cluster.new(
|
|
16
16
|
key: key,
|
|
17
17
|
host: host,
|
|
@@ -39,16 +39,14 @@ module Iriq
|
|
|
39
39
|
key, * = cluster_key(iri)
|
|
40
40
|
cluster = @clusters[key]
|
|
41
41
|
stats = cluster ? cluster.segment_stats : []
|
|
42
|
+
hinted = SegmentHints.derive(iri.path_segments, @classifier)
|
|
42
43
|
|
|
43
|
-
|
|
44
|
-
type = @classifier.classify(seg)
|
|
44
|
+
hinted.each_with_index.map do |entry, i|
|
|
45
45
|
stable = stats[i] && stats[i][:stable]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
type: type,
|
|
49
|
-
variable: !stable && @classifier.variable?(type),
|
|
46
|
+
entry.merge(
|
|
47
|
+
variable: !stable && entry[:variable],
|
|
50
48
|
stable: !!stable,
|
|
51
|
-
|
|
49
|
+
)
|
|
52
50
|
end
|
|
53
51
|
end
|
|
54
52
|
|
|
@@ -58,20 +56,37 @@ module Iriq
|
|
|
58
56
|
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
59
57
|
end
|
|
60
58
|
|
|
61
|
-
def cluster_key(iri)
|
|
59
|
+
def cluster_key(iri, shape: nil)
|
|
62
60
|
if iri.urn?
|
|
63
61
|
ns, value = (iri.nss || "").split(":", 2)
|
|
64
|
-
shape =
|
|
65
|
-
|
|
66
|
-
@classifier.variable?(type) ? "{#{type}}" : value
|
|
67
|
-
end
|
|
68
|
-
key = "urn:#{ns}:#{shape}"
|
|
62
|
+
shape = value ? urn_value_shape(ns, value) : nil
|
|
63
|
+
key = "urn:#{ns}:#{shape}"
|
|
69
64
|
[key, nil, "urn", key]
|
|
70
65
|
else
|
|
71
|
-
shape
|
|
66
|
+
shape ||= PathShape.new(classifier: @classifier).for(iri.path_segments)
|
|
72
67
|
key = "#{iri.scheme}://#{iri.host}#{shape}"
|
|
73
68
|
[key, iri.host, iri.scheme, shape]
|
|
74
69
|
end
|
|
75
70
|
end
|
|
71
|
+
|
|
72
|
+
def urn_value_shape(ns, value)
|
|
73
|
+
entry = SegmentHints.derive([ns, value], @classifier).last
|
|
74
|
+
return entry[:value] unless entry[:variable]
|
|
75
|
+
|
|
76
|
+
"{#{entry[:hint] || entry[:type]}}"
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
public
|
|
80
|
+
|
|
81
|
+
def dump
|
|
82
|
+
{ "clusters" => @clusters.transform_values(&:dump) }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
|
|
86
|
+
c = new(classifier: classifier)
|
|
87
|
+
restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
|
|
88
|
+
c.instance_variable_set(:@clusters, restored)
|
|
89
|
+
c
|
|
90
|
+
end
|
|
76
91
|
end
|
|
77
92
|
end
|