iriq 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iriq/cli.rb CHANGED
@@ -1,32 +1,55 @@
1
1
  require "json"
2
2
  require "optparse"
3
+ require "stringio"
3
4
 
4
5
  module Iriq
5
- # Tiny CLI wrapper around the public API. Construct with explicit IO so
6
- # specs can run it without shelling out.
6
+ # Flag-driven CLI. The default action for an input is a combined parse +
7
+ # normalize + explain summary; the -p/-n/-e flags select individual
8
+ # sections. The only subcommand is `cluster`, which is structurally
9
+ # different (many inputs, not one). Construct with explicit IO so specs
10
+ # can run it without shelling out.
7
11
  class CLI
8
- COMMANDS = %w[parse normalize explain classify cluster help version].freeze
12
+ SECTION_FLAGS = %i[parse normalize].freeze
13
+ TOP_N_STATS = 10
14
+
15
+ # When extraction yields this many or more IRIs, the default pipe
16
+ # output switches from a URL list to clusters — a longer list is
17
+ # easier to read as route-shape groups.
18
+ LARGE_BATCH_THRESHOLD = 10
9
19
 
10
20
  USAGE = <<~TXT
11
- Usage: iriq <command> [options] [args]
21
+ Usage: iriq [options] <input>
22
+ iriq [options] < text
23
+ iriq cluster [options] [file]
24
+
25
+ <input> may be an IRI, a file path (extracted automatically), or piped
26
+ text via stdin.
27
+
28
+ Sections (combine freely):
29
+ -n, --normalize Shape-normalized form
30
+ -p, --parse Parsed fields
12
31
 
13
- Commands:
14
- parse <input> Parse an identifier and print its fields
15
- normalize <input> Print the shape-normalized form
16
- explain <input> Annotate each path segment
17
- classify <segment> Classify a single segment
18
- cluster [file] Cluster identifiers from FILE or stdin (one per line)
19
- help Show this message
20
- version Print version
32
+ Corpus + stats:
33
+ --corpus PATH Load/create a JSON corpus; observe and save atomically.
34
+ -n becomes corpus-informed once it has data.
35
+ --stats Print rolling aggregates
21
36
 
22
- Options:
23
- -j, --json Emit JSON instead of human-readable output
24
- -h, --help Show this message
37
+ Other:
38
+ -h, --help Show this message
39
+ -j, --json Emit JSON instead of human-readable output
40
+ -N, --no-hints Use {integer_id} placeholders instead of {user_id}
41
+ --no-scheme-less Skip foo.com/path extraction (explicit-scheme only)
42
+ -V, --version Print version
43
+
44
+ Subcommands:
45
+ cluster [file] Force cluster view (default for ≥10 IRIs anyway)
25
46
 
26
47
  Examples:
27
- iriq parse https://foo.com/users/123
28
- iriq normalize foo.com/users/456
29
- echo "https://foo.com/users/1\\nhttps://foo.com/users/2" | iriq cluster
48
+ iriq foo.com/users/456
49
+ iriq -n https://foo.com/users/123
50
+ iriq ./access.log # auto-detect file → extract URLs
51
+ cat README.md | iriq -n # one normalized URL per line
52
+ cat README.md | iriq --corpus c.json
30
53
  TXT
31
54
 
32
55
  attr_reader :stdin, :stdout, :stderr
@@ -41,16 +64,35 @@ module Iriq
41
64
  def run(argv)
42
65
  args, opts = parse_options(argv)
43
66
 
44
- cmd = args.shift
45
- return print_usage(stdout, 0) if cmd.nil? || cmd == "help" || opts[:help]
67
+ return print_usage(stdout, 0) if opts[:help]
68
+ return print_version if opts[:version]
69
+
70
+ explicit_cluster = (args.first == "cluster")
71
+ args.shift if explicit_cluster
72
+
73
+ # Auto-detect: a positional argument that isn't parseable as an IRI
74
+ # but IS an existing file gets treated as a file to extract from. This
75
+ # is what makes `iriq ./access.log` and `iriq /var/log/foo.log` Just
76
+ # Work without a separate --extract flag.
77
+ positional_is_file = args.first && File.file?(args.first) && !parseable_iri?(args.first)
78
+
79
+ batch_mode = explicit_cluster || positional_is_file ||
80
+ (args.empty? && piped_stdin?)
81
+
82
+ return print_usage(stdout, 0) if args.empty? && !batch_mode
83
+
84
+ corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
46
85
 
47
- unless COMMANDS.include?(cmd)
48
- stderr.puts "iriq: unknown command #{cmd.inspect}"
49
- print_usage(stderr, 1)
50
- return 1
86
+ code = if batch_mode
87
+ cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
88
+ elsif opts[:stats]
89
+ cmd_stats(corpus, opts)
90
+ else
91
+ cmd_summary(args, opts, corpus)
51
92
  end
52
93
 
53
- send("cmd_#{cmd}", args, opts)
94
+ corpus.save(opts[:corpus]) if corpus && opts[:corpus]
95
+ code
54
96
  rescue Iriq::ParseError => e
55
97
  stderr.puts "iriq: parse error: #{e.message}"
56
98
  2
@@ -59,89 +101,178 @@ module Iriq
59
101
  1
60
102
  end
61
103
 
104
+ def parseable_iri?(input)
105
+ Iriq.parse(input)
106
+ true
107
+ rescue Iriq::ParseError
108
+ false
109
+ end
110
+
62
111
  private
63
112
 
64
113
  def parse_options(argv)
65
- opts = { json: false, help: false }
114
+ opts = {
115
+ json: false,
116
+ help: false,
117
+ version: false,
118
+ hints: true,
119
+ sections: [],
120
+ corpus: nil,
121
+ stats: false,
122
+ scheme_less: true,
123
+ }
66
124
  parser = OptionParser.new do |o|
67
- o.on("-j", "--json") { opts[:json] = true }
68
- o.on("-h", "--help") { opts[:help] = true }
125
+ o.on("-p", "--parse") { opts[:sections] << :parse }
126
+ o.on("-n", "--normalize") { opts[:sections] << :normalize }
127
+ o.on("-j", "--json") { opts[:json] = true }
128
+ o.on("--[no-]hints") { |v| opts[:hints] = v }
129
+ o.on("-N") { opts[:hints] = false }
130
+ o.on("--corpus PATH") { |v| opts[:corpus] = v }
131
+ o.on("--stats") { opts[:stats] = true }
132
+ o.on("--[no-]scheme-less") { |v| opts[:scheme_less] = v }
133
+ o.on("-h", "--help") { opts[:help] = true }
134
+ o.on("-V", "--version") { opts[:version] = true }
69
135
  end
70
136
  args = parser.parse(argv)
71
137
  [args, opts]
72
138
  end
73
139
 
74
- def print_usage(io, code)
75
- io.puts USAGE
76
- code
140
+ def piped_stdin?
141
+ # StringIO is the test injection point; treat it as "piped" only when
142
+ # it actually has content. Real stdin: tty? tells us.
143
+ if stdin.is_a?(StringIO)
144
+ stdin.size.positive?
145
+ elsif stdin.respond_to?(:tty?)
146
+ !stdin.tty?
147
+ else
148
+ true
149
+ end
77
150
  end
78
151
 
79
- def require_arg!(args, name)
80
- return args.first if args.first
152
+ def load_corpus(path)
153
+ Corpus.open(path)
154
+ end
81
155
 
82
- stderr.puts "iriq: missing argument <#{name}>"
83
- throw :missing_arg, 1
156
+ def print_usage(io, code)
157
+ io.puts USAGE
158
+ code
84
159
  end
85
160
 
86
- def cmd_version(_args, _opts)
161
+ def print_version
87
162
  stdout.puts Iriq::VERSION
88
163
  0
89
164
  end
90
165
 
91
- def cmd_parse(args, opts)
92
- input = args.first or return missing(:input)
93
- iri = Iriq.parse(input)
94
- emit_parse(iri, opts)
166
+ def cmd_summary(args, opts, corpus)
167
+ input = args.first or return missing(:input)
168
+ iri = Iriq.parse(input)
169
+ corpus&.observe(iri)
170
+ sections = opts[:sections].empty? ? SECTION_FLAGS : opts[:sections]
171
+
172
+ data = {}
173
+ data[:parse] = identifier_hash(iri) if sections.include?(:parse)
174
+ if sections.include?(:normalize)
175
+ data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
176
+ end
177
+
178
+ if opts[:json]
179
+ payload = sections.size == 1 ? data.values.first : data
180
+ stdout.puts JSON.generate(payload)
181
+ else
182
+ emit_sections(data, sections)
183
+ end
95
184
  0
96
185
  end
97
186
 
98
- def cmd_normalize(args, opts)
99
- input = args.first or return missing(:input)
100
- out = Iriq.normalize(input)
101
- opts[:json] ? stdout.puts(JSON.generate(normalized: out)) : stdout.puts(out)
187
+ # Used for the `cluster` subcommand and implicit piped batch mode. Reads
188
+ # the whole input as text and runs it through the extractor — so a file
189
+ # of URLs (one per line) and a file of prose with URLs both work. The
190
+ # corpus is ephemeral unless --corpus was given.
191
+ def cmd_batch(args, opts, corpus, explicit_cluster: false)
192
+ corpus ||= Corpus.new
193
+ iris = extract_text(read_text(args.first), opts)
194
+ corpus.batch { iris.each { |iri| corpus.observe(iri) } }
195
+
196
+ if opts[:sections].any?
197
+ emit_per_iri_sections(iris, opts)
198
+ elsif opts[:stats]
199
+ emit_stats(corpus, opts)
200
+ elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
201
+ # Either the user asked for clusters explicitly, or the input is
202
+ # big enough that the cluster summary beats a long URL list.
203
+ emit_clusters(corpus.clusters, opts)
204
+ else
205
+ emit_url_list(iris, opts)
206
+ end
102
207
  0
103
208
  end
104
209
 
105
- def cmd_explain(args, opts)
106
- input = args.first or return missing(:input)
107
- rows = Iriq.explain(input)
210
+ # Emit the requested sections (parse/normalize/explain) for each
211
+ # extracted IRI. -n alone is the cleanest case: one line per URL.
212
+ def emit_per_iri_sections(iris, opts)
213
+ sections = opts[:sections]
214
+ payloads = iris.map { |iri| section_payload(iri, sections, opts) }
215
+
108
216
  if opts[:json]
109
- stdout.puts JSON.generate(rows)
217
+ out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
218
+ stdout.puts JSON.generate(out)
219
+ elsif sections == [:normalize]
220
+ # Most common case — keep it tight: one URL per line, no headers.
221
+ payloads.each { |p| stdout.puts p[:normalize] }
110
222
  else
111
- rows.each do |r|
112
- mark = r[:variable] ? "*" : " "
113
- stdout.printf("%s %-12s %s\n", mark, r[:type], r[:value])
223
+ payloads.each_with_index do |p, i|
224
+ stdout.puts if i > 0
225
+ stdout.puts "# #{iris[i].canonical}"
226
+ sections.each_with_index do |sec, j|
227
+ stdout.puts if j > 0 # blank line between sections within one IRI
228
+ case sec
229
+ when :parse then emit_parse_human(p[:parse])
230
+ when :normalize then stdout.puts p[:normalize]
231
+ end
232
+ end
114
233
  end
115
234
  end
116
- 0
117
235
  end
118
236
 
119
- def cmd_classify(args, opts)
120
- seg = args.first or return missing(:segment)
121
- type = SegmentClassifier.new.classify(seg)
122
- opts[:json] ? stdout.puts(JSON.generate(value: seg, type: type)) : stdout.puts(type)
123
- 0
237
+ def section_payload(iri, sections, opts)
238
+ data = {}
239
+ data[:parse] = identifier_hash(iri) if sections.include?(:parse)
240
+ data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints]) if sections.include?(:normalize)
241
+ data
124
242
  end
125
243
 
126
- def cmd_cluster(args, opts)
127
- lines = read_input(args.first)
128
- clusterer = Clusterer.new
129
- lines.each do |line|
130
- line = line.strip
131
- next if line.empty?
244
+ def extract_text(text, opts)
245
+ Extractor.new(scheme_less: opts[:scheme_less]).extract(text)
246
+ end
132
247
 
133
- begin
134
- clusterer.add(line)
135
- rescue Iriq::ParseError => e
136
- stderr.puts "iriq: skipped #{line.inspect}: #{e.message}"
137
- end
248
+ # Emit a deduplicated list of IRIs with occurrence counts, sorted desc
249
+ # by count then by first-seen order. If every IRI is a singleton the
250
+ # `[1]` prefix is omitted — just print the URLs.
251
+ def emit_url_list(iris, opts)
252
+ counts = Hash.new(0)
253
+ first = {}
254
+ iris.each_with_index do |iri, i|
255
+ key = iri.canonical
256
+ counts[key] += 1
257
+ first[key] ||= i
258
+ end
259
+
260
+ sorted = counts.sort_by { |k, c| [-c, first[k]] }
261
+
262
+ if opts[:json]
263
+ stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
264
+ elsif sorted.all? { |_, c| c == 1 }
265
+ sorted.each { |k, _| stdout.puts k }
266
+ else
267
+ sorted.each { |k, c| stdout.puts "[#{c}] #{k}" }
138
268
  end
139
- emit_clusters(clusterer.clusters, opts)
140
- 0
141
269
  end
142
270
 
143
- def cmd_help(_args, _opts)
144
- print_usage(stdout, 0)
271
+ def cmd_stats(corpus, opts)
272
+ return missing("--corpus") unless corpus
273
+
274
+ emit_stats(corpus, opts)
275
+ 0
145
276
  end
146
277
 
147
278
  def missing(name)
@@ -157,49 +288,107 @@ module Iriq
157
288
  end
158
289
  end
159
290
 
160
- def emit_parse(iri, opts)
161
- if opts[:json]
162
- stdout.puts JSON.generate(
163
- original: iri.original,
164
- kind: iri.kind,
165
- scheme: iri.scheme,
166
- host: iri.host,
167
- port: iri.port,
168
- path_segments: iri.path_segments,
169
- query_params: iri.query_params,
170
- fragment: iri.fragment,
171
- nss: iri.nss,
172
- canonical: iri.canonical,
173
- )
291
+ def read_text(path)
292
+ if path.nil? || path == "-"
293
+ stdin.read
174
294
  else
175
- stdout.puts "original: #{iri.original}"
176
- stdout.puts "kind: #{iri.kind}"
177
- stdout.puts "scheme: #{iri.scheme}" if iri.scheme
178
- stdout.puts "host: #{iri.host}" if iri.host
179
- stdout.puts "port: #{iri.port}" if iri.port
180
- stdout.puts "path_segments: #{iri.path_segments.inspect}" if iri.url?
181
- unless iri.query_params.empty?
182
- stdout.puts "query_params: #{iri.query_params.inspect}"
295
+ File.read(path)
296
+ end
297
+ end
298
+
299
+ # Compact identifier hash for parse output (both JSON and human). Drops
300
+ # nil values and empty collections so URN dumps don't carry empty
301
+ # host/path/query slots, and URL dumps don't include null fragment/nss.
302
+ def identifier_hash(iri)
303
+ {
304
+ original: iri.original,
305
+ kind: iri.kind,
306
+ scheme: iri.scheme,
307
+ host: iri.host,
308
+ port: iri.port,
309
+ path_segments: iri.path_segments,
310
+ query_params: iri.query_params,
311
+ fragment: iri.fragment,
312
+ nss: iri.nss,
313
+ canonical: iri.canonical,
314
+ }.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
315
+ end
316
+
317
+ def emit_sections(data, sections)
318
+ multi = sections.size > 1
319
+ sections.each_with_index do |sec, i|
320
+ stdout.puts if i > 0
321
+ stdout.puts "# #{sec}" if multi
322
+ case sec
323
+ when :parse then emit_parse_human(data[:parse])
324
+ when :normalize then stdout.puts data[:normalize]
183
325
  end
184
- stdout.puts "fragment: #{iri.fragment}" if iri.fragment
185
- stdout.puts "nss: #{iri.nss}" if iri.nss
186
- stdout.puts "canonical: #{iri.canonical}"
187
326
  end
188
327
  end
189
328
 
329
+ def emit_parse_human(h)
330
+ stdout.puts "original: #{h[:original]}"
331
+ stdout.puts "kind: #{h[:kind]}"
332
+ stdout.puts "scheme: #{h[:scheme]}" if h[:scheme]
333
+ stdout.puts "host: #{h[:host]}" if h[:host]
334
+ stdout.puts "port: #{h[:port]}" if h[:port]
335
+ stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
336
+ stdout.puts "query_params: #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
337
+ stdout.puts "fragment: #{h[:fragment]}" if h[:fragment]
338
+ stdout.puts "nss: #{h[:nss]}" if h[:nss]
339
+ stdout.puts "canonical: #{h[:canonical]}"
340
+ end
341
+
190
342
  def emit_clusters(clusters, opts)
191
343
  sorted = clusters.sort_by { |c| -c.count }
192
344
 
193
345
  if opts[:json]
194
346
  stdout.puts JSON.generate(sorted.map(&:to_h))
195
347
  else
196
- sorted.each do |c|
197
- host = c.host || "(urn)"
198
- stdout.puts "[#{c.count}] #{host} #{c.shape}"
348
+ sorted.each_with_index do |c, i|
349
+ stdout.puts if i > 0
350
+ host = c.host || "(urn)"
351
+ shape = opts[:hints] ? c.shape : raw_shape_for(c)
352
+ stdout.puts "[#{c.count}] #{host} #{shape}"
199
353
  c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
200
354
  stdout.puts " + #{c.count - 3} more" if c.count > 3
201
355
  end
202
356
  end
203
357
  end
358
+
359
+ def raw_shape_for(cluster)
360
+ example = cluster.examples.first
361
+ return cluster.shape unless example
362
+
363
+ PathShape.for(example.path_segments, hints: false)
364
+ end
365
+
366
+ def emit_stats(corpus, opts)
367
+ payload = {
368
+ observations: corpus.host_counts.values.sum,
369
+ clusters: corpus.size,
370
+ hosts: top(corpus.host_counts),
371
+ shapes: top(corpus.fingerprint_counts),
372
+ raw_shapes: top(corpus.raw_shape_counts),
373
+ }
374
+
375
+ if opts[:json]
376
+ stdout.puts JSON.generate(payload)
377
+ else
378
+ stdout.puts "observations: #{payload[:observations]}"
379
+ stdout.puts "clusters: #{payload[:clusters]}"
380
+ stdout.puts
381
+ stdout.puts "top hosts:"
382
+ payload[:hosts].each { |h, n| stdout.puts " #{n.to_s.rjust(6)} #{h}" }
383
+ stdout.puts
384
+ stdout.puts "top shapes:"
385
+ shapes = opts[:hints] ? payload[:shapes] : payload[:raw_shapes]
386
+ shapes.each { |s, n| stdout.puts " #{n.to_s.rjust(6)} #{s}" }
387
+ end
388
+ end
389
+
390
+ def top(hash)
391
+ hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
392
+ end
204
393
  end
205
394
  end
data/lib/iriq/cluster.rb CHANGED
@@ -54,5 +54,52 @@ module Iriq
54
54
  segments: segment_stats,
55
55
  }
56
56
  end
57
+
58
+ # JSON-friendly dump for persistence (distinct from #to_h which is a
59
+ # display form). Examples are dumped as canonical strings and re-parsed
60
+ # on load.
61
+ def dump
62
+ {
63
+ "key" => key,
64
+ "host" => host,
65
+ "scheme" => scheme,
66
+ "shape" => shape,
67
+ "count" => count,
68
+ "examples" => examples.map(&:canonical),
69
+ "segment_counts" => @segment_counts.map { |h| h || {} },
70
+ }
71
+ end
72
+
73
+ def self.from_dump(h)
74
+ cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
75
+ cluster.instance_variable_set(:@count, h["count"])
76
+ cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
77
+ cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
78
+ cluster
79
+ end
80
+
81
+ # Shared cluster-key derivation. Returns [key, host, scheme, shape] —
82
+ # callers that already have a hinted shape can pass it in to skip the
83
+ # recomputation; URN inputs ignore the override and always derive their
84
+ # own shape from the NSS value.
85
+ def self.key_for(iri, classifier:, shape: nil)
86
+ if iri.urn?
87
+ ns, value = (iri.nss || "").split(":", 2)
88
+ derived = value ? urn_value_shape(ns, value, classifier) : nil
89
+ key = "urn:#{ns}:#{derived}"
90
+ [key, nil, "urn", key]
91
+ else
92
+ shape ||= PathShape.new(classifier: classifier).for(iri.path_segments)
93
+ key = "#{iri.scheme}://#{iri.host}#{shape}"
94
+ [key, iri.host, iri.scheme, shape]
95
+ end
96
+ end
97
+
98
+ def self.urn_value_shape(ns, value, classifier)
99
+ entry = SegmentHints.derive([ns, value], classifier).last
100
+ return entry[:value] unless entry[:variable]
101
+
102
+ "{#{entry[:hint] || entry[:type]}}"
103
+ end
57
104
  end
58
105
  end
@@ -3,31 +3,28 @@ module Iriq
3
3
  # `clusters` to read out the groups. `explain` annotates a single identifier
4
4
  # against the cluster it would fall into, including which positions are
5
5
  # stable across all observed members.
6
+ #
7
+ # Implemented as a thin wrapper over Storage::Memory — the same code path
8
+ # Corpus uses for the cluster portion of its state, so there's only one
9
+ # place that knows how clusters get stored.
6
10
  class Clusterer
7
- def initialize(classifier: SegmentClassifier.new)
11
+ def initialize(classifier: SegmentClassifier::DEFAULT)
8
12
  @classifier = classifier
9
- @clusters = {}
13
+ @storage = Storage::Memory.new(classifier: classifier)
10
14
  end
11
15
 
12
- def add(input)
16
+ def add(input, shape: nil)
13
17
  iri = coerce(input)
14
- key, host, scheme, shape = cluster_key(iri)
15
- cluster = @clusters[key] ||= Cluster.new(
16
- key: key,
17
- host: host,
18
- scheme: scheme,
19
- shape: shape,
20
- )
21
- cluster.add(iri)
22
- cluster
18
+ key, host, scheme, derived = Cluster.key_for(iri, classifier: @classifier, shape: shape)
19
+ @storage.add_to_cluster(key, host, scheme, derived, iri)
23
20
  end
24
21
 
25
22
  def clusters
26
- @clusters.values
23
+ @storage.clusters
27
24
  end
28
25
 
29
26
  def size
30
- @clusters.size
27
+ @storage.cluster_size
31
28
  end
32
29
 
33
30
  # Returns a per-segment explanation for the input, merging classifier
@@ -36,42 +33,35 @@ module Iriq
36
33
  # would otherwise call them variable).
37
34
  def explain(input)
38
35
  iri = coerce(input)
39
- key, * = cluster_key(iri)
40
- cluster = @clusters[key]
36
+ key, * = Cluster.key_for(iri, classifier: @classifier)
37
+ cluster = clusters.find { |c| c.key == key }
41
38
  stats = cluster ? cluster.segment_stats : []
39
+ hinted = SegmentHints.derive(iri.path_segments, @classifier)
42
40
 
43
- iri.path_segments.each_with_index.map do |seg, i|
44
- type = @classifier.classify(seg)
41
+ hinted.each_with_index.map do |entry, i|
45
42
  stable = stats[i] && stats[i][:stable]
46
- {
47
- value: seg,
48
- type: type,
49
- variable: !stable && @classifier.variable?(type),
43
+ entry.merge(
44
+ variable: !stable && entry[:variable],
50
45
  stable: !!stable,
51
- }
46
+ )
52
47
  end
53
48
  end
54
49
 
50
+ def dump
51
+ { "clusters" => clusters.each_with_object({}) { |c, h| h[c.key] = c.dump } }
52
+ end
53
+
54
+ def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
55
+ c = new(classifier: classifier)
56
+ restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
57
+ c.instance_variable_get(:@storage).instance_variable_set(:@clusters, restored)
58
+ c
59
+ end
60
+
55
61
  private
56
62
 
57
63
  def coerce(input)
58
64
  input.is_a?(Identifier) ? input : Parser.parse(input)
59
65
  end
60
-
61
- def cluster_key(iri)
62
- if iri.urn?
63
- ns, value = (iri.nss || "").split(":", 2)
64
- shape = if value
65
- type = @classifier.classify(value)
66
- @classifier.variable?(type) ? "{#{type}}" : value
67
- end
68
- key = "urn:#{ns}:#{shape}"
69
- [key, nil, "urn", key]
70
- else
71
- shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
72
- key = "#{iri.scheme}://#{iri.host}#{shape}"
73
- [key, iri.host, iri.scheme, shape]
74
- end
75
- end
76
66
  end
77
67
  end