iriq 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iriq/corpus.rb CHANGED
@@ -7,6 +7,10 @@ module Iriq
7
7
  #
8
8
  # The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
9
9
  # Corpus#normalize and Corpus#explain are the corpus-informed variants.
10
+ #
11
+ # State lives in a Storage backend (Memory by default; Json or Sqlite when
12
+ # opened against a file). The classification logic on top is identical
13
+ # regardless of where the counters live.
10
14
  class Corpus
11
15
  # Type-based: position is "mostly variable" (UUIDs/integers/etc.).
12
16
  VARIABLE_DOMINANCE_THRESHOLD = 0.8
@@ -38,28 +42,53 @@ module Iriq
38
42
  POPULAR_MIN_COUNT = 5
39
43
  POPULAR_BASELINE_MULTIPLE = 3
40
44
 
41
- attr_reader :host_counts, :path_length_counts, :raw_shape_counts,
42
- :fingerprint_counts, :position_stats
45
+ attr_reader :storage
43
46
 
44
47
  def initialize(classifier: SegmentClassifier::DEFAULT,
45
- max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
46
- @classifier = classifier
47
- @max_values_per_position = max_values_per_position
48
- @host_counts = Hash.new(0)
49
- @path_length_counts = Hash.new(0)
50
- @raw_shape_counts = Hash.new(0)
51
- @fingerprint_counts = Hash.new(0)
52
- @position_stats = {}
53
- @clusterer = Clusterer.new(classifier: classifier)
48
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
49
+ storage: nil)
50
+ @classifier = classifier
51
+ @storage = storage || Storage::Memory.new(
52
+ classifier: classifier,
53
+ max_values_per_position: max_values_per_position,
54
+ )
55
+ end
56
+
57
+ # Open a corpus against `path`. File extension picks the backend:
58
+ # `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
59
+ # else uses JSON.
60
+ def self.open(path, classifier: SegmentClassifier::DEFAULT,
61
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
62
+ storage = Storage.open(path,
63
+ classifier: classifier,
64
+ max_values_per_position: max_values_per_position)
65
+ new(classifier: classifier, storage: storage)
54
66
  end
55
67
 
56
68
  # Observe a single IRI. Returns an Observation.
57
69
  def observe(input)
58
70
  iri = coerce(input)
59
71
  hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
60
- record_aggregates(iri, hinted_entries)
72
+ raw_shape = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
61
73
  hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
62
- cluster = @clusterer.add(iri, shape: hinted_shape)
74
+
75
+ cluster = nil
76
+ @storage.transaction do |s|
77
+ s.increment_host(iri.host)
78
+ s.increment_path_length(iri.path_segments.size)
79
+ s.increment_raw_shape(raw_shape)
80
+ s.increment_fingerprint(hinted_shape)
81
+
82
+ prefix = ""
83
+ hinted_entries.each do |entry|
84
+ s.observe_position(iri.host, prefix, entry[:value], entry[:type])
85
+ prefix = "#{prefix}/#{placeholder(entry)}"
86
+ end
87
+
88
+ key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape)
89
+ cluster = s.add_to_cluster(key, host, scheme, shape, iri)
90
+ end
91
+
63
92
  Observation.new(corpus: self, identifier: iri, cluster: cluster)
64
93
  end
65
94
 
@@ -89,55 +118,69 @@ module Iriq
89
118
  end
90
119
  end
91
120
 
121
+ def host_counts; @storage.host_counts; end
122
+ def path_length_counts; @storage.path_length_counts; end
123
+ def raw_shape_counts; @storage.raw_shape_counts; end
124
+ def fingerprint_counts; @storage.fingerprint_counts; end
125
+
126
+ # Iterates (host, prefix) → PositionStats over all observed positions.
127
+ # Used by inspection tooling; not part of the hot path.
128
+ def each_position_stats(&block)
129
+ @storage.each_position_stats(&block)
130
+ end
131
+
92
132
  def clusters
93
- @clusterer.clusters
133
+ @storage.clusters
94
134
  end
95
135
 
96
136
  def size
97
- @clusterer.size
137
+ @storage.cluster_size
98
138
  end
99
139
 
100
140
  # Stats for a given (host, prefix_shape) — useful for tests and
101
141
  # debugging. Returns nil if nothing has been observed there.
102
142
  def stats_for(host, prefix)
103
- @position_stats[[host, prefix]]
143
+ @storage.position_stats(host, prefix)
104
144
  end
105
145
 
106
- private
107
-
108
- def coerce(input)
109
- input.is_a?(Identifier) ? input : Parser.parse(input)
146
+ # Persist the corpus.
147
+ #
148
+ # save() → flush the backend in place (JSON writes its file,
149
+ # SQLite is already on disk).
150
+ # save(same_path) → same as save() — idempotent for the backend's path.
151
+ # save(other_path) → export to other_path as JSON, regardless of the
152
+ # live backend.
153
+ def save(path = nil)
154
+ backend_path = @storage.respond_to?(:path) ? @storage.path : nil
155
+ if path.nil? || path == backend_path
156
+ @storage.save
157
+ else
158
+ write_json_dump(path)
159
+ end
110
160
  end
111
161
 
112
- def record_aggregates(iri, hinted_entries)
113
- @host_counts[iri.host] += 1 if iri.host
114
- @path_length_counts[iri.path_segments.size] += 1
115
-
116
- raw = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
117
- fp = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
118
- @raw_shape_counts[raw] += 1
119
- @fingerprint_counts[fp] += 1
162
+ def close
163
+ @storage.close
164
+ end
120
165
 
121
- record_position_stats(iri, hinted_entries)
166
+ # Wrap many observations in a single backend transaction. For SQLite this
167
+ # turns thousands of fsyncs into one; for in-memory backends it's a
168
+ # no-op. Use when ingesting a batch.
169
+ def batch(&block)
170
+ @storage.batch(&block)
122
171
  end
123
172
 
124
- def record_position_stats(iri, hinted_entries)
125
- prefix = ""
126
- hinted_entries.each do |entry|
127
- key = [iri.host, prefix]
128
- stats = @position_stats[key] ||= PositionStats.new(max_values: @max_values_per_position)
129
- stats.observe(entry[:value], entry[:type])
130
- prefix = "#{prefix}/#{placeholder(entry)}"
131
- end
173
+ private
174
+
175
+ def coerce(input)
176
+ input.is_a?(Identifier) ? input : Parser.parse(input)
132
177
  end
133
178
 
134
- # Walks the IRI's segments and returns hint-derived entries enriched with
135
- # the (host, prefix) PositionStats reference and a :classification symbol.
136
179
  def annotate_segments(iri)
137
180
  hinted = SegmentHints.derive(iri.path_segments, @classifier)
138
181
  prefix = ""
139
182
  hinted.map do |entry|
140
- stats = @position_stats[[iri.host, prefix]]
183
+ stats = @storage.position_stats(iri.host, prefix)
141
184
  out = entry.merge(
142
185
  prefix: prefix,
143
186
  classification: classify(entry, stats),
@@ -226,43 +269,54 @@ module Iriq
226
269
 
227
270
  public
228
271
 
272
+ # --- Legacy dump/load (JSON shape) ------------------------------------
273
+ #
274
+ # The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
275
+ # `Corpus.load(path)` for JSON-backed persistence. Those names still work
276
+ # but are now thin wrappers around the appropriate Storage backend.
277
+
229
278
  def dump
230
- {
231
- "host_counts" => @host_counts,
232
- "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
233
- "raw_shape_counts" => @raw_shape_counts,
234
- "fingerprint_counts" => @fingerprint_counts,
235
- "max_values_per_position" => @max_values_per_position,
236
- "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
237
- "clusterer" => @clusterer.dump,
238
- }
279
+ memory_view.to_dump
239
280
  end
240
281
 
241
- def save(path)
282
+ def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
283
+ max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
284
+ storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
285
+ storage.load_dump!(h)
286
+ new(classifier: classifier, storage: storage)
287
+ end
288
+
289
+ def self.load(path, classifier: SegmentClassifier::DEFAULT)
290
+ open(path, classifier: classifier)
291
+ end
292
+
293
+ private
294
+
295
+ def write_json_dump(path)
242
296
  tmp = "#{path}.tmp"
243
- File.write(tmp, JSON.generate(dump))
297
+ File.write(tmp, JSON.generate(memory_view.to_dump))
244
298
  File.rename(tmp, path)
245
299
  end
246
300
 
247
- def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
248
- c = new(
249
- classifier: classifier,
250
- max_values_per_position: h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES),
251
- )
252
- c.instance_variable_set(:@host_counts, Hash.new(0).merge(h["host_counts"]))
253
- c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
254
- c.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(h["raw_shape_counts"]))
255
- c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
256
- stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
257
- acc[[host, prefix]] = PositionStats.from_dump(sdump)
258
- end
259
- c.instance_variable_set(:@position_stats, stats)
260
- c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
261
- c
262
- end
301
+ # Materialize a Memory snapshot of the current state — used by dump for
302
+ # backends that don't natively know how to emit the JSON shape.
303
+ def memory_view
304
+ return @storage if @storage.respond_to?(:to_dump)
263
305
 
264
- def self.load(path, classifier: SegmentClassifier::DEFAULT)
265
- from_dump(JSON.parse(File.read(path)), classifier: classifier)
306
+ mem = Storage::Memory.new(
307
+ classifier: @classifier,
308
+ max_values_per_position: @storage.max_values_per_position,
309
+ )
310
+ mem.instance_variable_set(:@host_counts, Hash.new(0).merge(@storage.host_counts))
311
+ mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
312
+ mem.instance_variable_set(:@raw_shape_counts, Hash.new(0).merge(@storage.raw_shape_counts))
313
+ mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
314
+ ps = {}
315
+ @storage.each_position_stats { |key, stats| ps[key] = stats }
316
+ mem.instance_variable_set(:@position_stats, ps)
317
+ clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
318
+ mem.instance_variable_set(:@clusters, clusters_h)
319
+ mem
266
320
  end
267
321
  end
268
322
  end
data/lib/iriq/parser.rb CHANGED
@@ -3,7 +3,7 @@ module Iriq
3
3
  #
4
4
  # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
5
5
  # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
6
- # Unicode hosts and paths) to support normalization and clustering.
6
+ # Unicode hosts and paths) to support extraction, normalization, and clustering.
7
7
  module Parser
8
8
  SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
9
9
 
@@ -0,0 +1,43 @@
1
+ require "json"
2
+
3
+ module Iriq
4
+ module Storage
5
+ # Json wraps Memory with load-from-file at open and save-to-file at close.
6
+ # Same JSON shape as the pre-Storage release, so files round-trip across
7
+ # versions.
8
+ class Json < Memory
9
+ attr_reader :path
10
+
11
+ def initialize(path: nil, **opts)
12
+ super(**opts)
13
+ @path = path
14
+ end
15
+
16
+ def self.open(path, **opts)
17
+ s = new(path: path, **opts)
18
+ s.load!(path) if File.exist?(path) && File.size(path).positive?
19
+ s
20
+ end
21
+
22
+ def load!(path)
23
+ data = File.read(path)
24
+ return self if data.empty?
25
+
26
+ load_dump!(JSON.parse(data))
27
+ @path = path
28
+ self
29
+ end
30
+
31
+ # save writes atomically (tmp + rename). Defaults to the path passed at
32
+ # open(); pass an explicit path to write elsewhere.
33
+ def save(path = nil)
34
+ target = path || @path
35
+ raise ArgumentError, "no path provided" unless target
36
+
37
+ tmp = "#{target}.tmp"
38
+ File.write(tmp, JSON.generate(to_dump))
39
+ File.rename(tmp, target)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,138 @@
1
+ module Iriq
2
+ module Storage
3
+ # Memory is the canonical backend — every other backend either wraps it
4
+ # (Json) or implements the same surface against an external store (Sqlite).
5
+ #
6
+ # The contract is small enough to enumerate up top:
7
+ #
8
+ # increment_host(host)
9
+ # increment_path_length(length)
10
+ # increment_raw_shape(shape)
11
+ # increment_fingerprint(shape)
12
+ # observe_position(host, prefix, value, type)
13
+ # add_to_cluster(key, host, scheme, shape, identifier)
14
+ #
15
+ # host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
16
+ # position_stats(host, prefix)
17
+ # clusters / cluster_size
18
+ #
19
+ # transaction { ... } # backends may batch within
20
+ # flush # commit pending writes (no-op for Memory)
21
+ # close # release resources
22
+ class Memory
23
+ attr_reader :max_values_per_position
24
+
25
+ # Path of the underlying file, if any. Memory backends are unpathed;
26
+ # Json/Sqlite override.
27
+ def path; nil; end
28
+
29
+ def initialize(classifier: SegmentClassifier::DEFAULT,
30
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
31
+ @classifier = classifier
32
+ @max_values_per_position = max_values_per_position
33
+ @host_counts = Hash.new(0)
34
+ @path_length_counts = Hash.new(0)
35
+ @raw_shape_counts = Hash.new(0)
36
+ @fingerprint_counts = Hash.new(0)
37
+ @position_stats = {}
38
+ @clusters = {}
39
+ end
40
+
41
+ def transaction
42
+ yield self
43
+ end
44
+
45
+ def batch
46
+ yield
47
+ end
48
+
49
+ def flush; end
50
+ def close; end
51
+
52
+ # No-op for in-memory; subclasses override.
53
+ def save(path = nil); end
54
+
55
+ # --- Increments -------------------------------------------------------
56
+
57
+ def increment_host(host)
58
+ @host_counts[host] += 1 if host
59
+ end
60
+
61
+ def increment_path_length(length)
62
+ @path_length_counts[length] += 1
63
+ end
64
+
65
+ def increment_raw_shape(shape)
66
+ @raw_shape_counts[shape] += 1
67
+ end
68
+
69
+ def increment_fingerprint(shape)
70
+ @fingerprint_counts[shape] += 1
71
+ end
72
+
73
+ def observe_position(host, prefix, value, type)
74
+ stats = @position_stats[[host, prefix]] ||= PositionStats.new(max_values: @max_values_per_position)
75
+ stats.observe(value, type)
76
+ end
77
+
78
+ def add_to_cluster(key, host, scheme, shape, identifier)
79
+ cluster = @clusters[key] ||= Cluster.new(key: key, host: host, scheme: scheme, shape: shape)
80
+ cluster.add(identifier)
81
+ cluster
82
+ end
83
+
84
+ # --- Reads ------------------------------------------------------------
85
+
86
+ def host_counts; @host_counts; end
87
+ def path_length_counts; @path_length_counts; end
88
+ def raw_shape_counts; @raw_shape_counts; end
89
+ def fingerprint_counts; @fingerprint_counts; end
90
+
91
+ def position_stats(host, prefix)
92
+ @position_stats[[host, prefix]]
93
+ end
94
+
95
+ def each_position_stats(&block)
96
+ @position_stats.each(&block)
97
+ end
98
+
99
+ def clusters
100
+ @clusters.values
101
+ end
102
+
103
+ def cluster_size
104
+ @clusters.size
105
+ end
106
+
107
+ # --- Bulk load (used by JSON backend) --------------------------------
108
+
109
+ def load_dump!(h)
110
+ @host_counts = Hash.new(0).merge(h["host_counts"])
111
+ @path_length_counts = Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i))
112
+ @raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
113
+ @fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
114
+ @max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
115
+ @position_stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
116
+ acc[[host, prefix]] = PositionStats.from_dump(sdump)
117
+ end
118
+ cdump = h.fetch("clusterer", { "clusters" => {} })
119
+ @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c) }
120
+ self
121
+ end
122
+
123
+ def to_dump
124
+ {
125
+ "host_counts" => @host_counts,
126
+ "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
127
+ "raw_shape_counts" => @raw_shape_counts,
128
+ "fingerprint_counts" => @fingerprint_counts,
129
+ "max_values_per_position" => @max_values_per_position,
130
+ "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
131
+ "clusterer" => {
132
+ "clusters" => @clusters.transform_values(&:dump),
133
+ },
134
+ }
135
+ end
136
+ end
137
+ end
138
+ end