iriq 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,367 @@
1
+ require "sqlite3"
2
+
3
+ module Iriq
4
+ module Storage
5
+ # Sqlite is the incremental-write backend. Each observation translates
6
+ # to a handful of UPSERTs against a long-lived connection; nothing is
7
+ # materialized in memory beyond what reads explicitly ask for.
8
+ #
9
+ # WAL journaling lets multiple processes observe against the same file
10
+ # concurrently — the writer is serialized, readers are not blocked, and
11
+ # the existing `iriq --corpus c.db <url>` pattern works without a flock
12
+ # at the application layer.
13
+ class Sqlite
14
+ SCHEMA_VERSION = 1
15
+
16
+ SCHEMA = <<~SQL.freeze
17
+ CREATE TABLE IF NOT EXISTS meta (
18
+ key TEXT PRIMARY KEY,
19
+ value TEXT
20
+ );
21
+ CREATE TABLE IF NOT EXISTS host_counts (
22
+ host TEXT PRIMARY KEY,
23
+ count INTEGER NOT NULL
24
+ );
25
+ CREATE TABLE IF NOT EXISTS path_length_counts (
26
+ length INTEGER PRIMARY KEY,
27
+ count INTEGER NOT NULL
28
+ );
29
+ CREATE TABLE IF NOT EXISTS raw_shape_counts (
30
+ shape TEXT PRIMARY KEY,
31
+ count INTEGER NOT NULL
32
+ );
33
+ CREATE TABLE IF NOT EXISTS fingerprint_counts (
34
+ shape TEXT PRIMARY KEY,
35
+ count INTEGER NOT NULL
36
+ );
37
+ CREATE TABLE IF NOT EXISTS position_stats (
38
+ host TEXT NOT NULL,
39
+ prefix TEXT NOT NULL,
40
+ total INTEGER NOT NULL DEFAULT 0,
41
+ PRIMARY KEY (host, prefix)
42
+ );
43
+ CREATE TABLE IF NOT EXISTS position_values (
44
+ host TEXT NOT NULL,
45
+ prefix TEXT NOT NULL,
46
+ value TEXT NOT NULL,
47
+ count INTEGER NOT NULL,
48
+ PRIMARY KEY (host, prefix, value)
49
+ );
50
+ CREATE TABLE IF NOT EXISTS position_types (
51
+ host TEXT NOT NULL,
52
+ prefix TEXT NOT NULL,
53
+ type TEXT NOT NULL,
54
+ count INTEGER NOT NULL,
55
+ PRIMARY KEY (host, prefix, type)
56
+ );
57
+ CREATE TABLE IF NOT EXISTS clusters (
58
+ key TEXT PRIMARY KEY,
59
+ host TEXT,
60
+ scheme TEXT,
61
+ shape TEXT,
62
+ count INTEGER NOT NULL DEFAULT 0,
63
+ ord INTEGER NOT NULL
64
+ );
65
+ CREATE TABLE IF NOT EXISTS cluster_examples (
66
+ cluster_key TEXT NOT NULL,
67
+ position INTEGER NOT NULL,
68
+ canonical TEXT NOT NULL,
69
+ PRIMARY KEY (cluster_key, position)
70
+ );
71
+ CREATE TABLE IF NOT EXISTS cluster_segments (
72
+ cluster_key TEXT NOT NULL,
73
+ position INTEGER NOT NULL,
74
+ value TEXT NOT NULL,
75
+ count INTEGER NOT NULL,
76
+ PRIMARY KEY (cluster_key, position, value)
77
+ );
78
+ SQL
79
+
80
+ attr_reader :path, :max_values_per_position
81
+
82
+ def self.open(path, classifier: SegmentClassifier::DEFAULT,
83
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
84
+ new(path: path, classifier: classifier, max_values_per_position: max_values_per_position).tap(&:setup!)
85
+ end
86
+
87
+ def initialize(path:, classifier: SegmentClassifier::DEFAULT,
88
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
89
+ @path = path
90
+ @classifier = classifier
91
+ @max_values_per_position = max_values_per_position
92
+ @db = SQLite3::Database.new(path)
93
+ # busy_timeout MUST come first: other PRAGMAs (journal_mode in
94
+ # particular) can themselves block on the write lock under
95
+ # concurrent open, and without busy_timeout set they fail
96
+ # immediately with SQLITE_BUSY.
97
+ @db.execute("PRAGMA busy_timeout = 30000")
98
+ @db.execute("PRAGMA journal_mode = WAL")
99
+ @db.execute("PRAGMA synchronous = NORMAL")
100
+ @db.execute("PRAGMA foreign_keys = ON")
101
+ @in_batch = false
102
+ end
103
+
104
+ def setup!
105
+ @db.execute_batch(SCHEMA)
106
+ existing = @db.get_first_value("SELECT value FROM meta WHERE key = 'schema_version'")
107
+ if existing.nil?
108
+ @db.execute("INSERT INTO meta (key, value) VALUES ('schema_version', ?)", SCHEMA_VERSION.to_s)
109
+ @db.execute("INSERT INTO meta (key, value) VALUES ('max_values_per_position', ?)",
110
+ @max_values_per_position.to_s)
111
+ else
112
+ @max_values_per_position = (@db.get_first_value(
113
+ "SELECT value FROM meta WHERE key = 'max_values_per_position'"
114
+ ) || @max_values_per_position).to_i
115
+ end
116
+ self
117
+ end
118
+
119
+ def transaction
120
+ # While inside an outer batch, observe()-time transactions become
121
+ # no-ops — the outer batch wraps everything in one txn for speed.
122
+ return yield(self) if @in_batch
123
+
124
+ @db.transaction
125
+ yield self
126
+ @db.commit
127
+ rescue
128
+ @db.rollback rescue nil
129
+ raise
130
+ end
131
+
132
+ # Wrap many observations in a single transaction. Cuts SQLite write
133
+ # overhead from O(observations) fsyncs to O(1).
134
+ def batch
135
+ return yield if @in_batch
136
+
137
+ @in_batch = true
138
+ @db.transaction
139
+ begin
140
+ yield
141
+ @db.commit
142
+ rescue
143
+ @db.rollback rescue nil
144
+ raise
145
+ ensure
146
+ @in_batch = false
147
+ end
148
+ end
149
+
150
+ # Saving is automatic — incremental UPSERTs hit disk on commit. flush
151
+ # makes that explicit; close releases the connection.
152
+ def flush; end
153
+
154
+ def save(_path = nil)
155
+ # Already persisted. Provided for parity with the JSON backend.
156
+ end
157
+
158
+ def close
159
+ # Checkpoint + truncate the WAL so the .db-wal sidecar doesn't grow
160
+ # unbounded across long-lived `iriq --corpus c.db` sessions.
161
+ @db.execute("PRAGMA wal_checkpoint(TRUNCATE)") rescue nil
162
+ @db.close
163
+ end
164
+
165
+ # --- Increments -------------------------------------------------------
166
+
167
+ def increment_host(host)
168
+ return unless host
169
+
170
+ @db.execute(<<~SQL, host)
171
+ INSERT INTO host_counts (host, count) VALUES (?, 1)
172
+ ON CONFLICT(host) DO UPDATE SET count = count + 1
173
+ SQL
174
+ end
175
+
176
+ def increment_path_length(length)
177
+ @db.execute(<<~SQL, length)
178
+ INSERT INTO path_length_counts (length, count) VALUES (?, 1)
179
+ ON CONFLICT(length) DO UPDATE SET count = count + 1
180
+ SQL
181
+ end
182
+
183
+ def increment_raw_shape(shape)
184
+ upsert_shape("raw_shape_counts", shape)
185
+ end
186
+
187
+ def increment_fingerprint(shape)
188
+ upsert_shape("fingerprint_counts", shape)
189
+ end
190
+
191
+ def observe_position(host, prefix, value, type)
192
+ host ||= ""
193
+ @db.execute(<<~SQL, [host, prefix])
194
+ INSERT INTO position_stats (host, prefix, total) VALUES (?, ?, 1)
195
+ ON CONFLICT(host, prefix) DO UPDATE SET total = total + 1
196
+ SQL
197
+
198
+ # Type counts are unbounded — always upsert.
199
+ @db.execute(<<~SQL, [host, prefix, type.to_s])
200
+ INSERT INTO position_types (host, prefix, type, count) VALUES (?, ?, ?, 1)
201
+ ON CONFLICT(host, prefix, type) DO UPDATE SET count = count + 1
202
+ SQL
203
+
204
+ # Value counts are capped at max_values_per_position. If the value
205
+ # already exists, increment it; otherwise insert only when
206
+ # cardinality is below the cap. Two-step rather than ON CONFLICT
207
+ # because we need to enforce the cap on insert.
208
+ @db.execute(<<~SQL, [host, prefix, value])
209
+ UPDATE position_values SET count = count + 1
210
+ WHERE host = ? AND prefix = ? AND value = ?
211
+ SQL
212
+ if @db.changes.zero?
213
+ card = @db.get_first_value(
214
+ "SELECT COUNT(*) FROM position_values WHERE host = ? AND prefix = ?",
215
+ [host, prefix],
216
+ )
217
+ if card < @max_values_per_position
218
+ @db.execute(
219
+ "INSERT INTO position_values (host, prefix, value, count) VALUES (?, ?, ?, 1)",
220
+ [host, prefix, value],
221
+ )
222
+ end
223
+ end
224
+ end
225
+
226
+ def add_to_cluster(key, host, scheme, shape, identifier)
227
+ # Insert the cluster row if new (with a monotonic ord for stable
228
+ # iteration), then bump its count.
229
+ @db.execute(<<~SQL, [key, host, scheme, shape])
230
+ INSERT INTO clusters (key, host, scheme, shape, count, ord)
231
+ VALUES (?, ?, ?, ?, 1, (SELECT COALESCE(MAX(ord), 0) + 1 FROM clusters))
232
+ ON CONFLICT(key) DO UPDATE SET count = count + 1
233
+ SQL
234
+
235
+ # Examples — capped at Cluster::MAX_EXAMPLES.
236
+ examples_count = @db.get_first_value(
237
+ "SELECT COUNT(*) FROM cluster_examples WHERE cluster_key = ?", [key],
238
+ )
239
+ if examples_count < Cluster::MAX_EXAMPLES
240
+ @db.execute(<<~SQL, [key, examples_count, identifier.canonical])
241
+ INSERT INTO cluster_examples (cluster_key, position, canonical)
242
+ VALUES (?, ?, ?)
243
+ SQL
244
+ end
245
+
246
+ # Per-position segment counts — uncapped.
247
+ identifier.path_segments.each_with_index do |seg, i|
248
+ @db.execute(<<~SQL, [key, i, seg])
249
+ INSERT INTO cluster_segments (cluster_key, position, value, count) VALUES (?, ?, ?, 1)
250
+ ON CONFLICT(cluster_key, position, value) DO UPDATE SET count = count + 1
251
+ SQL
252
+ end
253
+
254
+ load_cluster(key)
255
+ end
256
+
257
+ # --- Reads ------------------------------------------------------------
258
+
259
+ def host_counts
260
+ rows_to_count_hash("host_counts", "host")
261
+ end
262
+
263
+ def path_length_counts
264
+ h = Hash.new(0)
265
+ @db.execute("SELECT length, count FROM path_length_counts") { |r| h[r[0]] = r[1] }
266
+ h
267
+ end
268
+
269
+ def raw_shape_counts
270
+ rows_to_count_hash("raw_shape_counts", "shape")
271
+ end
272
+
273
+ def fingerprint_counts
274
+ rows_to_count_hash("fingerprint_counts", "shape")
275
+ end
276
+
277
+ def position_stats(host, prefix)
278
+ host ||= ""
279
+ total = @db.get_first_value(
280
+ "SELECT total FROM position_stats WHERE host = ? AND prefix = ?", [host, prefix],
281
+ )
282
+ return nil if total.nil?
283
+
284
+ stats = PositionStats.new(max_values: @max_values_per_position)
285
+ stats.instance_variable_set(:@total, total)
286
+
287
+ vc = Hash.new(0)
288
+ @db.execute(
289
+ "SELECT value, count FROM position_values WHERE host = ? AND prefix = ?", [host, prefix]
290
+ ) { |r| vc[r[0]] = r[1] }
291
+ stats.instance_variable_set(:@value_counts, vc)
292
+
293
+ tc = Hash.new(0)
294
+ @db.execute(
295
+ "SELECT type, count FROM position_types WHERE host = ? AND prefix = ?", [host, prefix]
296
+ ) { |r| tc[r[0].to_sym] = r[1] }
297
+ stats.instance_variable_set(:@type_counts, tc)
298
+
299
+ stats
300
+ end
301
+
302
+ def each_position_stats
303
+ seen = []
304
+ @db.execute("SELECT DISTINCT host, prefix FROM position_stats ORDER BY ROWID") do |row|
305
+ seen << row
306
+ end
307
+ seen.each { |host, prefix| yield [host, prefix], position_stats(host, prefix) }
308
+ end
309
+
310
+ def clusters
311
+ out = []
312
+ @db.execute("SELECT key FROM clusters ORDER BY ord") do |row|
313
+ out << load_cluster(row[0])
314
+ end
315
+ out
316
+ end
317
+
318
+ def cluster_size
319
+ @db.get_first_value("SELECT COUNT(*) FROM clusters")
320
+ end
321
+
322
+ private
323
+
324
+ def upsert_shape(table, shape)
325
+ @db.execute(<<~SQL, shape)
326
+ INSERT INTO #{table} (shape, count) VALUES (?, 1)
327
+ ON CONFLICT(shape) DO UPDATE SET count = count + 1
328
+ SQL
329
+ end
330
+
331
+ def rows_to_count_hash(table, key_col)
332
+ h = Hash.new(0)
333
+ @db.execute("SELECT #{key_col}, count FROM #{table}") { |r| h[r[0]] = r[1] }
334
+ h
335
+ end
336
+
337
+ def load_cluster(key)
338
+ row = @db.get_first_row(
339
+ "SELECT key, host, scheme, shape, count FROM clusters WHERE key = ?", [key],
340
+ )
341
+ return nil unless row
342
+
343
+ c = Cluster.new(key: row[0], host: row[1], scheme: row[2], shape: row[3])
344
+ c.instance_variable_set(:@count, row[4])
345
+
346
+ examples = []
347
+ @db.execute(
348
+ "SELECT canonical FROM cluster_examples WHERE cluster_key = ? ORDER BY position", [key]
349
+ ) { |r| examples << Parser.parse(r[0]) }
350
+ c.instance_variable_set(:@examples, examples)
351
+
352
+ seg_counts = []
353
+ @db.execute(
354
+ "SELECT position, value, count FROM cluster_segments WHERE cluster_key = ? ORDER BY position",
355
+ [key],
356
+ ) do |r|
357
+ pos = r[0]
358
+ seg_counts[pos] ||= Hash.new(0)
359
+ seg_counts[pos][r[1]] = r[2]
360
+ end
361
+ c.instance_variable_set(:@segment_counts, seg_counts)
362
+
363
+ c
364
+ end
365
+ end
366
+ end
367
+ end
@@ -0,0 +1,35 @@
1
+ module Iriq
2
+ # Storage is the persistence layer for a Corpus. It owns every counter and
3
+ # per-(host, prefix) frequency map; the Corpus class delegates state to it.
4
+ #
5
+ # Three concrete backends ship:
6
+ #
7
+ # Storage::Memory — in-memory only; matches the original behavior.
8
+ # Storage::Json — Memory backend wrapped with load/save against a JSON file.
9
+ # Storage::Sqlite — incremental UPSERTs against a SQLite database.
10
+ #
11
+ # File-extension dispatch keeps callers simple: `.json` (or anything else)
12
+ # picks Json, `.db`/`.sqlite`/`.sqlite3` picks Sqlite.
13
+ module Storage
14
+ SQLITE_EXTS = %w[.db .sqlite .sqlite3].freeze
15
+
16
+ module_function
17
+
18
+ # Opens (or creates) a storage at `path`, picking the backend by extension.
19
+ # If `path` is nil, returns a Memory backend.
20
+ def open(path, classifier: SegmentClassifier::DEFAULT,
21
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
22
+ return Memory.new(classifier: classifier, max_values_per_position: max_values_per_position) if path.nil?
23
+
24
+ if SQLITE_EXTS.include?(File.extname(path).downcase)
25
+ require "iriq/storage/sqlite"
26
+ Sqlite.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
27
+ else
28
+ require "iriq/storage/json"
29
+ Json.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ require "iriq/storage/memory"
data/lib/iriq/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Iriq
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/iriq.rb CHANGED
@@ -12,6 +12,7 @@ require "iriq/cluster"
12
12
  require "iriq/clusterer"
13
13
  require "iriq/position_stats"
14
14
  require "iriq/observation"
15
+ require "iriq/storage"
15
16
  require "iriq/corpus"
16
17
  require "iriq/extractor"
17
18
  require "iriq/cli"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iriq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Pepper
@@ -65,17 +65,32 @@ dependencies:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
67
  version: '0.22'
68
- description: Semantic IRI/URI/URL/URN parsing, normalization, classification, and
69
- clustering.
68
+ - !ruby/object:Gem::Dependency
69
+ name: sqlite3
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '1.6'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '1.6'
82
+ description: IRI extraction, normalization, and clustering.
70
83
  executables:
71
84
  - iriq
72
85
  extensions: []
73
86
  extra_rdoc_files: []
74
87
  files:
75
88
  - CHANGELOG.md
89
+ - CLAUDE.md
76
90
  - Gemfile
77
91
  - Gemfile.lock
78
92
  - LICENSE.txt
93
+ - Makefile
79
94
  - README.md
80
95
  - exe/iriq
81
96
  - iriq.gemspec
@@ -96,9 +111,11 @@ files:
96
111
  - lib/iriq/position_stats.rb
97
112
  - lib/iriq/segment_classifier.rb
98
113
  - lib/iriq/segment_hints.rb
114
+ - lib/iriq/storage.rb
115
+ - lib/iriq/storage/json.rb
116
+ - lib/iriq/storage/memory.rb
117
+ - lib/iriq/storage/sqlite.rb
99
118
  - lib/iriq/version.rb
100
- - script/benchmark.rb
101
- - script/memory.rb
102
119
  homepage: https://github.com/dpep/iriq
103
120
  licenses:
104
121
  - MIT
@@ -119,5 +136,5 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
136
  requirements: []
120
137
  rubygems_version: 3.6.9
121
138
  specification_version: 4
122
- summary: Semantic IRI normalization and clustering.
139
+ summary: IRI extraction, normalization, and clustering.
123
140
  test_files: []
data/script/benchmark.rb DELETED
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # Performance benchmark for the main hot paths in Iriq.
3
- #
4
- # Usage:
5
- # bundle exec script/benchmark.rb # default sizes
6
- # bundle exec script/benchmark.rb 50000 # custom "large" size
7
- #
8
- # Inputs are generated deterministically from IriGenerator so results are
9
- # comparable across runs.
10
-
11
- require "benchmark"
12
- require "tempfile"
13
-
14
- $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
15
- $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
16
- require "iriq"
17
- require "iri_generator"
18
-
19
- LARGE = Integer(ARGV[0] || 10_000)
20
- SMALL = [LARGE / 10, 1_000].min
21
- HUGE = LARGE * 10
22
-
23
- puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
24
- puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
25
- puts
26
-
27
- small_urls = IriGenerator.urls(count: SMALL, seed: 1)
28
- large_urls = IriGenerator.urls(count: LARGE, seed: 1)
29
- huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
30
-
31
- # ~ LARGE URLs embedded in prose
32
- text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
33
- puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
34
- puts
35
-
36
- results = {}
37
- Benchmark.bm(42) do |x|
38
- results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
39
- results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
40
- results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
41
- results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
42
-
43
- results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
44
- c = Iriq::Corpus.new
45
- small_urls.each { |u| c.observe(u) }
46
- end
47
- results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
48
- c = Iriq::Corpus.new
49
- large_urls.each { |u| c.observe(u) }
50
- end
51
- results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
52
- c = Iriq::Corpus.new
53
- huge_urls.each { |u| c.observe(u) }
54
- end
55
-
56
- results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
57
- c = Iriq::Corpus.new
58
- large_urls.each { |u| c.observe(u) }
59
- Tempfile.open(["iriq-bench", ".json"]) do |f|
60
- c.save(f.path)
61
- Iriq::Corpus.load(f.path)
62
- end
63
- end
64
- end
65
-
66
- puts
67
- puts "Throughput summary:"
68
- [
69
- [:parse, LARGE, "URLs/s"],
70
- [:normalize, LARGE, "URLs/s"],
71
- [:explain, LARGE, "URLs/s"],
72
- [:observe_small, SMALL, "URLs/s"],
73
- [:observe_large, LARGE, "URLs/s"],
74
- [:observe_huge, HUGE, "URLs/s"],
75
- ].each do |key, n, unit|
76
- per_sec = n / results[key].real
77
- printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
78
- end
79
-
80
- extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
81
- printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
data/script/memory.rb DELETED
@@ -1,121 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # Memory profile for the main code paths in Iriq.
3
- #
4
- # Usage:
5
- # bundle exec script/memory.rb # default sizes
6
- # bundle exec script/memory.rb 50000 # custom corpus size
7
- #
8
- # Reports retained memory per operation, cache footprints, and memory
9
- # growth across corpus sizes (to verify linear scaling — no leaks).
10
-
11
- require "objspace"
12
-
13
- $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
14
- $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
15
- require "iriq"
16
- require "iri_generator"
17
-
18
- CORPUS_SIZE = Integer(ARGV[0] || 10_000)
19
- SIZES = [1_000, 10_000, 100_000].uniq.sort
20
- SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
21
- SIZES.sort!
22
-
23
- # Bytes → KB / MB string for display.
24
- def fmt_bytes(n)
25
- if n < 1024
26
- "#{n} B"
27
- elsif n < 1024 * 1024
28
- format("%.1f KB", n / 1024.0)
29
- else
30
- format("%.2f MB", n / (1024.0 * 1024.0))
31
- end
32
- end
33
-
34
- # Run a block in isolation: GC before + after, return delta in bytes.
35
- def measure_retained(&block)
36
- GC.start
37
- before = ObjectSpace.memsize_of_all
38
- result = block.call
39
- GC.start
40
- after = ObjectSpace.memsize_of_all
41
- [after - before, result]
42
- end
43
-
44
- # Reset caches so each scenario starts clean.
45
- def reset_caches
46
- Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
47
- Iriq::Inflector.instance_variable_get(:@cache)&.clear
48
- end
49
-
50
- puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
51
- puts
52
-
53
- # -- Section 1: memory growth across corpus sizes --
54
- puts "── corpus retained memory by N (verifies linear growth) ──"
55
- printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
56
- SIZES.each do |n|
57
- reset_caches
58
- urls = IriGenerator.urls(count: n, seed: 1)
59
- alloc_before = GC.stat(:total_allocated_objects)
60
- retained, _ = measure_retained do
61
- c = Iriq::Corpus.new
62
- urls.each { |u| c.observe(u) }
63
- c
64
- end
65
- alloc_total = GC.stat(:total_allocated_objects) - alloc_before
66
- printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
67
- end
68
- puts
69
-
70
- # -- Section 2: corpus state breakdown at CORPUS_SIZE --
71
- puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
72
- reset_caches
73
- urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
74
- corpus = Iriq::Corpus.new
75
- urls.each { |u| corpus.observe(u) }
76
- puts " unique hosts: #{corpus.host_counts.size}"
77
- puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
78
- puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
79
- puts " clusters: #{corpus.size}"
80
- puts " position_stats entries: #{corpus.position_stats.size}"
81
- puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
82
- puts
83
-
84
- # -- Section 3: cache footprints --
85
- puts "── memoization caches ──"
86
- classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
87
- inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
88
- puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
89
- puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
90
- puts
91
-
92
- # -- Section 4: per-operation memory cost --
93
- puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
94
- urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
95
- text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
96
-
97
- [
98
- ["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
99
- ["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
100
- ["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
101
- ["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
102
- ["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
103
- ].each do |label, op|
104
- reset_caches
105
- retained, _ = measure_retained(&op)
106
- printf(" %-50s %s\n", label, fmt_bytes(retained))
107
- end
108
- puts
109
-
110
- # -- Section 5: persistence overhead --
111
- puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
112
- require "tempfile"
113
- reset_caches
114
- corpus = Iriq::Corpus.new
115
- urls.each { |u| corpus.observe(u) }
116
- Tempfile.open(["iriq-mem", ".json"]) do |f|
117
- corpus.save(f.path)
118
- bytes = File.size(f.path)
119
- puts " JSON file on disk: #{fmt_bytes(bytes)}"
120
- puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
121
- end