iriq 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/CLAUDE.md +121 -0
- data/Gemfile.lock +8 -2
- data/Makefile +56 -0
- data/README.md +112 -11
- data/iriq.gemspec +4 -3
- data/lib/iriq/cli.rb +6 -5
- data/lib/iriq/cluster.rb +24 -0
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +123 -69
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +138 -0
- data/lib/iriq/storage/sqlite.rb +367 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +1 -0
- metadata +23 -6
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
require "sqlite3"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
module Storage
|
|
5
|
+
# Sqlite is the incremental-write backend. Each observation translates
|
|
6
|
+
# to a handful of UPSERTs against a long-lived connection; nothing is
|
|
7
|
+
# materialized in memory beyond what reads explicitly ask for.
|
|
8
|
+
#
|
|
9
|
+
# WAL journaling lets multiple processes observe against the same file
|
|
10
|
+
# concurrently — the writer is serialized, readers are not blocked, and
|
|
11
|
+
# the existing `iriq --corpus c.db <url>` pattern works without a flock
|
|
12
|
+
# at the application layer.
|
|
13
|
+
class Sqlite
|
|
14
|
+
SCHEMA_VERSION = 1
|
|
15
|
+
|
|
16
|
+
SCHEMA = <<~SQL.freeze
|
|
17
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
18
|
+
key TEXT PRIMARY KEY,
|
|
19
|
+
value TEXT
|
|
20
|
+
);
|
|
21
|
+
CREATE TABLE IF NOT EXISTS host_counts (
|
|
22
|
+
host TEXT PRIMARY KEY,
|
|
23
|
+
count INTEGER NOT NULL
|
|
24
|
+
);
|
|
25
|
+
CREATE TABLE IF NOT EXISTS path_length_counts (
|
|
26
|
+
length INTEGER PRIMARY KEY,
|
|
27
|
+
count INTEGER NOT NULL
|
|
28
|
+
);
|
|
29
|
+
CREATE TABLE IF NOT EXISTS raw_shape_counts (
|
|
30
|
+
shape TEXT PRIMARY KEY,
|
|
31
|
+
count INTEGER NOT NULL
|
|
32
|
+
);
|
|
33
|
+
CREATE TABLE IF NOT EXISTS fingerprint_counts (
|
|
34
|
+
shape TEXT PRIMARY KEY,
|
|
35
|
+
count INTEGER NOT NULL
|
|
36
|
+
);
|
|
37
|
+
CREATE TABLE IF NOT EXISTS position_stats (
|
|
38
|
+
host TEXT NOT NULL,
|
|
39
|
+
prefix TEXT NOT NULL,
|
|
40
|
+
total INTEGER NOT NULL DEFAULT 0,
|
|
41
|
+
PRIMARY KEY (host, prefix)
|
|
42
|
+
);
|
|
43
|
+
CREATE TABLE IF NOT EXISTS position_values (
|
|
44
|
+
host TEXT NOT NULL,
|
|
45
|
+
prefix TEXT NOT NULL,
|
|
46
|
+
value TEXT NOT NULL,
|
|
47
|
+
count INTEGER NOT NULL,
|
|
48
|
+
PRIMARY KEY (host, prefix, value)
|
|
49
|
+
);
|
|
50
|
+
CREATE TABLE IF NOT EXISTS position_types (
|
|
51
|
+
host TEXT NOT NULL,
|
|
52
|
+
prefix TEXT NOT NULL,
|
|
53
|
+
type TEXT NOT NULL,
|
|
54
|
+
count INTEGER NOT NULL,
|
|
55
|
+
PRIMARY KEY (host, prefix, type)
|
|
56
|
+
);
|
|
57
|
+
CREATE TABLE IF NOT EXISTS clusters (
|
|
58
|
+
key TEXT PRIMARY KEY,
|
|
59
|
+
host TEXT,
|
|
60
|
+
scheme TEXT,
|
|
61
|
+
shape TEXT,
|
|
62
|
+
count INTEGER NOT NULL DEFAULT 0,
|
|
63
|
+
ord INTEGER NOT NULL
|
|
64
|
+
);
|
|
65
|
+
CREATE TABLE IF NOT EXISTS cluster_examples (
|
|
66
|
+
cluster_key TEXT NOT NULL,
|
|
67
|
+
position INTEGER NOT NULL,
|
|
68
|
+
canonical TEXT NOT NULL,
|
|
69
|
+
PRIMARY KEY (cluster_key, position)
|
|
70
|
+
);
|
|
71
|
+
CREATE TABLE IF NOT EXISTS cluster_segments (
|
|
72
|
+
cluster_key TEXT NOT NULL,
|
|
73
|
+
position INTEGER NOT NULL,
|
|
74
|
+
value TEXT NOT NULL,
|
|
75
|
+
count INTEGER NOT NULL,
|
|
76
|
+
PRIMARY KEY (cluster_key, position, value)
|
|
77
|
+
);
|
|
78
|
+
SQL
|
|
79
|
+
|
|
80
|
+
attr_reader :path, :max_values_per_position
|
|
81
|
+
|
|
82
|
+
def self.open(path, classifier: SegmentClassifier::DEFAULT,
|
|
83
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
84
|
+
new(path: path, classifier: classifier, max_values_per_position: max_values_per_position).tap(&:setup!)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def initialize(path:, classifier: SegmentClassifier::DEFAULT,
|
|
88
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
89
|
+
@path = path
|
|
90
|
+
@classifier = classifier
|
|
91
|
+
@max_values_per_position = max_values_per_position
|
|
92
|
+
@db = SQLite3::Database.new(path)
|
|
93
|
+
# busy_timeout MUST come first: other PRAGMAs (journal_mode in
|
|
94
|
+
# particular) can themselves block on the write lock under
|
|
95
|
+
# concurrent open, and without busy_timeout set they fail
|
|
96
|
+
# immediately with SQLITE_BUSY.
|
|
97
|
+
@db.execute("PRAGMA busy_timeout = 30000")
|
|
98
|
+
@db.execute("PRAGMA journal_mode = WAL")
|
|
99
|
+
@db.execute("PRAGMA synchronous = NORMAL")
|
|
100
|
+
@db.execute("PRAGMA foreign_keys = ON")
|
|
101
|
+
@in_batch = false
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def setup!
|
|
105
|
+
@db.execute_batch(SCHEMA)
|
|
106
|
+
existing = @db.get_first_value("SELECT value FROM meta WHERE key = 'schema_version'")
|
|
107
|
+
if existing.nil?
|
|
108
|
+
@db.execute("INSERT INTO meta (key, value) VALUES ('schema_version', ?)", SCHEMA_VERSION.to_s)
|
|
109
|
+
@db.execute("INSERT INTO meta (key, value) VALUES ('max_values_per_position', ?)",
|
|
110
|
+
@max_values_per_position.to_s)
|
|
111
|
+
else
|
|
112
|
+
@max_values_per_position = (@db.get_first_value(
|
|
113
|
+
"SELECT value FROM meta WHERE key = 'max_values_per_position'"
|
|
114
|
+
) || @max_values_per_position).to_i
|
|
115
|
+
end
|
|
116
|
+
self
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def transaction
|
|
120
|
+
# While inside an outer batch, observe()-time transactions become
|
|
121
|
+
# no-ops — the outer batch wraps everything in one txn for speed.
|
|
122
|
+
return yield(self) if @in_batch
|
|
123
|
+
|
|
124
|
+
@db.transaction
|
|
125
|
+
yield self
|
|
126
|
+
@db.commit
|
|
127
|
+
rescue
|
|
128
|
+
@db.rollback rescue nil
|
|
129
|
+
raise
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Wrap many observations in a single transaction. Cuts SQLite write
|
|
133
|
+
# overhead from O(observations) fsyncs to O(1).
|
|
134
|
+
def batch
|
|
135
|
+
return yield if @in_batch
|
|
136
|
+
|
|
137
|
+
@in_batch = true
|
|
138
|
+
@db.transaction
|
|
139
|
+
begin
|
|
140
|
+
yield
|
|
141
|
+
@db.commit
|
|
142
|
+
rescue
|
|
143
|
+
@db.rollback rescue nil
|
|
144
|
+
raise
|
|
145
|
+
ensure
|
|
146
|
+
@in_batch = false
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Saving is automatic — incremental UPSERTs hit disk on commit. flush
|
|
151
|
+
# makes that explicit; close releases the connection.
|
|
152
|
+
def flush; end
|
|
153
|
+
|
|
154
|
+
def save(_path = nil)
|
|
155
|
+
# Already persisted. Provided for parity with the JSON backend.
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def close
|
|
159
|
+
# Checkpoint + truncate the WAL so the .db-wal sidecar doesn't grow
|
|
160
|
+
# unbounded across long-lived `iriq --corpus c.db` sessions.
|
|
161
|
+
@db.execute("PRAGMA wal_checkpoint(TRUNCATE)") rescue nil
|
|
162
|
+
@db.close
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# --- Increments -------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def increment_host(host)
|
|
168
|
+
return unless host
|
|
169
|
+
|
|
170
|
+
@db.execute(<<~SQL, host)
|
|
171
|
+
INSERT INTO host_counts (host, count) VALUES (?, 1)
|
|
172
|
+
ON CONFLICT(host) DO UPDATE SET count = count + 1
|
|
173
|
+
SQL
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def increment_path_length(length)
|
|
177
|
+
@db.execute(<<~SQL, length)
|
|
178
|
+
INSERT INTO path_length_counts (length, count) VALUES (?, 1)
|
|
179
|
+
ON CONFLICT(length) DO UPDATE SET count = count + 1
|
|
180
|
+
SQL
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def increment_raw_shape(shape)
|
|
184
|
+
upsert_shape("raw_shape_counts", shape)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def increment_fingerprint(shape)
|
|
188
|
+
upsert_shape("fingerprint_counts", shape)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def observe_position(host, prefix, value, type)
|
|
192
|
+
host ||= ""
|
|
193
|
+
@db.execute(<<~SQL, [host, prefix])
|
|
194
|
+
INSERT INTO position_stats (host, prefix, total) VALUES (?, ?, 1)
|
|
195
|
+
ON CONFLICT(host, prefix) DO UPDATE SET total = total + 1
|
|
196
|
+
SQL
|
|
197
|
+
|
|
198
|
+
# Type counts are unbounded — always upsert.
|
|
199
|
+
@db.execute(<<~SQL, [host, prefix, type.to_s])
|
|
200
|
+
INSERT INTO position_types (host, prefix, type, count) VALUES (?, ?, ?, 1)
|
|
201
|
+
ON CONFLICT(host, prefix, type) DO UPDATE SET count = count + 1
|
|
202
|
+
SQL
|
|
203
|
+
|
|
204
|
+
# Value counts are capped at max_values_per_position. If the value
|
|
205
|
+
# already exists, increment it; otherwise insert only when
|
|
206
|
+
# cardinality is below the cap. Two-step rather than ON CONFLICT
|
|
207
|
+
# because we need to enforce the cap on insert.
|
|
208
|
+
@db.execute(<<~SQL, [host, prefix, value])
|
|
209
|
+
UPDATE position_values SET count = count + 1
|
|
210
|
+
WHERE host = ? AND prefix = ? AND value = ?
|
|
211
|
+
SQL
|
|
212
|
+
if @db.changes.zero?
|
|
213
|
+
card = @db.get_first_value(
|
|
214
|
+
"SELECT COUNT(*) FROM position_values WHERE host = ? AND prefix = ?",
|
|
215
|
+
[host, prefix],
|
|
216
|
+
)
|
|
217
|
+
if card < @max_values_per_position
|
|
218
|
+
@db.execute(
|
|
219
|
+
"INSERT INTO position_values (host, prefix, value, count) VALUES (?, ?, ?, 1)",
|
|
220
|
+
[host, prefix, value],
|
|
221
|
+
)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def add_to_cluster(key, host, scheme, shape, identifier)
|
|
227
|
+
# Insert the cluster row if new (with a monotonic ord for stable
|
|
228
|
+
# iteration), then bump its count.
|
|
229
|
+
@db.execute(<<~SQL, [key, host, scheme, shape])
|
|
230
|
+
INSERT INTO clusters (key, host, scheme, shape, count, ord)
|
|
231
|
+
VALUES (?, ?, ?, ?, 1, (SELECT COALESCE(MAX(ord), 0) + 1 FROM clusters))
|
|
232
|
+
ON CONFLICT(key) DO UPDATE SET count = count + 1
|
|
233
|
+
SQL
|
|
234
|
+
|
|
235
|
+
# Examples — capped at Cluster::MAX_EXAMPLES.
|
|
236
|
+
examples_count = @db.get_first_value(
|
|
237
|
+
"SELECT COUNT(*) FROM cluster_examples WHERE cluster_key = ?", [key],
|
|
238
|
+
)
|
|
239
|
+
if examples_count < Cluster::MAX_EXAMPLES
|
|
240
|
+
@db.execute(<<~SQL, [key, examples_count, identifier.canonical])
|
|
241
|
+
INSERT INTO cluster_examples (cluster_key, position, canonical)
|
|
242
|
+
VALUES (?, ?, ?)
|
|
243
|
+
SQL
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Per-position segment counts — uncapped.
|
|
247
|
+
identifier.path_segments.each_with_index do |seg, i|
|
|
248
|
+
@db.execute(<<~SQL, [key, i, seg])
|
|
249
|
+
INSERT INTO cluster_segments (cluster_key, position, value, count) VALUES (?, ?, ?, 1)
|
|
250
|
+
ON CONFLICT(cluster_key, position, value) DO UPDATE SET count = count + 1
|
|
251
|
+
SQL
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
load_cluster(key)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# --- Reads ------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
def host_counts
|
|
260
|
+
rows_to_count_hash("host_counts", "host")
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def path_length_counts
|
|
264
|
+
h = Hash.new(0)
|
|
265
|
+
@db.execute("SELECT length, count FROM path_length_counts") { |r| h[r[0]] = r[1] }
|
|
266
|
+
h
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def raw_shape_counts
|
|
270
|
+
rows_to_count_hash("raw_shape_counts", "shape")
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def fingerprint_counts
|
|
274
|
+
rows_to_count_hash("fingerprint_counts", "shape")
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def position_stats(host, prefix)
|
|
278
|
+
host ||= ""
|
|
279
|
+
total = @db.get_first_value(
|
|
280
|
+
"SELECT total FROM position_stats WHERE host = ? AND prefix = ?", [host, prefix],
|
|
281
|
+
)
|
|
282
|
+
return nil if total.nil?
|
|
283
|
+
|
|
284
|
+
stats = PositionStats.new(max_values: @max_values_per_position)
|
|
285
|
+
stats.instance_variable_set(:@total, total)
|
|
286
|
+
|
|
287
|
+
vc = Hash.new(0)
|
|
288
|
+
@db.execute(
|
|
289
|
+
"SELECT value, count FROM position_values WHERE host = ? AND prefix = ?", [host, prefix]
|
|
290
|
+
) { |r| vc[r[0]] = r[1] }
|
|
291
|
+
stats.instance_variable_set(:@value_counts, vc)
|
|
292
|
+
|
|
293
|
+
tc = Hash.new(0)
|
|
294
|
+
@db.execute(
|
|
295
|
+
"SELECT type, count FROM position_types WHERE host = ? AND prefix = ?", [host, prefix]
|
|
296
|
+
) { |r| tc[r[0].to_sym] = r[1] }
|
|
297
|
+
stats.instance_variable_set(:@type_counts, tc)
|
|
298
|
+
|
|
299
|
+
stats
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def each_position_stats
|
|
303
|
+
seen = []
|
|
304
|
+
@db.execute("SELECT DISTINCT host, prefix FROM position_stats ORDER BY ROWID") do |row|
|
|
305
|
+
seen << row
|
|
306
|
+
end
|
|
307
|
+
seen.each { |host, prefix| yield [host, prefix], position_stats(host, prefix) }
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def clusters
|
|
311
|
+
out = []
|
|
312
|
+
@db.execute("SELECT key FROM clusters ORDER BY ord") do |row|
|
|
313
|
+
out << load_cluster(row[0])
|
|
314
|
+
end
|
|
315
|
+
out
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def cluster_size
|
|
319
|
+
@db.get_first_value("SELECT COUNT(*) FROM clusters")
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
private
|
|
323
|
+
|
|
324
|
+
def upsert_shape(table, shape)
|
|
325
|
+
@db.execute(<<~SQL, shape)
|
|
326
|
+
INSERT INTO #{table} (shape, count) VALUES (?, 1)
|
|
327
|
+
ON CONFLICT(shape) DO UPDATE SET count = count + 1
|
|
328
|
+
SQL
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
def rows_to_count_hash(table, key_col)
|
|
332
|
+
h = Hash.new(0)
|
|
333
|
+
@db.execute("SELECT #{key_col}, count FROM #{table}") { |r| h[r[0]] = r[1] }
|
|
334
|
+
h
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def load_cluster(key)
|
|
338
|
+
row = @db.get_first_row(
|
|
339
|
+
"SELECT key, host, scheme, shape, count FROM clusters WHERE key = ?", [key],
|
|
340
|
+
)
|
|
341
|
+
return nil unless row
|
|
342
|
+
|
|
343
|
+
c = Cluster.new(key: row[0], host: row[1], scheme: row[2], shape: row[3])
|
|
344
|
+
c.instance_variable_set(:@count, row[4])
|
|
345
|
+
|
|
346
|
+
examples = []
|
|
347
|
+
@db.execute(
|
|
348
|
+
"SELECT canonical FROM cluster_examples WHERE cluster_key = ? ORDER BY position", [key]
|
|
349
|
+
) { |r| examples << Parser.parse(r[0]) }
|
|
350
|
+
c.instance_variable_set(:@examples, examples)
|
|
351
|
+
|
|
352
|
+
seg_counts = []
|
|
353
|
+
@db.execute(
|
|
354
|
+
"SELECT position, value, count FROM cluster_segments WHERE cluster_key = ? ORDER BY position",
|
|
355
|
+
[key],
|
|
356
|
+
) do |r|
|
|
357
|
+
pos = r[0]
|
|
358
|
+
seg_counts[pos] ||= Hash.new(0)
|
|
359
|
+
seg_counts[pos][r[1]] = r[2]
|
|
360
|
+
end
|
|
361
|
+
c.instance_variable_set(:@segment_counts, seg_counts)
|
|
362
|
+
|
|
363
|
+
c
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
end
|
data/lib/iriq/storage.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Storage is the persistence layer for a Corpus. It owns every counter and
|
|
3
|
+
# per-(host, prefix) frequency map; the Corpus class delegates state to it.
|
|
4
|
+
#
|
|
5
|
+
# Three concrete backends ship:
|
|
6
|
+
#
|
|
7
|
+
# Storage::Memory — in-memory only; matches the original behavior.
|
|
8
|
+
# Storage::Json — Memory backend wrapped with load/save against a JSON file.
|
|
9
|
+
# Storage::Sqlite — incremental UPSERTs against a SQLite database.
|
|
10
|
+
#
|
|
11
|
+
# File-extension dispatch keeps callers simple: `.json` (or anything else)
|
|
12
|
+
# picks Json, `.db`/`.sqlite`/`.sqlite3` picks Sqlite.
|
|
13
|
+
module Storage
|
|
14
|
+
SQLITE_EXTS = %w[.db .sqlite .sqlite3].freeze
|
|
15
|
+
|
|
16
|
+
module_function
|
|
17
|
+
|
|
18
|
+
# Opens (or creates) a storage at `path`, picking the backend by extension.
|
|
19
|
+
# If `path` is nil, returns a Memory backend.
|
|
20
|
+
def open(path, classifier: SegmentClassifier::DEFAULT,
|
|
21
|
+
max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
|
|
22
|
+
return Memory.new(classifier: classifier, max_values_per_position: max_values_per_position) if path.nil?
|
|
23
|
+
|
|
24
|
+
if SQLITE_EXTS.include?(File.extname(path).downcase)
|
|
25
|
+
require "iriq/storage/sqlite"
|
|
26
|
+
Sqlite.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
|
|
27
|
+
else
|
|
28
|
+
require "iriq/storage/json"
|
|
29
|
+
Json.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
require "iriq/storage/memory"
|
data/lib/iriq/version.rb
CHANGED
data/lib/iriq.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iriq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniel Pepper
|
|
@@ -65,17 +65,32 @@ dependencies:
|
|
|
65
65
|
- - ">="
|
|
66
66
|
- !ruby/object:Gem::Version
|
|
67
67
|
version: '0.22'
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: sqlite3
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '1.6'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '1.6'
|
|
82
|
+
description: IRI extraction, normalization, and clustering.
|
|
70
83
|
executables:
|
|
71
84
|
- iriq
|
|
72
85
|
extensions: []
|
|
73
86
|
extra_rdoc_files: []
|
|
74
87
|
files:
|
|
75
88
|
- CHANGELOG.md
|
|
89
|
+
- CLAUDE.md
|
|
76
90
|
- Gemfile
|
|
77
91
|
- Gemfile.lock
|
|
78
92
|
- LICENSE.txt
|
|
93
|
+
- Makefile
|
|
79
94
|
- README.md
|
|
80
95
|
- exe/iriq
|
|
81
96
|
- iriq.gemspec
|
|
@@ -96,9 +111,11 @@ files:
|
|
|
96
111
|
- lib/iriq/position_stats.rb
|
|
97
112
|
- lib/iriq/segment_classifier.rb
|
|
98
113
|
- lib/iriq/segment_hints.rb
|
|
114
|
+
- lib/iriq/storage.rb
|
|
115
|
+
- lib/iriq/storage/json.rb
|
|
116
|
+
- lib/iriq/storage/memory.rb
|
|
117
|
+
- lib/iriq/storage/sqlite.rb
|
|
99
118
|
- lib/iriq/version.rb
|
|
100
|
-
- script/benchmark.rb
|
|
101
|
-
- script/memory.rb
|
|
102
119
|
homepage: https://github.com/dpep/iriq
|
|
103
120
|
licenses:
|
|
104
121
|
- MIT
|
|
@@ -119,5 +136,5 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
119
136
|
requirements: []
|
|
120
137
|
rubygems_version: 3.6.9
|
|
121
138
|
specification_version: 4
|
|
122
|
-
summary:
|
|
139
|
+
summary: IRI extraction, normalization, and clustering.
|
|
123
140
|
test_files: []
|
data/script/benchmark.rb
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# Performance benchmark for the main hot paths in Iriq.
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bundle exec script/benchmark.rb # default sizes
|
|
6
|
-
# bundle exec script/benchmark.rb 50000 # custom "large" size
|
|
7
|
-
#
|
|
8
|
-
# Inputs are generated deterministically from IriGenerator so results are
|
|
9
|
-
# comparable across runs.
|
|
10
|
-
|
|
11
|
-
require "benchmark"
|
|
12
|
-
require "tempfile"
|
|
13
|
-
|
|
14
|
-
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
15
|
-
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
16
|
-
require "iriq"
|
|
17
|
-
require "iri_generator"
|
|
18
|
-
|
|
19
|
-
LARGE = Integer(ARGV[0] || 10_000)
|
|
20
|
-
SMALL = [LARGE / 10, 1_000].min
|
|
21
|
-
HUGE = LARGE * 10
|
|
22
|
-
|
|
23
|
-
puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
24
|
-
puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
|
|
25
|
-
puts
|
|
26
|
-
|
|
27
|
-
small_urls = IriGenerator.urls(count: SMALL, seed: 1)
|
|
28
|
-
large_urls = IriGenerator.urls(count: LARGE, seed: 1)
|
|
29
|
-
huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
|
|
30
|
-
|
|
31
|
-
# ~ LARGE URLs embedded in prose
|
|
32
|
-
text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
|
|
33
|
-
puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
|
|
34
|
-
puts
|
|
35
|
-
|
|
36
|
-
results = {}
|
|
37
|
-
Benchmark.bm(42) do |x|
|
|
38
|
-
results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
|
|
39
|
-
results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
|
|
40
|
-
results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
|
|
41
|
-
results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
|
|
42
|
-
|
|
43
|
-
results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
|
|
44
|
-
c = Iriq::Corpus.new
|
|
45
|
-
small_urls.each { |u| c.observe(u) }
|
|
46
|
-
end
|
|
47
|
-
results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
|
|
48
|
-
c = Iriq::Corpus.new
|
|
49
|
-
large_urls.each { |u| c.observe(u) }
|
|
50
|
-
end
|
|
51
|
-
results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
|
|
52
|
-
c = Iriq::Corpus.new
|
|
53
|
-
huge_urls.each { |u| c.observe(u) }
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
|
|
57
|
-
c = Iriq::Corpus.new
|
|
58
|
-
large_urls.each { |u| c.observe(u) }
|
|
59
|
-
Tempfile.open(["iriq-bench", ".json"]) do |f|
|
|
60
|
-
c.save(f.path)
|
|
61
|
-
Iriq::Corpus.load(f.path)
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
puts
|
|
67
|
-
puts "Throughput summary:"
|
|
68
|
-
[
|
|
69
|
-
[:parse, LARGE, "URLs/s"],
|
|
70
|
-
[:normalize, LARGE, "URLs/s"],
|
|
71
|
-
[:explain, LARGE, "URLs/s"],
|
|
72
|
-
[:observe_small, SMALL, "URLs/s"],
|
|
73
|
-
[:observe_large, LARGE, "URLs/s"],
|
|
74
|
-
[:observe_huge, HUGE, "URLs/s"],
|
|
75
|
-
].each do |key, n, unit|
|
|
76
|
-
per_sec = n / results[key].real
|
|
77
|
-
printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
|
|
81
|
-
printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
|
data/script/memory.rb
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# Memory profile for the main code paths in Iriq.
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bundle exec script/memory.rb # default sizes
|
|
6
|
-
# bundle exec script/memory.rb 50000 # custom corpus size
|
|
7
|
-
#
|
|
8
|
-
# Reports retained memory per operation, cache footprints, and memory
|
|
9
|
-
# growth across corpus sizes (to verify linear scaling — no leaks).
|
|
10
|
-
|
|
11
|
-
require "objspace"
|
|
12
|
-
|
|
13
|
-
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
14
|
-
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
15
|
-
require "iriq"
|
|
16
|
-
require "iri_generator"
|
|
17
|
-
|
|
18
|
-
CORPUS_SIZE = Integer(ARGV[0] || 10_000)
|
|
19
|
-
SIZES = [1_000, 10_000, 100_000].uniq.sort
|
|
20
|
-
SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
|
|
21
|
-
SIZES.sort!
|
|
22
|
-
|
|
23
|
-
# Bytes → KB / MB string for display.
|
|
24
|
-
def fmt_bytes(n)
|
|
25
|
-
if n < 1024
|
|
26
|
-
"#{n} B"
|
|
27
|
-
elsif n < 1024 * 1024
|
|
28
|
-
format("%.1f KB", n / 1024.0)
|
|
29
|
-
else
|
|
30
|
-
format("%.2f MB", n / (1024.0 * 1024.0))
|
|
31
|
-
end
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Run a block in isolation: GC before + after, return delta in bytes.
|
|
35
|
-
def measure_retained(&block)
|
|
36
|
-
GC.start
|
|
37
|
-
before = ObjectSpace.memsize_of_all
|
|
38
|
-
result = block.call
|
|
39
|
-
GC.start
|
|
40
|
-
after = ObjectSpace.memsize_of_all
|
|
41
|
-
[after - before, result]
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Reset caches so each scenario starts clean.
|
|
45
|
-
def reset_caches
|
|
46
|
-
Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
|
|
47
|
-
Iriq::Inflector.instance_variable_get(:@cache)&.clear
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
51
|
-
puts
|
|
52
|
-
|
|
53
|
-
# -- Section 1: memory growth across corpus sizes --
|
|
54
|
-
puts "── corpus retained memory by N (verifies linear growth) ──"
|
|
55
|
-
printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
|
|
56
|
-
SIZES.each do |n|
|
|
57
|
-
reset_caches
|
|
58
|
-
urls = IriGenerator.urls(count: n, seed: 1)
|
|
59
|
-
alloc_before = GC.stat(:total_allocated_objects)
|
|
60
|
-
retained, _ = measure_retained do
|
|
61
|
-
c = Iriq::Corpus.new
|
|
62
|
-
urls.each { |u| c.observe(u) }
|
|
63
|
-
c
|
|
64
|
-
end
|
|
65
|
-
alloc_total = GC.stat(:total_allocated_objects) - alloc_before
|
|
66
|
-
printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
|
|
67
|
-
end
|
|
68
|
-
puts
|
|
69
|
-
|
|
70
|
-
# -- Section 2: corpus state breakdown at CORPUS_SIZE --
|
|
71
|
-
puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
|
|
72
|
-
reset_caches
|
|
73
|
-
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
74
|
-
corpus = Iriq::Corpus.new
|
|
75
|
-
urls.each { |u| corpus.observe(u) }
|
|
76
|
-
puts " unique hosts: #{corpus.host_counts.size}"
|
|
77
|
-
puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
|
|
78
|
-
puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
|
|
79
|
-
puts " clusters: #{corpus.size}"
|
|
80
|
-
puts " position_stats entries: #{corpus.position_stats.size}"
|
|
81
|
-
puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
|
|
82
|
-
puts
|
|
83
|
-
|
|
84
|
-
# -- Section 3: cache footprints --
|
|
85
|
-
puts "── memoization caches ──"
|
|
86
|
-
classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
|
|
87
|
-
inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
|
|
88
|
-
puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
|
|
89
|
-
puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
|
|
90
|
-
puts
|
|
91
|
-
|
|
92
|
-
# -- Section 4: per-operation memory cost --
|
|
93
|
-
puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
|
|
94
|
-
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
95
|
-
text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
|
|
96
|
-
|
|
97
|
-
[
|
|
98
|
-
["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
|
|
99
|
-
["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
|
|
100
|
-
["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
|
|
101
|
-
["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
|
|
102
|
-
["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
|
|
103
|
-
].each do |label, op|
|
|
104
|
-
reset_caches
|
|
105
|
-
retained, _ = measure_retained(&op)
|
|
106
|
-
printf(" %-50s %s\n", label, fmt_bytes(retained))
|
|
107
|
-
end
|
|
108
|
-
puts
|
|
109
|
-
|
|
110
|
-
# -- Section 5: persistence overhead --
|
|
111
|
-
puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
|
|
112
|
-
require "tempfile"
|
|
113
|
-
reset_caches
|
|
114
|
-
corpus = Iriq::Corpus.new
|
|
115
|
-
urls.each { |u| corpus.observe(u) }
|
|
116
|
-
Tempfile.open(["iriq-mem", ".json"]) do |f|
|
|
117
|
-
corpus.save(f.path)
|
|
118
|
-
bytes = File.size(f.path)
|
|
119
|
-
puts " JSON file on disk: #{fmt_bytes(bytes)}"
|
|
120
|
-
puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
|
|
121
|
-
end
|