iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
data/lib/iriq/trace.rb ADDED
@@ -0,0 +1,294 @@
1
+ module Iriq
2
+ # Produces an annotated trace explaining how an identifier got
3
+ # normalized — segment by segment, with notes for each non-obvious
4
+ # transformation (currency upcase, IP umbrella, hint suppression,
5
+ # canonical date, param-name lift, etc.).
6
+ #
7
+ # Trace.for("https://shop.com/pricing/usd?currency=eur")
8
+ # # => {
9
+ # # input: "...",
10
+ # # normalized: "https://shop.com/pricing/USD?currency=EUR",
11
+ # # scheme: "https", host: "shop.com",
12
+ # # path: [...per-segment rows...],
13
+ # # query: [...per-param rows...],
14
+ # # }
15
+ #
16
+ # Each row is `{ value, type, output, notes }` for path entries and
17
+ # `{ name, value, type, output, notes }` for query entries. The string
18
+ # notes are rendered from structured Iriq::Evidence::Record values;
19
+ # callers that want the structured form can use Trace.evidence_for.
20
+ module Trace
21
+ module_function
22
+
23
+ HINT_NOTE_TEMPLATE = "semantic type — surfaced as {%s}, not {%s}".freeze
24
+
25
+ # Render-ready Trace output. The public format consumers depend on.
26
+ def for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
27
+ iri = coerce(input)
28
+ normalized = Normalizer.normalize_identifier(iri, classifier: classifier, hints: hints)
29
+
30
+ out = {
31
+ input: iri.canonical,
32
+ normalized: normalized,
33
+ scheme: iri.scheme,
34
+ host: iri.host,
35
+ }
36
+ out[:port] = iri.port if iri.port
37
+
38
+ if iri.urn?
39
+ out[:path] = urn_rows(iri, classifier, hints)
40
+ else
41
+ out[:path] = path_rows(iri.path_segments, classifier, hints)
42
+ if iri.query_params && !iri.query_params.empty?
43
+ out[:query] = query_rows(iri.query_params, classifier)
44
+ end
45
+ end
46
+
47
+ out
48
+ end
49
+
50
+ # Structured Evidence list for `input`. Each segment + query param
51
+ # contributes one classification Evidence plus zero or more
52
+ # transformation Evidence records (canonical date, IP umbrella
53
+ # collapse, param-name hint, hint suppression).
54
+ #
55
+ # Position + Cluster Evidence are not emitted here — they belong to
56
+ # corpus-informed trace (Corpus#trace), which a follow-up step lands.
57
+ def evidence_for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
58
+ iri = coerce(input)
59
+ records = []
60
+ segments = iri.urn? ? urn_parts(iri) : (iri.path_segments || [])
61
+ entries = SegmentHints.derive(segments, classifier)
62
+
63
+ entries.each_with_index do |entry, i|
64
+ records.concat(segment_evidence(entry, segments, i, classifier, hints))
65
+ end
66
+
67
+ if !iri.urn? && iri.query_params && !iri.query_params.empty?
68
+ iri.query_params.keys.sort.each do |k|
69
+ records.concat(query_param_evidence(k, iri.query_params[k].to_s, classifier))
70
+ end
71
+ end
72
+
73
+ records
74
+ end
75
+
76
+ # ── Evidence builders ────────────────────────────────────────────────
77
+
78
+ def segment_evidence(entry, segments, idx, classifier, hints)
79
+ records = []
80
+
81
+ records << Evidence.segment(
82
+ index: idx, value: entry[:value],
83
+ source: :recognizer,
84
+ payload: { type: entry[:type], variable: entry[:variable], hint: entry[:hint] },
85
+ )
86
+
87
+ if entry[:variable]
88
+ if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
89
+ if canon != entry[:value]
90
+ records << Evidence.segment(
91
+ index: idx, value: entry[:value],
92
+ source: :policy,
93
+ payload: { rule: :canonical_date, before: entry[:value], after: canon },
94
+ notes: ["canonical date (#{entry[:value]} → #{canon})"],
95
+ )
96
+ end
97
+ elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
98
+ if canon != entry[:value]
99
+ records << Evidence.segment(
100
+ index: idx, value: entry[:value],
101
+ source: :policy,
102
+ payload: { rule: :canonical_currency, before: entry[:value], after: canon },
103
+ notes: ["currency upcase (#{entry[:value]} → #{canon})"],
104
+ )
105
+ end
106
+ else
107
+ extra = placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
108
+ records.concat(extra)
109
+ end
110
+ end
111
+
112
+ records
113
+ end
114
+
115
+ def placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
116
+ out = []
117
+ type = entry[:type]
118
+
119
+ if type == :ipv4 || type == :ipv6
120
+ out << Evidence.segment(
121
+ index: idx, value: entry[:value],
122
+ source: :policy,
123
+ payload: { rule: :ip_umbrella_collapse, from: type, to: :ip },
124
+ notes: ["ip umbrella collapse (#{type} → ip)"],
125
+ )
126
+ end
127
+
128
+ if hints && entry[:hint].nil? && !SegmentHints::HINT_ELIGIBLE_TYPES.include?(type)
129
+ if (would_be = would_be_hint(segments, idx, type, classifier))
130
+ display = SegmentClassifier.display_type(type)
131
+ out << Evidence.segment(
132
+ index: idx, value: entry[:value],
133
+ source: :neighbor,
134
+ payload: { rule: :hint_suppression, surfaced: display, would_be: would_be, semantic_type: type },
135
+ notes: [format(HINT_NOTE_TEMPLATE, display, would_be)],
136
+ )
137
+ end
138
+ end
139
+
140
+ out
141
+ end
142
+
143
+ def query_param_evidence(name, value, classifier)
144
+ records = []
145
+ base_type = classifier.classify(value)
146
+ effective = base_type
147
+
148
+ if (hint = SegmentClassifier.param_name_hint(name, base_type))
149
+ effective = hint
150
+ records << Evidence.segment(
151
+ index: name, value: value,
152
+ source: :neighbor,
153
+ payload: { rule: :param_name_hint, name: name, before: base_type, after: hint },
154
+ notes: ["param-name hint (`#{name}=`) lifted #{base_type} → #{hint}"],
155
+ )
156
+ end
157
+
158
+ records << Evidence.segment(
159
+ index: name, value: value,
160
+ source: :recognizer,
161
+ payload: { type: effective, variable: SegmentClassifier::DEFAULT.variable?(effective) },
162
+ )
163
+
164
+ if effective == :date && (canon = SegmentClassifier.canonical_date(value))
165
+ if canon != value
166
+ records << Evidence.segment(
167
+ index: name, value: value,
168
+ source: :policy,
169
+ payload: { rule: :canonical_date, before: value, after: canon },
170
+ notes: ["canonical date (#{value} → #{canon})"],
171
+ )
172
+ end
173
+ elsif effective == :currency && (canon = SegmentClassifier.canonical_currency(value))
174
+ if canon != value
175
+ records << Evidence.segment(
176
+ index: name, value: value,
177
+ source: :policy,
178
+ payload: { rule: :canonical_currency, before: value, after: canon },
179
+ notes: ["currency upcase (#{value} → #{canon})"],
180
+ )
181
+ end
182
+ elsif effective == :ipv4 || effective == :ipv6
183
+ records << Evidence.segment(
184
+ index: name, value: value,
185
+ source: :policy,
186
+ payload: { rule: :ip_umbrella_collapse, from: effective, to: :ip },
187
+ notes: ["ip umbrella collapse (#{effective} → ip)"],
188
+ )
189
+ end
190
+
191
+ records
192
+ end
193
+
194
+ # ── View rendering (Evidence → Trace.for hash) ───────────────────────
195
+
196
+ def path_rows(segments, classifier, hints)
197
+ return [] if segments.nil? || segments.empty?
198
+
199
+ entries = SegmentHints.derive(segments, classifier)
200
+ entries.each_with_index.map do |entry, i|
201
+ ev = segment_evidence(entry, segments, i, classifier, hints)
202
+ render_segment_row(entry, ev, hints)
203
+ end
204
+ end
205
+
206
+ def urn_rows(iri, classifier, hints)
207
+ parts = urn_parts(iri)
208
+ return [] if parts.empty?
209
+
210
+ entries = SegmentHints.derive(parts, classifier)
211
+ entries.each_with_index.map do |entry, i|
212
+ ev = segment_evidence(entry, parts, i, classifier, hints)
213
+ render_segment_row(entry, ev, hints)
214
+ end
215
+ end
216
+
217
+ def query_rows(params, classifier)
218
+ params.keys.sort.map do |k|
219
+ v = params[k].to_s
220
+ ev = query_param_evidence(k, v, classifier)
221
+ render_query_row(k, v, ev)
222
+ end
223
+ end
224
+
225
+ def render_segment_row(entry, evidence, hints)
226
+ notes = collect_notes(evidence)
227
+ value = entry[:value]
228
+ type = entry[:type]
229
+
230
+ return { value: value, type: type, output: value, notes: notes } unless entry[:variable]
231
+
232
+ canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
233
+ if canon_policy
234
+ return { value: value, type: type, output: canon_policy[:after], notes: notes }
235
+ end
236
+
237
+ placeholder = hints && entry[:hint] ? entry[:hint].to_s : SegmentClassifier.display_type(type).to_s
238
+ { value: value, type: type, output: "{#{placeholder}}", notes: notes }
239
+ end
240
+
241
+ def render_query_row(name, value, evidence)
242
+ notes = collect_notes(evidence)
243
+ cls = find_evidence(evidence, source: :recognizer)
244
+ effective = cls ? cls.payload[:type] : SegmentClassifier::DEFAULT.classify(value)
245
+ canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
246
+
247
+ output =
248
+ if canon_policy
249
+ canon_policy[:after]
250
+ elsif SegmentClassifier::DEFAULT.variable?(effective)
251
+ "{#{SegmentClassifier.display_type(effective)}}"
252
+ else
253
+ value
254
+ end
255
+
256
+ { name: name, value: value, type: effective, output: output, notes: notes }
257
+ end
258
+
259
+ # ── Helpers ──────────────────────────────────────────────────────────
260
+
261
+ def coerce(input)
262
+ input.is_a?(Identifier) ? input : Parser.parse(input)
263
+ end
264
+
265
+ def urn_parts(iri)
266
+ return [] unless iri.nss
267
+ iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
268
+ end
269
+
270
+ def collect_notes(evidence)
271
+ evidence.flat_map(&:notes)
272
+ end
273
+
274
+ def find_evidence(evidence, source:)
275
+ evidence.find { |r| r.source == source }
276
+ end
277
+
278
+ def find_payload(evidence, rule)
279
+ r = evidence.find { |e| e.source == :policy && e.payload[:rule] == rule }
280
+ r&.payload
281
+ end
282
+
283
+ def would_be_hint(segments, idx, type, classifier)
284
+ return nil if idx.zero?
285
+
286
+ prev = segments[idx - 1]
287
+ return nil unless classifier.classify(prev) == :literal
288
+
289
+ base = Inflector.singularize(prev)
290
+ suffix = type == :uuid ? "_uuid" : "_id"
291
+ "#{base}#{suffix}"
292
+ end
293
+ end
294
+ end
data/lib/iriq/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Iriq
2
- VERSION = "0.1.0"
2
+ VERSION = "0.30.2"
3
3
  end
data/lib/iriq.rb CHANGED
@@ -3,15 +3,33 @@ require "iriq/errors"
3
3
  require "iriq/inflector"
4
4
  require "iriq/identifier"
5
5
  require "iriq/parser"
6
+ require "iriq/specificity"
7
+ require "iriq/recognizer"
8
+ require "iriq/recognizers/uuid"
9
+ require "iriq/recognizers/date"
10
+ require "iriq/recognizers/integer"
6
11
  require "iriq/segment_classifier"
7
12
  require "iriq/segment_hints"
13
+ require "iriq/shape"
8
14
  require "iriq/path_shape"
9
15
  require "iriq/normalizer"
10
16
  require "iriq/explanation"
17
+ require "iriq/evidence"
18
+ require "iriq/trace"
11
19
  require "iriq/cluster"
12
20
  require "iriq/clusterer"
13
21
  require "iriq/position_stats"
22
+ require "set"
23
+
14
24
  require "iriq/observation"
25
+ require "iriq/position"
26
+ require "iriq/event"
27
+ require "iriq/reducer"
28
+ require "iriq/registrable_domain"
29
+ require "iriq/storage"
30
+ require "iriq/recognizer_proposal"
31
+ require "iriq/synthesized_recognizer"
32
+ require "iriq/cross_host_shape"
15
33
  require "iriq/corpus"
16
34
  require "iriq/extractor"
17
35
  require "iriq/cli"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iriq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.30.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Pepper
@@ -65,18 +65,37 @@ dependencies:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
67
  version: '0.22'
68
- description: Semantic IRI/URI/URL/URN parsing, normalization, classification, and
69
- clustering.
68
+ - !ruby/object:Gem::Dependency
69
+ name: sqlite3
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '1.6'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '1.6'
82
+ description: IRI extraction, normalization, and clustering.
70
83
  executables:
71
84
  - iriq
72
85
  extensions: []
73
86
  extra_rdoc_files: []
74
87
  files:
75
88
  - CHANGELOG.md
89
+ - CLAUDE.md
76
90
  - Gemfile
77
91
  - Gemfile.lock
78
92
  - LICENSE.txt
93
+ - Makefile
79
94
  - README.md
95
+ - completions/_iriq
96
+ - completions/iriq.bash
97
+ - docs/ARCHITECTURE.md
98
+ - docs/ROADMAP.md
80
99
  - exe/iriq
81
100
  - iriq.gemspec
82
101
  - lib/iriq.rb
@@ -84,7 +103,10 @@ files:
84
103
  - lib/iriq/cluster.rb
85
104
  - lib/iriq/clusterer.rb
86
105
  - lib/iriq/corpus.rb
106
+ - lib/iriq/cross_host_shape.rb
87
107
  - lib/iriq/errors.rb
108
+ - lib/iriq/event.rb
109
+ - lib/iriq/evidence.rb
88
110
  - lib/iriq/explanation.rb
89
111
  - lib/iriq/extractor.rb
90
112
  - lib/iriq/identifier.rb
@@ -93,12 +115,26 @@ files:
93
115
  - lib/iriq/observation.rb
94
116
  - lib/iriq/parser.rb
95
117
  - lib/iriq/path_shape.rb
118
+ - lib/iriq/position.rb
96
119
  - lib/iriq/position_stats.rb
120
+ - lib/iriq/recognizer.rb
121
+ - lib/iriq/recognizer_proposal.rb
122
+ - lib/iriq/recognizers/date.rb
123
+ - lib/iriq/recognizers/integer.rb
124
+ - lib/iriq/recognizers/uuid.rb
125
+ - lib/iriq/reducer.rb
126
+ - lib/iriq/registrable_domain.rb
97
127
  - lib/iriq/segment_classifier.rb
98
128
  - lib/iriq/segment_hints.rb
129
+ - lib/iriq/shape.rb
130
+ - lib/iriq/specificity.rb
131
+ - lib/iriq/storage.rb
132
+ - lib/iriq/storage/json.rb
133
+ - lib/iriq/storage/memory.rb
134
+ - lib/iriq/storage/sqlite.rb
135
+ - lib/iriq/synthesized_recognizer.rb
136
+ - lib/iriq/trace.rb
99
137
  - lib/iriq/version.rb
100
- - script/benchmark.rb
101
- - script/memory.rb
102
138
  homepage: https://github.com/dpep/iriq
103
139
  licenses:
104
140
  - MIT
@@ -110,14 +146,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
110
146
  requirements:
111
147
  - - ">="
112
148
  - !ruby/object:Gem::Version
113
- version: '3.2'
149
+ version: '3.4'
114
150
  required_rubygems_version: !ruby/object:Gem::Requirement
115
151
  requirements:
116
152
  - - ">="
117
153
  - !ruby/object:Gem::Version
118
154
  version: '0'
119
155
  requirements: []
120
- rubygems_version: 3.6.9
156
+ rubygems_version: 4.0.11
121
157
  specification_version: 4
122
- summary: Semantic IRI normalization and clustering.
158
+ summary: IRI extraction, normalization, and clustering.
123
159
  test_files: []
data/script/benchmark.rb DELETED
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # Performance benchmark for the main hot paths in Iriq.
3
- #
4
- # Usage:
5
- # bundle exec script/benchmark.rb # default sizes
6
- # bundle exec script/benchmark.rb 50000 # custom "large" size
7
- #
8
- # Inputs are generated deterministically from IriGenerator so results are
9
- # comparable across runs.
10
-
11
- require "benchmark"
12
- require "tempfile"
13
-
14
- $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
15
- $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
16
- require "iriq"
17
- require "iri_generator"
18
-
19
- LARGE = Integer(ARGV[0] || 10_000)
20
- SMALL = [LARGE / 10, 1_000].min
21
- HUGE = LARGE * 10
22
-
23
- puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
24
- puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
25
- puts
26
-
27
- small_urls = IriGenerator.urls(count: SMALL, seed: 1)
28
- large_urls = IriGenerator.urls(count: LARGE, seed: 1)
29
- huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
30
-
31
- # ~ LARGE URLs embedded in prose
32
- text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
33
- puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
34
- puts
35
-
36
- results = {}
37
- Benchmark.bm(42) do |x|
38
- results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
39
- results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
40
- results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
41
- results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
42
-
43
- results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
44
- c = Iriq::Corpus.new
45
- small_urls.each { |u| c.observe(u) }
46
- end
47
- results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
48
- c = Iriq::Corpus.new
49
- large_urls.each { |u| c.observe(u) }
50
- end
51
- results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
52
- c = Iriq::Corpus.new
53
- huge_urls.each { |u| c.observe(u) }
54
- end
55
-
56
- results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
57
- c = Iriq::Corpus.new
58
- large_urls.each { |u| c.observe(u) }
59
- Tempfile.open(["iriq-bench", ".json"]) do |f|
60
- c.save(f.path)
61
- Iriq::Corpus.load(f.path)
62
- end
63
- end
64
- end
65
-
66
- puts
67
- puts "Throughput summary:"
68
- [
69
- [:parse, LARGE, "URLs/s"],
70
- [:normalize, LARGE, "URLs/s"],
71
- [:explain, LARGE, "URLs/s"],
72
- [:observe_small, SMALL, "URLs/s"],
73
- [:observe_large, LARGE, "URLs/s"],
74
- [:observe_huge, HUGE, "URLs/s"],
75
- ].each do |key, n, unit|
76
- per_sec = n / results[key].real
77
- printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
78
- end
79
-
80
- extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
81
- printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
data/script/memory.rb DELETED
@@ -1,121 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # Memory profile for the main code paths in Iriq.
3
- #
4
- # Usage:
5
- # bundle exec script/memory.rb # default sizes
6
- # bundle exec script/memory.rb 50000 # custom corpus size
7
- #
8
- # Reports retained memory per operation, cache footprints, and memory
9
- # growth across corpus sizes (to verify linear scaling — no leaks).
10
-
11
- require "objspace"
12
-
13
- $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
14
- $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
15
- require "iriq"
16
- require "iri_generator"
17
-
18
- CORPUS_SIZE = Integer(ARGV[0] || 10_000)
19
- SIZES = [1_000, 10_000, 100_000].uniq.sort
20
- SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
21
- SIZES.sort!
22
-
23
- # Bytes → KB / MB string for display.
24
- def fmt_bytes(n)
25
- if n < 1024
26
- "#{n} B"
27
- elsif n < 1024 * 1024
28
- format("%.1f KB", n / 1024.0)
29
- else
30
- format("%.2f MB", n / (1024.0 * 1024.0))
31
- end
32
- end
33
-
34
- # Run a block in isolation: GC before + after, return delta in bytes.
35
- def measure_retained(&block)
36
- GC.start
37
- before = ObjectSpace.memsize_of_all
38
- result = block.call
39
- GC.start
40
- after = ObjectSpace.memsize_of_all
41
- [after - before, result]
42
- end
43
-
44
- # Reset caches so each scenario starts clean.
45
- def reset_caches
46
- Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
47
- Iriq::Inflector.instance_variable_get(:@cache)&.clear
48
- end
49
-
50
- puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
51
- puts
52
-
53
- # -- Section 1: memory growth across corpus sizes --
54
- puts "── corpus retained memory by N (verifies linear growth) ──"
55
- printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
56
- SIZES.each do |n|
57
- reset_caches
58
- urls = IriGenerator.urls(count: n, seed: 1)
59
- alloc_before = GC.stat(:total_allocated_objects)
60
- retained, _ = measure_retained do
61
- c = Iriq::Corpus.new
62
- urls.each { |u| c.observe(u) }
63
- c
64
- end
65
- alloc_total = GC.stat(:total_allocated_objects) - alloc_before
66
- printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
67
- end
68
- puts
69
-
70
- # -- Section 2: corpus state breakdown at CORPUS_SIZE --
71
- puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
72
- reset_caches
73
- urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
74
- corpus = Iriq::Corpus.new
75
- urls.each { |u| corpus.observe(u) }
76
- puts " unique hosts: #{corpus.host_counts.size}"
77
- puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
78
- puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
79
- puts " clusters: #{corpus.size}"
80
- puts " position_stats entries: #{corpus.position_stats.size}"
81
- puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
82
- puts
83
-
84
- # -- Section 3: cache footprints --
85
- puts "── memoization caches ──"
86
- classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
87
- inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
88
- puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
89
- puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
90
- puts
91
-
92
- # -- Section 4: per-operation memory cost --
93
- puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
94
- urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
95
- text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
96
-
97
- [
98
- ["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
99
- ["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
100
- ["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
101
- ["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
102
- ["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
103
- ].each do |label, op|
104
- reset_caches
105
- retained, _ = measure_retained(&op)
106
- printf(" %-50s %s\n", label, fmt_bytes(retained))
107
- end
108
- puts
109
-
110
- # -- Section 5: persistence overhead --
111
- puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
112
- require "tempfile"
113
- reset_caches
114
- corpus = Iriq::Corpus.new
115
- urls.each { |u| corpus.observe(u) }
116
- Tempfile.open(["iriq-mem", ".json"]) do |f|
117
- corpus.save(f.path)
118
- bytes = File.size(f.path)
119
- puts " JSON file on disk: #{fmt_bytes(bytes)}"
120
- puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
121
- end