iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
data/lib/iriq/trace.rb
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Produces an annotated trace explaining how an identifier got
|
|
3
|
+
# normalized — segment by segment, with notes for each non-obvious
|
|
4
|
+
# transformation (currency upcase, IP umbrella, hint suppression,
|
|
5
|
+
# canonical date, param-name lift, etc.).
|
|
6
|
+
#
|
|
7
|
+
# Trace.for("https://shop.com/pricing/usd?currency=eur")
|
|
8
|
+
# # => {
|
|
9
|
+
# # input: "...",
|
|
10
|
+
# # normalized: "https://shop.com/pricing/USD?currency=EUR",
|
|
11
|
+
# # scheme: "https", host: "shop.com",
|
|
12
|
+
# # path: [...per-segment rows...],
|
|
13
|
+
# # query: [...per-param rows...],
|
|
14
|
+
# # }
|
|
15
|
+
#
|
|
16
|
+
# Each row is `{ value, type, output, notes }` for path entries and
|
|
17
|
+
# `{ name, value, type, output, notes }` for query entries. The string
|
|
18
|
+
# notes are rendered from structured Iriq::Evidence::Record values;
|
|
19
|
+
# callers that want the structured form can use Trace.evidence_for.
|
|
20
|
+
module Trace
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
HINT_NOTE_TEMPLATE = "semantic type — surfaced as {%s}, not {%s}".freeze
|
|
24
|
+
|
|
25
|
+
# Render-ready Trace output. The public format consumers depend on.
|
|
26
|
+
def for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
27
|
+
iri = coerce(input)
|
|
28
|
+
normalized = Normalizer.normalize_identifier(iri, classifier: classifier, hints: hints)
|
|
29
|
+
|
|
30
|
+
out = {
|
|
31
|
+
input: iri.canonical,
|
|
32
|
+
normalized: normalized,
|
|
33
|
+
scheme: iri.scheme,
|
|
34
|
+
host: iri.host,
|
|
35
|
+
}
|
|
36
|
+
out[:port] = iri.port if iri.port
|
|
37
|
+
|
|
38
|
+
if iri.urn?
|
|
39
|
+
out[:path] = urn_rows(iri, classifier, hints)
|
|
40
|
+
else
|
|
41
|
+
out[:path] = path_rows(iri.path_segments, classifier, hints)
|
|
42
|
+
if iri.query_params && !iri.query_params.empty?
|
|
43
|
+
out[:query] = query_rows(iri.query_params, classifier)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
out
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Structured Evidence list for `input`. Each segment + query param
|
|
51
|
+
# contributes one classification Evidence plus zero or more
|
|
52
|
+
# transformation Evidence records (canonical date, IP umbrella
|
|
53
|
+
# collapse, param-name hint, hint suppression).
|
|
54
|
+
#
|
|
55
|
+
# Position + Cluster Evidence are not emitted here — they belong to
|
|
56
|
+
# corpus-informed trace (Corpus#trace), which a follow-up step lands.
|
|
57
|
+
def evidence_for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
58
|
+
iri = coerce(input)
|
|
59
|
+
records = []
|
|
60
|
+
segments = iri.urn? ? urn_parts(iri) : (iri.path_segments || [])
|
|
61
|
+
entries = SegmentHints.derive(segments, classifier)
|
|
62
|
+
|
|
63
|
+
entries.each_with_index do |entry, i|
|
|
64
|
+
records.concat(segment_evidence(entry, segments, i, classifier, hints))
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
if !iri.urn? && iri.query_params && !iri.query_params.empty?
|
|
68
|
+
iri.query_params.keys.sort.each do |k|
|
|
69
|
+
records.concat(query_param_evidence(k, iri.query_params[k].to_s, classifier))
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
records
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# ── Evidence builders ────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
def segment_evidence(entry, segments, idx, classifier, hints)
|
|
79
|
+
records = []
|
|
80
|
+
|
|
81
|
+
records << Evidence.segment(
|
|
82
|
+
index: idx, value: entry[:value],
|
|
83
|
+
source: :recognizer,
|
|
84
|
+
payload: { type: entry[:type], variable: entry[:variable], hint: entry[:hint] },
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if entry[:variable]
|
|
88
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
89
|
+
if canon != entry[:value]
|
|
90
|
+
records << Evidence.segment(
|
|
91
|
+
index: idx, value: entry[:value],
|
|
92
|
+
source: :policy,
|
|
93
|
+
payload: { rule: :canonical_date, before: entry[:value], after: canon },
|
|
94
|
+
notes: ["canonical date (#{entry[:value]} → #{canon})"],
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
|
|
98
|
+
if canon != entry[:value]
|
|
99
|
+
records << Evidence.segment(
|
|
100
|
+
index: idx, value: entry[:value],
|
|
101
|
+
source: :policy,
|
|
102
|
+
payload: { rule: :canonical_currency, before: entry[:value], after: canon },
|
|
103
|
+
notes: ["currency upcase (#{entry[:value]} → #{canon})"],
|
|
104
|
+
)
|
|
105
|
+
end
|
|
106
|
+
else
|
|
107
|
+
extra = placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
|
|
108
|
+
records.concat(extra)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
records
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
|
|
116
|
+
out = []
|
|
117
|
+
type = entry[:type]
|
|
118
|
+
|
|
119
|
+
if type == :ipv4 || type == :ipv6
|
|
120
|
+
out << Evidence.segment(
|
|
121
|
+
index: idx, value: entry[:value],
|
|
122
|
+
source: :policy,
|
|
123
|
+
payload: { rule: :ip_umbrella_collapse, from: type, to: :ip },
|
|
124
|
+
notes: ["ip umbrella collapse (#{type} → ip)"],
|
|
125
|
+
)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
if hints && entry[:hint].nil? && !SegmentHints::HINT_ELIGIBLE_TYPES.include?(type)
|
|
129
|
+
if (would_be = would_be_hint(segments, idx, type, classifier))
|
|
130
|
+
display = SegmentClassifier.display_type(type)
|
|
131
|
+
out << Evidence.segment(
|
|
132
|
+
index: idx, value: entry[:value],
|
|
133
|
+
source: :neighbor,
|
|
134
|
+
payload: { rule: :hint_suppression, surfaced: display, would_be: would_be, semantic_type: type },
|
|
135
|
+
notes: [format(HINT_NOTE_TEMPLATE, display, would_be)],
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
out
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def query_param_evidence(name, value, classifier)
|
|
144
|
+
records = []
|
|
145
|
+
base_type = classifier.classify(value)
|
|
146
|
+
effective = base_type
|
|
147
|
+
|
|
148
|
+
if (hint = SegmentClassifier.param_name_hint(name, base_type))
|
|
149
|
+
effective = hint
|
|
150
|
+
records << Evidence.segment(
|
|
151
|
+
index: name, value: value,
|
|
152
|
+
source: :neighbor,
|
|
153
|
+
payload: { rule: :param_name_hint, name: name, before: base_type, after: hint },
|
|
154
|
+
notes: ["param-name hint (`#{name}=`) lifted #{base_type} → #{hint}"],
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
records << Evidence.segment(
|
|
159
|
+
index: name, value: value,
|
|
160
|
+
source: :recognizer,
|
|
161
|
+
payload: { type: effective, variable: SegmentClassifier::DEFAULT.variable?(effective) },
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if effective == :date && (canon = SegmentClassifier.canonical_date(value))
|
|
165
|
+
if canon != value
|
|
166
|
+
records << Evidence.segment(
|
|
167
|
+
index: name, value: value,
|
|
168
|
+
source: :policy,
|
|
169
|
+
payload: { rule: :canonical_date, before: value, after: canon },
|
|
170
|
+
notes: ["canonical date (#{value} → #{canon})"],
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
elsif effective == :currency && (canon = SegmentClassifier.canonical_currency(value))
|
|
174
|
+
if canon != value
|
|
175
|
+
records << Evidence.segment(
|
|
176
|
+
index: name, value: value,
|
|
177
|
+
source: :policy,
|
|
178
|
+
payload: { rule: :canonical_currency, before: value, after: canon },
|
|
179
|
+
notes: ["currency upcase (#{value} → #{canon})"],
|
|
180
|
+
)
|
|
181
|
+
end
|
|
182
|
+
elsif effective == :ipv4 || effective == :ipv6
|
|
183
|
+
records << Evidence.segment(
|
|
184
|
+
index: name, value: value,
|
|
185
|
+
source: :policy,
|
|
186
|
+
payload: { rule: :ip_umbrella_collapse, from: effective, to: :ip },
|
|
187
|
+
notes: ["ip umbrella collapse (#{effective} → ip)"],
|
|
188
|
+
)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
records
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# ── View rendering (Evidence → Trace.for hash) ───────────────────────
|
|
195
|
+
|
|
196
|
+
def path_rows(segments, classifier, hints)
|
|
197
|
+
return [] if segments.nil? || segments.empty?
|
|
198
|
+
|
|
199
|
+
entries = SegmentHints.derive(segments, classifier)
|
|
200
|
+
entries.each_with_index.map do |entry, i|
|
|
201
|
+
ev = segment_evidence(entry, segments, i, classifier, hints)
|
|
202
|
+
render_segment_row(entry, ev, hints)
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def urn_rows(iri, classifier, hints)
|
|
207
|
+
parts = urn_parts(iri)
|
|
208
|
+
return [] if parts.empty?
|
|
209
|
+
|
|
210
|
+
entries = SegmentHints.derive(parts, classifier)
|
|
211
|
+
entries.each_with_index.map do |entry, i|
|
|
212
|
+
ev = segment_evidence(entry, parts, i, classifier, hints)
|
|
213
|
+
render_segment_row(entry, ev, hints)
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def query_rows(params, classifier)
|
|
218
|
+
params.keys.sort.map do |k|
|
|
219
|
+
v = params[k].to_s
|
|
220
|
+
ev = query_param_evidence(k, v, classifier)
|
|
221
|
+
render_query_row(k, v, ev)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def render_segment_row(entry, evidence, hints)
|
|
226
|
+
notes = collect_notes(evidence)
|
|
227
|
+
value = entry[:value]
|
|
228
|
+
type = entry[:type]
|
|
229
|
+
|
|
230
|
+
return { value: value, type: type, output: value, notes: notes } unless entry[:variable]
|
|
231
|
+
|
|
232
|
+
canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
|
|
233
|
+
if canon_policy
|
|
234
|
+
return { value: value, type: type, output: canon_policy[:after], notes: notes }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
placeholder = hints && entry[:hint] ? entry[:hint].to_s : SegmentClassifier.display_type(type).to_s
|
|
238
|
+
{ value: value, type: type, output: "{#{placeholder}}", notes: notes }
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def render_query_row(name, value, evidence)
|
|
242
|
+
notes = collect_notes(evidence)
|
|
243
|
+
cls = find_evidence(evidence, source: :recognizer)
|
|
244
|
+
effective = cls ? cls.payload[:type] : SegmentClassifier::DEFAULT.classify(value)
|
|
245
|
+
canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
|
|
246
|
+
|
|
247
|
+
output =
|
|
248
|
+
if canon_policy
|
|
249
|
+
canon_policy[:after]
|
|
250
|
+
elsif SegmentClassifier::DEFAULT.variable?(effective)
|
|
251
|
+
"{#{SegmentClassifier.display_type(effective)}}"
|
|
252
|
+
else
|
|
253
|
+
value
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
{ name: name, value: value, type: effective, output: output, notes: notes }
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# ── Helpers ──────────────────────────────────────────────────────────
|
|
260
|
+
|
|
261
|
+
def coerce(input)
|
|
262
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def urn_parts(iri)
|
|
266
|
+
return [] unless iri.nss
|
|
267
|
+
iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def collect_notes(evidence)
|
|
271
|
+
evidence.flat_map(&:notes)
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def find_evidence(evidence, source:)
|
|
275
|
+
evidence.find { |r| r.source == source }
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def find_payload(evidence, rule)
|
|
279
|
+
r = evidence.find { |e| e.source == :policy && e.payload[:rule] == rule }
|
|
280
|
+
r&.payload
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def would_be_hint(segments, idx, type, classifier)
|
|
284
|
+
return nil if idx.zero?
|
|
285
|
+
|
|
286
|
+
prev = segments[idx - 1]
|
|
287
|
+
return nil unless classifier.classify(prev) == :literal
|
|
288
|
+
|
|
289
|
+
base = Inflector.singularize(prev)
|
|
290
|
+
suffix = type == :uuid ? "_uuid" : "_id"
|
|
291
|
+
"#{base}#{suffix}"
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|
data/lib/iriq/version.rb
CHANGED
data/lib/iriq.rb
CHANGED
|
@@ -3,15 +3,33 @@ require "iriq/errors"
|
|
|
3
3
|
require "iriq/inflector"
|
|
4
4
|
require "iriq/identifier"
|
|
5
5
|
require "iriq/parser"
|
|
6
|
+
require "iriq/specificity"
|
|
7
|
+
require "iriq/recognizer"
|
|
8
|
+
require "iriq/recognizers/uuid"
|
|
9
|
+
require "iriq/recognizers/date"
|
|
10
|
+
require "iriq/recognizers/integer"
|
|
6
11
|
require "iriq/segment_classifier"
|
|
7
12
|
require "iriq/segment_hints"
|
|
13
|
+
require "iriq/shape"
|
|
8
14
|
require "iriq/path_shape"
|
|
9
15
|
require "iriq/normalizer"
|
|
10
16
|
require "iriq/explanation"
|
|
17
|
+
require "iriq/evidence"
|
|
18
|
+
require "iriq/trace"
|
|
11
19
|
require "iriq/cluster"
|
|
12
20
|
require "iriq/clusterer"
|
|
13
21
|
require "iriq/position_stats"
|
|
22
|
+
require "set"
|
|
23
|
+
|
|
14
24
|
require "iriq/observation"
|
|
25
|
+
require "iriq/position"
|
|
26
|
+
require "iriq/event"
|
|
27
|
+
require "iriq/reducer"
|
|
28
|
+
require "iriq/registrable_domain"
|
|
29
|
+
require "iriq/storage"
|
|
30
|
+
require "iriq/recognizer_proposal"
|
|
31
|
+
require "iriq/synthesized_recognizer"
|
|
32
|
+
require "iriq/cross_host_shape"
|
|
15
33
|
require "iriq/corpus"
|
|
16
34
|
require "iriq/extractor"
|
|
17
35
|
require "iriq/cli"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iriq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.30.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniel Pepper
|
|
@@ -65,18 +65,37 @@ dependencies:
|
|
|
65
65
|
- - ">="
|
|
66
66
|
- !ruby/object:Gem::Version
|
|
67
67
|
version: '0.22'
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: sqlite3
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '1.6'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '1.6'
|
|
82
|
+
description: IRI extraction, normalization, and clustering.
|
|
70
83
|
executables:
|
|
71
84
|
- iriq
|
|
72
85
|
extensions: []
|
|
73
86
|
extra_rdoc_files: []
|
|
74
87
|
files:
|
|
75
88
|
- CHANGELOG.md
|
|
89
|
+
- CLAUDE.md
|
|
76
90
|
- Gemfile
|
|
77
91
|
- Gemfile.lock
|
|
78
92
|
- LICENSE.txt
|
|
93
|
+
- Makefile
|
|
79
94
|
- README.md
|
|
95
|
+
- completions/_iriq
|
|
96
|
+
- completions/iriq.bash
|
|
97
|
+
- docs/ARCHITECTURE.md
|
|
98
|
+
- docs/ROADMAP.md
|
|
80
99
|
- exe/iriq
|
|
81
100
|
- iriq.gemspec
|
|
82
101
|
- lib/iriq.rb
|
|
@@ -84,7 +103,10 @@ files:
|
|
|
84
103
|
- lib/iriq/cluster.rb
|
|
85
104
|
- lib/iriq/clusterer.rb
|
|
86
105
|
- lib/iriq/corpus.rb
|
|
106
|
+
- lib/iriq/cross_host_shape.rb
|
|
87
107
|
- lib/iriq/errors.rb
|
|
108
|
+
- lib/iriq/event.rb
|
|
109
|
+
- lib/iriq/evidence.rb
|
|
88
110
|
- lib/iriq/explanation.rb
|
|
89
111
|
- lib/iriq/extractor.rb
|
|
90
112
|
- lib/iriq/identifier.rb
|
|
@@ -93,12 +115,26 @@ files:
|
|
|
93
115
|
- lib/iriq/observation.rb
|
|
94
116
|
- lib/iriq/parser.rb
|
|
95
117
|
- lib/iriq/path_shape.rb
|
|
118
|
+
- lib/iriq/position.rb
|
|
96
119
|
- lib/iriq/position_stats.rb
|
|
120
|
+
- lib/iriq/recognizer.rb
|
|
121
|
+
- lib/iriq/recognizer_proposal.rb
|
|
122
|
+
- lib/iriq/recognizers/date.rb
|
|
123
|
+
- lib/iriq/recognizers/integer.rb
|
|
124
|
+
- lib/iriq/recognizers/uuid.rb
|
|
125
|
+
- lib/iriq/reducer.rb
|
|
126
|
+
- lib/iriq/registrable_domain.rb
|
|
97
127
|
- lib/iriq/segment_classifier.rb
|
|
98
128
|
- lib/iriq/segment_hints.rb
|
|
129
|
+
- lib/iriq/shape.rb
|
|
130
|
+
- lib/iriq/specificity.rb
|
|
131
|
+
- lib/iriq/storage.rb
|
|
132
|
+
- lib/iriq/storage/json.rb
|
|
133
|
+
- lib/iriq/storage/memory.rb
|
|
134
|
+
- lib/iriq/storage/sqlite.rb
|
|
135
|
+
- lib/iriq/synthesized_recognizer.rb
|
|
136
|
+
- lib/iriq/trace.rb
|
|
99
137
|
- lib/iriq/version.rb
|
|
100
|
-
- script/benchmark.rb
|
|
101
|
-
- script/memory.rb
|
|
102
138
|
homepage: https://github.com/dpep/iriq
|
|
103
139
|
licenses:
|
|
104
140
|
- MIT
|
|
@@ -110,14 +146,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
110
146
|
requirements:
|
|
111
147
|
- - ">="
|
|
112
148
|
- !ruby/object:Gem::Version
|
|
113
|
-
version: '3.
|
|
149
|
+
version: '3.4'
|
|
114
150
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
115
151
|
requirements:
|
|
116
152
|
- - ">="
|
|
117
153
|
- !ruby/object:Gem::Version
|
|
118
154
|
version: '0'
|
|
119
155
|
requirements: []
|
|
120
|
-
rubygems_version:
|
|
156
|
+
rubygems_version: 4.0.11
|
|
121
157
|
specification_version: 4
|
|
122
|
-
summary:
|
|
158
|
+
summary: IRI extraction, normalization, and clustering.
|
|
123
159
|
test_files: []
|
data/script/benchmark.rb
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# Performance benchmark for the main hot paths in Iriq.
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bundle exec script/benchmark.rb # default sizes
|
|
6
|
-
# bundle exec script/benchmark.rb 50000 # custom "large" size
|
|
7
|
-
#
|
|
8
|
-
# Inputs are generated deterministically from IriGenerator so results are
|
|
9
|
-
# comparable across runs.
|
|
10
|
-
|
|
11
|
-
require "benchmark"
|
|
12
|
-
require "tempfile"
|
|
13
|
-
|
|
14
|
-
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
15
|
-
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
16
|
-
require "iriq"
|
|
17
|
-
require "iri_generator"
|
|
18
|
-
|
|
19
|
-
LARGE = Integer(ARGV[0] || 10_000)
|
|
20
|
-
SMALL = [LARGE / 10, 1_000].min
|
|
21
|
-
HUGE = LARGE * 10
|
|
22
|
-
|
|
23
|
-
puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
24
|
-
puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
|
|
25
|
-
puts
|
|
26
|
-
|
|
27
|
-
small_urls = IriGenerator.urls(count: SMALL, seed: 1)
|
|
28
|
-
large_urls = IriGenerator.urls(count: LARGE, seed: 1)
|
|
29
|
-
huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
|
|
30
|
-
|
|
31
|
-
# ~ LARGE URLs embedded in prose
|
|
32
|
-
text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
|
|
33
|
-
puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
|
|
34
|
-
puts
|
|
35
|
-
|
|
36
|
-
results = {}
|
|
37
|
-
Benchmark.bm(42) do |x|
|
|
38
|
-
results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
|
|
39
|
-
results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
|
|
40
|
-
results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
|
|
41
|
-
results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
|
|
42
|
-
|
|
43
|
-
results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
|
|
44
|
-
c = Iriq::Corpus.new
|
|
45
|
-
small_urls.each { |u| c.observe(u) }
|
|
46
|
-
end
|
|
47
|
-
results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
|
|
48
|
-
c = Iriq::Corpus.new
|
|
49
|
-
large_urls.each { |u| c.observe(u) }
|
|
50
|
-
end
|
|
51
|
-
results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
|
|
52
|
-
c = Iriq::Corpus.new
|
|
53
|
-
huge_urls.each { |u| c.observe(u) }
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
|
|
57
|
-
c = Iriq::Corpus.new
|
|
58
|
-
large_urls.each { |u| c.observe(u) }
|
|
59
|
-
Tempfile.open(["iriq-bench", ".json"]) do |f|
|
|
60
|
-
c.save(f.path)
|
|
61
|
-
Iriq::Corpus.load(f.path)
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
puts
|
|
67
|
-
puts "Throughput summary:"
|
|
68
|
-
[
|
|
69
|
-
[:parse, LARGE, "URLs/s"],
|
|
70
|
-
[:normalize, LARGE, "URLs/s"],
|
|
71
|
-
[:explain, LARGE, "URLs/s"],
|
|
72
|
-
[:observe_small, SMALL, "URLs/s"],
|
|
73
|
-
[:observe_large, LARGE, "URLs/s"],
|
|
74
|
-
[:observe_huge, HUGE, "URLs/s"],
|
|
75
|
-
].each do |key, n, unit|
|
|
76
|
-
per_sec = n / results[key].real
|
|
77
|
-
printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
|
|
81
|
-
printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
|
data/script/memory.rb
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# Memory profile for the main code paths in Iriq.
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bundle exec script/memory.rb # default sizes
|
|
6
|
-
# bundle exec script/memory.rb 50000 # custom corpus size
|
|
7
|
-
#
|
|
8
|
-
# Reports retained memory per operation, cache footprints, and memory
|
|
9
|
-
# growth across corpus sizes (to verify linear scaling — no leaks).
|
|
10
|
-
|
|
11
|
-
require "objspace"
|
|
12
|
-
|
|
13
|
-
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
14
|
-
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
15
|
-
require "iriq"
|
|
16
|
-
require "iri_generator"
|
|
17
|
-
|
|
18
|
-
CORPUS_SIZE = Integer(ARGV[0] || 10_000)
|
|
19
|
-
SIZES = [1_000, 10_000, 100_000].uniq.sort
|
|
20
|
-
SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
|
|
21
|
-
SIZES.sort!
|
|
22
|
-
|
|
23
|
-
# Bytes → KB / MB string for display.
|
|
24
|
-
def fmt_bytes(n)
|
|
25
|
-
if n < 1024
|
|
26
|
-
"#{n} B"
|
|
27
|
-
elsif n < 1024 * 1024
|
|
28
|
-
format("%.1f KB", n / 1024.0)
|
|
29
|
-
else
|
|
30
|
-
format("%.2f MB", n / (1024.0 * 1024.0))
|
|
31
|
-
end
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Run a block in isolation: GC before + after, return delta in bytes.
|
|
35
|
-
def measure_retained(&block)
|
|
36
|
-
GC.start
|
|
37
|
-
before = ObjectSpace.memsize_of_all
|
|
38
|
-
result = block.call
|
|
39
|
-
GC.start
|
|
40
|
-
after = ObjectSpace.memsize_of_all
|
|
41
|
-
[after - before, result]
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Reset caches so each scenario starts clean.
|
|
45
|
-
def reset_caches
|
|
46
|
-
Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
|
|
47
|
-
Iriq::Inflector.instance_variable_get(:@cache)&.clear
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
51
|
-
puts
|
|
52
|
-
|
|
53
|
-
# -- Section 1: memory growth across corpus sizes --
|
|
54
|
-
puts "── corpus retained memory by N (verifies linear growth) ──"
|
|
55
|
-
printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
|
|
56
|
-
SIZES.each do |n|
|
|
57
|
-
reset_caches
|
|
58
|
-
urls = IriGenerator.urls(count: n, seed: 1)
|
|
59
|
-
alloc_before = GC.stat(:total_allocated_objects)
|
|
60
|
-
retained, _ = measure_retained do
|
|
61
|
-
c = Iriq::Corpus.new
|
|
62
|
-
urls.each { |u| c.observe(u) }
|
|
63
|
-
c
|
|
64
|
-
end
|
|
65
|
-
alloc_total = GC.stat(:total_allocated_objects) - alloc_before
|
|
66
|
-
printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
|
|
67
|
-
end
|
|
68
|
-
puts
|
|
69
|
-
|
|
70
|
-
# -- Section 2: corpus state breakdown at CORPUS_SIZE --
|
|
71
|
-
puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
|
|
72
|
-
reset_caches
|
|
73
|
-
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
74
|
-
corpus = Iriq::Corpus.new
|
|
75
|
-
urls.each { |u| corpus.observe(u) }
|
|
76
|
-
puts " unique hosts: #{corpus.host_counts.size}"
|
|
77
|
-
puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
|
|
78
|
-
puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
|
|
79
|
-
puts " clusters: #{corpus.size}"
|
|
80
|
-
puts " position_stats entries: #{corpus.position_stats.size}"
|
|
81
|
-
puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
|
|
82
|
-
puts
|
|
83
|
-
|
|
84
|
-
# -- Section 3: cache footprints --
|
|
85
|
-
puts "── memoization caches ──"
|
|
86
|
-
classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
|
|
87
|
-
inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
|
|
88
|
-
puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
|
|
89
|
-
puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
|
|
90
|
-
puts
|
|
91
|
-
|
|
92
|
-
# -- Section 4: per-operation memory cost --
|
|
93
|
-
puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
|
|
94
|
-
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
95
|
-
text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
|
|
96
|
-
|
|
97
|
-
[
|
|
98
|
-
["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
|
|
99
|
-
["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
|
|
100
|
-
["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
|
|
101
|
-
["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
|
|
102
|
-
["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
|
|
103
|
-
].each do |label, op|
|
|
104
|
-
reset_caches
|
|
105
|
-
retained, _ = measure_retained(&op)
|
|
106
|
-
printf(" %-50s %s\n", label, fmt_bytes(retained))
|
|
107
|
-
end
|
|
108
|
-
puts
|
|
109
|
-
|
|
110
|
-
# -- Section 5: persistence overhead --
|
|
111
|
-
puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
|
|
112
|
-
require "tempfile"
|
|
113
|
-
reset_caches
|
|
114
|
-
corpus = Iriq::Corpus.new
|
|
115
|
-
urls.each { |u| corpus.observe(u) }
|
|
116
|
-
Tempfile.open(["iriq-mem", ".json"]) do |f|
|
|
117
|
-
corpus.save(f.path)
|
|
118
|
-
bytes = File.size(f.path)
|
|
119
|
-
puts " JSON file on disk: #{fmt_bytes(bytes)}"
|
|
120
|
-
puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
|
|
121
|
-
end
|