iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
data/lib/iriq/trace.rb
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Produces an annotated trace explaining how an identifier got
|
|
3
|
+
# normalized — segment by segment, with notes for each non-obvious
|
|
4
|
+
# transformation (currency upcase, IP umbrella, hint suppression,
|
|
5
|
+
# canonical date, param-name lift, etc.).
|
|
6
|
+
#
|
|
7
|
+
# Trace.for("https://shop.com/pricing/usd?currency=eur")
|
|
8
|
+
# # => {
|
|
9
|
+
# # input: "...",
|
|
10
|
+
# # normalized: "https://shop.com/pricing/USD?currency=EUR",
|
|
11
|
+
# # scheme: "https", host: "shop.com",
|
|
12
|
+
# # path: [...per-segment rows...],
|
|
13
|
+
# # query: [...per-param rows...],
|
|
14
|
+
# # }
|
|
15
|
+
#
|
|
16
|
+
# Each row is `{ value, type, output, notes }` for path entries and
|
|
17
|
+
# `{ name, value, type, output, notes }` for query entries. The string
|
|
18
|
+
# notes are rendered from structured Iriq::Evidence::Record values;
|
|
19
|
+
# callers that want the structured form can use Trace.evidence_for.
|
|
20
|
+
module Trace
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
HINT_NOTE_TEMPLATE = "semantic type — surfaced as {%s}, not {%s}".freeze
|
|
24
|
+
|
|
25
|
+
# Render-ready Trace output. The public format consumers depend on.
|
|
26
|
+
def for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
27
|
+
iri = coerce(input)
|
|
28
|
+
normalized = Normalizer.normalize_identifier(iri, classifier: classifier, hints: hints)
|
|
29
|
+
|
|
30
|
+
out = {
|
|
31
|
+
input: iri.canonical,
|
|
32
|
+
normalized: normalized,
|
|
33
|
+
scheme: iri.scheme,
|
|
34
|
+
host: iri.host,
|
|
35
|
+
}
|
|
36
|
+
out[:port] = iri.port if iri.port
|
|
37
|
+
|
|
38
|
+
if iri.urn?
|
|
39
|
+
out[:path] = urn_rows(iri, classifier, hints)
|
|
40
|
+
else
|
|
41
|
+
out[:path] = path_rows(iri.path_segments, classifier, hints)
|
|
42
|
+
if iri.query_params && !iri.query_params.empty?
|
|
43
|
+
out[:query] = query_rows(iri.query_params, classifier)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
out
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Structured Evidence list for `input`. Each segment + query param
|
|
51
|
+
# contributes one classification Evidence plus zero or more
|
|
52
|
+
# transformation Evidence records (canonical date, IP umbrella
|
|
53
|
+
# collapse, param-name hint, hint suppression).
|
|
54
|
+
#
|
|
55
|
+
# Position + Cluster Evidence are not emitted here — they belong to
|
|
56
|
+
# corpus-informed trace (Corpus#trace), which a follow-up step lands.
|
|
57
|
+
def evidence_for(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
58
|
+
iri = coerce(input)
|
|
59
|
+
records = []
|
|
60
|
+
segments = iri.urn? ? urn_parts(iri) : (iri.path_segments || [])
|
|
61
|
+
entries = SegmentHints.derive(segments, classifier)
|
|
62
|
+
|
|
63
|
+
entries.each_with_index do |entry, i|
|
|
64
|
+
records.concat(segment_evidence(entry, segments, i, classifier, hints))
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
if !iri.urn? && iri.query_params && !iri.query_params.empty?
|
|
68
|
+
iri.query_params.keys.sort.each do |k|
|
|
69
|
+
records.concat(query_param_evidence(k, iri.query_params[k].to_s, classifier))
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
records
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# ── Evidence builders ────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
def segment_evidence(entry, segments, idx, classifier, hints)
|
|
79
|
+
records = []
|
|
80
|
+
|
|
81
|
+
records << Evidence.segment(
|
|
82
|
+
index: idx, value: entry[:value],
|
|
83
|
+
source: :recognizer,
|
|
84
|
+
payload: { type: entry[:type], variable: entry[:variable], hint: entry[:hint] },
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if entry[:variable]
|
|
88
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
89
|
+
if canon != entry[:value]
|
|
90
|
+
records << Evidence.segment(
|
|
91
|
+
index: idx, value: entry[:value],
|
|
92
|
+
source: :policy,
|
|
93
|
+
payload: { rule: :canonical_date, before: entry[:value], after: canon },
|
|
94
|
+
notes: ["canonical date (#{entry[:value]} → #{canon})"],
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
|
|
98
|
+
if canon != entry[:value]
|
|
99
|
+
records << Evidence.segment(
|
|
100
|
+
index: idx, value: entry[:value],
|
|
101
|
+
source: :policy,
|
|
102
|
+
payload: { rule: :canonical_currency, before: entry[:value], after: canon },
|
|
103
|
+
notes: ["currency upcase (#{entry[:value]} → #{canon})"],
|
|
104
|
+
)
|
|
105
|
+
end
|
|
106
|
+
else
|
|
107
|
+
extra = placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
|
|
108
|
+
records.concat(extra)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
records
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def placeholder_decoration_evidence(entry, segments, idx, classifier, hints)
|
|
116
|
+
out = []
|
|
117
|
+
type = entry[:type]
|
|
118
|
+
|
|
119
|
+
if type == :ipv4 || type == :ipv6
|
|
120
|
+
out << Evidence.segment(
|
|
121
|
+
index: idx, value: entry[:value],
|
|
122
|
+
source: :policy,
|
|
123
|
+
payload: { rule: :ip_umbrella_collapse, from: type, to: :ip },
|
|
124
|
+
notes: ["ip umbrella collapse (#{type} → ip)"],
|
|
125
|
+
)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
if hints && entry[:hint].nil? && !SegmentHints::HINT_ELIGIBLE_TYPES.include?(type)
|
|
129
|
+
if (would_be = would_be_hint(segments, idx, type, classifier))
|
|
130
|
+
display = SegmentClassifier.display_type(type)
|
|
131
|
+
out << Evidence.segment(
|
|
132
|
+
index: idx, value: entry[:value],
|
|
133
|
+
source: :neighbor,
|
|
134
|
+
payload: { rule: :hint_suppression, surfaced: display, would_be: would_be, semantic_type: type },
|
|
135
|
+
notes: [format(HINT_NOTE_TEMPLATE, display, would_be)],
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
out
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def query_param_evidence(name, value, classifier)
|
|
144
|
+
records = []
|
|
145
|
+
base_type = classifier.classify(value)
|
|
146
|
+
effective = base_type
|
|
147
|
+
|
|
148
|
+
if (hint = SegmentClassifier.param_name_hint(name, base_type))
|
|
149
|
+
effective = hint
|
|
150
|
+
records << Evidence.segment(
|
|
151
|
+
index: name, value: value,
|
|
152
|
+
source: :neighbor,
|
|
153
|
+
payload: { rule: :param_name_hint, name: name, before: base_type, after: hint },
|
|
154
|
+
notes: ["param-name hint (`#{name}=`) lifted #{base_type} → #{hint}"],
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
records << Evidence.segment(
|
|
159
|
+
index: name, value: value,
|
|
160
|
+
source: :recognizer,
|
|
161
|
+
payload: { type: effective, variable: SegmentClassifier::DEFAULT.variable?(effective) },
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if effective == :date && (canon = SegmentClassifier.canonical_date(value))
|
|
165
|
+
if canon != value
|
|
166
|
+
records << Evidence.segment(
|
|
167
|
+
index: name, value: value,
|
|
168
|
+
source: :policy,
|
|
169
|
+
payload: { rule: :canonical_date, before: value, after: canon },
|
|
170
|
+
notes: ["canonical date (#{value} → #{canon})"],
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
elsif effective == :currency && (canon = SegmentClassifier.canonical_currency(value))
|
|
174
|
+
if canon != value
|
|
175
|
+
records << Evidence.segment(
|
|
176
|
+
index: name, value: value,
|
|
177
|
+
source: :policy,
|
|
178
|
+
payload: { rule: :canonical_currency, before: value, after: canon },
|
|
179
|
+
notes: ["currency upcase (#{value} → #{canon})"],
|
|
180
|
+
)
|
|
181
|
+
end
|
|
182
|
+
elsif effective == :ipv4 || effective == :ipv6
|
|
183
|
+
records << Evidence.segment(
|
|
184
|
+
index: name, value: value,
|
|
185
|
+
source: :policy,
|
|
186
|
+
payload: { rule: :ip_umbrella_collapse, from: effective, to: :ip },
|
|
187
|
+
notes: ["ip umbrella collapse (#{effective} → ip)"],
|
|
188
|
+
)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
records
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# ── View rendering (Evidence → Trace.for hash) ───────────────────────
|
|
195
|
+
|
|
196
|
+
def path_rows(segments, classifier, hints)
|
|
197
|
+
return [] if segments.nil? || segments.empty?
|
|
198
|
+
|
|
199
|
+
entries = SegmentHints.derive(segments, classifier)
|
|
200
|
+
entries.each_with_index.map do |entry, i|
|
|
201
|
+
ev = segment_evidence(entry, segments, i, classifier, hints)
|
|
202
|
+
render_segment_row(entry, ev, hints)
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def urn_rows(iri, classifier, hints)
|
|
207
|
+
parts = urn_parts(iri)
|
|
208
|
+
return [] if parts.empty?
|
|
209
|
+
|
|
210
|
+
entries = SegmentHints.derive(parts, classifier)
|
|
211
|
+
entries.each_with_index.map do |entry, i|
|
|
212
|
+
ev = segment_evidence(entry, parts, i, classifier, hints)
|
|
213
|
+
render_segment_row(entry, ev, hints)
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def query_rows(params, classifier)
|
|
218
|
+
params.keys.sort.map do |k|
|
|
219
|
+
v = params[k].to_s
|
|
220
|
+
ev = query_param_evidence(k, v, classifier)
|
|
221
|
+
render_query_row(k, v, ev)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def render_segment_row(entry, evidence, hints)
|
|
226
|
+
notes = collect_notes(evidence)
|
|
227
|
+
value = entry[:value]
|
|
228
|
+
type = entry[:type]
|
|
229
|
+
|
|
230
|
+
return { value: value, type: type, output: value, notes: notes } unless entry[:variable]
|
|
231
|
+
|
|
232
|
+
canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
|
|
233
|
+
if canon_policy
|
|
234
|
+
return { value: value, type: type, output: canon_policy[:after], notes: notes }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
placeholder = hints && entry[:hint] ? entry[:hint].to_s : SegmentClassifier.display_type(type).to_s
|
|
238
|
+
{ value: value, type: type, output: "{#{placeholder}}", notes: notes }
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def render_query_row(name, value, evidence)
|
|
242
|
+
notes = collect_notes(evidence)
|
|
243
|
+
cls = find_evidence(evidence, source: :recognizer)
|
|
244
|
+
effective = cls ? cls.payload[:type] : SegmentClassifier::DEFAULT.classify(value)
|
|
245
|
+
canon_policy = find_payload(evidence, :canonical_date) || find_payload(evidence, :canonical_currency)
|
|
246
|
+
|
|
247
|
+
output =
|
|
248
|
+
if canon_policy
|
|
249
|
+
canon_policy[:after]
|
|
250
|
+
elsif SegmentClassifier::DEFAULT.variable?(effective)
|
|
251
|
+
"{#{SegmentClassifier.display_type(effective)}}"
|
|
252
|
+
else
|
|
253
|
+
value
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
{ name: name, value: value, type: effective, output: output, notes: notes }
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# ── Helpers ──────────────────────────────────────────────────────────
|
|
260
|
+
|
|
261
|
+
def coerce(input)
|
|
262
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def urn_parts(iri)
|
|
266
|
+
return [] unless iri.nss
|
|
267
|
+
iri.nss.include?(":") ? iri.nss.split(":", 2) : [iri.nss]
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def collect_notes(evidence)
|
|
271
|
+
evidence.flat_map(&:notes)
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def find_evidence(evidence, source:)
|
|
275
|
+
evidence.find { |r| r.source == source }
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def find_payload(evidence, rule)
|
|
279
|
+
r = evidence.find { |e| e.source == :policy && e.payload[:rule] == rule }
|
|
280
|
+
r&.payload
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def would_be_hint(segments, idx, type, classifier)
|
|
284
|
+
return nil if idx.zero?
|
|
285
|
+
|
|
286
|
+
prev = segments[idx - 1]
|
|
287
|
+
return nil unless classifier.classify(prev) == :literal
|
|
288
|
+
|
|
289
|
+
base = Inflector.singularize(prev)
|
|
290
|
+
suffix = type == :uuid ? "_uuid" : "_id"
|
|
291
|
+
"#{base}#{suffix}"
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|
data/lib/iriq/version.rb
CHANGED
data/lib/iriq.rb
CHANGED
|
@@ -3,16 +3,33 @@ require "iriq/errors"
|
|
|
3
3
|
require "iriq/inflector"
|
|
4
4
|
require "iriq/identifier"
|
|
5
5
|
require "iriq/parser"
|
|
6
|
+
require "iriq/specificity"
|
|
7
|
+
require "iriq/recognizer"
|
|
8
|
+
require "iriq/recognizers/uuid"
|
|
9
|
+
require "iriq/recognizers/date"
|
|
10
|
+
require "iriq/recognizers/integer"
|
|
6
11
|
require "iriq/segment_classifier"
|
|
7
12
|
require "iriq/segment_hints"
|
|
13
|
+
require "iriq/shape"
|
|
8
14
|
require "iriq/path_shape"
|
|
9
15
|
require "iriq/normalizer"
|
|
10
16
|
require "iriq/explanation"
|
|
17
|
+
require "iriq/evidence"
|
|
18
|
+
require "iriq/trace"
|
|
11
19
|
require "iriq/cluster"
|
|
12
20
|
require "iriq/clusterer"
|
|
13
21
|
require "iriq/position_stats"
|
|
22
|
+
require "set"
|
|
23
|
+
|
|
14
24
|
require "iriq/observation"
|
|
25
|
+
require "iriq/position"
|
|
26
|
+
require "iriq/event"
|
|
27
|
+
require "iriq/reducer"
|
|
28
|
+
require "iriq/registrable_domain"
|
|
15
29
|
require "iriq/storage"
|
|
30
|
+
require "iriq/recognizer_proposal"
|
|
31
|
+
require "iriq/synthesized_recognizer"
|
|
32
|
+
require "iriq/cross_host_shape"
|
|
16
33
|
require "iriq/corpus"
|
|
17
34
|
require "iriq/extractor"
|
|
18
35
|
require "iriq/cli"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iriq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2
|
|
4
|
+
version: 0.30.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniel Pepper
|
|
@@ -92,6 +92,10 @@ files:
|
|
|
92
92
|
- LICENSE.txt
|
|
93
93
|
- Makefile
|
|
94
94
|
- README.md
|
|
95
|
+
- completions/_iriq
|
|
96
|
+
- completions/iriq.bash
|
|
97
|
+
- docs/ARCHITECTURE.md
|
|
98
|
+
- docs/ROADMAP.md
|
|
95
99
|
- exe/iriq
|
|
96
100
|
- iriq.gemspec
|
|
97
101
|
- lib/iriq.rb
|
|
@@ -99,7 +103,10 @@ files:
|
|
|
99
103
|
- lib/iriq/cluster.rb
|
|
100
104
|
- lib/iriq/clusterer.rb
|
|
101
105
|
- lib/iriq/corpus.rb
|
|
106
|
+
- lib/iriq/cross_host_shape.rb
|
|
102
107
|
- lib/iriq/errors.rb
|
|
108
|
+
- lib/iriq/event.rb
|
|
109
|
+
- lib/iriq/evidence.rb
|
|
103
110
|
- lib/iriq/explanation.rb
|
|
104
111
|
- lib/iriq/extractor.rb
|
|
105
112
|
- lib/iriq/identifier.rb
|
|
@@ -108,13 +115,25 @@ files:
|
|
|
108
115
|
- lib/iriq/observation.rb
|
|
109
116
|
- lib/iriq/parser.rb
|
|
110
117
|
- lib/iriq/path_shape.rb
|
|
118
|
+
- lib/iriq/position.rb
|
|
111
119
|
- lib/iriq/position_stats.rb
|
|
120
|
+
- lib/iriq/recognizer.rb
|
|
121
|
+
- lib/iriq/recognizer_proposal.rb
|
|
122
|
+
- lib/iriq/recognizers/date.rb
|
|
123
|
+
- lib/iriq/recognizers/integer.rb
|
|
124
|
+
- lib/iriq/recognizers/uuid.rb
|
|
125
|
+
- lib/iriq/reducer.rb
|
|
126
|
+
- lib/iriq/registrable_domain.rb
|
|
112
127
|
- lib/iriq/segment_classifier.rb
|
|
113
128
|
- lib/iriq/segment_hints.rb
|
|
129
|
+
- lib/iriq/shape.rb
|
|
130
|
+
- lib/iriq/specificity.rb
|
|
114
131
|
- lib/iriq/storage.rb
|
|
115
132
|
- lib/iriq/storage/json.rb
|
|
116
133
|
- lib/iriq/storage/memory.rb
|
|
117
134
|
- lib/iriq/storage/sqlite.rb
|
|
135
|
+
- lib/iriq/synthesized_recognizer.rb
|
|
136
|
+
- lib/iriq/trace.rb
|
|
118
137
|
- lib/iriq/version.rb
|
|
119
138
|
homepage: https://github.com/dpep/iriq
|
|
120
139
|
licenses:
|
|
@@ -127,14 +146,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
127
146
|
requirements:
|
|
128
147
|
- - ">="
|
|
129
148
|
- !ruby/object:Gem::Version
|
|
130
|
-
version: '3.
|
|
149
|
+
version: '3.4'
|
|
131
150
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
132
151
|
requirements:
|
|
133
152
|
- - ">="
|
|
134
153
|
- !ruby/object:Gem::Version
|
|
135
154
|
version: '0'
|
|
136
155
|
requirements: []
|
|
137
|
-
rubygems_version:
|
|
156
|
+
rubygems_version: 4.0.11
|
|
138
157
|
specification_version: 4
|
|
139
158
|
summary: IRI extraction, normalization, and clustering.
|
|
140
159
|
test_files: []
|