iriq 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/CLAUDE.md +121 -0
- data/Gemfile.lock +8 -2
- data/Makefile +56 -0
- data/README.md +334 -39
- data/iriq.gemspec +4 -3
- data/lib/iriq/cli.rb +289 -100
- data/lib/iriq/cluster.rb +47 -0
- data/lib/iriq/clusterer.rb +29 -39
- data/lib/iriq/corpus.rb +322 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +138 -0
- data/lib/iriq/storage/sqlite.rb +367 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +11 -0
- metadata +29 -4
data/README.md
CHANGED
|
@@ -3,19 +3,70 @@ Iriq
|
|
|
3
3
|

|
|
4
4
|
[](https://codecov.io/gh/dpep/iriq)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
IRI extraction, normalization, and clustering.
|
|
7
7
|
|
|
8
|
-
Iriq
|
|
9
|
-
forms, classifies path and query components,
|
|
10
|
-
and
|
|
8
|
+
Iriq pulls IRIs out of free text, parses them, normalizes them into
|
|
9
|
+
canonical shape-aware forms, classifies their path and query components,
|
|
10
|
+
and clusters similar identifiers — surfacing what's stable vs. unique.
|
|
11
|
+
|
|
12
|
+
Ships as both a **command-line tool** (`iriq`) and a **library** (Ruby and
|
|
13
|
+
Go — same behavior, enforced by parity tests).
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
The CLI is available three ways. Pick whichever fits your workflow:
|
|
18
|
+
|
|
19
|
+
```sh
|
|
20
|
+
# Homebrew (recommended)
|
|
21
|
+
brew install dpep/tools/iriq
|
|
22
|
+
|
|
23
|
+
# RubyGems — installs the CLI shim and the library
|
|
24
|
+
gem install iriq
|
|
25
|
+
|
|
26
|
+
# Go — installs the CLI binary into $GOBIN
|
|
27
|
+
go install github.com/dpep/iriq/cmd/iriq@latest
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
For library use, depend on whichever runtime you're working in:
|
|
11
31
|
|
|
12
32
|
```ruby
|
|
13
|
-
|
|
33
|
+
# Gemfile
|
|
34
|
+
gem "iriq"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
```go
|
|
38
|
+
import "github.com/dpep/iriq"
|
|
14
39
|
```
|
|
15
40
|
|
|
16
|
-
##
|
|
41
|
+
## CLI quick start
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
$ iriq https://foo.com/users/123
|
|
45
|
+
# parse
|
|
46
|
+
original: https://foo.com/users/123
|
|
47
|
+
kind: url
|
|
48
|
+
scheme: https
|
|
49
|
+
host: foo.com
|
|
50
|
+
path_segments: ["users", "123"]
|
|
51
|
+
canonical: https://foo.com/users/123
|
|
52
|
+
|
|
53
|
+
# normalize
|
|
54
|
+
https://foo.com/users/{user_id}
|
|
55
|
+
|
|
56
|
+
$ iriq -n https://foo.com/users/123
|
|
57
|
+
https://foo.com/users/{user_id}
|
|
58
|
+
|
|
59
|
+
$ cat access.log | iriq # extract → URL list (or clusters at scale)
|
|
60
|
+
$ cat access.log | iriq --stats # rolling aggregates
|
|
61
|
+
$ iriq ./access.log -n # file auto-detected → normalize each found URL
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Full CLI reference is below under [CLI](#cli).
|
|
65
|
+
|
|
66
|
+
## Library quick start
|
|
17
67
|
|
|
18
68
|
```ruby
|
|
69
|
+
# Ruby
|
|
19
70
|
iri = Iriq.parse("https://foo.com/users/123")
|
|
20
71
|
iri.scheme # => "https"
|
|
21
72
|
iri.host # => "foo.com"
|
|
@@ -23,17 +74,56 @@ iri.path_segments # => ["users", "123"]
|
|
|
23
74
|
iri.canonical # => "https://foo.com/users/123"
|
|
24
75
|
|
|
25
76
|
Iriq.normalize("https://foo.com/users/123")
|
|
26
|
-
# => "https://foo.com/users/{
|
|
77
|
+
# => "https://foo.com/users/{user_id}"
|
|
27
78
|
|
|
28
79
|
Iriq.explain("https://foo.com/users/123/orders/456")
|
|
29
80
|
# => [
|
|
30
|
-
# { value: "users", type: :literal, variable: false },
|
|
31
|
-
# { value: "123", type: :integer_id, variable: true },
|
|
32
|
-
# { value: "orders", type: :literal, variable: false },
|
|
33
|
-
# { value: "456", type: :integer_id, variable: true },
|
|
81
|
+
# { value: "users", type: :literal, variable: false, hint: nil },
|
|
82
|
+
# { value: "123", type: :integer_id, variable: true, hint: "user_id" },
|
|
83
|
+
# { value: "orders", type: :literal, variable: false, hint: nil },
|
|
84
|
+
# { value: "456", type: :integer_id, variable: true, hint: "order_id" },
|
|
34
85
|
# ]
|
|
35
86
|
```
|
|
36
87
|
|
|
88
|
+
```go
|
|
89
|
+
// Go (same surface)
|
|
90
|
+
iri, _ := iriq.Parse("https://foo.com/users/123")
|
|
91
|
+
iri.Scheme // "https"
|
|
92
|
+
iri.Host // "foo.com"
|
|
93
|
+
iri.PathSegments // []string{"users", "123"}
|
|
94
|
+
iri.Canonical() // "https://foo.com/users/123"
|
|
95
|
+
|
|
96
|
+
norm, _ := iriq.Normalize("https://foo.com/users/123")
|
|
97
|
+
// "https://foo.com/users/{user_id}"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The Ruby gem is the reference implementation; Go mirrors its API and is
|
|
101
|
+
kept in sync via JSON fixtures plus a CLI parity harness. See
|
|
102
|
+
[CLAUDE.md](CLAUDE.md) for the dev process.
|
|
103
|
+
|
|
104
|
+
Pass `hints: false` to `Iriq.normalize` (or `PathShape`) for mechanical
|
|
105
|
+
placeholders (`{integer_id}` instead of `{user_id}`).
|
|
106
|
+
|
|
107
|
+
## RESTful hints
|
|
108
|
+
|
|
109
|
+
When a variable segment follows a literal one, Iriq derives a hint by
|
|
110
|
+
singularizing the literal and suffixing `_id` (or `_uuid` for UUIDs). This is
|
|
111
|
+
what produces `{user_id}` from `/users/123` and `{order_id}` from
|
|
112
|
+
`/orders/456`. Singularization uses `Iriq::Inflector`, which delegates to a
|
|
113
|
+
swappable adapter:
|
|
114
|
+
|
|
115
|
+
```ruby
|
|
116
|
+
# Default: ActiveSupport::Inflector if `active_support/inflector` is loadable,
|
|
117
|
+
# otherwise a built-in adapter with rules adapted from ActiveSupport.
|
|
118
|
+
|
|
119
|
+
Iriq::Inflector.singularize("categories") # => "category"
|
|
120
|
+
Iriq::Inflector.singularize("people") # => "person"
|
|
121
|
+
|
|
122
|
+
# Override:
|
|
123
|
+
Iriq::Inflector.adapter = MyAdapter # must respond to .singularize(String)
|
|
124
|
+
Iriq::Inflector.reset_adapter!
|
|
125
|
+
```
|
|
126
|
+
|
|
37
127
|
## Supported inputs
|
|
38
128
|
|
|
39
129
|
| Input | Notes |
|
|
@@ -69,7 +159,7 @@ clusterer.add("https://foo.com/users/456")
|
|
|
69
159
|
clusterer.add("https://foo.com/users/789/orders/1")
|
|
70
160
|
|
|
71
161
|
clusterer.clusters.map(&:shape)
|
|
72
|
-
# => ["/users/{
|
|
162
|
+
# => ["/users/{user_id}", "/users/{user_id}/orders/{order_id}"]
|
|
73
163
|
|
|
74
164
|
clusterer.clusters.first.segment_stats
|
|
75
165
|
# => [
|
|
@@ -79,8 +169,8 @@ clusterer.clusters.first.segment_stats
|
|
|
79
169
|
|
|
80
170
|
clusterer.explain("https://foo.com/users/999")
|
|
81
171
|
# => [
|
|
82
|
-
# { value: "users", type: :literal, variable: false, stable: true },
|
|
83
|
-
# { value: "999", type: :integer_id, variable: true, stable: false },
|
|
172
|
+
# { value: "users", type: :literal, variable: false, hint: nil, stable: true },
|
|
173
|
+
# { value: "999", type: :integer_id, variable: true, hint: "user_id", stable: false },
|
|
84
174
|
# ]
|
|
85
175
|
```
|
|
86
176
|
|
|
@@ -89,6 +179,104 @@ a position the classifier *would* call variable but that is empirically
|
|
|
89
179
|
constant across all members of the cluster will be reported with
|
|
90
180
|
`stable: true, variable: false`.
|
|
91
181
|
|
|
182
|
+
## Corpus (streaming + learning)
|
|
183
|
+
|
|
184
|
+
For processing many identifiers — possibly an unbounded stream — use
|
|
185
|
+
`Iriq::Corpus`. It maintains rolling aggregates and per-(host, prefix)
|
|
186
|
+
frequency stats so classification improves as more data comes in.
|
|
187
|
+
|
|
188
|
+
```ruby
|
|
189
|
+
corpus = Iriq::Corpus.new
|
|
190
|
+
|
|
191
|
+
iris.each do |iri|
|
|
192
|
+
obs = corpus.observe(iri)
|
|
193
|
+
obs.fingerprint # deterministic shape: "https://foo.com/users/{user_id}"
|
|
194
|
+
obs.cluster # the Iriq::Cluster this fell into
|
|
195
|
+
obs.explanation # per-segment annotations with corpus-informed classification
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
corpus.host_counts # { "foo.com" => 1234, "bar.com" => 7 }
|
|
199
|
+
corpus.path_length_counts # { 2 => 800, 3 => 434 }
|
|
200
|
+
corpus.fingerprint_counts # shape → count
|
|
201
|
+
corpus.raw_shape_counts # hint-free shape → count
|
|
202
|
+
corpus.clusters # Iriq::Cluster instances
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Deterministic vs. corpus-informed normalization
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
Iriq.normalize("https://foo.com/users/me")
|
|
209
|
+
# => "https://foo.com/users/me" # mechanical: "me" is a literal
|
|
210
|
+
|
|
211
|
+
corpus.normalize("https://foo.com/users/me")
|
|
212
|
+
# => depends on what the corpus has seen
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
If many `/users/{integer_id}` paths flow in alongside a handful of
|
|
216
|
+
`/users/me`, the cluster `/users/me` is preserved (mechanical clustering
|
|
217
|
+
keeps literal routes distinct). If many *distinct literal handles*
|
|
218
|
+
(`/users/alice`, `/users/bob`, `/users/carol`, ...) flow in, the corpus
|
|
219
|
+
promotes that position to a `{user}` placeholder:
|
|
220
|
+
|
|
221
|
+
```ruby
|
|
222
|
+
%w[alice bob carol dave erin frank gina hank ivan jane].each do |name|
|
|
223
|
+
corpus.observe("https://foo.com/users/#{name}/profile")
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
corpus.normalize("https://foo.com/users/alice/profile")
|
|
227
|
+
# => "https://foo.com/users/{user}/profile"
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Explainability
|
|
231
|
+
|
|
232
|
+
Each row of `corpus.explain(...)` (and `observation.explanation`) carries a
|
|
233
|
+
`classification:` symbol on top of the deterministic fields:
|
|
234
|
+
|
|
235
|
+
| Classification | Meaning |
|
|
236
|
+
| --------------------------- | ---------------------------------------------------- |
|
|
237
|
+
| `:stable_literal` | Literal value dominates this position |
|
|
238
|
+
| `:variable_identifier` | Classifier said variable (uuid, integer, etc.) |
|
|
239
|
+
| `:rare_literal` | Literal seen here, but not dominant |
|
|
240
|
+
| `:corpus_inferred_variable` | Classifier said literal, but position has high entropy |
|
|
241
|
+
| `:ambiguous` | Insufficient signal — never seen, or mixed |
|
|
242
|
+
|
|
243
|
+
## Extracting IRIs from text
|
|
244
|
+
|
|
245
|
+
`Iriq::Extractor` is what powers pipe-mode in the CLI. Picks up explicit-
|
|
246
|
+
scheme URLs (`http`, `https`, `ftp`, `ws`, `wss`, `urn`) and `foo.com/path`-
|
|
247
|
+
style scheme-less URLs (small TLD allow-list, required path). Trims trailing
|
|
248
|
+
sentence punctuation iteratively and preserves balanced parens
|
|
249
|
+
(`https://en.wikipedia.org/wiki/Ruby_(programming_language)` stays intact;
|
|
250
|
+
`(see https://foo.com)` drops the outer paren).
|
|
251
|
+
|
|
252
|
+
```ruby
|
|
253
|
+
Iriq.extract("Visit https://foo.com today, also hit foo.com/users.")
|
|
254
|
+
# => [#<Iriq::Identifier https://foo.com>,
|
|
255
|
+
# #<Iriq::Identifier https://foo.com/users>]
|
|
256
|
+
|
|
257
|
+
# Disable scheme-less:
|
|
258
|
+
Iriq::Extractor.new(scheme_less: false).extract("hit foo.com/users today")
|
|
259
|
+
# => []
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Known limitations (intentional):
|
|
263
|
+
|
|
264
|
+
- Comma is a URL boundary, so query strings like `?q=37.7,-122.4` truncate.
|
|
265
|
+
Trade-off picked to keep CSV-shaped text working.
|
|
266
|
+
- No HTML entity decoding (`&` stays as-is).
|
|
267
|
+
- Scheme-less mode skips bare hostnames without a path (too noisy in prose).
|
|
268
|
+
|
|
269
|
+
### Memory bounds
|
|
270
|
+
|
|
271
|
+
- Per-position `value_counts` is capped (`max_values_per_position`, default
|
|
272
|
+
1000) — once full, `total` keeps growing but only existing keys count up.
|
|
273
|
+
- Cluster examples are capped at `Iriq::Cluster::MAX_EXAMPLES`.
|
|
274
|
+
- No raw IRI strings are retained outside the bounded cluster examples.
|
|
275
|
+
|
|
276
|
+
```ruby
|
|
277
|
+
Iriq::Corpus.new(max_values_per_position: 200)
|
|
278
|
+
```
|
|
279
|
+
|
|
92
280
|
## Object model
|
|
93
281
|
|
|
94
282
|
| Class | Responsibility |
|
|
@@ -96,51 +284,137 @@ constant across all members of the cluster will be reported with
|
|
|
96
284
|
| `Iriq::Parser` | String → `Identifier` |
|
|
97
285
|
| `Iriq::Identifier` | Structured fields + `canonical` reconstruction |
|
|
98
286
|
| `Iriq::SegmentClassifier` | Single segment → type symbol |
|
|
99
|
-
| `Iriq::PathShape` | Segments → `/users/{
|
|
287
|
+
| `Iriq::PathShape` | Segments → `/users/{user_id}` route shape |
|
|
288
|
+
| `Iriq::SegmentHints` | Derives `user_id`-style hints from neighbors |
|
|
289
|
+
| `Iriq::Inflector` | Singularization with swappable adapter (AS or built-in) |
|
|
100
290
|
| `Iriq::Normalizer` | Identifier → canonical, shape-aware string |
|
|
101
|
-
| `Iriq::Explanation` | Per-segment `{value, type, variable}`
|
|
291
|
+
| `Iriq::Explanation` | Per-segment `{value, type, variable, hint}` rows |
|
|
102
292
|
| `Iriq::Cluster` | One host + shape group, with examples & stats |
|
|
103
293
|
| `Iriq::Clusterer` | Many identifiers → `Cluster` set + explain |
|
|
294
|
+
| `Iriq::PositionStats` | Capped value/type frequencies for one position |
|
|
295
|
+
| `Iriq::Observation` | What `Corpus#observe` returns |
|
|
296
|
+
| `Iriq::Corpus` | Streaming observer with rolling aggregates + learning |
|
|
297
|
+
| `Iriq::Extractor` | Pulls IRIs out of free text (scheme-anchored) |
|
|
104
298
|
|
|
105
299
|
## CLI
|
|
106
300
|
|
|
107
|
-
Installing the gem
|
|
301
|
+
Installing the gem installs an `iriq` executable. Two main modes:
|
|
302
|
+
|
|
303
|
+
**Single input** — combined parse + normalize summary; trim with section
|
|
304
|
+
flags (`-p`, `-n`).
|
|
108
305
|
|
|
109
306
|
```
|
|
110
|
-
$ iriq
|
|
111
|
-
|
|
307
|
+
$ iriq foo.com/users/456
|
|
308
|
+
# parse
|
|
309
|
+
original: foo.com/users/456
|
|
112
310
|
kind: url
|
|
113
311
|
scheme: https
|
|
114
312
|
host: foo.com
|
|
115
|
-
path_segments: ["users", "
|
|
116
|
-
canonical: https://foo.com/users/
|
|
313
|
+
path_segments: ["users", "456"]
|
|
314
|
+
canonical: https://foo.com/users/456
|
|
117
315
|
|
|
118
|
-
|
|
119
|
-
https://foo.com/
|
|
316
|
+
# normalize
|
|
317
|
+
https://foo.com/users/{user_id}
|
|
120
318
|
|
|
121
|
-
$ iriq
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
literal orders
|
|
125
|
-
* integer_id 456
|
|
319
|
+
$ iriq -n https://foo.com/users/123
|
|
320
|
+
https://foo.com/users/{user_id}
|
|
321
|
+
```
|
|
126
322
|
|
|
127
|
-
|
|
128
|
-
|
|
323
|
+
**Piped stdin** — extraction runs by default. Output auto-switches: small
|
|
324
|
+
inputs get a deduplicated URL list, larger inputs (≥ 10 IRIs) get the
|
|
325
|
+
cluster view via an ephemeral corpus. Section flags work too — emit one
|
|
326
|
+
normalized URL / parsed record per extracted IRI.
|
|
129
327
|
|
|
130
|
-
$ cat urls.txt | iriq cluster
|
|
131
|
-
[2] foo.com /users/{integer_id}
|
|
132
|
-
https://foo.com/users/1
|
|
133
|
-
https://foo.com/users/2
|
|
134
|
-
[1] foo.com /posts/{slug}/edit
|
|
135
|
-
https://foo.com/posts/abc-123/edit
|
|
136
328
|
```
|
|
329
|
+
$ cat short.txt | iriq
|
|
330
|
+
[2] https://github.com/dpep/iriq
|
|
331
|
+
[1] https://foo.com/users
|
|
332
|
+
|
|
333
|
+
$ cat short.txt | iriq -n # normalized URL per line
|
|
334
|
+
https://github.com/dpep/iriq
|
|
335
|
+
https://foo.com/users
|
|
336
|
+
|
|
337
|
+
$ cat access.log | iriq # ≥ 10 IRIs → cluster view
|
|
338
|
+
[190] docs.example.com /users/{user_id}
|
|
339
|
+
[186] app.example.com /users/{user_id}
|
|
340
|
+
...
|
|
341
|
+
|
|
342
|
+
$ cat README.md | iriq --stats # rolling aggregates
|
|
343
|
+
$ cat README.md | iriq cluster # force cluster view
|
|
344
|
+
$ cat README.md | iriq --corpus c.json # persist into a corpus
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
`--corpus PATH` makes the corpus survive across invocations. The file
|
|
348
|
+
extension picks the storage backend:
|
|
137
349
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
350
|
+
- `.json` — a single atomically-written JSON file (default). Best for small
|
|
351
|
+
corpora and when you want the data human-readable.
|
|
352
|
+
- `.db` / `.sqlite` / `.sqlite3` — a SQLite database with WAL journaling.
|
|
353
|
+
Each observation is an incremental UPSERT, so multiple `iriq --corpus`
|
|
354
|
+
processes can write concurrently without clobbering each other, and the
|
|
355
|
+
cost of opening doesn't scale with corpus size.
|
|
356
|
+
|
|
357
|
+
Once the corpus has data, `-n` becomes corpus-informed:
|
|
358
|
+
|
|
359
|
+
```
|
|
360
|
+
$ for n in alice bob carol dave erin frank gina hank ivan jane; do
|
|
361
|
+
iriq --corpus c.db https://foo.com/users/$n/profile >/dev/null
|
|
362
|
+
done
|
|
363
|
+
|
|
364
|
+
$ iriq -n --corpus c.db https://foo.com/users/zoe/profile
|
|
365
|
+
https://foo.com/users/{user}/profile # mechanical would keep "zoe"
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
Library: `Iriq::Corpus.open("c.db")` (or `iriq.OpenCorpus("c.db")` in Go)
|
|
369
|
+
dispatches on the same extension rules. `corpus.save("export.json")`
|
|
370
|
+
exports any backend as JSON.
|
|
371
|
+
|
|
372
|
+
Flags:
|
|
373
|
+
|
|
374
|
+
| Flag | Effect |
|
|
375
|
+
| ------------------- | ------------------------------------------------------- |
|
|
376
|
+
| `-p, --parse` | Show parsed fields |
|
|
377
|
+
| `-n, --normalize` | Show the shape-normalized form |
|
|
378
|
+
| `-j, --json` | Emit JSON |
|
|
379
|
+
| `-N, --no-hints` | Use `{integer_id}` etc. instead of `{user_id}` |
|
|
380
|
+
| `--no-scheme-less` | Skip `foo.com/path`-style extraction (explicit-scheme only) |
|
|
381
|
+
| `--corpus PATH` | Load/create a corpus at PATH (`.json` or `.db`/`.sqlite`/`.sqlite3`) |
|
|
382
|
+
| `--stats` | Print rolling aggregates |
|
|
383
|
+
| `-V, --version` | Print version |
|
|
384
|
+
|
|
385
|
+
A positional argument that doesn't parse as an IRI but IS an existing
|
|
386
|
+
file is read and extracted from automatically — `iriq ./access.log` and
|
|
387
|
+
`iriq /var/log/foo.log` Just Work. (Bare filenames like `README.md`
|
|
388
|
+
may still parse as a URL; pipe with `cat` to disambiguate.)
|
|
141
389
|
|
|
142
390
|
Exit codes: `0` success, `1` usage error, `2` parse error.
|
|
143
391
|
|
|
392
|
+
## Performance
|
|
393
|
+
|
|
394
|
+
Measured on the deterministic `IriGenerator` fixture (Ruby 3.4.9, single
|
|
395
|
+
thread):
|
|
396
|
+
|
|
397
|
+
| Operation | Throughput |
|
|
398
|
+
| ------------------------ | ------------ |
|
|
399
|
+
| `Iriq.parse` | ~260k URLs/s |
|
|
400
|
+
| `Iriq.normalize` | ~148k URLs/s |
|
|
401
|
+
| `Iriq.explain` | ~205k URLs/s |
|
|
402
|
+
| `Iriq.extract` (prose) | ~9.6 MB/s |
|
|
403
|
+
| `Corpus#observe` | ~80k URLs/s |
|
|
404
|
+
| Corpus save/load (10k) | ~135 ms |
|
|
405
|
+
|
|
406
|
+
Linear scaling holds through 100k observations; per-observation retained
|
|
407
|
+
memory amortizes to ~100 bytes at that scale. Memoization caches are
|
|
408
|
+
bounded by `CACHE_MAX = 10_000` (cleared when full) — overhead is a few
|
|
409
|
+
hundred KB regardless of corpus size.
|
|
410
|
+
|
|
411
|
+
Re-run anytime with:
|
|
412
|
+
|
|
413
|
+
```
|
|
414
|
+
bundle exec script/benchmark.rb # throughput
|
|
415
|
+
bundle exec script/memory.rb # retained memory + cache footprints
|
|
416
|
+
```
|
|
417
|
+
|
|
144
418
|
## Limitations (intentional)
|
|
145
419
|
|
|
146
420
|
This is an MVP. Iriq does **not**:
|
|
@@ -158,6 +432,25 @@ For richer IRI handling, see `addressable`. Iriq's focus is the analysis
|
|
|
158
432
|
side: classification, normalization, and clustering — not a complete URL
|
|
159
433
|
implementation.
|
|
160
434
|
|
|
435
|
+
----
|
|
436
|
+
## Go port
|
|
437
|
+
|
|
438
|
+
A Go implementation lives under [`go/`](go/) — same public surface, same
|
|
439
|
+
behavior, ~10× faster CLI on extraction-heavy workloads. The Ruby gem is
|
|
440
|
+
the reference; the Go port stays in sync via golden JSON fixtures
|
|
441
|
+
(`spec/fixtures/`) and a CLI parity harness (`script/cli_parity.sh`), both
|
|
442
|
+
checked in CI.
|
|
443
|
+
|
|
444
|
+
```go
|
|
445
|
+
import "github.com/dpep/iriq/go/iriq"
|
|
446
|
+
|
|
447
|
+
iri, _ := iriq.Parse("https://foo.com/users/123")
|
|
448
|
+
norm, _ := iriq.Normalize("https://foo.com/users/123")
|
|
449
|
+
// "https://foo.com/users/{user_id}"
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
See [`go/README.md`](go/README.md) for the full API table and porting workflow.
|
|
453
|
+
|
|
161
454
|
----
|
|
162
455
|
## Contributing
|
|
163
456
|
|
|
@@ -166,6 +459,8 @@ Yes please :)
|
|
|
166
459
|
1. Fork it
|
|
167
460
|
1. Create your feature branch (`git checkout -b my-feature`)
|
|
168
461
|
1. Ensure the tests pass (`bundle exec rspec`)
|
|
462
|
+
1. If you changed library behavior, port the change to Go (or open an
|
|
463
|
+
issue) and regenerate fixtures: `bundle exec ruby script/generate_fixtures.rb`
|
|
169
464
|
1. Commit your changes (`git commit -am 'awesome new feature'`)
|
|
170
465
|
1. Push your branch (`git push origin my-feature`)
|
|
171
466
|
1. Create a Pull Request
|
data/iriq.gemspec
CHANGED
|
@@ -4,13 +4,13 @@ Gem::Specification.new do |s|
|
|
|
4
4
|
s.name = "iriq"
|
|
5
5
|
s.version = Iriq::VERSION
|
|
6
6
|
s.authors = ["Daniel Pepper"]
|
|
7
|
-
s.description = "
|
|
8
|
-
s.files = `git ls-files * ':!:spec'`.split("\n")
|
|
7
|
+
s.description = "IRI extraction, normalization, and clustering."
|
|
8
|
+
s.files = `git ls-files * ':!:spec' ':!:script' ':!:cmd' ':!:bin' ':!:*.go' ':!:go.mod' ':!:go.sum'`.split("\n")
|
|
9
9
|
s.bindir = "exe"
|
|
10
10
|
s.executables = ["iriq"]
|
|
11
11
|
s.homepage = "https://github.com/dpep/iriq"
|
|
12
12
|
s.license = "MIT"
|
|
13
|
-
s.summary = "
|
|
13
|
+
s.summary = "IRI extraction, normalization, and clustering."
|
|
14
14
|
|
|
15
15
|
s.required_ruby_version = ">= 3.2"
|
|
16
16
|
|
|
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
|
|
|
18
18
|
s.add_development_dependency 'rspec', '>= 3.10'
|
|
19
19
|
s.add_development_dependency 'rspec-debugging'
|
|
20
20
|
s.add_development_dependency 'simplecov', '>= 0.22'
|
|
21
|
+
s.add_development_dependency 'sqlite3', '>= 1.6'
|
|
21
22
|
end
|