iriq 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +97 -0
- data/LICENSE.txt +21 -0
- data/README.md +171 -0
- data/exe/iriq +4 -0
- data/iriq.gemspec +21 -0
- data/lib/iriq/cli.rb +205 -0
- data/lib/iriq/cluster.rb +58 -0
- data/lib/iriq/clusterer.rb +77 -0
- data/lib/iriq/errors.rb +4 -0
- data/lib/iriq/explanation.rb +45 -0
- data/lib/iriq/identifier.rb +64 -0
- data/lib/iriq/normalizer.rb +49 -0
- data/lib/iriq/parser.rb +141 -0
- data/lib/iriq/path_shape.rb +27 -0
- data/lib/iriq/segment_classifier.rb +54 -0
- data/lib/iriq/version.rb +3 -0
- data/lib/iriq.rb +27 -0
- metadata +115 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: d1f6ebe248ed57192adc8f5c9600dfe14d9d0c85160dd5d6588d7fbfd7995e72
|
|
4
|
+
data.tar.gz: d80356a646effb078ebe78b62417534746ded550fd4145faa53998aa35866bde
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: d355eef90433cef5cd807d9c68d6259ad49d79dd87d842770fded377a535ed39e4d24228594b24ce96015f8c9e7246bdc96d7bcfa58f3cdf3350bb389bff211f
|
|
7
|
+
data.tar.gz: b0e1ddf1c6bcebadbe2578fe0760076a8401fd4fc2cd9a5f345b651371cf2fb47ebaf302565df61b05a389e0253f3febdbccc4e2389cf7d2e810abc8e4156a40
|
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
iriq (0.0.1)
|
|
5
|
+
|
|
6
|
+
GEM
|
|
7
|
+
remote: https://rubygems.org/
|
|
8
|
+
specs:
|
|
9
|
+
date (3.5.1)
|
|
10
|
+
debug (1.11.1)
|
|
11
|
+
irb (~> 1.10)
|
|
12
|
+
reline (>= 0.3.8)
|
|
13
|
+
diff-lcs (1.6.2)
|
|
14
|
+
docile (1.4.1)
|
|
15
|
+
erb (6.0.4)
|
|
16
|
+
io-console (0.8.2)
|
|
17
|
+
irb (1.17.0)
|
|
18
|
+
pp (>= 0.6.0)
|
|
19
|
+
prism (>= 1.3.0)
|
|
20
|
+
rdoc (>= 4.0.0)
|
|
21
|
+
reline (>= 0.4.2)
|
|
22
|
+
pp (0.6.3)
|
|
23
|
+
prettyprint
|
|
24
|
+
prettyprint (0.2.0)
|
|
25
|
+
prism (1.9.0)
|
|
26
|
+
psych (5.3.1)
|
|
27
|
+
date
|
|
28
|
+
stringio
|
|
29
|
+
rdoc (7.2.0)
|
|
30
|
+
erb
|
|
31
|
+
psych (>= 4.0.0)
|
|
32
|
+
tsort
|
|
33
|
+
reline (0.6.3)
|
|
34
|
+
io-console (~> 0.5)
|
|
35
|
+
rspec (3.13.2)
|
|
36
|
+
rspec-core (~> 3.13.0)
|
|
37
|
+
rspec-expectations (~> 3.13.0)
|
|
38
|
+
rspec-mocks (~> 3.13.0)
|
|
39
|
+
rspec-core (3.13.6)
|
|
40
|
+
rspec-support (~> 3.13.0)
|
|
41
|
+
rspec-debugging (0.0.4)
|
|
42
|
+
rspec-expectations (>= 3)
|
|
43
|
+
rspec-expectations (3.13.5)
|
|
44
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
45
|
+
rspec-support (~> 3.13.0)
|
|
46
|
+
rspec-mocks (3.13.8)
|
|
47
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
48
|
+
rspec-support (~> 3.13.0)
|
|
49
|
+
rspec-support (3.13.7)
|
|
50
|
+
simplecov (0.22.0)
|
|
51
|
+
docile (~> 1.1)
|
|
52
|
+
simplecov-html (~> 0.11)
|
|
53
|
+
simplecov_json_formatter (~> 0.1)
|
|
54
|
+
simplecov-html (0.13.2)
|
|
55
|
+
simplecov_json_formatter (0.1.4)
|
|
56
|
+
stringio (3.2.0)
|
|
57
|
+
tsort (0.2.0)
|
|
58
|
+
|
|
59
|
+
PLATFORMS
|
|
60
|
+
ruby
|
|
61
|
+
|
|
62
|
+
DEPENDENCIES
|
|
63
|
+
debug (>= 1)
|
|
64
|
+
iriq!
|
|
65
|
+
rspec (>= 3.10)
|
|
66
|
+
rspec-debugging
|
|
67
|
+
simplecov (>= 0.22)
|
|
68
|
+
|
|
69
|
+
CHECKSUMS
|
|
70
|
+
date (3.5.1) sha256=750d06384d7b9c15d562c76291407d89e368dda4d4fff957eb94962d325a0dc0
|
|
71
|
+
debug (1.11.1) sha256=2e0b0ac6119f2207a6f8ac7d4a73ca8eb4e440f64da0a3136c30343146e952b6
|
|
72
|
+
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
73
|
+
docile (1.4.1) sha256=96159be799bfa73cdb721b840e9802126e4e03dfc26863db73647204c727f21e
|
|
74
|
+
erb (6.0.4) sha256=38e3803694be357fe2bfe312487c74beaf9fb4e5beb3e22498952fe1645b95d9
|
|
75
|
+
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
76
|
+
irb (1.17.0) sha256=168c4ddb93d8a361a045c41d92b2952c7a118fa73f23fe14e55609eb7a863aae
|
|
77
|
+
iriq (0.0.1)
|
|
78
|
+
pp (0.6.3) sha256=2951d514450b93ccfeb1df7d021cae0da16e0a7f95ee1e2273719669d0ab9df6
|
|
79
|
+
prettyprint (0.2.0) sha256=2bc9e15581a94742064a3cc8b0fb9d45aae3d03a1baa6ef80922627a0766f193
|
|
80
|
+
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
81
|
+
psych (5.3.1) sha256=eb7a57cef10c9d70173ff74e739d843ac3b2c019a003de48447b2963d81b1974
|
|
82
|
+
rdoc (7.2.0) sha256=8650f76cd4009c3b54955eb5d7e3a075c60a57276766ebf36f9085e8c9f23192
|
|
83
|
+
reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
|
|
84
|
+
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
85
|
+
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
86
|
+
rspec-debugging (0.0.4) sha256=7a8e2dc240c140f0ed27b452a5661a56474ee8cf7b84c5bcbefd827ad36f0a6f
|
|
87
|
+
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
88
|
+
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
89
|
+
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
90
|
+
simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
|
|
91
|
+
simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
|
|
92
|
+
simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
|
|
93
|
+
stringio (3.2.0) sha256=c37cb2e58b4ffbd33fe5cd948c05934af997b36e0b6ca6fdf43afa234cf222e1
|
|
94
|
+
tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
|
|
95
|
+
|
|
96
|
+
BUNDLED WITH
|
|
97
|
+
4.0.9
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Pepper
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
Iriq
|
|
2
|
+
======
|
|
3
|
+

|
|
4
|
+
[](https://codecov.io/gh/dpep/iriq)
|
|
5
|
+
|
|
6
|
+
Semantic IRI / URI / URL / URN normalization and clustering for Ruby.
|
|
7
|
+
|
|
8
|
+
Iriq parses resource identifiers, normalizes them into canonical IRI-like
|
|
9
|
+
forms, classifies path and query components, clusters similar identifiers,
|
|
10
|
+
and explains which parts are stable vs. unique.
|
|
11
|
+
|
|
12
|
+
```ruby
|
|
13
|
+
require "iriq"
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Quick start
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
iri = Iriq.parse("https://foo.com/users/123")
|
|
20
|
+
iri.scheme # => "https"
|
|
21
|
+
iri.host # => "foo.com"
|
|
22
|
+
iri.path_segments # => ["users", "123"]
|
|
23
|
+
iri.canonical # => "https://foo.com/users/123"
|
|
24
|
+
|
|
25
|
+
Iriq.normalize("https://foo.com/users/123")
|
|
26
|
+
# => "https://foo.com/users/{integer_id}"
|
|
27
|
+
|
|
28
|
+
Iriq.explain("https://foo.com/users/123/orders/456")
|
|
29
|
+
# => [
|
|
30
|
+
# { value: "users", type: :literal, variable: false },
|
|
31
|
+
# { value: "123", type: :integer_id, variable: true },
|
|
32
|
+
# { value: "orders", type: :literal, variable: false },
|
|
33
|
+
# { value: "456", type: :integer_id, variable: true },
|
|
34
|
+
# ]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Supported inputs
|
|
38
|
+
|
|
39
|
+
| Input | Notes |
|
|
40
|
+
| ------------------------------------ | ------------------------------------------------ |
|
|
41
|
+
| `https://foo.com/users/123` | Standard URL |
|
|
42
|
+
| `foo.com/users/456` | Scheme-less; `https://` is assumed |
|
|
43
|
+
| `urn:isbn:0451450523` | URN — `scheme` and `nss` are populated |
|
|
44
|
+
| `https://例え.テスト/こんにちは` | Unicode IRI — display form preserved |
|
|
45
|
+
| `HTTPS://Foo.com:443/A` | Scheme + host lowercased; default port dropped |
|
|
46
|
+
| `https://foo.com/a/./b/../c` | Dot segments normalized |
|
|
47
|
+
|
|
48
|
+
## Segment classification
|
|
49
|
+
|
|
50
|
+
`Iriq::SegmentClassifier` returns one of:
|
|
51
|
+
|
|
52
|
+
- `:literal` — plain word (`users`, `orders`, `Profile`, `こんにちは`)
|
|
53
|
+
- `:integer_id` — pure digits below the timestamp range (`1`, `123`, `42`)
|
|
54
|
+
- `:uuid` — `f47ac10b-58cc-4372-a567-0e02b2c3d479`
|
|
55
|
+
- `:date` — `2024-05-23`
|
|
56
|
+
- `:timestamp` — ISO 8601, or 10/13-digit UNIX epoch
|
|
57
|
+
- `:hash` — 32+ hex chars (md5 / sha)
|
|
58
|
+
- `:slug` — `my-cool-post`, `my_cool_post`
|
|
59
|
+
- `:opaque_id` — short alphanumeric mix that doesn't fit elsewhere
|
|
60
|
+
|
|
61
|
+
Heuristics are deterministic and ordered — the first matching rule wins.
|
|
62
|
+
|
|
63
|
+
## Clustering
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
clusterer = Iriq::Clusterer.new
|
|
67
|
+
clusterer.add("https://foo.com/users/123")
|
|
68
|
+
clusterer.add("https://foo.com/users/456")
|
|
69
|
+
clusterer.add("https://foo.com/users/789/orders/1")
|
|
70
|
+
|
|
71
|
+
clusterer.clusters.map(&:shape)
|
|
72
|
+
# => ["/users/{integer_id}", "/users/{integer_id}/orders/{integer_id}"]
|
|
73
|
+
|
|
74
|
+
clusterer.clusters.first.segment_stats
|
|
75
|
+
# => [
|
|
76
|
+
# { position: 0, stable: true, values: { "users" => 2 } },
|
|
77
|
+
# { position: 1, stable: false, values: { "123" => 1, "456" => 1 } },
|
|
78
|
+
# ]
|
|
79
|
+
|
|
80
|
+
clusterer.explain("https://foo.com/users/999")
|
|
81
|
+
# => [
|
|
82
|
+
# { value: "users", type: :literal, variable: false, stable: true },
|
|
83
|
+
# { value: "999", type: :integer_id, variable: true, stable: false },
|
|
84
|
+
# ]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The clusterer combines classifier output with what it has actually observed:
|
|
88
|
+
a position the classifier *would* call variable but that is empirically
|
|
89
|
+
constant across all members of the cluster will be reported with
|
|
90
|
+
`stable: true, variable: false`.
|
|
91
|
+
|
|
92
|
+
## Object model
|
|
93
|
+
|
|
94
|
+
| Class | Responsibility |
|
|
95
|
+
| --------------------------- | ---------------------------------------------------- |
|
|
96
|
+
| `Iriq::Parser` | String → `Identifier` |
|
|
97
|
+
| `Iriq::Identifier` | Structured fields + `canonical` reconstruction |
|
|
98
|
+
| `Iriq::SegmentClassifier` | Single segment → type symbol |
|
|
99
|
+
| `Iriq::PathShape` | Segments → `/users/{integer_id}` route shape |
|
|
100
|
+
| `Iriq::Normalizer` | Identifier → canonical, shape-aware string |
|
|
101
|
+
| `Iriq::Explanation` | Per-segment `{value, type, variable}` annotations |
|
|
102
|
+
| `Iriq::Cluster` | One host + shape group, with examples & stats |
|
|
103
|
+
| `Iriq::Clusterer` | Many identifiers → `Cluster` set + explain |
|
|
104
|
+
|
|
105
|
+
## CLI
|
|
106
|
+
|
|
107
|
+
Installing the gem also installs an `iriq` executable.
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
$ iriq parse https://foo.com/users/123
|
|
111
|
+
original: https://foo.com/users/123
|
|
112
|
+
kind: url
|
|
113
|
+
scheme: https
|
|
114
|
+
host: foo.com
|
|
115
|
+
path_segments: ["users", "123"]
|
|
116
|
+
canonical: https://foo.com/users/123
|
|
117
|
+
|
|
118
|
+
$ iriq normalize foo.com/posts/2024-05-23/hello-world
|
|
119
|
+
https://foo.com/posts/{date}/{slug}
|
|
120
|
+
|
|
121
|
+
$ iriq explain https://foo.com/users/123/orders/456
|
|
122
|
+
literal users
|
|
123
|
+
* integer_id 123
|
|
124
|
+
literal orders
|
|
125
|
+
* integer_id 456
|
|
126
|
+
|
|
127
|
+
$ iriq classify f47ac10b-58cc-4372-a567-0e02b2c3d479
|
|
128
|
+
uuid
|
|
129
|
+
|
|
130
|
+
$ cat urls.txt | iriq cluster
|
|
131
|
+
[2] foo.com /users/{integer_id}
|
|
132
|
+
https://foo.com/users/1
|
|
133
|
+
https://foo.com/users/2
|
|
134
|
+
[1] foo.com /posts/{slug}/edit
|
|
135
|
+
https://foo.com/posts/abc-123/edit
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Add `--json` to any command for machine-readable output. `iriq cluster` reads
|
|
139
|
+
identifiers (one per line) from a file argument or stdin; lines that fail to
|
|
140
|
+
parse are skipped with a warning on stderr.
|
|
141
|
+
|
|
142
|
+
Exit codes: `0` success, `1` usage error, `2` parse error.
|
|
143
|
+
|
|
144
|
+
## Limitations (intentional)
|
|
145
|
+
|
|
146
|
+
This is an MVP. Iriq does **not**:
|
|
147
|
+
|
|
148
|
+
- Implement RFC 3986, RFC 3987, or the WHATWG URL standard fully.
|
|
149
|
+
- Convert between Unicode (IRI) and punycode (URI) — the display form is
|
|
150
|
+
preserved as-is.
|
|
151
|
+
- Percent-encode or decode path/query bytes. Bytes are kept as written.
|
|
152
|
+
- Validate scheme-specific structure beyond URL vs. URN.
|
|
153
|
+
- Resolve relative references against a base URL.
|
|
154
|
+
- Round-trip `canonical` back to the exact original byte-for-byte (whitespace
|
|
155
|
+
is stripped, default ports are dropped, dot segments are collapsed).
|
|
156
|
+
|
|
157
|
+
For richer IRI handling, see `addressable`. Iriq's focus is the analysis
|
|
158
|
+
side: classification, normalization, and clustering — not a complete URL
|
|
159
|
+
implementation.
|
|
160
|
+
|
|
161
|
+
----
|
|
162
|
+
## Contributing
|
|
163
|
+
|
|
164
|
+
Yes please :)
|
|
165
|
+
|
|
166
|
+
1. Fork it
|
|
167
|
+
1. Create your feature branch (`git checkout -b my-feature`)
|
|
168
|
+
1. Ensure the tests pass (`bundle exec rspec`)
|
|
169
|
+
1. Commit your changes (`git commit -am 'awesome new feature'`)
|
|
170
|
+
1. Push your branch (`git push origin my-feature`)
|
|
171
|
+
1. Create a Pull Request
|
data/exe/iriq
ADDED
data/iriq.gemspec
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require_relative "lib/iriq/version"
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |s|
|
|
4
|
+
s.name = "iriq"
|
|
5
|
+
s.version = Iriq::VERSION
|
|
6
|
+
s.authors = ["Daniel Pepper"]
|
|
7
|
+
s.description = "Semantic IRI/URI/URL/URN parsing, normalization, classification, and clustering."
|
|
8
|
+
s.files = `git ls-files * ':!:spec'`.split("\n")
|
|
9
|
+
s.bindir = "exe"
|
|
10
|
+
s.executables = ["iriq"]
|
|
11
|
+
s.homepage = "https://github.com/dpep/iriq"
|
|
12
|
+
s.license = "MIT"
|
|
13
|
+
s.summary = "Semantic IRI normalization and clustering."
|
|
14
|
+
|
|
15
|
+
s.required_ruby_version = ">= 3.2"
|
|
16
|
+
|
|
17
|
+
s.add_development_dependency 'debug', '>= 1'
|
|
18
|
+
s.add_development_dependency 'rspec', '>= 3.10'
|
|
19
|
+
s.add_development_dependency 'rspec-debugging'
|
|
20
|
+
s.add_development_dependency 'simplecov', '>= 0.22'
|
|
21
|
+
end
|
data/lib/iriq/cli.rb
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
require "optparse"
|
|
3
|
+
|
|
4
|
+
module Iriq
|
|
5
|
+
# Tiny CLI wrapper around the public API. Construct with explicit IO so
|
|
6
|
+
# specs can run it without shelling out.
|
|
7
|
+
class CLI
|
|
8
|
+
COMMANDS = %w[parse normalize explain classify cluster help version].freeze
|
|
9
|
+
|
|
10
|
+
USAGE = <<~TXT
|
|
11
|
+
Usage: iriq <command> [options] [args]
|
|
12
|
+
|
|
13
|
+
Commands:
|
|
14
|
+
parse <input> Parse an identifier and print its fields
|
|
15
|
+
normalize <input> Print the shape-normalized form
|
|
16
|
+
explain <input> Annotate each path segment
|
|
17
|
+
classify <segment> Classify a single segment
|
|
18
|
+
cluster [file] Cluster identifiers from FILE or stdin (one per line)
|
|
19
|
+
help Show this message
|
|
20
|
+
version Print version
|
|
21
|
+
|
|
22
|
+
Options:
|
|
23
|
+
-j, --json Emit JSON instead of human-readable output
|
|
24
|
+
-h, --help Show this message
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
iriq parse https://foo.com/users/123
|
|
28
|
+
iriq normalize foo.com/users/456
|
|
29
|
+
echo "https://foo.com/users/1\\nhttps://foo.com/users/2" | iriq cluster
|
|
30
|
+
TXT
|
|
31
|
+
|
|
32
|
+
attr_reader :stdin, :stdout, :stderr
|
|
33
|
+
|
|
34
|
+
def initialize(stdin: $stdin, stdout: $stdout, stderr: $stderr)
|
|
35
|
+
@stdin = stdin
|
|
36
|
+
@stdout = stdout
|
|
37
|
+
@stderr = stderr
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Returns an integer exit code.
|
|
41
|
+
def run(argv)
|
|
42
|
+
args, opts = parse_options(argv)
|
|
43
|
+
|
|
44
|
+
cmd = args.shift
|
|
45
|
+
return print_usage(stdout, 0) if cmd.nil? || cmd == "help" || opts[:help]
|
|
46
|
+
|
|
47
|
+
unless COMMANDS.include?(cmd)
|
|
48
|
+
stderr.puts "iriq: unknown command #{cmd.inspect}"
|
|
49
|
+
print_usage(stderr, 1)
|
|
50
|
+
return 1
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
send("cmd_#{cmd}", args, opts)
|
|
54
|
+
rescue Iriq::ParseError => e
|
|
55
|
+
stderr.puts "iriq: parse error: #{e.message}"
|
|
56
|
+
2
|
|
57
|
+
rescue OptionParser::ParseError => e
|
|
58
|
+
stderr.puts "iriq: #{e.message}"
|
|
59
|
+
1
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def parse_options(argv)
|
|
65
|
+
opts = { json: false, help: false }
|
|
66
|
+
parser = OptionParser.new do |o|
|
|
67
|
+
o.on("-j", "--json") { opts[:json] = true }
|
|
68
|
+
o.on("-h", "--help") { opts[:help] = true }
|
|
69
|
+
end
|
|
70
|
+
args = parser.parse(argv)
|
|
71
|
+
[args, opts]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def print_usage(io, code)
|
|
75
|
+
io.puts USAGE
|
|
76
|
+
code
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def require_arg!(args, name)
|
|
80
|
+
return args.first if args.first
|
|
81
|
+
|
|
82
|
+
stderr.puts "iriq: missing argument <#{name}>"
|
|
83
|
+
throw :missing_arg, 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def cmd_version(_args, _opts)
|
|
87
|
+
stdout.puts Iriq::VERSION
|
|
88
|
+
0
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def cmd_parse(args, opts)
|
|
92
|
+
input = args.first or return missing(:input)
|
|
93
|
+
iri = Iriq.parse(input)
|
|
94
|
+
emit_parse(iri, opts)
|
|
95
|
+
0
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def cmd_normalize(args, opts)
|
|
99
|
+
input = args.first or return missing(:input)
|
|
100
|
+
out = Iriq.normalize(input)
|
|
101
|
+
opts[:json] ? stdout.puts(JSON.generate(normalized: out)) : stdout.puts(out)
|
|
102
|
+
0
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def cmd_explain(args, opts)
|
|
106
|
+
input = args.first or return missing(:input)
|
|
107
|
+
rows = Iriq.explain(input)
|
|
108
|
+
if opts[:json]
|
|
109
|
+
stdout.puts JSON.generate(rows)
|
|
110
|
+
else
|
|
111
|
+
rows.each do |r|
|
|
112
|
+
mark = r[:variable] ? "*" : " "
|
|
113
|
+
stdout.printf("%s %-12s %s\n", mark, r[:type], r[:value])
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
0
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def cmd_classify(args, opts)
|
|
120
|
+
seg = args.first or return missing(:segment)
|
|
121
|
+
type = SegmentClassifier.new.classify(seg)
|
|
122
|
+
opts[:json] ? stdout.puts(JSON.generate(value: seg, type: type)) : stdout.puts(type)
|
|
123
|
+
0
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def cmd_cluster(args, opts)
|
|
127
|
+
lines = read_input(args.first)
|
|
128
|
+
clusterer = Clusterer.new
|
|
129
|
+
lines.each do |line|
|
|
130
|
+
line = line.strip
|
|
131
|
+
next if line.empty?
|
|
132
|
+
|
|
133
|
+
begin
|
|
134
|
+
clusterer.add(line)
|
|
135
|
+
rescue Iriq::ParseError => e
|
|
136
|
+
stderr.puts "iriq: skipped #{line.inspect}: #{e.message}"
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
emit_clusters(clusterer.clusters, opts)
|
|
140
|
+
0
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def cmd_help(_args, _opts)
|
|
144
|
+
print_usage(stdout, 0)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def missing(name)
|
|
148
|
+
stderr.puts "iriq: missing argument <#{name}>"
|
|
149
|
+
1
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def read_input(path)
|
|
153
|
+
if path.nil? || path == "-"
|
|
154
|
+
stdin.read.lines
|
|
155
|
+
else
|
|
156
|
+
File.readlines(path)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def emit_parse(iri, opts)
|
|
161
|
+
if opts[:json]
|
|
162
|
+
stdout.puts JSON.generate(
|
|
163
|
+
original: iri.original,
|
|
164
|
+
kind: iri.kind,
|
|
165
|
+
scheme: iri.scheme,
|
|
166
|
+
host: iri.host,
|
|
167
|
+
port: iri.port,
|
|
168
|
+
path_segments: iri.path_segments,
|
|
169
|
+
query_params: iri.query_params,
|
|
170
|
+
fragment: iri.fragment,
|
|
171
|
+
nss: iri.nss,
|
|
172
|
+
canonical: iri.canonical,
|
|
173
|
+
)
|
|
174
|
+
else
|
|
175
|
+
stdout.puts "original: #{iri.original}"
|
|
176
|
+
stdout.puts "kind: #{iri.kind}"
|
|
177
|
+
stdout.puts "scheme: #{iri.scheme}" if iri.scheme
|
|
178
|
+
stdout.puts "host: #{iri.host}" if iri.host
|
|
179
|
+
stdout.puts "port: #{iri.port}" if iri.port
|
|
180
|
+
stdout.puts "path_segments: #{iri.path_segments.inspect}" if iri.url?
|
|
181
|
+
unless iri.query_params.empty?
|
|
182
|
+
stdout.puts "query_params: #{iri.query_params.inspect}"
|
|
183
|
+
end
|
|
184
|
+
stdout.puts "fragment: #{iri.fragment}" if iri.fragment
|
|
185
|
+
stdout.puts "nss: #{iri.nss}" if iri.nss
|
|
186
|
+
stdout.puts "canonical: #{iri.canonical}"
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def emit_clusters(clusters, opts)
|
|
191
|
+
sorted = clusters.sort_by { |c| -c.count }
|
|
192
|
+
|
|
193
|
+
if opts[:json]
|
|
194
|
+
stdout.puts JSON.generate(sorted.map(&:to_h))
|
|
195
|
+
else
|
|
196
|
+
sorted.each do |c|
|
|
197
|
+
host = c.host || "(urn)"
|
|
198
|
+
stdout.puts "[#{c.count}] #{host} #{c.shape}"
|
|
199
|
+
c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
|
|
200
|
+
stdout.puts " + #{c.count - 3} more" if c.count > 3
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
data/lib/iriq/cluster.rb
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# A group of identifiers that share a host + shape key. Tracks examples and
|
|
3
|
+
# per-position segment statistics so callers can ask which positions are
|
|
4
|
+
# actually stable in practice (e.g. /users/ always literal, /{integer_id}
|
|
5
|
+
# always variable).
|
|
6
|
+
class Cluster
|
|
7
|
+
attr_reader :key, :host, :scheme, :shape, :examples, :count
|
|
8
|
+
|
|
9
|
+
MAX_EXAMPLES = 10
|
|
10
|
+
|
|
11
|
+
def initialize(key:, host:, scheme:, shape:)
|
|
12
|
+
@key = key
|
|
13
|
+
@host = host
|
|
14
|
+
@scheme = scheme
|
|
15
|
+
@shape = shape
|
|
16
|
+
@examples = []
|
|
17
|
+
@count = 0
|
|
18
|
+
@segment_counts = []
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def add(identifier)
|
|
22
|
+
@count += 1
|
|
23
|
+
@examples << identifier if @examples.size < MAX_EXAMPLES
|
|
24
|
+
|
|
25
|
+
identifier.path_segments.each_with_index do |seg, i|
|
|
26
|
+
@segment_counts[i] ||= Hash.new(0)
|
|
27
|
+
@segment_counts[i][seg] += 1
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Per-position summary:
|
|
32
|
+
# [
|
|
33
|
+
# { position: 0, stable: true, values: { "users" => 3 } },
|
|
34
|
+
# { position: 1, stable: false, values: { "1" => 1, "2" => 1, "3" => 1 } },
|
|
35
|
+
# ]
|
|
36
|
+
def segment_stats
|
|
37
|
+
@segment_counts.each_with_index.map do |counts, i|
|
|
38
|
+
{
|
|
39
|
+
position: i,
|
|
40
|
+
stable: counts.size == 1,
|
|
41
|
+
values: counts.dup,
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def to_h
|
|
47
|
+
{
|
|
48
|
+
key: key,
|
|
49
|
+
host: host,
|
|
50
|
+
scheme: scheme,
|
|
51
|
+
shape: shape,
|
|
52
|
+
count: count,
|
|
53
|
+
examples: examples.map(&:canonical),
|
|
54
|
+
segments: segment_stats,
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Groups many identifiers by host + path shape. Use `add` to feed inputs and
|
|
3
|
+
# `clusters` to read out the groups. `explain` annotates a single identifier
|
|
4
|
+
# against the cluster it would fall into, including which positions are
|
|
5
|
+
# stable across all observed members.
|
|
6
|
+
class Clusterer
|
|
7
|
+
def initialize(classifier: SegmentClassifier.new)
|
|
8
|
+
@classifier = classifier
|
|
9
|
+
@clusters = {}
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def add(input)
|
|
13
|
+
iri = coerce(input)
|
|
14
|
+
key, host, scheme, shape = cluster_key(iri)
|
|
15
|
+
cluster = @clusters[key] ||= Cluster.new(
|
|
16
|
+
key: key,
|
|
17
|
+
host: host,
|
|
18
|
+
scheme: scheme,
|
|
19
|
+
shape: shape,
|
|
20
|
+
)
|
|
21
|
+
cluster.add(iri)
|
|
22
|
+
cluster
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def clusters
|
|
26
|
+
@clusters.values
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def size
|
|
30
|
+
@clusters.size
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Returns a per-segment explanation for the input, merging classifier
|
|
34
|
+
# output with what we've observed in its cluster (i.e. positions that
|
|
35
|
+
# are factually stable get marked variable: false even if classifier
|
|
36
|
+
# would otherwise call them variable).
|
|
37
|
+
def explain(input)
|
|
38
|
+
iri = coerce(input)
|
|
39
|
+
key, * = cluster_key(iri)
|
|
40
|
+
cluster = @clusters[key]
|
|
41
|
+
stats = cluster ? cluster.segment_stats : []
|
|
42
|
+
|
|
43
|
+
iri.path_segments.each_with_index.map do |seg, i|
|
|
44
|
+
type = @classifier.classify(seg)
|
|
45
|
+
stable = stats[i] && stats[i][:stable]
|
|
46
|
+
{
|
|
47
|
+
value: seg,
|
|
48
|
+
type: type,
|
|
49
|
+
variable: !stable && @classifier.variable?(type),
|
|
50
|
+
stable: !!stable,
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def coerce(input)
|
|
58
|
+
input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def cluster_key(iri)
|
|
62
|
+
if iri.urn?
|
|
63
|
+
ns, value = (iri.nss || "").split(":", 2)
|
|
64
|
+
shape = if value
|
|
65
|
+
type = @classifier.classify(value)
|
|
66
|
+
@classifier.variable?(type) ? "{#{type}}" : value
|
|
67
|
+
end
|
|
68
|
+
key = "urn:#{ns}:#{shape}"
|
|
69
|
+
[key, nil, "urn", key]
|
|
70
|
+
else
|
|
71
|
+
shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
|
|
72
|
+
key = "#{iri.scheme}://#{iri.host}#{shape}"
|
|
73
|
+
[key, iri.host, iri.scheme, shape]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
data/lib/iriq/errors.rb
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Builds a per-segment explanation for a single identifier.
|
|
3
|
+
#
|
|
4
|
+
# Explanation.explain("https://foo.com/users/123")
|
|
5
|
+
# # => [
|
|
6
|
+
# # { value: "users", type: :literal, variable: false },
|
|
7
|
+
# # { value: "123", type: :integer_id, variable: true },
|
|
8
|
+
# # ]
|
|
9
|
+
module Explanation
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def explain(input, classifier: SegmentClassifier.new)
|
|
13
|
+
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
14
|
+
|
|
15
|
+
if iri.urn?
|
|
16
|
+
explain_urn(iri, classifier)
|
|
17
|
+
else
|
|
18
|
+
iri.path_segments.map { |s| segment_entry(s, classifier) }
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def segment_entry(segment, classifier)
|
|
23
|
+
type = classifier.classify(segment)
|
|
24
|
+
{
|
|
25
|
+
value: segment,
|
|
26
|
+
type: type,
|
|
27
|
+
variable: classifier.variable?(type),
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def explain_urn(iri, classifier)
|
|
32
|
+
return [] unless iri.nss
|
|
33
|
+
|
|
34
|
+
if iri.nss.include?(":")
|
|
35
|
+
ns, value = iri.nss.split(":", 2)
|
|
36
|
+
[
|
|
37
|
+
{ value: ns, type: :literal, variable: false },
|
|
38
|
+
segment_entry(value, classifier),
|
|
39
|
+
]
|
|
40
|
+
else
|
|
41
|
+
[segment_entry(iri.nss, classifier)]
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Parsed identifier. Stores the original input alongside the structured fields
|
|
3
|
+
# extracted by the parser.
|
|
4
|
+
#
|
|
5
|
+
# For URN-style inputs (`urn:isbn:0451450523`) only `scheme` and `nss` (the
|
|
6
|
+
# Namespace Specific String) are populated; host/path are nil.
|
|
7
|
+
class Identifier
|
|
8
|
+
attr_reader :original, :scheme, :host, :port, :path,
|
|
9
|
+
:path_segments, :query, :query_params, :fragment,
|
|
10
|
+
:nss, :kind
|
|
11
|
+
|
|
12
|
+
def initialize(original:, scheme: nil, host: nil, port: nil, path: nil,
|
|
13
|
+
path_segments: [], query: nil, query_params: {},
|
|
14
|
+
fragment: nil, nss: nil, kind: :url)
|
|
15
|
+
@original = original
|
|
16
|
+
@scheme = scheme
|
|
17
|
+
@host = host
|
|
18
|
+
@port = port
|
|
19
|
+
@path = path
|
|
20
|
+
@path_segments = path_segments
|
|
21
|
+
@query = query
|
|
22
|
+
@query_params = query_params
|
|
23
|
+
@fragment = fragment
|
|
24
|
+
@nss = nss
|
|
25
|
+
@kind = kind
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def urn?
|
|
29
|
+
kind == :urn
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def url?
|
|
33
|
+
kind == :url
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Rebuild a canonical IRI-like string from the parsed fields. Preserves
|
|
37
|
+
# Unicode display form (no punycode / percent-encoding pass).
|
|
38
|
+
def canonical
|
|
39
|
+
if urn?
|
|
40
|
+
"urn:#{nss}"
|
|
41
|
+
else
|
|
42
|
+
out = +""
|
|
43
|
+
out << "#{scheme}://" if scheme
|
|
44
|
+
out << host if host
|
|
45
|
+
out << ":#{port}" if port
|
|
46
|
+
out << "/" + path_segments.join("/") if path_segments.any?
|
|
47
|
+
out << "?#{query}" if query && !query.empty?
|
|
48
|
+
out << "##{fragment}" if fragment && !fragment.empty?
|
|
49
|
+
out
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
alias to_s canonical
|
|
54
|
+
|
|
55
|
+
def ==(other)
|
|
56
|
+
other.is_a?(Identifier) && other.canonical == canonical
|
|
57
|
+
end
|
|
58
|
+
alias eql? ==
|
|
59
|
+
|
|
60
|
+
def hash
|
|
61
|
+
canonical.hash
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Produces a canonical, shape-aware string for an identifier.
|
|
3
|
+
#
|
|
4
|
+
# Normalizer.normalize("https://Foo.com:443/users/123")
|
|
5
|
+
# # => "https://foo.com/users/{integer_id}"
|
|
6
|
+
#
|
|
7
|
+
# The form is intended for grouping/diffing — it is not a round-trippable URL.
|
|
8
|
+
module Normalizer
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
def normalize(input, classifier: SegmentClassifier.new)
|
|
12
|
+
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
13
|
+
normalize_identifier(iri, classifier: classifier)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def normalize_identifier(iri, classifier: SegmentClassifier.new)
|
|
17
|
+
if iri.urn?
|
|
18
|
+
# urn:isbn:0451450523 -> urn:isbn:{integer_id}
|
|
19
|
+
if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
|
|
20
|
+
ns, value = iri.nss.split(":", 2)
|
|
21
|
+
type = classifier.classify(value)
|
|
22
|
+
shaped = classifier.variable?(type) ? "{#{type}}" : value
|
|
23
|
+
"urn:#{ns}:#{shaped}"
|
|
24
|
+
else
|
|
25
|
+
iri.canonical
|
|
26
|
+
end
|
|
27
|
+
else
|
|
28
|
+
out = +""
|
|
29
|
+
out << "#{iri.scheme}://" if iri.scheme
|
|
30
|
+
out << iri.host if iri.host
|
|
31
|
+
out << ":#{iri.port}" if iri.port
|
|
32
|
+
out << PathShape.new(classifier: classifier).for(iri.path_segments)
|
|
33
|
+
if iri.query_params && !iri.query_params.empty?
|
|
34
|
+
out << "?" + shape_query(iri.query_params, classifier)
|
|
35
|
+
end
|
|
36
|
+
out
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def shape_query(params, classifier)
|
|
41
|
+
params.keys.sort.map do |k|
|
|
42
|
+
v = params[k]
|
|
43
|
+
type = classifier.classify(v.to_s)
|
|
44
|
+
shaped = classifier.variable?(type) ? "{#{type}}" : v
|
|
45
|
+
"#{k}=#{shaped}"
|
|
46
|
+
end.join("&")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
data/lib/iriq/parser.rb
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Lightweight, Unicode-aware parser for URL/IRI/URN inputs.
|
|
3
|
+
#
|
|
4
|
+
# Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
|
|
5
|
+
# accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
|
|
6
|
+
# Unicode hosts and paths) to support normalization and clustering.
|
|
7
|
+
module Parser
|
|
8
|
+
SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
|
|
9
|
+
|
|
10
|
+
# Matches a host-ish first token before the first slash. We deliberately
|
|
11
|
+
# allow any non-ASCII character so IRIs work without punycode.
|
|
12
|
+
HOSTISH_RE = %r{
|
|
13
|
+
\A
|
|
14
|
+
(?<host>[^/?#\s:]+\.[^/?#\s:]+|localhost) # something.something or localhost
|
|
15
|
+
(?::(?<port>\d+))?
|
|
16
|
+
(?<rest>[/?#].*)?
|
|
17
|
+
\z
|
|
18
|
+
}x.freeze
|
|
19
|
+
|
|
20
|
+
DEFAULT_PORTS = {
|
|
21
|
+
"http" => 80,
|
|
22
|
+
"https" => 443,
|
|
23
|
+
"ftp" => 21,
|
|
24
|
+
"ws" => 80,
|
|
25
|
+
"wss" => 443,
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
module_function
|
|
29
|
+
|
|
30
|
+
def parse(input)
|
|
31
|
+
raise ParseError, "input is nil" if input.nil?
|
|
32
|
+
raise ParseError, "input must be a String" unless input.is_a?(String)
|
|
33
|
+
|
|
34
|
+
stripped = input.strip
|
|
35
|
+
raise ParseError, "input is empty" if stripped.empty?
|
|
36
|
+
|
|
37
|
+
if (m = stripped.match(SCHEME_RE))
|
|
38
|
+
scheme = m[1].downcase
|
|
39
|
+
rest = stripped[m[0].length..]
|
|
40
|
+
|
|
41
|
+
if scheme == "urn"
|
|
42
|
+
parse_urn(input, rest)
|
|
43
|
+
elsif rest.start_with?("//")
|
|
44
|
+
parse_authority_url(input, scheme, rest[2..])
|
|
45
|
+
else
|
|
46
|
+
# opaque scheme like mailto:foo@bar — keep nss, mark as urn-ish so we
|
|
47
|
+
# don't pretend we know its host/path layout.
|
|
48
|
+
Identifier.new(original: input, scheme: scheme, nss: rest, kind: :urn)
|
|
49
|
+
end
|
|
50
|
+
else
|
|
51
|
+
# No scheme. If it looks like a hostname, assume https.
|
|
52
|
+
if HOSTISH_RE.match?(stripped)
|
|
53
|
+
parse_authority_url(input, "https", stripped)
|
|
54
|
+
else
|
|
55
|
+
raise ParseError, "cannot parse #{input.inspect}: no scheme and no host-like prefix"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def parse_urn(original, rest)
|
|
61
|
+
raise ParseError, "urn missing namespace" if rest.nil? || rest.empty?
|
|
62
|
+
|
|
63
|
+
Identifier.new(original: original, scheme: "urn", nss: rest, kind: :urn)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def parse_authority_url(original, scheme, remainder)
|
|
67
|
+
m = remainder.match(HOSTISH_RE) || remainder.match(%r{\A(?<host>[^/?#]+?)(?::(?<port>\d+))?(?<rest>[/?#].*)?\z})
|
|
68
|
+
raise ParseError, "cannot parse authority from #{original.inspect}" unless m
|
|
69
|
+
|
|
70
|
+
host = m[:host].downcase
|
|
71
|
+
port = m[:port]&.to_i
|
|
72
|
+
port = nil if port && DEFAULT_PORTS[scheme] == port
|
|
73
|
+
|
|
74
|
+
rest = m[:rest] || ""
|
|
75
|
+
path, query, fragment = split_path_query_fragment(rest)
|
|
76
|
+
segments = path_segments(path)
|
|
77
|
+
|
|
78
|
+
Identifier.new(
|
|
79
|
+
original: original,
|
|
80
|
+
scheme: scheme,
|
|
81
|
+
host: host,
|
|
82
|
+
port: port,
|
|
83
|
+
path: "/" + segments.join("/"),
|
|
84
|
+
path_segments: segments,
|
|
85
|
+
query: query,
|
|
86
|
+
query_params: parse_query(query),
|
|
87
|
+
fragment: fragment,
|
|
88
|
+
kind: :url,
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def split_path_query_fragment(rest)
|
|
93
|
+
path = rest
|
|
94
|
+
query = nil
|
|
95
|
+
fragment = nil
|
|
96
|
+
|
|
97
|
+
if (idx = path.index("#"))
|
|
98
|
+
fragment = path[(idx + 1)..]
|
|
99
|
+
path = path[0...idx]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
if (idx = path.index("?"))
|
|
103
|
+
query = path[(idx + 1)..]
|
|
104
|
+
path = path[0...idx]
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
[path, query, fragment]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Apply dot-segment normalization (RFC 3986 §5.2.4, lightweight version)
|
|
111
|
+
# and drop empty segments from leading/trailing/duplicate slashes.
|
|
112
|
+
def path_segments(path)
|
|
113
|
+
return [] if path.nil? || path.empty? || path == "/"
|
|
114
|
+
|
|
115
|
+
raw = path.sub(%r{\A/}, "").split("/")
|
|
116
|
+
out = []
|
|
117
|
+
raw.each do |seg|
|
|
118
|
+
case seg
|
|
119
|
+
when "", "."
|
|
120
|
+
next
|
|
121
|
+
when ".."
|
|
122
|
+
out.pop
|
|
123
|
+
else
|
|
124
|
+
out << seg
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
out
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def parse_query(query)
|
|
131
|
+
return {} if query.nil? || query.empty?
|
|
132
|
+
|
|
133
|
+
query.split("&").each_with_object({}) do |pair, acc|
|
|
134
|
+
k, v = pair.split("=", 2)
|
|
135
|
+
next if k.nil? || k.empty?
|
|
136
|
+
|
|
137
|
+
acc[k] = v
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Converts a sequence of path segments into a route-shape string by
|
|
3
|
+
# replacing variable segments with `{type}` placeholders.
|
|
4
|
+
#
|
|
5
|
+
# PathShape.for(["users", "123", "orders", "456"])
|
|
6
|
+
# # => "/users/{integer_id}/orders/{integer_id}"
|
|
7
|
+
class PathShape
|
|
8
|
+
def initialize(classifier: SegmentClassifier.new)
|
|
9
|
+
@classifier = classifier
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def for(segments)
|
|
13
|
+
return "/" if segments.nil? || segments.empty?
|
|
14
|
+
|
|
15
|
+
"/" + segments.map { |s| shape_segment(s) }.join("/")
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def shape_segment(segment)
|
|
19
|
+
type = @classifier.classify(segment)
|
|
20
|
+
@classifier.variable?(type) ? "{#{type}}" : segment
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.for(segments, classifier: SegmentClassifier.new)
|
|
24
|
+
new(classifier: classifier).for(segments)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Heuristic classifier for individual path segments and query values.
|
|
3
|
+
#
|
|
4
|
+
# Returns a symbol from the known TYPES set. Order matters: the first
|
|
5
|
+
# matching rule wins.
|
|
6
|
+
class SegmentClassifier
|
|
7
|
+
TYPES = %i[literal integer_id uuid date timestamp hash slug opaque_id].freeze
|
|
8
|
+
|
|
9
|
+
UUID_RE = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
|
|
10
|
+
INTEGER_RE = /\A\d+\z/.freeze
|
|
11
|
+
DATE_RE = /\A\d{4}-\d{2}-\d{2}\z/.freeze
|
|
12
|
+
ISO_TIME_RE = /\A\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+\-]\d{2}:?\d{2})?\z/.freeze
|
|
13
|
+
HASH_RE = /\A\h{32,}\z/.freeze
|
|
14
|
+
SLUG_RE = /\A[a-z0-9]+(?:[-_][a-z0-9]+)+\z/.freeze
|
|
15
|
+
LITERAL_RE = /\A[\p{L}][\p{L}\p{M}_]*\z/u.freeze
|
|
16
|
+
OPAQUE_RE = /\A[A-Za-z0-9_\-.~]{4,}\z/.freeze
|
|
17
|
+
|
|
18
|
+
# Plausible UNIX timestamps (10 digit seconds or 13 digit ms) from
|
|
19
|
+
# roughly 2001 onward.
|
|
20
|
+
TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
|
|
21
|
+
TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
|
|
22
|
+
|
|
23
|
+
def classify(segment)
|
|
24
|
+
return :literal if segment.nil? || segment.empty?
|
|
25
|
+
|
|
26
|
+
case segment
|
|
27
|
+
when UUID_RE then :uuid
|
|
28
|
+
when DATE_RE then :date
|
|
29
|
+
when ISO_TIME_RE then :timestamp
|
|
30
|
+
when INTEGER_RE then classify_integer(segment)
|
|
31
|
+
when HASH_RE then :hash
|
|
32
|
+
when SLUG_RE then :slug
|
|
33
|
+
when LITERAL_RE then :literal
|
|
34
|
+
when OPAQUE_RE then :opaque_id
|
|
35
|
+
else :literal
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Anything except :literal is considered variable for shape/explain.
|
|
40
|
+
def variable?(type)
|
|
41
|
+
type != :literal
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def classify_integer(segment)
|
|
47
|
+
n = segment.to_i
|
|
48
|
+
return :timestamp if TS_MILLIS_RANGE.cover?(n)
|
|
49
|
+
return :timestamp if TS_SECONDS_RANGE.cover?(n)
|
|
50
|
+
|
|
51
|
+
:integer_id
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
data/lib/iriq/version.rb
ADDED
data/lib/iriq.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require "iriq/version"
|
|
2
|
+
require "iriq/errors"
|
|
3
|
+
require "iriq/identifier"
|
|
4
|
+
require "iriq/parser"
|
|
5
|
+
require "iriq/segment_classifier"
|
|
6
|
+
require "iriq/path_shape"
|
|
7
|
+
require "iriq/normalizer"
|
|
8
|
+
require "iriq/explanation"
|
|
9
|
+
require "iriq/cluster"
|
|
10
|
+
require "iriq/clusterer"
|
|
11
|
+
require "iriq/cli"
|
|
12
|
+
|
|
13
|
+
module Iriq
|
|
14
|
+
class << self
|
|
15
|
+
def parse(input)
|
|
16
|
+
Parser.parse(input)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def normalize(input)
|
|
20
|
+
Normalizer.normalize(input)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def explain(input)
|
|
24
|
+
Explanation.explain(input)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: iriq
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Daniel Pepper
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: debug
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1'
|
|
19
|
+
type: :development
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rspec
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '3.10'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '3.10'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rspec-debugging
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: simplecov
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0.22'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '0.22'
|
|
68
|
+
description: Semantic IRI/URI/URL/URN parsing, normalization, classification, and
|
|
69
|
+
clustering.
|
|
70
|
+
executables:
|
|
71
|
+
- iriq
|
|
72
|
+
extensions: []
|
|
73
|
+
extra_rdoc_files: []
|
|
74
|
+
files:
|
|
75
|
+
- CHANGELOG.md
|
|
76
|
+
- Gemfile
|
|
77
|
+
- Gemfile.lock
|
|
78
|
+
- LICENSE.txt
|
|
79
|
+
- README.md
|
|
80
|
+
- exe/iriq
|
|
81
|
+
- iriq.gemspec
|
|
82
|
+
- lib/iriq.rb
|
|
83
|
+
- lib/iriq/cli.rb
|
|
84
|
+
- lib/iriq/cluster.rb
|
|
85
|
+
- lib/iriq/clusterer.rb
|
|
86
|
+
- lib/iriq/errors.rb
|
|
87
|
+
- lib/iriq/explanation.rb
|
|
88
|
+
- lib/iriq/identifier.rb
|
|
89
|
+
- lib/iriq/normalizer.rb
|
|
90
|
+
- lib/iriq/parser.rb
|
|
91
|
+
- lib/iriq/path_shape.rb
|
|
92
|
+
- lib/iriq/segment_classifier.rb
|
|
93
|
+
- lib/iriq/version.rb
|
|
94
|
+
homepage: https://github.com/dpep/iriq
|
|
95
|
+
licenses:
|
|
96
|
+
- MIT
|
|
97
|
+
metadata: {}
|
|
98
|
+
rdoc_options: []
|
|
99
|
+
require_paths:
|
|
100
|
+
- lib
|
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
102
|
+
requirements:
|
|
103
|
+
- - ">="
|
|
104
|
+
- !ruby/object:Gem::Version
|
|
105
|
+
version: '3.2'
|
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - ">="
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '0'
|
|
111
|
+
requirements: []
|
|
112
|
+
rubygems_version: 3.6.9
|
|
113
|
+
specification_version: 4
|
|
114
|
+
summary: Semantic IRI normalization and clustering.
|
|
115
|
+
test_files: []
|