iriq 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d1f6ebe248ed57192adc8f5c9600dfe14d9d0c85160dd5d6588d7fbfd7995e72
4
+ data.tar.gz: d80356a646effb078ebe78b62417534746ded550fd4145faa53998aa35866bde
5
+ SHA512:
6
+ metadata.gz: d355eef90433cef5cd807d9c68d6259ad49d79dd87d842770fded377a535ed39e4d24228594b24ce96015f8c9e7246bdc96d7bcfa58f3cdf3350bb389bff211f
7
+ data.tar.gz: b0e1ddf1c6bcebadbe2578fe0760076a8401fd4fc2cd9a5f345b651371cf2fb47ebaf302565df61b05a389e0253f3febdbccc4e2389cf7d2e810abc8e4156a40
data/CHANGELOG.md ADDED
@@ -0,0 +1,2 @@
1
+ ### 0.0.1 (2026-05-24)
2
+ - prototype
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,97 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ iriq (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ date (3.5.1)
10
+ debug (1.11.1)
11
+ irb (~> 1.10)
12
+ reline (>= 0.3.8)
13
+ diff-lcs (1.6.2)
14
+ docile (1.4.1)
15
+ erb (6.0.4)
16
+ io-console (0.8.2)
17
+ irb (1.17.0)
18
+ pp (>= 0.6.0)
19
+ prism (>= 1.3.0)
20
+ rdoc (>= 4.0.0)
21
+ reline (>= 0.4.2)
22
+ pp (0.6.3)
23
+ prettyprint
24
+ prettyprint (0.2.0)
25
+ prism (1.9.0)
26
+ psych (5.3.1)
27
+ date
28
+ stringio
29
+ rdoc (7.2.0)
30
+ erb
31
+ psych (>= 4.0.0)
32
+ tsort
33
+ reline (0.6.3)
34
+ io-console (~> 0.5)
35
+ rspec (3.13.2)
36
+ rspec-core (~> 3.13.0)
37
+ rspec-expectations (~> 3.13.0)
38
+ rspec-mocks (~> 3.13.0)
39
+ rspec-core (3.13.6)
40
+ rspec-support (~> 3.13.0)
41
+ rspec-debugging (0.0.4)
42
+ rspec-expectations (>= 3)
43
+ rspec-expectations (3.13.5)
44
+ diff-lcs (>= 1.2.0, < 2.0)
45
+ rspec-support (~> 3.13.0)
46
+ rspec-mocks (3.13.8)
47
+ diff-lcs (>= 1.2.0, < 2.0)
48
+ rspec-support (~> 3.13.0)
49
+ rspec-support (3.13.7)
50
+ simplecov (0.22.0)
51
+ docile (~> 1.1)
52
+ simplecov-html (~> 0.11)
53
+ simplecov_json_formatter (~> 0.1)
54
+ simplecov-html (0.13.2)
55
+ simplecov_json_formatter (0.1.4)
56
+ stringio (3.2.0)
57
+ tsort (0.2.0)
58
+
59
+ PLATFORMS
60
+ ruby
61
+
62
+ DEPENDENCIES
63
+ debug (>= 1)
64
+ iriq!
65
+ rspec (>= 3.10)
66
+ rspec-debugging
67
+ simplecov (>= 0.22)
68
+
69
+ CHECKSUMS
70
+ date (3.5.1) sha256=750d06384d7b9c15d562c76291407d89e368dda4d4fff957eb94962d325a0dc0
71
+ debug (1.11.1) sha256=2e0b0ac6119f2207a6f8ac7d4a73ca8eb4e440f64da0a3136c30343146e952b6
72
+ diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
73
+ docile (1.4.1) sha256=96159be799bfa73cdb721b840e9802126e4e03dfc26863db73647204c727f21e
74
+ erb (6.0.4) sha256=38e3803694be357fe2bfe312487c74beaf9fb4e5beb3e22498952fe1645b95d9
75
+ io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
76
+ irb (1.17.0) sha256=168c4ddb93d8a361a045c41d92b2952c7a118fa73f23fe14e55609eb7a863aae
77
+ iriq (0.0.1)
78
+ pp (0.6.3) sha256=2951d514450b93ccfeb1df7d021cae0da16e0a7f95ee1e2273719669d0ab9df6
79
+ prettyprint (0.2.0) sha256=2bc9e15581a94742064a3cc8b0fb9d45aae3d03a1baa6ef80922627a0766f193
80
+ prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
81
+ psych (5.3.1) sha256=eb7a57cef10c9d70173ff74e739d843ac3b2c019a003de48447b2963d81b1974
82
+ rdoc (7.2.0) sha256=8650f76cd4009c3b54955eb5d7e3a075c60a57276766ebf36f9085e8c9f23192
83
+ reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
84
+ rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
85
+ rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
86
+ rspec-debugging (0.0.4) sha256=7a8e2dc240c140f0ed27b452a5661a56474ee8cf7b84c5bcbefd827ad36f0a6f
87
+ rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
88
+ rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
89
+ rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
90
+ simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
91
+ simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
92
+ simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
93
+ stringio (3.2.0) sha256=c37cb2e58b4ffbd33fe5cd948c05934af997b36e0b6ca6fdf43afa234cf222e1
94
+ tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
95
+
96
+ BUNDLED WITH
97
+ 4.0.9
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 Daniel Pepper
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,171 @@
1
+ Iriq
2
+ ======
3
+ ![Gem](https://img.shields.io/gem/dt/iriq?style=plastic)
4
+ [![codecov](https://codecov.io/gh/dpep/iriq/branch/main/graph/badge.svg)](https://codecov.io/gh/dpep/iriq)
5
+
6
+ Semantic IRI / URI / URL / URN normalization and clustering for Ruby.
7
+
8
+ Iriq parses resource identifiers, normalizes them into canonical IRI-like
9
+ forms, classifies path and query components, clusters similar identifiers,
10
+ and explains which parts are stable vs. unique.
11
+
12
+ ```ruby
13
+ require "iriq"
14
+ ```
15
+
16
+ ## Quick start
17
+
18
+ ```ruby
19
+ iri = Iriq.parse("https://foo.com/users/123")
20
+ iri.scheme # => "https"
21
+ iri.host # => "foo.com"
22
+ iri.path_segments # => ["users", "123"]
23
+ iri.canonical # => "https://foo.com/users/123"
24
+
25
+ Iriq.normalize("https://foo.com/users/123")
26
+ # => "https://foo.com/users/{integer_id}"
27
+
28
+ Iriq.explain("https://foo.com/users/123/orders/456")
29
+ # => [
30
+ # { value: "users", type: :literal, variable: false },
31
+ # { value: "123", type: :integer_id, variable: true },
32
+ # { value: "orders", type: :literal, variable: false },
33
+ # { value: "456", type: :integer_id, variable: true },
34
+ # ]
35
+ ```
36
+
37
+ ## Supported inputs
38
+
39
+ | Input | Notes |
40
+ | ------------------------------------ | ------------------------------------------------ |
41
+ | `https://foo.com/users/123` | Standard URL |
42
+ | `foo.com/users/456` | Scheme-less; `https://` is assumed |
43
+ | `urn:isbn:0451450523` | URN — `scheme` and `nss` are populated |
44
+ | `https://例え.テスト/こんにちは` | Unicode IRI — display form preserved |
45
+ | `HTTPS://Foo.com:443/A` | Scheme + host lowercased; default port dropped |
46
+ | `https://foo.com/a/./b/../c` | Dot segments normalized |
47
+
48
+ ## Segment classification
49
+
50
+ `Iriq::SegmentClassifier` returns one of:
51
+
52
+ - `:literal` — plain word (`users`, `orders`, `Profile`, `こんにちは`)
53
+ - `:integer_id` — pure digits below the timestamp range (`1`, `123`, `42`)
54
+ - `:uuid` — `f47ac10b-58cc-4372-a567-0e02b2c3d479`
55
+ - `:date` — `2024-05-23`
56
+ - `:timestamp` — ISO 8601, or 10/13-digit UNIX epoch
57
+ - `:hash` — 32+ hex chars (md5 / sha)
58
+ - `:slug` — `my-cool-post`, `my_cool_post`
59
+ - `:opaque_id` — short alphanumeric mix that doesn't fit elsewhere
60
+
61
+ Heuristics are deterministic and ordered — the first matching rule wins.
62
+
63
+ ## Clustering
64
+
65
+ ```ruby
66
+ clusterer = Iriq::Clusterer.new
67
+ clusterer.add("https://foo.com/users/123")
68
+ clusterer.add("https://foo.com/users/456")
69
+ clusterer.add("https://foo.com/users/789/orders/1")
70
+
71
+ clusterer.clusters.map(&:shape)
72
+ # => ["/users/{integer_id}", "/users/{integer_id}/orders/{integer_id}"]
73
+
74
+ clusterer.clusters.first.segment_stats
75
+ # => [
76
+ # { position: 0, stable: true, values: { "users" => 2 } },
77
+ # { position: 1, stable: false, values: { "123" => 1, "456" => 1 } },
78
+ # ]
79
+
80
+ clusterer.explain("https://foo.com/users/999")
81
+ # => [
82
+ # { value: "users", type: :literal, variable: false, stable: true },
83
+ # { value: "999", type: :integer_id, variable: true, stable: false },
84
+ # ]
85
+ ```
86
+
87
+ The clusterer combines classifier output with what it has actually observed:
88
+ a position the classifier *would* call variable but that is empirically
89
+ constant across all members of the cluster will be reported with
90
+ `stable: true, variable: false`.
91
+
92
+ ## Object model
93
+
94
+ | Class | Responsibility |
95
+ | --------------------------- | ---------------------------------------------------- |
96
+ | `Iriq::Parser` | String → `Identifier` |
97
+ | `Iriq::Identifier` | Structured fields + `canonical` reconstruction |
98
+ | `Iriq::SegmentClassifier` | Single segment → type symbol |
99
+ | `Iriq::PathShape` | Segments → `/users/{integer_id}` route shape |
100
+ | `Iriq::Normalizer` | Identifier → canonical, shape-aware string |
101
+ | `Iriq::Explanation` | Per-segment `{value, type, variable}` annotations |
102
+ | `Iriq::Cluster` | One host + shape group, with examples & stats |
103
+ | `Iriq::Clusterer` | Many identifiers → `Cluster` set + explain |
104
+
105
+ ## CLI
106
+
107
+ Installing the gem also installs an `iriq` executable.
108
+
109
+ ```
110
+ $ iriq parse https://foo.com/users/123
111
+ original: https://foo.com/users/123
112
+ kind: url
113
+ scheme: https
114
+ host: foo.com
115
+ path_segments: ["users", "123"]
116
+ canonical: https://foo.com/users/123
117
+
118
+ $ iriq normalize foo.com/posts/2024-05-23/hello-world
119
+ https://foo.com/posts/{date}/{slug}
120
+
121
+ $ iriq explain https://foo.com/users/123/orders/456
122
+ literal users
123
+ * integer_id 123
124
+ literal orders
125
+ * integer_id 456
126
+
127
+ $ iriq classify f47ac10b-58cc-4372-a567-0e02b2c3d479
128
+ uuid
129
+
130
+ $ cat urls.txt | iriq cluster
131
+ [2] foo.com /users/{integer_id}
132
+ https://foo.com/users/1
133
+ https://foo.com/users/2
134
+ [1] foo.com /posts/{slug}/edit
135
+ https://foo.com/posts/abc-123/edit
136
+ ```
137
+
138
+ Add `--json` to any command for machine-readable output. `iriq cluster` reads
139
+ identifiers (one per line) from a file argument or stdin; lines that fail to
140
+ parse are skipped with a warning on stderr.
141
+
142
+ Exit codes: `0` success, `1` usage error, `2` parse error.
143
+
144
+ ## Limitations (intentional)
145
+
146
+ This is an MVP. Iriq does **not**:
147
+
148
+ - Implement RFC 3986, RFC 3987, or the WHATWG URL standard fully.
149
+ - Convert between Unicode (IRI) and punycode (URI) — the display form is
150
+ preserved as-is.
151
+ - Percent-encode or decode path/query bytes. Bytes are kept as written.
152
+ - Validate scheme-specific structure beyond URL vs. URN.
153
+ - Resolve relative references against a base URL.
154
+ - Round-trip `canonical` back to the exact original byte-for-byte (whitespace
155
+ is stripped, default ports are dropped, dot segments are collapsed).
156
+
157
+ For richer IRI handling, see `addressable`. Iriq's focus is the analysis
158
+ side: classification, normalization, and clustering — not a complete URL
159
+ implementation.
160
+
161
+ ----
162
+ ## Contributing
163
+
164
+ Yes please :)
165
+
166
+ 1. Fork it
167
+ 1. Create your feature branch (`git checkout -b my-feature`)
168
+ 1. Ensure the tests pass (`bundle exec rspec`)
169
+ 1. Commit your changes (`git commit -am 'awesome new feature'`)
170
+ 1. Push your branch (`git push origin my-feature`)
171
+ 1. Create a Pull Request
data/exe/iriq ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "iriq"
3
+
4
+ exit Iriq::CLI.new.run(ARGV)
data/iriq.gemspec ADDED
@@ -0,0 +1,21 @@
1
+ require_relative "lib/iriq/version"
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "iriq"
5
+ s.version = Iriq::VERSION
6
+ s.authors = ["Daniel Pepper"]
7
+ s.description = "Semantic IRI/URI/URL/URN parsing, normalization, classification, and clustering."
8
+ s.files = `git ls-files * ':!:spec'`.split("\n")
9
+ s.bindir = "exe"
10
+ s.executables = ["iriq"]
11
+ s.homepage = "https://github.com/dpep/iriq"
12
+ s.license = "MIT"
13
+ s.summary = "Semantic IRI normalization and clustering."
14
+
15
+ s.required_ruby_version = ">= 3.2"
16
+
17
+ s.add_development_dependency 'debug', '>= 1'
18
+ s.add_development_dependency 'rspec', '>= 3.10'
19
+ s.add_development_dependency 'rspec-debugging'
20
+ s.add_development_dependency 'simplecov', '>= 0.22'
21
+ end
data/lib/iriq/cli.rb ADDED
@@ -0,0 +1,205 @@
1
+ require "json"
2
+ require "optparse"
3
+
4
+ module Iriq
5
+ # Tiny CLI wrapper around the public API. Construct with explicit IO so
6
+ # specs can run it without shelling out.
7
+ class CLI
8
+ COMMANDS = %w[parse normalize explain classify cluster help version].freeze
9
+
10
+ USAGE = <<~TXT
11
+ Usage: iriq <command> [options] [args]
12
+
13
+ Commands:
14
+ parse <input> Parse an identifier and print its fields
15
+ normalize <input> Print the shape-normalized form
16
+ explain <input> Annotate each path segment
17
+ classify <segment> Classify a single segment
18
+ cluster [file] Cluster identifiers from FILE or stdin (one per line)
19
+ help Show this message
20
+ version Print version
21
+
22
+ Options:
23
+ -j, --json Emit JSON instead of human-readable output
24
+ -h, --help Show this message
25
+
26
+ Examples:
27
+ iriq parse https://foo.com/users/123
28
+ iriq normalize foo.com/users/456
29
+ echo "https://foo.com/users/1\\nhttps://foo.com/users/2" | iriq cluster
30
+ TXT
31
+
32
+ attr_reader :stdin, :stdout, :stderr
33
+
34
+ def initialize(stdin: $stdin, stdout: $stdout, stderr: $stderr)
35
+ @stdin = stdin
36
+ @stdout = stdout
37
+ @stderr = stderr
38
+ end
39
+
40
+ # Returns an integer exit code.
41
+ def run(argv)
42
+ args, opts = parse_options(argv)
43
+
44
+ cmd = args.shift
45
+ return print_usage(stdout, 0) if cmd.nil? || cmd == "help" || opts[:help]
46
+
47
+ unless COMMANDS.include?(cmd)
48
+ stderr.puts "iriq: unknown command #{cmd.inspect}"
49
+ print_usage(stderr, 1)
50
+ return 1
51
+ end
52
+
53
+ send("cmd_#{cmd}", args, opts)
54
+ rescue Iriq::ParseError => e
55
+ stderr.puts "iriq: parse error: #{e.message}"
56
+ 2
57
+ rescue OptionParser::ParseError => e
58
+ stderr.puts "iriq: #{e.message}"
59
+ 1
60
+ end
61
+
62
+ private
63
+
64
+ def parse_options(argv)
65
+ opts = { json: false, help: false }
66
+ parser = OptionParser.new do |o|
67
+ o.on("-j", "--json") { opts[:json] = true }
68
+ o.on("-h", "--help") { opts[:help] = true }
69
+ end
70
+ args = parser.parse(argv)
71
+ [args, opts]
72
+ end
73
+
74
+ def print_usage(io, code)
75
+ io.puts USAGE
76
+ code
77
+ end
78
+
79
+ def require_arg!(args, name)
80
+ return args.first if args.first
81
+
82
+ stderr.puts "iriq: missing argument <#{name}>"
83
+ throw :missing_arg, 1
84
+ end
85
+
86
+ def cmd_version(_args, _opts)
87
+ stdout.puts Iriq::VERSION
88
+ 0
89
+ end
90
+
91
+ def cmd_parse(args, opts)
92
+ input = args.first or return missing(:input)
93
+ iri = Iriq.parse(input)
94
+ emit_parse(iri, opts)
95
+ 0
96
+ end
97
+
98
+ def cmd_normalize(args, opts)
99
+ input = args.first or return missing(:input)
100
+ out = Iriq.normalize(input)
101
+ opts[:json] ? stdout.puts(JSON.generate(normalized: out)) : stdout.puts(out)
102
+ 0
103
+ end
104
+
105
+ def cmd_explain(args, opts)
106
+ input = args.first or return missing(:input)
107
+ rows = Iriq.explain(input)
108
+ if opts[:json]
109
+ stdout.puts JSON.generate(rows)
110
+ else
111
+ rows.each do |r|
112
+ mark = r[:variable] ? "*" : " "
113
+ stdout.printf("%s %-12s %s\n", mark, r[:type], r[:value])
114
+ end
115
+ end
116
+ 0
117
+ end
118
+
119
+ def cmd_classify(args, opts)
120
+ seg = args.first or return missing(:segment)
121
+ type = SegmentClassifier.new.classify(seg)
122
+ opts[:json] ? stdout.puts(JSON.generate(value: seg, type: type)) : stdout.puts(type)
123
+ 0
124
+ end
125
+
126
+ def cmd_cluster(args, opts)
127
+ lines = read_input(args.first)
128
+ clusterer = Clusterer.new
129
+ lines.each do |line|
130
+ line = line.strip
131
+ next if line.empty?
132
+
133
+ begin
134
+ clusterer.add(line)
135
+ rescue Iriq::ParseError => e
136
+ stderr.puts "iriq: skipped #{line.inspect}: #{e.message}"
137
+ end
138
+ end
139
+ emit_clusters(clusterer.clusters, opts)
140
+ 0
141
+ end
142
+
143
+ def cmd_help(_args, _opts)
144
+ print_usage(stdout, 0)
145
+ end
146
+
147
+ def missing(name)
148
+ stderr.puts "iriq: missing argument <#{name}>"
149
+ 1
150
+ end
151
+
152
+ def read_input(path)
153
+ if path.nil? || path == "-"
154
+ stdin.read.lines
155
+ else
156
+ File.readlines(path)
157
+ end
158
+ end
159
+
160
+ def emit_parse(iri, opts)
161
+ if opts[:json]
162
+ stdout.puts JSON.generate(
163
+ original: iri.original,
164
+ kind: iri.kind,
165
+ scheme: iri.scheme,
166
+ host: iri.host,
167
+ port: iri.port,
168
+ path_segments: iri.path_segments,
169
+ query_params: iri.query_params,
170
+ fragment: iri.fragment,
171
+ nss: iri.nss,
172
+ canonical: iri.canonical,
173
+ )
174
+ else
175
+ stdout.puts "original: #{iri.original}"
176
+ stdout.puts "kind: #{iri.kind}"
177
+ stdout.puts "scheme: #{iri.scheme}" if iri.scheme
178
+ stdout.puts "host: #{iri.host}" if iri.host
179
+ stdout.puts "port: #{iri.port}" if iri.port
180
+ stdout.puts "path_segments: #{iri.path_segments.inspect}" if iri.url?
181
+ unless iri.query_params.empty?
182
+ stdout.puts "query_params: #{iri.query_params.inspect}"
183
+ end
184
+ stdout.puts "fragment: #{iri.fragment}" if iri.fragment
185
+ stdout.puts "nss: #{iri.nss}" if iri.nss
186
+ stdout.puts "canonical: #{iri.canonical}"
187
+ end
188
+ end
189
+
190
+ def emit_clusters(clusters, opts)
191
+ sorted = clusters.sort_by { |c| -c.count }
192
+
193
+ if opts[:json]
194
+ stdout.puts JSON.generate(sorted.map(&:to_h))
195
+ else
196
+ sorted.each do |c|
197
+ host = c.host || "(urn)"
198
+ stdout.puts "[#{c.count}] #{host} #{c.shape}"
199
+ c.examples.first(3).each { |e| stdout.puts " #{e.canonical}" }
200
+ stdout.puts " + #{c.count - 3} more" if c.count > 3
201
+ end
202
+ end
203
+ end
204
+ end
205
+ end
@@ -0,0 +1,58 @@
1
+ module Iriq
2
+ # A group of identifiers that share a host + shape key. Tracks examples and
3
+ # per-position segment statistics so callers can ask which positions are
4
+ # actually stable in practice (e.g. /users/ always literal, /{integer_id}
5
+ # always variable).
6
+ class Cluster
7
+ attr_reader :key, :host, :scheme, :shape, :examples, :count
8
+
9
+ MAX_EXAMPLES = 10
10
+
11
+ def initialize(key:, host:, scheme:, shape:)
12
+ @key = key
13
+ @host = host
14
+ @scheme = scheme
15
+ @shape = shape
16
+ @examples = []
17
+ @count = 0
18
+ @segment_counts = []
19
+ end
20
+
21
+ def add(identifier)
22
+ @count += 1
23
+ @examples << identifier if @examples.size < MAX_EXAMPLES
24
+
25
+ identifier.path_segments.each_with_index do |seg, i|
26
+ @segment_counts[i] ||= Hash.new(0)
27
+ @segment_counts[i][seg] += 1
28
+ end
29
+ end
30
+
31
+ # Per-position summary:
32
+ # [
33
+ # { position: 0, stable: true, values: { "users" => 3 } },
34
+ # { position: 1, stable: false, values: { "1" => 1, "2" => 1, "3" => 1 } },
35
+ # ]
36
+ def segment_stats
37
+ @segment_counts.each_with_index.map do |counts, i|
38
+ {
39
+ position: i,
40
+ stable: counts.size == 1,
41
+ values: counts.dup,
42
+ }
43
+ end
44
+ end
45
+
46
+ def to_h
47
+ {
48
+ key: key,
49
+ host: host,
50
+ scheme: scheme,
51
+ shape: shape,
52
+ count: count,
53
+ examples: examples.map(&:canonical),
54
+ segments: segment_stats,
55
+ }
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,77 @@
1
+ module Iriq
2
+ # Groups many identifiers by host + path shape. Use `add` to feed inputs and
3
+ # `clusters` to read out the groups. `explain` annotates a single identifier
4
+ # against the cluster it would fall into, including which positions are
5
+ # stable across all observed members.
6
+ class Clusterer
7
+ def initialize(classifier: SegmentClassifier.new)
8
+ @classifier = classifier
9
+ @clusters = {}
10
+ end
11
+
12
+ def add(input)
13
+ iri = coerce(input)
14
+ key, host, scheme, shape = cluster_key(iri)
15
+ cluster = @clusters[key] ||= Cluster.new(
16
+ key: key,
17
+ host: host,
18
+ scheme: scheme,
19
+ shape: shape,
20
+ )
21
+ cluster.add(iri)
22
+ cluster
23
+ end
24
+
25
+ def clusters
26
+ @clusters.values
27
+ end
28
+
29
+ def size
30
+ @clusters.size
31
+ end
32
+
33
+ # Returns a per-segment explanation for the input, merging classifier
34
+ # output with what we've observed in its cluster (i.e. positions that
35
+ # are factually stable get marked variable: false even if classifier
36
+ # would otherwise call them variable).
37
+ def explain(input)
38
+ iri = coerce(input)
39
+ key, * = cluster_key(iri)
40
+ cluster = @clusters[key]
41
+ stats = cluster ? cluster.segment_stats : []
42
+
43
+ iri.path_segments.each_with_index.map do |seg, i|
44
+ type = @classifier.classify(seg)
45
+ stable = stats[i] && stats[i][:stable]
46
+ {
47
+ value: seg,
48
+ type: type,
49
+ variable: !stable && @classifier.variable?(type),
50
+ stable: !!stable,
51
+ }
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def coerce(input)
58
+ input.is_a?(Identifier) ? input : Parser.parse(input)
59
+ end
60
+
61
+ def cluster_key(iri)
62
+ if iri.urn?
63
+ ns, value = (iri.nss || "").split(":", 2)
64
+ shape = if value
65
+ type = @classifier.classify(value)
66
+ @classifier.variable?(type) ? "{#{type}}" : value
67
+ end
68
+ key = "urn:#{ns}:#{shape}"
69
+ [key, nil, "urn", key]
70
+ else
71
+ shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
72
+ key = "#{iri.scheme}://#{iri.host}#{shape}"
73
+ [key, iri.host, iri.scheme, shape]
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,4 @@
1
+ module Iriq
2
+ class Error < StandardError; end
3
+ class ParseError < Error; end
4
+ end
@@ -0,0 +1,45 @@
1
+ module Iriq
2
+ # Builds a per-segment explanation for a single identifier.
3
+ #
4
+ # Explanation.explain("https://foo.com/users/123")
5
+ # # => [
6
+ # # { value: "users", type: :literal, variable: false },
7
+ # # { value: "123", type: :integer_id, variable: true },
8
+ # # ]
9
+ module Explanation
10
+ module_function
11
+
12
+ def explain(input, classifier: SegmentClassifier.new)
13
+ iri = input.is_a?(Identifier) ? input : Parser.parse(input)
14
+
15
+ if iri.urn?
16
+ explain_urn(iri, classifier)
17
+ else
18
+ iri.path_segments.map { |s| segment_entry(s, classifier) }
19
+ end
20
+ end
21
+
22
+ def segment_entry(segment, classifier)
23
+ type = classifier.classify(segment)
24
+ {
25
+ value: segment,
26
+ type: type,
27
+ variable: classifier.variable?(type),
28
+ }
29
+ end
30
+
31
+ def explain_urn(iri, classifier)
32
+ return [] unless iri.nss
33
+
34
+ if iri.nss.include?(":")
35
+ ns, value = iri.nss.split(":", 2)
36
+ [
37
+ { value: ns, type: :literal, variable: false },
38
+ segment_entry(value, classifier),
39
+ ]
40
+ else
41
+ [segment_entry(iri.nss, classifier)]
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,64 @@
1
+ module Iriq
2
+ # Parsed identifier. Stores the original input alongside the structured fields
3
+ # extracted by the parser.
4
+ #
5
+ # For URN-style inputs (`urn:isbn:0451450523`) only `scheme` and `nss` (the
6
+ # Namespace Specific String) are populated; host/path are nil.
7
+ class Identifier
8
+ attr_reader :original, :scheme, :host, :port, :path,
9
+ :path_segments, :query, :query_params, :fragment,
10
+ :nss, :kind
11
+
12
+ def initialize(original:, scheme: nil, host: nil, port: nil, path: nil,
13
+ path_segments: [], query: nil, query_params: {},
14
+ fragment: nil, nss: nil, kind: :url)
15
+ @original = original
16
+ @scheme = scheme
17
+ @host = host
18
+ @port = port
19
+ @path = path
20
+ @path_segments = path_segments
21
+ @query = query
22
+ @query_params = query_params
23
+ @fragment = fragment
24
+ @nss = nss
25
+ @kind = kind
26
+ end
27
+
28
+ def urn?
29
+ kind == :urn
30
+ end
31
+
32
+ def url?
33
+ kind == :url
34
+ end
35
+
36
+ # Rebuild a canonical IRI-like string from the parsed fields. Preserves
37
+ # Unicode display form (no punycode / percent-encoding pass).
38
+ def canonical
39
+ if urn?
40
+ "urn:#{nss}"
41
+ else
42
+ out = +""
43
+ out << "#{scheme}://" if scheme
44
+ out << host if host
45
+ out << ":#{port}" if port
46
+ out << "/" + path_segments.join("/") if path_segments.any?
47
+ out << "?#{query}" if query && !query.empty?
48
+ out << "##{fragment}" if fragment && !fragment.empty?
49
+ out
50
+ end
51
+ end
52
+
53
+ alias to_s canonical
54
+
55
+ def ==(other)
56
+ other.is_a?(Identifier) && other.canonical == canonical
57
+ end
58
+ alias eql? ==
59
+
60
+ def hash
61
+ canonical.hash
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,49 @@
1
+ module Iriq
2
+ # Produces a canonical, shape-aware string for an identifier.
3
+ #
4
+ # Normalizer.normalize("https://Foo.com:443/users/123")
5
+ # # => "https://foo.com/users/{integer_id}"
6
+ #
7
+ # The form is intended for grouping/diffing — it is not a round-trippable URL.
8
+ module Normalizer
9
+ module_function
10
+
11
+ def normalize(input, classifier: SegmentClassifier.new)
12
+ iri = input.is_a?(Identifier) ? input : Parser.parse(input)
13
+ normalize_identifier(iri, classifier: classifier)
14
+ end
15
+
16
+ def normalize_identifier(iri, classifier: SegmentClassifier.new)
17
+ if iri.urn?
18
+ # urn:isbn:0451450523 -> urn:isbn:{integer_id}
19
+ if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
20
+ ns, value = iri.nss.split(":", 2)
21
+ type = classifier.classify(value)
22
+ shaped = classifier.variable?(type) ? "{#{type}}" : value
23
+ "urn:#{ns}:#{shaped}"
24
+ else
25
+ iri.canonical
26
+ end
27
+ else
28
+ out = +""
29
+ out << "#{iri.scheme}://" if iri.scheme
30
+ out << iri.host if iri.host
31
+ out << ":#{iri.port}" if iri.port
32
+ out << PathShape.new(classifier: classifier).for(iri.path_segments)
33
+ if iri.query_params && !iri.query_params.empty?
34
+ out << "?" + shape_query(iri.query_params, classifier)
35
+ end
36
+ out
37
+ end
38
+ end
39
+
40
+ def shape_query(params, classifier)
41
+ params.keys.sort.map do |k|
42
+ v = params[k]
43
+ type = classifier.classify(v.to_s)
44
+ shaped = classifier.variable?(type) ? "{#{type}}" : v
45
+ "#{k}=#{shaped}"
46
+ end.join("&")
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,141 @@
1
+ module Iriq
2
+ # Lightweight, Unicode-aware parser for URL/IRI/URN inputs.
3
+ #
4
+ # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
5
+ # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
6
+ # Unicode hosts and paths) to support normalization and clustering.
7
+ module Parser
8
+ SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
9
+
10
+ # Matches a host-ish first token before the first slash. We deliberately
11
+ # allow any non-ASCII character so IRIs work without punycode.
12
+ HOSTISH_RE = %r{
13
+ \A
14
+ (?<host>[^/?#\s:]+\.[^/?#\s:]+|localhost) # something.something or localhost
15
+ (?::(?<port>\d+))?
16
+ (?<rest>[/?#].*)?
17
+ \z
18
+ }x.freeze
19
+
20
+ DEFAULT_PORTS = {
21
+ "http" => 80,
22
+ "https" => 443,
23
+ "ftp" => 21,
24
+ "ws" => 80,
25
+ "wss" => 443,
26
+ }.freeze
27
+
28
+ module_function
29
+
30
+ def parse(input)
31
+ raise ParseError, "input is nil" if input.nil?
32
+ raise ParseError, "input must be a String" unless input.is_a?(String)
33
+
34
+ stripped = input.strip
35
+ raise ParseError, "input is empty" if stripped.empty?
36
+
37
+ if (m = stripped.match(SCHEME_RE))
38
+ scheme = m[1].downcase
39
+ rest = stripped[m[0].length..]
40
+
41
+ if scheme == "urn"
42
+ parse_urn(input, rest)
43
+ elsif rest.start_with?("//")
44
+ parse_authority_url(input, scheme, rest[2..])
45
+ else
46
+ # opaque scheme like mailto:foo@bar — keep nss, mark as urn-ish so we
47
+ # don't pretend we know its host/path layout.
48
+ Identifier.new(original: input, scheme: scheme, nss: rest, kind: :urn)
49
+ end
50
+ else
51
+ # No scheme. If it looks like a hostname, assume https.
52
+ if HOSTISH_RE.match?(stripped)
53
+ parse_authority_url(input, "https", stripped)
54
+ else
55
+ raise ParseError, "cannot parse #{input.inspect}: no scheme and no host-like prefix"
56
+ end
57
+ end
58
+ end
59
+
60
+ def parse_urn(original, rest)
61
+ raise ParseError, "urn missing namespace" if rest.nil? || rest.empty?
62
+
63
+ Identifier.new(original: original, scheme: "urn", nss: rest, kind: :urn)
64
+ end
65
+
66
+ def parse_authority_url(original, scheme, remainder)
67
+ m = remainder.match(HOSTISH_RE) || remainder.match(%r{\A(?<host>[^/?#]+?)(?::(?<port>\d+))?(?<rest>[/?#].*)?\z})
68
+ raise ParseError, "cannot parse authority from #{original.inspect}" unless m
69
+
70
+ host = m[:host].downcase
71
+ port = m[:port]&.to_i
72
+ port = nil if port && DEFAULT_PORTS[scheme] == port
73
+
74
+ rest = m[:rest] || ""
75
+ path, query, fragment = split_path_query_fragment(rest)
76
+ segments = path_segments(path)
77
+
78
+ Identifier.new(
79
+ original: original,
80
+ scheme: scheme,
81
+ host: host,
82
+ port: port,
83
+ path: "/" + segments.join("/"),
84
+ path_segments: segments,
85
+ query: query,
86
+ query_params: parse_query(query),
87
+ fragment: fragment,
88
+ kind: :url,
89
+ )
90
+ end
91
+
92
+ def split_path_query_fragment(rest)
93
+ path = rest
94
+ query = nil
95
+ fragment = nil
96
+
97
+ if (idx = path.index("#"))
98
+ fragment = path[(idx + 1)..]
99
+ path = path[0...idx]
100
+ end
101
+
102
+ if (idx = path.index("?"))
103
+ query = path[(idx + 1)..]
104
+ path = path[0...idx]
105
+ end
106
+
107
+ [path, query, fragment]
108
+ end
109
+
110
+ # Apply dot-segment normalization (RFC 3986 §5.2.4, lightweight version)
111
+ # and drop empty segments from leading/trailing/duplicate slashes.
112
+ def path_segments(path)
113
+ return [] if path.nil? || path.empty? || path == "/"
114
+
115
+ raw = path.sub(%r{\A/}, "").split("/")
116
+ out = []
117
+ raw.each do |seg|
118
+ case seg
119
+ when "", "."
120
+ next
121
+ when ".."
122
+ out.pop
123
+ else
124
+ out << seg
125
+ end
126
+ end
127
+ out
128
+ end
129
+
130
+ def parse_query(query)
131
+ return {} if query.nil? || query.empty?
132
+
133
+ query.split("&").each_with_object({}) do |pair, acc|
134
+ k, v = pair.split("=", 2)
135
+ next if k.nil? || k.empty?
136
+
137
+ acc[k] = v
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,27 @@
1
+ module Iriq
2
+ # Converts a sequence of path segments into a route-shape string by
3
+ # replacing variable segments with `{type}` placeholders.
4
+ #
5
+ # PathShape.for(["users", "123", "orders", "456"])
6
+ # # => "/users/{integer_id}/orders/{integer_id}"
7
+ class PathShape
8
+ def initialize(classifier: SegmentClassifier.new)
9
+ @classifier = classifier
10
+ end
11
+
12
+ def for(segments)
13
+ return "/" if segments.nil? || segments.empty?
14
+
15
+ "/" + segments.map { |s| shape_segment(s) }.join("/")
16
+ end
17
+
18
+ def shape_segment(segment)
19
+ type = @classifier.classify(segment)
20
+ @classifier.variable?(type) ? "{#{type}}" : segment
21
+ end
22
+
23
+ def self.for(segments, classifier: SegmentClassifier.new)
24
+ new(classifier: classifier).for(segments)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,54 @@
1
+ module Iriq
2
+ # Heuristic classifier for individual path segments and query values.
3
+ #
4
+ # Returns a symbol from the known TYPES set. Order matters: the first
5
+ # matching rule wins.
6
+ class SegmentClassifier
7
+ TYPES = %i[literal integer_id uuid date timestamp hash slug opaque_id].freeze
8
+
9
+ UUID_RE = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
10
+ INTEGER_RE = /\A\d+\z/.freeze
11
+ DATE_RE = /\A\d{4}-\d{2}-\d{2}\z/.freeze
12
+ ISO_TIME_RE = /\A\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+\-]\d{2}:?\d{2})?\z/.freeze
13
+ HASH_RE = /\A\h{32,}\z/.freeze
14
+ SLUG_RE = /\A[a-z0-9]+(?:[-_][a-z0-9]+)+\z/.freeze
15
+ LITERAL_RE = /\A[\p{L}][\p{L}\p{M}_]*\z/u.freeze
16
+ OPAQUE_RE = /\A[A-Za-z0-9_\-.~]{4,}\z/.freeze
17
+
18
+ # Plausible UNIX timestamps (10 digit seconds or 13 digit ms) from
19
+ # roughly 2001 onward.
20
+ TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
21
+ TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
22
+
23
+ def classify(segment)
24
+ return :literal if segment.nil? || segment.empty?
25
+
26
+ case segment
27
+ when UUID_RE then :uuid
28
+ when DATE_RE then :date
29
+ when ISO_TIME_RE then :timestamp
30
+ when INTEGER_RE then classify_integer(segment)
31
+ when HASH_RE then :hash
32
+ when SLUG_RE then :slug
33
+ when LITERAL_RE then :literal
34
+ when OPAQUE_RE then :opaque_id
35
+ else :literal
36
+ end
37
+ end
38
+
39
+ # Anything except :literal is considered variable for shape/explain.
40
+ def variable?(type)
41
+ type != :literal
42
+ end
43
+
44
+ private
45
+
46
+ def classify_integer(segment)
47
+ n = segment.to_i
48
+ return :timestamp if TS_MILLIS_RANGE.cover?(n)
49
+ return :timestamp if TS_SECONDS_RANGE.cover?(n)
50
+
51
+ :integer_id
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,3 @@
1
+ module Iriq
2
+ VERSION = "0.0.1"
3
+ end
data/lib/iriq.rb ADDED
@@ -0,0 +1,27 @@
1
+ require "iriq/version"
2
+ require "iriq/errors"
3
+ require "iriq/identifier"
4
+ require "iriq/parser"
5
+ require "iriq/segment_classifier"
6
+ require "iriq/path_shape"
7
+ require "iriq/normalizer"
8
+ require "iriq/explanation"
9
+ require "iriq/cluster"
10
+ require "iriq/clusterer"
11
+ require "iriq/cli"
12
+
13
+ module Iriq
14
+ class << self
15
+ def parse(input)
16
+ Parser.parse(input)
17
+ end
18
+
19
+ def normalize(input)
20
+ Normalizer.normalize(input)
21
+ end
22
+
23
+ def explain(input)
24
+ Explanation.explain(input)
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iriq
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Pepper
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: debug
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '1'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '1'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '3.10'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '3.10'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rspec-debugging
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: simplecov
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0.22'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0.22'
68
+ description: Semantic IRI/URI/URL/URN parsing, normalization, classification, and
69
+ clustering.
70
+ executables:
71
+ - iriq
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - CHANGELOG.md
76
+ - Gemfile
77
+ - Gemfile.lock
78
+ - LICENSE.txt
79
+ - README.md
80
+ - exe/iriq
81
+ - iriq.gemspec
82
+ - lib/iriq.rb
83
+ - lib/iriq/cli.rb
84
+ - lib/iriq/cluster.rb
85
+ - lib/iriq/clusterer.rb
86
+ - lib/iriq/errors.rb
87
+ - lib/iriq/explanation.rb
88
+ - lib/iriq/identifier.rb
89
+ - lib/iriq/normalizer.rb
90
+ - lib/iriq/parser.rb
91
+ - lib/iriq/path_shape.rb
92
+ - lib/iriq/segment_classifier.rb
93
+ - lib/iriq/version.rb
94
+ homepage: https://github.com/dpep/iriq
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '3.2'
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ requirements: []
112
+ rubygems_version: 3.6.9
113
+ specification_version: 4
114
+ summary: Semantic IRI normalization and clustering.
115
+ test_files: []