iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
@@ -1,24 +1,199 @@
1
+ require "set"
2
+
1
3
  module Iriq
2
4
  # Heuristic classifier for individual path segments and query values.
3
5
  #
4
6
  # Returns a symbol from the known TYPES set. Order matters: the first
5
7
  # matching rule wins.
6
8
  class SegmentClassifier
7
- TYPES = %i[literal integer_id uuid date timestamp hash slug opaque_id].freeze
9
+ # `:number` is a corpus-only umbrella surfaced by Cluster#param_type
10
+ # when both `:integer` and `:float` are observed at the same position
11
+ # without either hitting a clear majority. The classifier never returns
12
+ # `:number` for an individual value — every value is unambiguously one
13
+ # or the other.
14
+ #
15
+ # `:enum` is similarly corpus-only — it surfaces when a position has a
16
+ # bounded set of distinct values observed across enough samples (see
17
+ # Cluster::ENUM_* thresholds).
18
+ TYPES = %i[literal integer float number uuid date year timestamp hash slug
19
+ ipv4 ipv6 url email boolean version locale currency phone jwt mime
20
+ file color coordinate country base64 http_status enum opaque_id].freeze
8
21
 
9
- UUID_RE = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
10
- INTEGER_RE = /\A\d+\z/.freeze
11
- DATE_RE = /\A\d{4}-\d{2}-\d{2}\z/.freeze
22
+ # A float requires a decimal point and digits on both sides. Sign is
23
+ # optional. Bare integers and 4+ char hex/UUID-shaped tokens fall through
24
+ # to their own rules.
25
+ FLOAT_RE = /\A-?\d+\.\d+\z/.freeze
26
+ # ISO 8601 timestamp shapes (RFC 3339-ish). Date-only forms live on
27
+ # Recognizers::Date / Recognizers::Integer.
12
28
  ISO_TIME_RE = /\A\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+\-]\d{2}:?\d{2})?\z/.freeze
13
29
  HASH_RE = /\A\h{32,}\z/.freeze
14
30
  SLUG_RE = /\A[a-z0-9]+(?:[-_][a-z0-9]+)+\z/.freeze
15
31
  LITERAL_RE = /\A[\p{L}][\p{L}\p{M}_]*\z/u.freeze
16
32
  OPAQUE_RE = /\A[A-Za-z0-9_\-.~]{4,}\z/.freeze
17
33
 
18
- # Plausible UNIX timestamps (10 digit seconds or 13 digit ms) from
19
- # roughly 2001 onward.
20
- TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
21
- TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
34
+ # Dotted-quad shape; per-octet bounds are validated in classify_ipv4.
35
+ IPV4_RE = /\A\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\z/.freeze
36
+ # IPv6: matches either the full eight-group form (`a:b:c:d:e:f:g:h`)
37
+ # or any compressed form containing `::`. Rejects bare hex / integers
38
+ # / single-colon strings so we don't shadow :integer, :hash, etc.
39
+ # Doesn't accept IPv4-mapped variants (`::ffff:192.0.2.1`) — common
40
+ # IPv6 traffic in URLs doesn't use them.
41
+ IPV6_RE = /\A(?:[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4}){7}|(?=[0-9a-fA-F:]*::)[0-9a-fA-F:]{2,})\z/.freeze
42
+ # URL-as-value: a scheme prefix followed by something non-empty.
43
+ # Used for query params like ?redirect=https://foo.com/bar.
44
+ URL_RE = %r{\A[a-zA-Z][a-zA-Z0-9+.\-]*://\S+\z}.freeze
45
+ # Scheme-less URL — `foo.com/path`, `sub.foo.com/`, etc. Requires a
46
+ # dotted host with a TLD-like suffix (≥2 letters) followed by a slash
47
+ # to disambiguate from filenames like `image.png` or version strings
48
+ # like `1.2.3`.
49
+ SCHEMELESS_URL_RE = %r{\A[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,}/\S*\z}.freeze
50
+ # Simplified email — local@host.tld, no leading/trailing dots in either
51
+ # part. Not RFC 5322 compliant; covers the common shape.
52
+ EMAIL_RE = /\A[A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+\z/.freeze
53
+
54
+ # Boolean literal — case-insensitive. `0`/`1` look like integers from a
55
+ # single value alone; the corpus's :enum detection picks them up when
56
+ # they appear as a bounded value set on a param.
57
+ BOOLEAN_RE = /\A(?:true|false)\z/i.freeze
58
+ # SemVer-ish version tag with explicit `v` prefix. Without the prefix
59
+ # `1.2.3` looks like a float / opaque blob; the `v` keeps it
60
+ # unambiguous from a single value.
61
+ VERSION_RE = /\Av\d+(?:\.\d+)*(?:[-+][A-Za-z0-9.\-]+)?\z/.freeze
62
+ # BCP 47-ish locale: 2-3 letter language + separator + 2-4 char region
63
+ # or script. Real-world subtags: ISO 3166-1 region (`US`, `CA`, 2 letters
64
+ # / 3 digits), ISO 15924 script (`Hans`, 4 letters). The bare 2/3-letter
65
+ # case is handled via LOCALE_LANGUAGE_CODES below so we don't
66
+ # over-classify random short words. A trailing helper (classify_locale_pair)
67
+ # also confirms the language portion is in the allowlist — otherwise
68
+ # things like `by-locale` would wrongly promote to :locale.
69
+ LOCALE_RE = /\A([a-z]{2,3})[-_]([A-Za-z0-9]{2,4})\z/.freeze
70
+ # Inline ISO 639-1 (subset) — the language codes we'll accept as a
71
+ # standalone locale segment. Bare `en` / `fr` / `ja` etc. classify as
72
+ # :locale; tokens not in the list (like the 2-letter literal `to` or
73
+ # `if`) stay as :literal. Curated for the languages that show up in
74
+ # real `?lang=` traffic; expandable as needed.
75
+ LOCALE_LANGUAGE_CODES = %w[
76
+ ar bg bn ca cs da de el en es et fa fi fr gu he hi hr hu id it
77
+ ja ka kk km kn ko lt lv mk ml mr ms my nb nl no pa pl pt ro ru
78
+ sk sl sr sv sw ta te th tl tr uk ur vi zh
79
+ ].to_set.freeze
80
+ # 2 letters only — 3-letter slot is handled by CURRENCY_RE (ISO 4217
81
+ # codes are 3 chars; ISO 639-2 language codes are too, but we don't
82
+ # ship that list and would shadow currencies for ambiguous strings).
83
+ LOCALE_BARE_RE = /\A[a-z]{2}\z/.freeze
84
+ # ISO 4217 currency codes — inline allowlist of the ~30 most-used
85
+ # codes covers the long tail of real traffic. Three-letter all-caps
86
+ # strings (`FAQ`, `FOO`) would otherwise leak into the literal type
87
+ # if we relied on shape alone.
88
+ CURRENCY_CODES = %w[
89
+ USD EUR GBP JPY CNY CHF CAD AUD NZD HKD SGD
90
+ INR KRW MXN BRL ZAR SEK NOK DKK PLN CZK HUF
91
+ RUB TRY ILS AED SAR THB IDR PHP VND TWD MYR
92
+ NGN EGP
93
+ ].to_set.freeze
94
+ CURRENCY_RE = /\A[A-Za-z]{3}\z/.freeze
95
+ # E.164 phone number — leading `+` then 1-3 digit country code, then up
96
+ # to 14 more digits. Allows separators (space, dash, dot, parens) but
97
+ # they don't count toward digit length. A standalone `+15551234567` and
98
+ # `+1 (555) 123-4567` both classify; bare digit blobs without `+`
99
+ # stay as :integer / :opaque_id (too ambiguous from a single value).
100
+ PHONE_RE = %r{\A\+(?:[ \-.()\d]){7,20}\z}.freeze
101
+ # NANP phone without `+` — `555-666-7777`, `555.666.7777`, `(555) 666-7777`.
102
+ # The area-code + exchange leading-digit constraint (first digit 2-9 in
103
+ # both) is what makes this safe to add without shadowing :integer —
104
+ # bare digit blobs / dotted numerics fall through. Only matches the
105
+ # 10-digit NANP shape; international formats need the explicit `+`.
106
+ PHONE_NANP_RE = /\A\(?([2-9]\d{2})\)?[ \-.]?([2-9]\d{2})[ \-.]?(\d{4})\z/.freeze
107
+ # JWT: three base64url-encoded segments separated by dots, header
108
+ # starts with `eyJ` (the `{` JSON prefix base64url-encoded).
109
+ JWT_RE = /\Aey[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\z/.freeze
110
+ # MIME / media type — RFC 2046 top-level types plus a subtype. The
111
+ # subtype side is permissive (letters/digits/+-.) so `application/vnd.api+json`
112
+ # and `image/svg+xml` both match.
113
+ MIME_RE = %r{\A(?:text|image|video|audio|application|multipart|message|font|model)/[A-Za-z0-9!#$&^_+\-.]+\z}.freeze
114
+ # File — `name.ext` shape where ext is in FILE_EXTENSIONS. The stem
115
+ # can be a slug, opaque-id, or literal; the meaningful signal is the
116
+ # extension. Per-extension grouping (image / document / data / etc.)
117
+ # surfaces via SegmentClassifier.file_kind for verbose displays.
118
+ FILE_RE = /\A([A-Za-z0-9][A-Za-z0-9_\-.~]*)\.([A-Za-z0-9]{1,8})\z/.freeze
119
+ # Allowlist of common file extensions, keyed by kind. The kind is
120
+ # surfaced via file_kind for verbose output; the type itself is just
121
+ # `:file`. Keep this list curated — random 1-8 char endings can shadow
122
+ # legitimate semantic types (`fr_CA.us`, `1.2.3`).
123
+ FILE_EXTENSIONS = {
124
+ image: %w[png jpg jpeg gif webp svg bmp tiff tif ico avif heic heif],
125
+ document: %w[pdf doc docx xls xlsx ppt pptx odt ods odp rtf epub],
126
+ data: %w[csv tsv json xml yaml yml parquet sqlite db ndjson jsonl],
127
+ text: %w[txt md log markdown rst],
128
+ web: %w[html htm css js mjs cjs ts jsx tsx],
129
+ audio: %w[mp3 wav ogg flac aac m4a opus],
130
+ video: %w[mp4 mov avi mkv webm flv wmv m4v],
131
+ archive: %w[zip tar gz bz2 7z rar xz tgz],
132
+ code: %w[rb py go java c cc cpp h hpp sh swift kt rs],
133
+ }.freeze
134
+ # Reverse map ext → kind for O(1) lookup. Lowercase keys; classify
135
+ # downcases before consulting.
136
+ FILE_EXTENSION_KIND = FILE_EXTENSIONS.each_with_object({}) { |(kind, exts), h|
137
+ exts.each { |e| h[e] = kind }
138
+ }.freeze
139
+
140
+ # Hex color — `#fff`, `#ffffff`, `#ffffff80` (with alpha). 3/4/6/8
141
+ # hex chars after the leading `#`. Other color formats (named, rgb(),
142
+ # hsl()) aren't recognized yet; this is the only one common in URL
143
+ # path/query positions.
144
+ COLOR_HEX_RE = /\A#([0-9a-fA-F]{3}|[0-9a-fA-F]{4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})\z/.freeze
145
+ # Coordinate pair — `lat,lng`, both signed decimals. The extractor's
146
+ # comma boundary means this only survives when present at classify
147
+ # time (e.g. query values fed in already-parsed). Each component
148
+ # validated for plausible lat/lng range in classify_coordinate.
149
+ COORDINATE_RE = /\A(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)\z/.freeze
150
+ # ISO 3166-1 alpha-2 — 2 letters, validated against the inline
151
+ # allowlist below (so random 2-letter uppercase tokens like `OK` or
152
+ # `NO` don't unconditionally promote). Lowercase tokens are routed
153
+ # through :locale by LOCALE_BARE_RE.
154
+ COUNTRY_RE = /\A[A-Z]{2}\z/.freeze
155
+ COUNTRY_CODES = %w[
156
+ AD AE AF AG AL AM AO AR AT AU AZ
157
+ BA BB BD BE BG BH BJ BM BN BO BR BS BT BW BY BZ
158
+ CA CD CG CH CI CL CM CN CO CR CU CY CZ
159
+ DE DJ DK DM DO DZ
160
+ EC EE EG ER ES ET
161
+ FI FJ FK FM FO FR
162
+ GA GB GE GH GI GL GM GN GR GT GU GW GY
163
+ HK HN HR HT HU
164
+ ID IE IL IM IN IQ IR IS IT
165
+ JM JO JP
166
+ KE KG KH KM KN KP KR KW KY KZ
167
+ LA LB LC LI LK LR LS LT LU LV LY
168
+ MA MC MD ME MG MK ML MM MN MO MR MT MU MV MW MX MY MZ
169
+ NA NE NG NI NL NO NP NR NU NZ
170
+ OM
171
+ PA PE PF PG PH PK PL PR PT PW PY
172
+ QA
173
+ RE RO RS RU RW
174
+ SA SB SC SD SE SG SI SK SL SM SN SO SR SS ST SV SY SZ
175
+ TD TG TH TJ TM TN TO TR TT TV TW TZ
176
+ UA UG US UY UZ
177
+ VA VC VE VG VI VN VU
178
+ WS
179
+ YE
180
+ ZA ZM ZW
181
+ ].to_set.freeze
182
+ # Standard base64 — at least 16 chars, made up of base64 alphabet,
183
+ # AND contains one of the disambiguating chars (`+`, `/`, trailing
184
+ # `=` padding) so we don't shadow plain alphanumeric :opaque_id
185
+ # blobs. URL-safe base64 (which uses `-`/`_`) overlaps too heavily
186
+ # with :slug to discriminate from shape alone.
187
+ BASE64_RE = %r{\A[A-Za-z0-9+/]{16,}={0,2}\z}.freeze
188
+
189
+ # HTTP status — bare 3-digit integer in the 100..599 window. Same
190
+ # corpus-promotion pattern as :year: a single 3-digit int is ambiguous,
191
+ # but a position whose values cluster inside the HTTP status window is
192
+ # almost certainly statuses. See Cluster#param_type for the promotion.
193
+ HTTP_STATUS_RANGE = 100..599
194
+ # Plausible year — 4-digit integer in the 1900..2100 window. Checked
195
+ # inside classify_integer so we don't shadow shorter / longer ints.
196
+ YEAR_RANGE = 1900..2100
22
197
 
23
198
  # Bounded memoization: classification of a given string is pure, so
24
199
  # repeat segments (e.g. /users in countless paths) can be cached. Cap
@@ -28,6 +203,11 @@ module Iriq
28
203
 
29
204
  def initialize
30
205
  @cache = {}
206
+ # The recognizer ensemble consulted at classify time. Starts with
207
+ # the built-in three (uuid, date, integer); Corpus#activate_proposal
208
+ # appends SynthesizedRecognizer instances at runtime so a corpus
209
+ # picks up its learned patterns without classifier surgery.
210
+ @recognizers = [Recognizers::UUID, Recognizers::DATE, Recognizers::INTEGER]
31
211
  end
32
212
 
33
213
  def classify(segment)
@@ -40,6 +220,22 @@ module Iriq
40
220
  @cache[segment] = compute_classification(segment)
41
221
  end
42
222
 
223
+ # Append a Recognizer to the ensemble. Called by Corpus#activate_proposal
224
+ # to promote a learned RecognizerProposal into a live Recognizer.
225
+ # Busts the classify cache so subsequent classify() calls see the
226
+ # new Recognizer.
227
+ def register_recognizer(recognizer)
228
+ @recognizers << recognizer
229
+ @cache.clear
230
+ recognizer
231
+ end
232
+
233
+ # Snapshot of the live ensemble. Useful for tests and tooling that
234
+ # want to inspect which Recognizers a corpus is consulting.
235
+ def recognizers
236
+ @recognizers.dup
237
+ end
238
+
43
239
  # Anything except :literal is considered variable for shape/explain.
44
240
  def variable?(type)
45
241
  type != :literal
@@ -48,25 +244,153 @@ module Iriq
48
244
  private
49
245
 
50
246
  def compute_classification(segment)
51
- case segment
52
- when UUID_RE then :uuid
53
- when DATE_RE then :date
54
- when ISO_TIME_RE then :timestamp
55
- when INTEGER_RE then classify_integer(segment)
56
- when HASH_RE then :hash
57
- when SLUG_RE then :slug
58
- when LITERAL_RE then :literal
59
- when OPAQUE_RE then :opaque_id
60
- else :literal
247
+ # Cheap composition checks short-circuit regex matches that can't
248
+ # possibly fire. Each `_RE` test below is preceded by a `String#include?`
249
+ # / `start_with?` / `size` guard so a literal like "users" walks
250
+ # past 20-odd `_RE`s in O(len) instead of O(len * n_regexes).
251
+ size = segment.size
252
+ first = segment.getbyte(0)
253
+ digit0 = first && first >= 0x30 && first <= 0x39
254
+ has_dash = segment.include?("-")
255
+ has_dot = segment.include?(".")
256
+ has_colon = segment.include?(":")
257
+ has_slash = segment.include?("/")
258
+ has_at = segment.include?("@")
259
+ has_sep = has_dash || segment.include?("_")
260
+ has_comma = segment.include?(",")
261
+
262
+ # Scored ensemble over the live Recognizer list — built-ins +
263
+ # anything Corpus#activate_proposal has registered for this
264
+ # classifier instance.
265
+ if (v = Recognizer.ensemble(segment, *@recognizers))
266
+ return v[:type]
267
+ end
268
+
269
+ # Network / structured-value types take precedence over the generic
270
+ # OPAQUE_RE catch-all (which would otherwise grab IPv4) and the
271
+ # LITERAL fallback (which today swallows email + URL + IPv6).
272
+ return :jwt if segment.start_with?("ey") && segment.count(".") == 2 && JWT_RE.match?(segment)
273
+ return classify_color(segment) if first == 0x23 && COLOR_HEX_RE.match?(segment) # '#'
274
+ return :url if has_colon && segment.include?("://") && URL_RE.match?(segment)
275
+ return :email if has_at && EMAIL_RE.match?(segment)
276
+ return :mime if has_slash && MIME_RE.match?(segment)
277
+ return :url if has_dot && has_slash && SCHEMELESS_URL_RE.match?(segment)
278
+ return classify_ipv4(segment) if digit0 && has_dot && IPV4_RE.match?(segment)
279
+ return :ipv6 if has_colon && IPV6_RE.match?(segment)
280
+ return classify_coordinate(segment) if has_comma && COORDINATE_RE.match?(segment)
281
+ return :hash if size >= 32 && HASH_RE.match?(segment)
282
+ return :version if first == 0x76 && VERSION_RE.match?(segment) # 'v'
283
+ return :boolean if (size >= 4 && size <= 5) && BOOLEAN_RE.match?(segment)
284
+ return classify_locale_pair(segment) if has_sep && LOCALE_RE.match?(segment)
285
+ return classify_locale(segment) if size == 2 && LOCALE_BARE_RE.match?(segment)
286
+ return :timestamp if has_colon && ISO_TIME_RE.match?(segment)
287
+ return classify_phone(segment) if first == 0x2B && PHONE_RE.match?(segment) # '+'
288
+ return :phone if (has_dash || has_dot || segment.include?("(")) && PHONE_NANP_RE.match?(segment)
289
+ return :float if has_dot && FLOAT_RE.match?(segment)
290
+ return classify_currency(segment) if size == 3 && CURRENCY_RE.match?(segment)
291
+ return classify_country(segment) if size == 2 && COUNTRY_RE.match?(segment)
292
+ return :base64 if size >= 16 && (segment.include?("=") || segment.include?("+") || segment.include?("/")) && BASE64_RE.match?(segment)
293
+ return classify_file(segment) if has_dot && FILE_RE.match?(segment)
294
+ return :slug if has_sep && SLUG_RE.match?(segment)
295
+ return :literal if LITERAL_RE.match?(segment)
296
+ return :opaque_id if OPAQUE_RE.match?(segment)
297
+
298
+ :literal
299
+ end
300
+
301
+ # IPV4_RE only checks shape (1-3 digits between dots). Validate each
302
+ # octet ≤ 255; on failure fall back to :opaque_id so we don't promote
303
+ # garbage like `999.999.999.999` to :ipv4.
304
+ def classify_ipv4(segment)
305
+ return :opaque_id unless segment.split(".").all? { |o| (0..255).cover?(o.to_i) }
306
+
307
+ :ipv4
308
+ end
309
+
310
+ # Validate E.164-shaped phone: count digits (ignoring separators) and
311
+ # ensure between 7 and 15 inclusive. The shape regex permits a wide
312
+ # range — the digit count is the meaningful guardrail.
313
+ def classify_phone(segment)
314
+ digits = segment.count("0-9")
315
+ return :phone if digits.between?(7, 15)
316
+
317
+ :opaque_id
318
+ end
319
+
320
+ # Color — only the hex form is recognized for now. Returns :color
321
+ # when the value matches COLOR_HEX_RE. Future extensions (named
322
+ # colors, rgb(), hsl()) can plug in via classify_color without
323
+ # rearranging compute_classification.
324
+ def classify_color(segment)
325
+ return :color if COLOR_HEX_RE.match?(segment)
326
+
327
+ :opaque_id
328
+ end
329
+
330
+ # Coordinate pair — both numbers must land in plausible lat/lng
331
+ # bounds: latitude ±90, longitude ±180. We accept either ordering
332
+ # (lat,lng OR lng,lat) by checking both. Pairs outside the range
333
+ # fall back to :opaque_id so generic CSV-shaped values aren't
334
+ # promoted.
335
+ def classify_coordinate(segment)
336
+ m = segment.match(COORDINATE_RE) or return :opaque_id
337
+ a = m[1].to_f
338
+ b = m[2].to_f
339
+ if (a.between?(-90, 90) && b.between?(-180, 180)) ||
340
+ (a.between?(-180, 180) && b.between?(-90, 90))
341
+ return :coordinate
61
342
  end
343
+
344
+ :opaque_id
345
+ end
346
+
347
+ # Country — promote to :country only when the 2-letter token is in
348
+ # the ISO 3166-1 alpha-2 allowlist. Otherwise fall through to
349
+ # :literal (matches LITERAL_RE).
350
+ def classify_country(segment)
351
+ return :country if COUNTRY_CODES.include?(segment)
352
+
353
+ :literal
354
+ end
355
+
356
+ # File classification — only promote when the trailing extension is
357
+ # in the allowlist. Otherwise fall through to the slug/literal/opaque
358
+ # rules so `1.2.3` (version) and `fr_CA.us` (locale-shaped opaque) don't
359
+ # get pulled in by the FILE_RE shape.
360
+ def classify_file(segment)
361
+ ext = segment[/\.([A-Za-z0-9]{1,8})\z/, 1]&.downcase
362
+ return :file if ext && FILE_EXTENSION_KIND.key?(ext)
363
+ return :slug if segment.match?(SLUG_RE)
364
+
365
+ :opaque_id
366
+ end
367
+
368
+ # Three-letter shape — only call it :currency if it's actually in the
369
+ # ISO 4217 allowlist (case-insensitive). Otherwise fall through to the
370
+ # literal/opaque rules.
371
+ def classify_currency(segment)
372
+ return :currency if CURRENCY_CODES.include?(segment.upcase)
373
+ return :literal if segment.match?(LITERAL_RE)
374
+
375
+ :opaque_id
376
+ end
377
+
378
+ # Bare 2- or 3-letter lowercase token — only :locale when it's a known
379
+ # ISO 639-1 code. Otherwise it's a regular literal (`if`, `to`, `of`).
380
+ def classify_locale(segment)
381
+ return :locale if LOCALE_LANGUAGE_CODES.include?(segment)
382
+
383
+ :literal
62
384
  end
63
385
 
64
- def classify_integer(segment)
65
- n = segment.to_i
66
- return :timestamp if TS_MILLIS_RANGE.cover?(n)
67
- return :timestamp if TS_SECONDS_RANGE.cover?(n)
386
+ # Dashed/underscored locale form (`en-US`, `zh-Hans`). Only promote to
387
+ # :locale when the language portion is in the ISO 639-1 allowlist —
388
+ # otherwise tokens like `by-locale` would slip through.
389
+ def classify_locale_pair(segment)
390
+ lang = segment[/\A[a-z]{2,3}/]
391
+ return :locale if LOCALE_LANGUAGE_CODES.include?(lang)
68
392
 
69
- :integer_id
393
+ segment.match?(SLUG_RE) ? :slug : :literal
70
394
  end
71
395
 
72
396
  public
@@ -74,5 +398,133 @@ module Iriq
74
398
  # Shared singleton — preferred default for callers that don't bring
75
399
  # their own classifier (saves a per-call allocation).
76
400
  DEFAULT = new
401
+
402
+ # Display name for a type in `--normalize` placeholders. Collapses
403
+ # `:ipv4` and `:ipv6` to `:ip` (callers that want the specific family
404
+ # read it off the classifier directly or via cluster stats).
405
+ def self.display_type(type)
406
+ return :ip if type == :ipv4 || type == :ipv6
407
+
408
+ type
409
+ end
410
+
411
+ # Return the kind (`:image`/`:document`/`:data`/...) for a file-shaped
412
+ # value, or nil if the value isn't a recognized file. Used by verbose
413
+ # displays to subdivide `:file` without polluting the top-level type
414
+ # taxonomy.
415
+ def self.file_kind(value)
416
+ return nil if value.nil?
417
+ ext = value[/\.([A-Za-z0-9]{1,8})\z/, 1]&.downcase
418
+ ext && FILE_EXTENSION_KIND[ext]
419
+ end
420
+
421
+ # Return the kind (`:hex` for now — placeholder for future named /
422
+ # rgb / hsl support) of a color-shaped value, or nil if the value
423
+ # isn't a recognized color. Used by verbose displays alongside the
424
+ # `:color` type itself.
425
+ def self.color_kind(value)
426
+ return nil if value.nil?
427
+ return :hex if COLOR_HEX_RE.match?(value)
428
+
429
+ nil
430
+ end
431
+
432
+ # Param-name hints — when a value's classifier output is too generic
433
+ # (`:literal`, `:opaque_id`, `:slug`) to be informative, the param name
434
+ # can supply the type. `?phone=unknown` becomes `:phone` even though
435
+ # `unknown` is a literal. Only "safe" string-shaped types are in the
436
+ # map; numeric types (`:integer`, `:year`, `:http_status`) are handled
437
+ # by range analysis instead.
438
+ PARAM_NAME_HINTS = {
439
+ "phone" => :phone,
440
+ "tel" => :phone,
441
+ "telephone" => :phone,
442
+ "mobile" => :phone,
443
+ "cell" => :phone,
444
+ "email" => :email,
445
+ "e_mail" => :email,
446
+ "mail" => :email,
447
+ "locale" => :locale,
448
+ "lang" => :locale,
449
+ "language" => :locale,
450
+ "currency" => :currency,
451
+ "cur" => :currency,
452
+ "curr" => :currency,
453
+ "url" => :url,
454
+ "uri" => :url,
455
+ "redirect" => :url,
456
+ "redirect_url" => :url,
457
+ "return_to" => :url,
458
+ "return_url" => :url,
459
+ "callback" => :url,
460
+ "callback_url" => :url,
461
+ "next_url" => :url,
462
+ "jwt" => :jwt,
463
+ "bearer" => :jwt,
464
+ "auth_token" => :jwt,
465
+ "mime" => :mime,
466
+ "content_type" => :mime,
467
+ "media_type" => :mime,
468
+ "color" => :color,
469
+ "colour" => :color,
470
+ "bg" => :color,
471
+ "background" => :color,
472
+ "fg" => :color,
473
+ "foreground" => :color,
474
+ "coords" => :coordinate,
475
+ "coordinates" => :coordinate,
476
+ "geo" => :coordinate,
477
+ "location" => :coordinate,
478
+ "position" => :coordinate,
479
+ "latlng" => :coordinate,
480
+ "latlon" => :coordinate,
481
+ "country" => :country,
482
+ "country_code" => :country,
483
+ "nation" => :country,
484
+ }.freeze
485
+ # Types the param-name hint is allowed to override. Anything more
486
+ # specific (`:integer`, `:uuid`, etc.) already carries useful info —
487
+ # the classifier wins.
488
+ PARAM_HINT_OVERRIDABLE = %i[literal opaque_id slug].to_set.freeze
489
+
490
+ # Return a hinted type for a param name when the resolved value type
491
+ # is generic. Nil when no hint applies. Both Cluster#param_type (for
492
+ # the corpus path) and Normalizer.shape_query (for one-shot rendering)
493
+ # consult this so corpus + one-shot agree on the override.
494
+ def self.param_name_hint(name, current_type)
495
+ return nil if name.nil? || !PARAM_HINT_OVERRIDABLE.include?(current_type)
496
+
497
+ PARAM_NAME_HINTS[name.to_s.downcase]
498
+ end
499
+
500
+ # Canonicalize a currency code to uppercase ISO 4217. Returns nil if
501
+ # the value isn't a known code. Used by --normalize so /pricing/usd and
502
+ # /pricing/USD both render as /pricing/USD.
503
+ def self.canonical_currency(value)
504
+ return nil if value.nil?
505
+ up = value.upcase
506
+ CURRENCY_CODES.include?(up) ? up : nil
507
+ end
508
+
509
+ # Canonicalize a recognized date string to ISO 8601 (YYYY-MM-DD). Returns
510
+ # nil if the value isn't one of our accepted date forms. Used by --normalize
511
+ # so /events/2024/01/15 and /events/20240115 both render as
512
+ # /events/2024-01-15 in the output.
513
+ def self.canonical_date(value)
514
+ return nil if value.nil?
515
+ return nil unless value.is_a?(String)
516
+
517
+ canon = Recognizers::Date.canonical(value)
518
+ return canon if canon
519
+
520
+ # Compact YYYYMMDD lives on the Integer recognizer for classification,
521
+ # but the canonical form is part of the same date family.
522
+ if Recognizers::Integer::COMPACT_DATE_PATTERN.match?(value)
523
+ y, m, d = value[0, 4], value[4, 2], value[6, 2]
524
+ return "#{y}-#{m}-#{d}" if Recognizers::Date.plausible?(y, m, d)
525
+ end
526
+
527
+ nil
528
+ end
77
529
  end
78
530
  end
@@ -1,8 +1,16 @@
1
+ require "set"
2
+
1
3
  module Iriq
2
4
  # Walks a segment list and annotates each entry with the type, whether it's
3
5
  # variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
4
6
  # follows a literal one — `/users/123` ⇒ hint `user_id`.
5
7
  module SegmentHints
8
+ # Only ID-shaped types get the noun-singularize hint. Semantic types
9
+ # (version, locale, currency, date, etc.) are more informative as
10
+ # `{type}` than as `{noun}_id` — `/api/v1/...` should render
11
+ # `{version}`, not `{api_id}`.
12
+ HINT_ELIGIBLE_TYPES = %i[integer uuid hash opaque_id slug].to_set.freeze
13
+
6
14
  module_function
7
15
 
8
16
  def derive(segments, classifier)
@@ -20,6 +28,7 @@ module Iriq
20
28
 
21
29
  def hint_for(segments, i, type, variable, classifier)
22
30
  return nil unless variable && i > 0
31
+ return nil unless HINT_ELIGIBLE_TYPES.include?(type)
23
32
 
24
33
  prev = segments[i - 1]
25
34
  return nil unless classifier.classify(prev) == :literal
data/lib/iriq/shape.rb ADDED
@@ -0,0 +1,106 @@
1
+ module Iriq
2
+ # Structured route shape: an ordered list of typed segment entries plus
3
+ # rendering methods that produce the various string forms (placeholder,
4
+ # canonical-dates, raw-types, etc.).
5
+ #
6
+ # Replaces the string-as-data convention where PathShape's String output
7
+ # was the only carrier of shape information. Structured Shape makes:
8
+ # - downstream consumers cheap (they iterate entries instead of
9
+ # re-deriving from segments + classifier)
10
+ # - shape identity explicit (structural #== / #hash, not string match)
11
+ # - multiple renderings free (canonical dates, hints on/off, raw types
12
+ # vs hinted) without re-walking segments
13
+ #
14
+ # The cluster identity layer still uses string keys for storage; a
15
+ # follow-up step migrates Cluster equality to be Shape-driven.
16
+ class Shape
17
+ attr_reader :entries
18
+
19
+ # Build a Shape from raw path segments using the given classifier.
20
+ def self.from_segments(segments, classifier: SegmentClassifier::DEFAULT)
21
+ new(entries: SegmentHints.derive(segments || [], classifier))
22
+ end
23
+
24
+ # Build a Shape from already-derived SegmentHints entries — same input
25
+ # PathShape.from_entries used to take. Useful when the caller already
26
+ # walked segments once and wants to avoid a second pass.
27
+ def self.from_entries(entries)
28
+ new(entries: entries || [])
29
+ end
30
+
31
+ def initialize(entries:)
32
+ @entries = entries
33
+ end
34
+
35
+ def empty?
36
+ @entries.empty?
37
+ end
38
+
39
+ # Render to the placeholder form — "/users/{user_id}" etc. This is the
40
+ # default string representation.
41
+ def render(hints: true, canonical_dates: false, canonical_currencies: false)
42
+ return "/" if empty?
43
+
44
+ "/" + @entries.map { |e|
45
+ render_entry(e, hints: hints, canonical_dates: canonical_dates,
46
+ canonical_currencies: canonical_currencies)
47
+ }.join("/")
48
+ end
49
+
50
+ def to_s
51
+ render
52
+ end
53
+ alias inspect to_s
54
+
55
+ # Structural equality: two Shapes are equal when they render the same
56
+ # placeholder form. /users/1 and /users/999 are the same shape even
57
+ # though raw values differ, but /users/1 and /posts/1 are not.
58
+ def ==(other)
59
+ other.is_a?(Shape) && other.render == render
60
+ end
61
+ alias eql? ==
62
+
63
+ def hash
64
+ render.hash
65
+ end
66
+
67
+ def to_dump
68
+ { "entries" => @entries.map { |e| e.transform_keys(&:to_s) } }
69
+ end
70
+
71
+ def self.from_dump(h)
72
+ entries = (h["entries"] || []).map do |e|
73
+ e.each_with_object({}) do |(k, v), acc|
74
+ key = k.to_sym
75
+ # Only :type is symbolized — :value and :hint stay as strings,
76
+ # matching what SegmentHints.derive produces.
77
+ acc[key] = key == :type ? v.to_sym : v
78
+ end
79
+ end
80
+ new(entries: entries)
81
+ end
82
+
83
+ private
84
+
85
+ def render_entry(entry, hints:, canonical_dates:, canonical_currencies:)
86
+ return entry[:value] unless entry[:variable]
87
+
88
+ if canonical_dates && entry[:type] == :date &&
89
+ (canon = SegmentClassifier.canonical_date(entry[:value]))
90
+ return canon
91
+ end
92
+
93
+ if canonical_currencies && entry[:type] == :currency &&
94
+ (canon = SegmentClassifier.canonical_currency(entry[:value]))
95
+ return canon
96
+ end
97
+
98
+ placeholder = if hints
99
+ entry[:hint] || SegmentClassifier.display_type(entry[:type])
100
+ else
101
+ SegmentClassifier.display_type(entry[:type])
102
+ end
103
+ "{#{placeholder}}"
104
+ end
105
+ end
106
+ end