iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/completions/_iriq ADDED
@@ -0,0 +1,52 @@
1
+ #compdef iriq
2
+ # Zsh completion for the `iriq` CLI.
3
+ #
4
+ # Install (pick one):
5
+ # - Persist via Homebrew: brew install dpep/tools/iriq drops this file
6
+ # into Homebrew's zsh site-functions dir automatically.
7
+ # - Try in the current shell:
8
+ # source <(iriq completion zsh)
9
+ # - Or copy this file into a directory listed in $fpath and run
10
+ # `compinit` (typically run by your zshrc).
11
+
12
+ _iriq() {
13
+ local context state state_descr line
14
+ typeset -A opt_args
15
+
16
+ _arguments -C \
17
+ '(-h --help)'{-h,--help}'[show usage]' \
18
+ '(-V --version)'{-V,--version}'[print version]' \
19
+ '(-p --parse)'{-p,--parse}'[parsed fields section]' \
20
+ '(-n --normalize)'{-n,--normalize}'[normalized section]' \
21
+ '(-c --canonical)'{-c,--canonical}'[canonical form section]' \
22
+ '(-e --explain)'{-e,--explain}'[annotated trace section]' \
23
+ '(-j --json)'{-j,--json}'[JSON output]' \
24
+ '(-J --ndjson)'{-J,--ndjson}'[newline-delimited JSON]' \
25
+ '(-N --no-hints)'{-N,--no-hints}'[use {type} placeholders, not {hint}]' \
26
+ '--hints[enable hint placeholders]' \
27
+ '--no-scheme-less[skip schemeless URL extraction]' \
28
+ '--scheme-less[enable schemeless URL extraction]' \
29
+ '--corpus[load/create a JSON or SQLite corpus]:corpus path:_files -g "*.(json|db|sqlite|sqlite3)"' \
30
+ '--host[host-keying strategy for clustering]:strategy:(full registrable reg none)' \
31
+ '--stats[print rolling aggregates]' \
32
+ '--reinfer[replay the source-IRI log]' \
33
+ '--propose-recognizers[propose new Recognizers from observed shapes]' \
34
+ '--cross-host-shapes[list route shapes seen across multiple hosts]' \
35
+ '--activate-above[auto-activate proposals at or above this confidence]:F:' \
36
+ '--min-observations[proposal threshold]:N:' \
37
+ '--min-coverage[proposal threshold]:F:' \
38
+ '--min-hosts[threshold for proposals and cross-host shapes]:N:' \
39
+ '1:command or file:->first' \
40
+ '*:file:_files' \
41
+ && return 0
42
+
43
+ case $state in
44
+ first)
45
+ _alternative \
46
+ 'commands:command:(cluster completion)' \
47
+ 'files:file:_files'
48
+ ;;
49
+ esac
50
+ }
51
+
52
+ _iriq "$@"
@@ -0,0 +1,70 @@
1
+ # Bash completion for the `iriq` CLI.
2
+ #
3
+ # Install (pick one):
4
+ # - Persist via Homebrew: brew install dpep/tools/iriq automatically
5
+ # drops this script into Homebrew's bash-completion dir.
6
+ # - Try it out in the current shell:
7
+ # source <(iriq completion bash)
8
+ # - Persist to ~/.bashrc:
9
+ # echo 'source <(iriq completion bash)' >> ~/.bashrc
10
+ # - Or write to your system's bash completion dir:
11
+ # iriq completion bash > /usr/local/etc/bash_completion.d/iriq
12
+
13
+ _iriq() {
14
+ local cur prev words cword
15
+ _init_completion 2>/dev/null || {
16
+ cur="${COMP_WORDS[COMP_CWORD]}"
17
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
18
+ }
19
+
20
+ # Argument completion for flags that take a value.
21
+ case "$prev" in
22
+ --corpus)
23
+ # Corpus paths are file-shaped. _filedir picks up *.json / *.db
24
+ # / *.sqlite / *.sqlite3 by default extension; the user can also
25
+ # tab through any path.
26
+ _filedir
27
+ return
28
+ ;;
29
+ --host)
30
+ COMPREPLY=( $(compgen -W "full registrable reg none" -- "$cur") )
31
+ return
32
+ ;;
33
+ --min-observations|--min-hosts|--min-coverage|--activate-above)
34
+ # Numeric argument — no completion candidates.
35
+ return
36
+ ;;
37
+ completion)
38
+ COMPREPLY=( $(compgen -W "bash zsh" -- "$cur") )
39
+ return
40
+ ;;
41
+ esac
42
+
43
+ # If the current token starts with `-`, complete flags.
44
+ if [[ "$cur" == -* ]]; then
45
+ local flags="-h --help -V --version -p --parse -n --normalize -c --canonical -e --explain
46
+ -j --json -J --ndjson -N --no-hints --hints --no-scheme-less
47
+ --scheme-less --corpus --host --stats --reinfer
48
+ --propose-recognizers --activate-above --cross-host-shapes
49
+ --min-observations --min-coverage --min-hosts"
50
+ COMPREPLY=( $(compgen -W "$flags" -- "$cur") )
51
+ return
52
+ fi
53
+
54
+ # First non-flag positional may be a subcommand or a file/IRI.
55
+ if [[ $COMP_CWORD -eq 1 ]]; then
56
+ COMPREPLY=( $(compgen -W "cluster completion" -- "$cur") )
57
+ # Also offer files for the auto-extract path (iriq ./access.log).
58
+ local files
59
+ files=$(compgen -f -- "$cur")
60
+ if [[ -n "$files" ]]; then
61
+ COMPREPLY+=( $files )
62
+ fi
63
+ return
64
+ fi
65
+
66
+ # Otherwise fall back to file completion (e.g. `iriq cluster <file>`).
67
+ _filedir
68
+ }
69
+
70
+ complete -F _iriq iriq
@@ -0,0 +1,223 @@
1
+ # iriq architecture
2
+
3
+ As of v0.28.0, this describes the system that's actually in the repo —
4
+ Phase 1 of the rearchitecture roadmap (target model below) is complete,
5
+ and Phase 2 has added the learning layer on top. Originally written as
6
+ the *target* model at the start of Phase 1; the implementation has
7
+ caught up.
8
+
9
+ `ROADMAP.md` tracks what's still pending (inter-position correlations,
10
+ near-shape clustering, and Phase 3 productize items).
11
+
12
+ For the inner details, read the source: `lib/iriq/{recognizer,position,
13
+ shape,evidence,event,reducer,recognizer_proposal,synthesized_recognizer,
14
+ cross_host_shape,corpus}.rb` and their Go counterparts at the repo root.
15
+
16
+ ## Core principle
17
+
18
+ The system pivots around **Position** (a slot in a host's structure) and
19
+ **Evidence** (what we know about a Position). Strings are observations;
20
+ types are inferences. Everything composable hangs off these two nouns.
21
+
22
+ ## Core types
23
+
24
+ ### Identifier (existing, mostly unchanged)
25
+
26
+ A parsed IRI: scheme, host, port, path_segments, query_params, fragment,
27
+ plus `kind ∈ {:url, :urn}` and the urn `nss`. Produced by `Parser` from a
28
+ candidate string; produced in bulk by `Extractor` from free text.
29
+
30
+ ### Position (new)
31
+
32
+ A stable identity for a slot in a host's structure.
33
+
34
+ ```
35
+ Position {
36
+ host: string # normalized per host_strategy (full|registrable|none)
37
+ scope: :path | :query
38
+ locator: path_prefix # for :path — the typed-shape prefix that led here
39
+ | param_name # for :query — the ?key= name
40
+ }
41
+ ```
42
+
43
+ Two Identifiers occupy the *same* Position when their hosts agree and their
44
+ typed prefix agrees. "Typed prefix" means the prefix rendered with inferred
45
+ types (`/users/{integer}/...`), not raw values. This makes Position identity
46
+ robust to the specific values seen.
47
+
48
+ ### Recognizer (new)
49
+
50
+ A pluggable classifier for a single (or small family of) types.
51
+
52
+ ```
53
+ Recognizer#try(string) -> { type:, confidence:, canonical:, notes: } | nil
54
+ ```
55
+
56
+ - `type` is a symbol from the recognized vocabulary (uuid, date, integer, …).
57
+ - `confidence` is in `[0, 1]`. Calibrated against
58
+ `spec/fixtures/calibration/`.
59
+ - `canonical` is the canonical form (e.g. ISO date for `:date`, uppercased
60
+ ISO 4217 for `:currency`). `nil` means "use the input as-is".
61
+ - `notes` is an optional array of strings the Trace view may surface.
62
+
63
+ Recognizers are independent. There is no global ordering. The
64
+ **SegmentClassifier becomes an ensemble** that runs each Recognizer and
65
+ picks the highest-confidence answer above a floor. `:literal` is the
66
+ fallback when no Recognizer fires above the floor.
67
+
68
+ ### Evidence (new)
69
+
70
+ The universal substrate for explanation. A small fixed schema, emitted by
71
+ both classification and inference.
72
+
73
+ ```
74
+ Evidence {
75
+ position: Position
76
+ kind: :lexical | :corpus | :recognizer | :neighbor | :policy
77
+ payload: { ... kind-specific fields ... }
78
+ weight: float in [0, 1] # contribution to the final inference
79
+ }
80
+ ```
81
+
82
+ Examples:
83
+ - `:recognizer` payload: `{ name:, type:, lexical_confidence: }`
84
+ - `:corpus` payload: `{ observations:, distinct_values:, cardinality_frac: }`
85
+ - `:neighbor` payload: `{ prior_literal:, inferred_hint: }`
86
+ - `:policy` payload: `{ rule:, applied: }` (e.g. "ipv4→ip umbrella collapse")
87
+
88
+ `Trace` and `Explanation` are *views* over a list of Evidence. They render
89
+ strings for human consumption but the data shape is shared.
90
+
91
+ ### Shape (new — replaces PathShape strings as the identity)
92
+
93
+ An ordered list of typed Positions for a single Identifier path:
94
+
95
+ ```
96
+ Shape { positions: [Position], inferred_types: [Symbol] }
97
+ ```
98
+
99
+ `Shape#render(:string)` produces the human-readable form (`/users/{user_id}`)
100
+ that today's `PathShape` produces. Cluster identity is the structural Shape,
101
+ not the string. String renderings are derived for display, not for keying.
102
+
103
+ ### Cluster (existing; refactored)
104
+
105
+ Same role: a group of Identifiers sharing (host, Shape). Internals shift to
106
+ key on structural Shape; aggregation (`PositionStats`) is unchanged.
107
+ Inference (variability promotion, enum detection, etc.) moves *out* of
108
+ Cluster and into its own stage so it can be re-run over stored evidence
109
+ without re-feeding observations.
110
+
111
+ ### Reducer (new)
112
+
113
+ A function over the event stream that maintains a materialized view.
114
+
115
+ ```
116
+ Reducer#apply(event, state) -> state
117
+ ```
118
+
119
+ Examples ship in v1:
120
+ - `HostCountsReducer`
121
+ - `PositionStatsReducer`
122
+ - `ClusterReducer`
123
+ - `FingerprintReducer`
124
+
125
+ Storage backends store the event log (or, equivalently, the materialized
126
+ views — backends choose). Adding a new metric is: write a Reducer, declare
127
+ its state shape; no other module changes.
128
+
129
+ ## Pipeline (target end-state)
130
+
131
+ ```
132
+ text
133
+ └─► Extractor ─► [candidate strings]
134
+ └─► Parser ─► Identifier
135
+ └─► Recognition ─► annotated segments + per-segment Evidence
136
+ └─► Events ─► Storage event log
137
+ └─► Reducers ─► materialized views
138
+ (PositionStats, Clusters, ...)
139
+ ├─► Inference ─► per-Position type + Evidence
140
+ └─► Rendering ─► normalized string / Shape view
141
+ ```
142
+
143
+ Each stage is a separate module with input/output contracts. Inference
144
+ re-reads materialized views; it does not require re-running the pipeline.
145
+
146
+ ## What this fixes (vs. the today code)
147
+
148
+ - **No more string-fingerprint cluster keys.** Classifier tuning no longer
149
+ fractures clusters.
150
+ - **No more order-dependent first-match classifier.** Add a Recognizer
151
+ without considering global ordering.
152
+ - **`SegmentClassifier` stops being a god module.** Recognizers own their
153
+ patterns and canonical forms; reference data (locales, currencies,
154
+ countries) lives in `ReferenceData::*`; display names / variability
155
+ predicate live in a small `Policy` / `Naming` module.
156
+ - **One Normalizer.** Mechanical mode is "NullEvidence" — no special case.
157
+ - **Two-axis type taxonomy.** Lexical shape and semantic role can grow
158
+ independently. Cardinality character is a *property of Position*, not of
159
+ type — `:year`, `:http_status`, `:enum` collapse into "integer-shape +
160
+ bounded-cardinality" expressed via Evidence.
161
+ - **Re-runnable inference.** Threshold tuning no longer requires
162
+ re-observation.
163
+ - **One explanation substrate.** Trace, Explanation, future schema export,
164
+ future PR-diff annotator are all views over Evidence.
165
+
166
+ ## What stays the same
167
+
168
+ - `Identifier` as the parsed record.
169
+ - `Parser` + `Extractor` — already cleanly separated.
170
+ - `Storage::*` backends (Memory, Json, Sqlite). Internals change to persist
171
+ events rather than ad-hoc counters, but the file-extension routing API
172
+ (`Corpus.open(path)`) stays.
173
+ - Public CLI surface (`exe/iriq`, the Go `cmd/iriq` binary).
174
+ - The four `Iriq.*` module methods: `parse`, `normalize`, `explain`,
175
+ `extract`.
176
+
177
+ ## Extensibility (where it stands)
178
+
179
+ - **Recognizer registry**: per-classifier and mutable as of v0.26.
180
+ `SegmentClassifier#register_recognizer` appends to the instance's
181
+ ensemble; `SegmentClassifier::DEFAULT` is the module-level singleton
182
+ that fresh corpora share, and the first `Corpus#activate_proposal`
183
+ call swaps to a private classifier so activations don't leak.
184
+ External users registering their own Recognizer subclasses works
185
+ today via the same API; an *external* registry / discovery surface
186
+ (load Recognizers from a config file or env var) is still future
187
+ work.
188
+ - **ProposalStrategy**: pluggable via `Iriq::ProposalStrategy::DEFAULTS`.
189
+ Adding a strategy = define a class with `#propose(storage, **opts)`
190
+ and append. v1 ships one strategy (PrefixUnderscoreId); next-segment
191
+ / cross-position correlation strategies are pending Phase 2 work.
192
+ - **Reducer registry**: the dispatch table `Iriq::Reducer::DEFAULTS`
193
+ maps `Event` subclasses to reducer lambdas. Adding a metric = define
194
+ an `Event` subtype, write a Reducer, register it. The registry is
195
+ exposed as a constant; safe to monkeypatch in user code, though no
196
+ public registration API ships yet.
197
+ - **Storage backends**: three ship (Memory, JSON, SQLite). Adding a
198
+ fourth = implement the `Storage` interface (lib/iriq/storage/memory.rb
199
+ is the canonical reference), wire it into `Storage.open` extension
200
+ routing, add it to `script/cli_parity.sh`.
201
+
202
+ ## Learning layer (Phase 2, on top of the substrate above)
203
+
204
+ The Phase 1 substrate gives us typed observations, structured Shape,
205
+ and re-runnable inference. Phase 2 builds the learning pipeline on top:
206
+
207
+ - **Source-IRI log** (v0.21). Storage persists every observed canonical
208
+ IRI alongside the materialized views. `Corpus#reinfer` drops the
209
+ views and replays the log through the current classifier + reducers
210
+ — lets us tune thresholds or swap the classifier without re-feeding.
211
+ - **RecognizerProposal** (v0.23). A struct describing a learned
212
+ pattern: prefix / suggested_type / positions / hosts / coverage /
213
+ confidence / observation_count / sample_values / strategy.
214
+ Emitted by ProposalStrategy implementations; not auto-applied.
215
+ - **SynthesizedRecognizer** (v0.26). Built from a proposal's prefix;
216
+ regex `^<prefix>[A-Za-z0-9]+$`, `Specificity::SEMANTIC`. Same
217
+ Recognizer interface as the built-ins (UUID, Date, Integer) — the
218
+ ensemble doesn't know the difference.
219
+ - **CrossHostShape** (v0.27). Read-side: route shapes that recur
220
+ across multiple hosts. Independent evidence of semantic pattern.
221
+ - **Confidence formula** (v0.28). `min(1.0, coverage + 0.05 *
222
+ (host_count - 1))`. Single-host proposals are unchanged; cross-host
223
+ proposals get boosted. `--activate-above F` checks confidence.
data/docs/ROADMAP.md ADDED
@@ -0,0 +1,190 @@
1
+ # iriq roadmap
2
+
3
+ A multi-quarter plan to lift iriq from a single-pass string classifier into an
4
+ event-driven inference system over typed Positions. Pre-release; we will break
5
+ APIs freely and bump minor versions per shipped feature.
6
+
7
+ Source of the plan: principal-engineer architectural review on 2026-05-30,
8
+ captured in the project history.
9
+
10
+ ## Where to pick up next time
11
+
12
+ Last updated: 2026-05-31, current version **v0.28.0**.
13
+
14
+ **Phase 1**: complete (v0.11 → v0.19). The structural foundation —
15
+ Recognizer ensemble, Position, Shape, Evidence, single Normalizer,
16
+ events + reducers — is all in place.
17
+
18
+ **Phase 2**: 5 of 7 items shipped (v0.21 → v0.28).
19
+ - ✅ Re-runnable inference (library + CLI)
20
+ - ✅ Learned recognizers (library + propose CLI + auto-activate CLI)
21
+ - ✅ Cross-host learning (catalog + confidence integration)
22
+ - ⏭️ **Inter-position correlations** — next up. When slot A's value
23
+ predicts slot B's type, surface that ("after `/orgs/{org_id}/` the
24
+ next segment is almost always `users` or `repos`"). Read-side query
25
+ over existing cluster + position stats; no storage changes needed.
26
+ Estimated scope: similar to cross-host shape learning. One commit.
27
+ - ⏭️ **Near-shape clustering** — edit-distance over Shapes catches
28
+ near-duplicates that today require exact match. Bigger scope — needs
29
+ a Shape distance metric, candidate-pair search, and CLI for
30
+ inspection. Two or three commits.
31
+
32
+ **Phase 3**: not started. Schema export, PII mode, streaming observatory,
33
+ external recognizer registry.
34
+
35
+ **Housekeeping done**: Ruby 3.3 dropped (4d49509), shell completion
36
+ with brew auto-install (v0.25), homebrew skill updated, README has the
37
+ full learning-loop documentation, all CLI flags reflected in `-h` and
38
+ completion scripts. Homebrew formula tracks 0.28.0.
39
+
40
+ **Recommended next commit**: inter-position correlations. The
41
+ read-side seam is clean (walk clusters, group segments by their
42
+ predecessor) and the payoff is concrete (next-segment hints for
43
+ corpus-informed normalization). Falls out of the existing data with
44
+ no schema changes.
45
+
46
+ ## Operating ground rules (decisions on file)
47
+
48
+ - **Backward compatibility:** none required while pre-1.0. Break what you must.
49
+ Bump the minor version per feature.
50
+ - **Public API surface:** narrow on purpose. The four `Iriq.*` module methods
51
+ and the CLI are the contract; everything else may move freely. Dev speed
52
+ beats API stability for now.
53
+ - **Ruby ↔ Go parity:** still the goal. The current discipline (Ruby-first,
54
+ fixture regen, Go-mirror, parity test) stays in force as the *acceptance
55
+ gate*; ordering within a single feature commit is implementer's call.
56
+ - **Performance:** no hard target. Don't be wasteful. Watch out for obvious
57
+ regressions (per-segment allocation in hot paths, redundant classify calls,
58
+ etc.) but no benchmark gates yet.
59
+ - **Extensibility:** internal-only is fine for v1. Design the seams so an
60
+ external recognizer registry is a small follow-up, not a redesign.
61
+ - **Calibration data:** we generate and check in our own. See task #3.
62
+ - **Commit discipline:** one feature, one commit, immediate push. Tests must
63
+ pass: `bundle exec rspec && go test ./... && script/cli_parity.sh`.
64
+
65
+ ## Phase 1 — Year 1: structural foundation
66
+
67
+ Status: **complete**. All 9 tasks shipped in a single session, v0.11.0 → v0.19.0.
68
+
69
+ Goal: stop being a 554-line `SegmentClassifier` doing five jobs. Land the
70
+ substrate that the year-2 learning work compounds on.
71
+
72
+ Ordered task list (each was a commit checkpoint, each bumped a minor version):
73
+
74
+ 1. ✅ **Docs + plan** — this file and `ARCHITECTURE.md`. Shipped at 866525f.
75
+ 2. ✅ **Recognizer interface (uuid, date, integer)** — carved three
76
+ Recognizers out of `SegmentClassifier`. No behavior change; first-match
77
+ equivalent. Shipped in v0.12.0 (b3cc889).
78
+ 3. ✅ **Calibration corpus** — 202 labeled segments across 25 types at
79
+ `spec/fixtures/calibration/segments.json`, generated by
80
+ `script/build_calibration.rb`. Loaded by both Ruby (`calibration_spec`)
81
+ and Go (`calibration_test.go`) as a regression suite + future
82
+ calibration target. Shipped in v0.13.0 (f9eb980).
83
+ 4. ✅ **Scored ensemble** — added Specificity bands (SEMANTIC, STRUCTURED,
84
+ BOUNDED, TYPED, PATTERN, FALLBACK) on each Recognizer's Verdict and an
85
+ Ensemble helper that picks max(specificity × confidence) with stable
86
+ earlier-wins tie-break. Shipped in v0.14.0 (924f1ba).
87
+ 5. ✅ **Position promoted to first-class** — Position is { host, scope ∈
88
+ {:path,:query}, locator } — a typed slot in a host's URL structure.
89
+ Storage contracts now take Position; SQLite schema gained a `scope`
90
+ column (bumped to v2). Shipped in v0.15.0 (954e7a6).
91
+ 6. ✅ **Structured Shape** — Shape is a value object: an ordered list of
92
+ typed segment entries plus Render() / Equal() over the structured form.
93
+ PathShape became a thin wrapper. Cluster gained `#shape_object`.
94
+ Shipped in v0.16.0 (97a60c2).
95
+ 7. ✅ **Evidence records + Trace as view** — Evidence is the structured
96
+ substrate for explanation: three subject kinds (segment / position /
97
+ cluster), five sources (lexical / recognizer / corpus / neighbor /
98
+ policy). Ruby Trace was rewritten to build Evidence internally and
99
+ render it into the same `{value, type, output, notes}` hash. Go ships
100
+ the Evidence types but kept Trace internals hand-coded (output parity
101
+ preserved). Shipped in v0.17.0 (9e1133e).
102
+ 8. ✅ **One Normalizer** — folded mechanical Normalizer and
103
+ corpus-informed Corpus#normalize into one entry point with an evidence
104
+ source. NullEvidence provides classifier-only behavior; Corpus
105
+ implements the same RenderPath / RenderQuery interface for
106
+ corpus-informed rendering. Shipped in v0.18.0 (24ad168).
107
+ 9. ✅ **Events + reducers** — Corpus.observe now builds an ordered Event
108
+ list (HostSeen, PathLengthSeen, RawShapeSeen, FingerprintSeen,
109
+ per-segment PositionSeen, ClusterAddition) and applies each through
110
+ the Reducer registry. Adding a new metric = define a new Event +
111
+ Reducer; no other module changes. Event list is transient today;
112
+ future commit can persist it for re-runnable inference. Shipped in
113
+ v0.19.0 (891fd12).
114
+
115
+ End of Phase 1 deliverable: the same CLI surface, but the internals are an
116
+ evidence-driven inference system over typed Positions with pluggable
117
+ recognizers and re-runnable-inference-ready event emission.
118
+
119
+ ## Phase 2 — Year 2: learning that compounds on the new substrate
120
+
121
+ Each of these is a non-trivial multi-PR initiative.
122
+
123
+ - ✅ **Re-runnable inference (library)** — source-IRI log persisted alongside
124
+ materialized views; `Corpus#reinfer` drops the views and replays the log
125
+ through events + reducers. Lets users tune thresholds or swap the
126
+ classifier without re-feeding IRIs. Shipped in v0.21.0.
127
+ - ✅ **Re-runnable inference (CLI)** — `iriq --corpus PATH --reinfer`
128
+ replays the log and prints a before/after summary. Parity scenarios
129
+ added for JSON + SQLite backends. Shipped in v0.22.0.
130
+ - **Inter-position correlations** — when slot A's value predicts slot B's
131
+ type, surface that. Catches things like "the segment after `/orgs/{org_id}/`
132
+ is almost always `users` or `repos`".
133
+ - ✅ **Learned recognizers** — full loop:
134
+ * `Corpus#propose_recognizers` scans observed values via pluggable
135
+ ProposalStrategy strategies (v0.23.0). v1 ships PrefixUnderscoreId
136
+ (detects `ghp_…`, `cus_…`, `sk_…` shapes at slug/opaque_id
137
+ positions).
138
+ * `iriq --corpus PATH --propose-recognizers [--json]` CLI flag with
139
+ tunable `--min-observations` / `--min-coverage` / `--min-hosts`
140
+ thresholds (v0.24.0).
141
+ * `Corpus#activate_proposal(p)` promotes a proposal into a live
142
+ SynthesizedRecognizer on the corpus's classifier, persists it,
143
+ and reinfers. Reopens re-apply via `Corpus.open`. Doesn't leak
144
+ to the module-level DEFAULT classifier — corpora are isolated
145
+ (v0.26.0).
146
+ * `iriq --corpus PATH --propose-recognizers --activate-above F`
147
+ auto-activates every proposal at or above coverage F (v0.26.0).
148
+ - ✅ **Cross-host learning** — full integration:
149
+ * `Corpus#cross_host_shapes(min_hosts:)` lists route shapes that
150
+ recur across multiple hosts (independent evidence of semantic
151
+ pattern, not host-local quirk). CLI `--cross-host-shapes`. v0.27.0.
152
+ * Cross-host count wires into `RecognizerProposal#confidence`:
153
+ each additional host beyond the first adds 0.05 (capped at 1.0).
154
+ Single-host proposals get `confidence == coverage`; cross-host
155
+ proposals get boosted. `--activate-above F` checks confidence
156
+ (not raw coverage). Proposals are sorted by confidence desc.
157
+ v0.28.0.
158
+ - **Near-shape clustering** — edit-distance over Shapes catches
159
+ near-duplicates that today require exact match.
160
+
161
+ ## Phase 3 — Year 3: productize
162
+
163
+ Speculative; revisit after Phase 2.
164
+
165
+ - Exported learned schemas (OpenAPI-ish: routes per host with inferred
166
+ types per Position).
167
+ - Privacy/PII mode (recognizer-driven redaction policies).
168
+ - Streaming observatory with shape-drift alerts.
169
+ - External recognizer registry (the public surface for what Phase 1 keeps
170
+ internal).
171
+
172
+ ## Risks tracked
173
+
174
+ - **Parity tax compounds.** Every refactor is 2× surface. If a refactor's Go
175
+ port slips a release, mark it explicitly in CHANGELOG and treat it as tech
176
+ debt — don't let it sit.
177
+ - **Confidence calibration is theater without truth data.** Task #3 ships
178
+ before #4 for that reason.
179
+ - **Recognizer-set versioning.** Once we ship pluggable recognizers, stored
180
+ corpora need to know which recognizer-set produced them. Plan: stamp
181
+ `recognizer_set_id` into `PositionStats` at observation time. Defer
182
+ implementation until Phase 2.
183
+
184
+ ## How to use this document
185
+
186
+ - Update it as decisions change. It is not a frozen artifact.
187
+ - Each completed Phase 1 task: tick it off, add a one-line "shipped in vX.Y.Z"
188
+ pointer. Don't delete history.
189
+ - Bigger plan changes (reorder, scope cut, scope grow): leave the prior
190
+ list intact and append a "Plan change YYYY-MM-DD" note explaining why.
data/iriq.gemspec CHANGED
@@ -5,14 +5,14 @@ Gem::Specification.new do |s|
5
5
  s.version = Iriq::VERSION
6
6
  s.authors = ["Daniel Pepper"]
7
7
  s.description = "IRI extraction, normalization, and clustering."
8
- s.files = `git ls-files * ':!:spec' ':!:script' ':!:cmd' ':!:bin' ':!:*.go' ':!:go.mod' ':!:go.sum'`.split("\n")
8
+ s.files = `git ls-files * ':!:spec' ':!:script' ':!:bin' ':!:rust' ':!:go'`.split("\n")
9
9
  s.bindir = "exe"
10
10
  s.executables = ["iriq"]
11
11
  s.homepage = "https://github.com/dpep/iriq"
12
12
  s.license = "MIT"
13
13
  s.summary = "IRI extraction, normalization, and clustering."
14
14
 
15
- s.required_ruby_version = ">= 3.2"
15
+ s.required_ruby_version = ">= 3.4"
16
16
 
17
17
  s.add_development_dependency 'debug', '>= 1'
18
18
  s.add_development_dependency 'rspec', '>= 3.10'