iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
data/completions/_iriq
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
#compdef iriq
|
|
2
|
+
# Zsh completion for the `iriq` CLI.
|
|
3
|
+
#
|
|
4
|
+
# Install (pick one):
|
|
5
|
+
# - Persist via Homebrew: brew install dpep/tools/iriq drops this file
|
|
6
|
+
# into Homebrew's zsh site-functions dir automatically.
|
|
7
|
+
# - Try in the current shell:
|
|
8
|
+
# source <(iriq completion zsh)
|
|
9
|
+
# - Or copy this file into a directory listed in $fpath and run
|
|
10
|
+
# `compinit` (typically run by your zshrc).
|
|
11
|
+
|
|
12
|
+
_iriq() {
|
|
13
|
+
local context state state_descr line
|
|
14
|
+
typeset -A opt_args
|
|
15
|
+
|
|
16
|
+
_arguments -C \
|
|
17
|
+
'(-h --help)'{-h,--help}'[show usage]' \
|
|
18
|
+
'(-V --version)'{-V,--version}'[print version]' \
|
|
19
|
+
'(-p --parse)'{-p,--parse}'[parsed fields section]' \
|
|
20
|
+
'(-n --normalize)'{-n,--normalize}'[normalized section]' \
|
|
21
|
+
'(-c --canonical)'{-c,--canonical}'[canonical form section]' \
|
|
22
|
+
'(-e --explain)'{-e,--explain}'[annotated trace section]' \
|
|
23
|
+
'(-j --json)'{-j,--json}'[JSON output]' \
|
|
24
|
+
'(-J --ndjson)'{-J,--ndjson}'[newline-delimited JSON]' \
|
|
25
|
+
'(-N --no-hints)'{-N,--no-hints}'[use {type} placeholders, not {hint}]' \
|
|
26
|
+
'--hints[enable hint placeholders]' \
|
|
27
|
+
'--no-scheme-less[skip schemeless URL extraction]' \
|
|
28
|
+
'--scheme-less[enable schemeless URL extraction]' \
|
|
29
|
+
'--corpus[load/create a JSON or SQLite corpus]:corpus path:_files -g "*.(json|db|sqlite|sqlite3)"' \
|
|
30
|
+
'--host[host-keying strategy for clustering]:strategy:(full registrable reg none)' \
|
|
31
|
+
'--stats[print rolling aggregates]' \
|
|
32
|
+
'--reinfer[replay the source-IRI log]' \
|
|
33
|
+
'--propose-recognizers[propose new Recognizers from observed shapes]' \
|
|
34
|
+
'--cross-host-shapes[list route shapes seen across multiple hosts]' \
|
|
35
|
+
'--activate-above[auto-activate proposals at or above this confidence]:F:' \
|
|
36
|
+
'--min-observations[proposal threshold]:N:' \
|
|
37
|
+
'--min-coverage[proposal threshold]:F:' \
|
|
38
|
+
'--min-hosts[threshold for proposals and cross-host shapes]:N:' \
|
|
39
|
+
'1:command or file:->first' \
|
|
40
|
+
'*:file:_files' \
|
|
41
|
+
&& return 0
|
|
42
|
+
|
|
43
|
+
case $state in
|
|
44
|
+
first)
|
|
45
|
+
_alternative \
|
|
46
|
+
'commands:command:(cluster completion)' \
|
|
47
|
+
'files:file:_files'
|
|
48
|
+
;;
|
|
49
|
+
esac
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
_iriq "$@"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Bash completion for the `iriq` CLI.
|
|
2
|
+
#
|
|
3
|
+
# Install (pick one):
|
|
4
|
+
# - Persist via Homebrew: brew install dpep/tools/iriq automatically
|
|
5
|
+
# drops this script into Homebrew's bash-completion dir.
|
|
6
|
+
# - Try it out in the current shell:
|
|
7
|
+
# source <(iriq completion bash)
|
|
8
|
+
# - Persist to ~/.bashrc:
|
|
9
|
+
# echo 'source <(iriq completion bash)' >> ~/.bashrc
|
|
10
|
+
# - Or write to your system's bash completion dir:
|
|
11
|
+
# iriq completion bash > /usr/local/etc/bash_completion.d/iriq
|
|
12
|
+
|
|
13
|
+
_iriq() {
|
|
14
|
+
local cur prev words cword
|
|
15
|
+
_init_completion 2>/dev/null || {
|
|
16
|
+
cur="${COMP_WORDS[COMP_CWORD]}"
|
|
17
|
+
prev="${COMP_WORDS[COMP_CWORD-1]}"
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Argument completion for flags that take a value.
|
|
21
|
+
case "$prev" in
|
|
22
|
+
--corpus)
|
|
23
|
+
# Corpus paths are file-shaped. _filedir picks up *.json / *.db
|
|
24
|
+
# / *.sqlite / *.sqlite3 by default extension; the user can also
|
|
25
|
+
# tab through any path.
|
|
26
|
+
_filedir
|
|
27
|
+
return
|
|
28
|
+
;;
|
|
29
|
+
--host)
|
|
30
|
+
COMPREPLY=( $(compgen -W "full registrable reg none" -- "$cur") )
|
|
31
|
+
return
|
|
32
|
+
;;
|
|
33
|
+
--min-observations|--min-hosts|--min-coverage|--activate-above)
|
|
34
|
+
# Numeric argument — no completion candidates.
|
|
35
|
+
return
|
|
36
|
+
;;
|
|
37
|
+
completion)
|
|
38
|
+
COMPREPLY=( $(compgen -W "bash zsh" -- "$cur") )
|
|
39
|
+
return
|
|
40
|
+
;;
|
|
41
|
+
esac
|
|
42
|
+
|
|
43
|
+
# If the current token starts with `-`, complete flags.
|
|
44
|
+
if [[ "$cur" == -* ]]; then
|
|
45
|
+
local flags="-h --help -V --version -p --parse -n --normalize -c --canonical -e --explain
|
|
46
|
+
-j --json -J --ndjson -N --no-hints --hints --no-scheme-less
|
|
47
|
+
--scheme-less --corpus --host --stats --reinfer
|
|
48
|
+
--propose-recognizers --activate-above --cross-host-shapes
|
|
49
|
+
--min-observations --min-coverage --min-hosts"
|
|
50
|
+
COMPREPLY=( $(compgen -W "$flags" -- "$cur") )
|
|
51
|
+
return
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
# First non-flag positional may be a subcommand or a file/IRI.
|
|
55
|
+
if [[ $COMP_CWORD -eq 1 ]]; then
|
|
56
|
+
COMPREPLY=( $(compgen -W "cluster completion" -- "$cur") )
|
|
57
|
+
# Also offer files for the auto-extract path (iriq ./access.log).
|
|
58
|
+
local files
|
|
59
|
+
files=$(compgen -f -- "$cur")
|
|
60
|
+
if [[ -n "$files" ]]; then
|
|
61
|
+
COMPREPLY+=( $files )
|
|
62
|
+
fi
|
|
63
|
+
return
|
|
64
|
+
fi
|
|
65
|
+
|
|
66
|
+
# Otherwise fall back to file completion (e.g. `iriq cluster <file>`).
|
|
67
|
+
_filedir
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
complete -F _iriq iriq
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# iriq architecture
|
|
2
|
+
|
|
3
|
+
As of v0.28.0, this describes the system that's actually in the repo —
|
|
4
|
+
Phase 1 of the rearchitecture roadmap (target model below) is complete,
|
|
5
|
+
and Phase 2 has added the learning layer on top. Originally written as
|
|
6
|
+
the *target* model at the start of Phase 1; the implementation has
|
|
7
|
+
caught up.
|
|
8
|
+
|
|
9
|
+
`ROADMAP.md` tracks what's still pending (inter-position correlations,
|
|
10
|
+
near-shape clustering, and Phase 3 productize items).
|
|
11
|
+
|
|
12
|
+
For the inner details, read the source: `lib/iriq/{recognizer,position,
|
|
13
|
+
shape,evidence,event,reducer,recognizer_proposal,synthesized_recognizer,
|
|
14
|
+
cross_host_shape,corpus}.rb` and their Go counterparts at the repo root.
|
|
15
|
+
|
|
16
|
+
## Core principle
|
|
17
|
+
|
|
18
|
+
The system pivots around **Position** (a slot in a host's structure) and
|
|
19
|
+
**Evidence** (what we know about a Position). Strings are observations;
|
|
20
|
+
types are inferences. Everything composable hangs off these two nouns.
|
|
21
|
+
|
|
22
|
+
## Core types
|
|
23
|
+
|
|
24
|
+
### Identifier (existing, mostly unchanged)
|
|
25
|
+
|
|
26
|
+
A parsed IRI: scheme, host, port, path_segments, query_params, fragment,
|
|
27
|
+
plus `kind ∈ {:url, :urn}` and the urn `nss`. Produced by `Parser` from a
|
|
28
|
+
candidate string; produced in bulk by `Extractor` from free text.
|
|
29
|
+
|
|
30
|
+
### Position (new)
|
|
31
|
+
|
|
32
|
+
A stable identity for a slot in a host's structure.
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Position {
|
|
36
|
+
host: string # normalized per host_strategy (full|registrable|none)
|
|
37
|
+
scope: :path | :query
|
|
38
|
+
locator: path_prefix # for :path — the typed-shape prefix that led here
|
|
39
|
+
| param_name # for :query — the ?key= name
|
|
40
|
+
}
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Two Identifiers occupy the *same* Position when their hosts agree and their
|
|
44
|
+
typed prefix agrees. "Typed prefix" means the prefix rendered with inferred
|
|
45
|
+
types (`/users/{integer}/...`), not raw values. This makes Position identity
|
|
46
|
+
robust to the specific values seen.
|
|
47
|
+
|
|
48
|
+
### Recognizer (new)
|
|
49
|
+
|
|
50
|
+
A pluggable classifier for a single (or small family of) types.
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
Recognizer#try(string) -> { type:, confidence:, canonical:, notes: } | nil
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
- `type` is a symbol from the recognized vocabulary (uuid, date, integer, …).
|
|
57
|
+
- `confidence` is in `[0, 1]`. Calibrated against
|
|
58
|
+
`spec/fixtures/calibration/`.
|
|
59
|
+
- `canonical` is the canonical form (e.g. ISO date for `:date`, uppercased
|
|
60
|
+
ISO 4217 for `:currency`). `nil` means "use the input as-is".
|
|
61
|
+
- `notes` is an optional array of strings the Trace view may surface.
|
|
62
|
+
|
|
63
|
+
Recognizers are independent. There is no global ordering. The
|
|
64
|
+
**SegmentClassifier becomes an ensemble** that runs each Recognizer and
|
|
65
|
+
picks the highest-confidence answer above a floor. `:literal` is the
|
|
66
|
+
fallback when no Recognizer fires above the floor.
|
|
67
|
+
|
|
68
|
+
### Evidence (new)
|
|
69
|
+
|
|
70
|
+
The universal substrate for explanation. A small fixed schema, emitted by
|
|
71
|
+
both classification and inference.
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
Evidence {
|
|
75
|
+
position: Position
|
|
76
|
+
kind: :lexical | :corpus | :recognizer | :neighbor | :policy
|
|
77
|
+
payload: { ... kind-specific fields ... }
|
|
78
|
+
weight: float in [0, 1] # contribution to the final inference
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
- `:recognizer` payload: `{ name:, type:, lexical_confidence: }`
|
|
84
|
+
- `:corpus` payload: `{ observations:, distinct_values:, cardinality_frac: }`
|
|
85
|
+
- `:neighbor` payload: `{ prior_literal:, inferred_hint: }`
|
|
86
|
+
- `:policy` payload: `{ rule:, applied: }` (e.g. "ipv4→ip umbrella collapse")
|
|
87
|
+
|
|
88
|
+
`Trace` and `Explanation` are *views* over a list of Evidence. They render
|
|
89
|
+
strings for human consumption but the data shape is shared.
|
|
90
|
+
|
|
91
|
+
### Shape (new — replaces PathShape strings as the identity)
|
|
92
|
+
|
|
93
|
+
An ordered list of typed Positions for a single Identifier path:
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
Shape { positions: [Position], inferred_types: [Symbol] }
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
`Shape#render(:string)` produces the human-readable form (`/users/{user_id}`)
|
|
100
|
+
that today's `PathShape` produces. Cluster identity is the structural Shape,
|
|
101
|
+
not the string. String renderings are derived for display, not for keying.
|
|
102
|
+
|
|
103
|
+
### Cluster (existing; refactored)
|
|
104
|
+
|
|
105
|
+
Same role: a group of Identifiers sharing (host, Shape). Internals shift to
|
|
106
|
+
key on structural Shape; aggregation (`PositionStats`) is unchanged.
|
|
107
|
+
Inference (variability promotion, enum detection, etc.) moves *out* of
|
|
108
|
+
Cluster and into its own stage so it can be re-run over stored evidence
|
|
109
|
+
without re-feeding observations.
|
|
110
|
+
|
|
111
|
+
### Reducer (new)
|
|
112
|
+
|
|
113
|
+
A function over the event stream that maintains a materialized view.
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
Reducer#apply(event, state) -> state
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Examples ship in v1:
|
|
120
|
+
- `HostCountsReducer`
|
|
121
|
+
- `PositionStatsReducer`
|
|
122
|
+
- `ClusterReducer`
|
|
123
|
+
- `FingerprintReducer`
|
|
124
|
+
|
|
125
|
+
Storage backends store the event log (or, equivalently, the materialized
|
|
126
|
+
views — backends choose). Adding a new metric is: write a Reducer, declare
|
|
127
|
+
its state shape; no other module changes.
|
|
128
|
+
|
|
129
|
+
## Pipeline (target end-state)
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
text
|
|
133
|
+
└─► Extractor ─► [candidate strings]
|
|
134
|
+
└─► Parser ─► Identifier
|
|
135
|
+
└─► Recognition ─► annotated segments + per-segment Evidence
|
|
136
|
+
└─► Events ─► Storage event log
|
|
137
|
+
└─► Reducers ─► materialized views
|
|
138
|
+
(PositionStats, Clusters, ...)
|
|
139
|
+
├─► Inference ─► per-Position type + Evidence
|
|
140
|
+
└─► Rendering ─► normalized string / Shape view
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Each stage is a separate module with input/output contracts. Inference
|
|
144
|
+
re-reads materialized views; it does not require re-running the pipeline.
|
|
145
|
+
|
|
146
|
+
## What this fixes (vs. the today code)
|
|
147
|
+
|
|
148
|
+
- **No more string-fingerprint cluster keys.** Classifier tuning no longer
|
|
149
|
+
fractures clusters.
|
|
150
|
+
- **No more order-dependent first-match classifier.** Add a Recognizer
|
|
151
|
+
without considering global ordering.
|
|
152
|
+
- **`SegmentClassifier` stops being a god module.** Recognizers own their
|
|
153
|
+
patterns and canonical forms; reference data (locales, currencies,
|
|
154
|
+
countries) lives in `ReferenceData::*`; display names / variability
|
|
155
|
+
predicate live in a small `Policy` / `Naming` module.
|
|
156
|
+
- **One Normalizer.** Mechanical mode is "NullEvidence" — no special case.
|
|
157
|
+
- **Two-axis type taxonomy.** Lexical shape and semantic role can grow
|
|
158
|
+
independently. Cardinality character is a *property of Position*, not of
|
|
159
|
+
type — `:year`, `:http_status`, `:enum` collapse into "integer-shape +
|
|
160
|
+
bounded-cardinality" expressed via Evidence.
|
|
161
|
+
- **Re-runnable inference.** Threshold tuning no longer requires
|
|
162
|
+
re-observation.
|
|
163
|
+
- **One explanation substrate.** Trace, Explanation, future schema export,
|
|
164
|
+
future PR-diff annotator are all views over Evidence.
|
|
165
|
+
|
|
166
|
+
## What stays the same
|
|
167
|
+
|
|
168
|
+
- `Identifier` as the parsed record.
|
|
169
|
+
- `Parser` + `Extractor` — already cleanly separated.
|
|
170
|
+
- `Storage::*` backends (Memory, Json, Sqlite). Internals change to persist
|
|
171
|
+
events rather than ad-hoc counters, but the file-extension routing API
|
|
172
|
+
(`Corpus.open(path)`) stays.
|
|
173
|
+
- Public CLI surface (`exe/iriq`, the Go `cmd/iriq` binary).
|
|
174
|
+
- The four `Iriq.*` module methods: `parse`, `normalize`, `explain`,
|
|
175
|
+
`extract`.
|
|
176
|
+
|
|
177
|
+
## Extensibility (where it stands)
|
|
178
|
+
|
|
179
|
+
- **Recognizer registry**: per-classifier and mutable as of v0.26.
|
|
180
|
+
`SegmentClassifier#register_recognizer` appends to the instance's
|
|
181
|
+
ensemble; `SegmentClassifier::DEFAULT` is the module-level singleton
|
|
182
|
+
that fresh corpora share, and the first `Corpus#activate_proposal`
|
|
183
|
+
call swaps to a private classifier so activations don't leak.
|
|
184
|
+
External users registering their own Recognizer subclasses works
|
|
185
|
+
today via the same API; an *external* registry / discovery surface
|
|
186
|
+
(load Recognizers from a config file or env var) is still future
|
|
187
|
+
work.
|
|
188
|
+
- **ProposalStrategy**: pluggable via `Iriq::ProposalStrategy::DEFAULTS`.
|
|
189
|
+
Adding a strategy = define a class with `#propose(storage, **opts)`
|
|
190
|
+
and append. v1 ships one strategy (PrefixUnderscoreId); next-segment
|
|
191
|
+
/ cross-position correlation strategies are pending Phase 2 work.
|
|
192
|
+
- **Reducer registry**: the dispatch table `Iriq::Reducer::DEFAULTS`
|
|
193
|
+
maps `Event` subclasses to reducer lambdas. Adding a metric = define
|
|
194
|
+
an `Event` subtype, write a Reducer, register it. The registry is
|
|
195
|
+
exposed as a constant; safe to monkeypatch in user code, though no
|
|
196
|
+
public registration API ships yet.
|
|
197
|
+
- **Storage backends**: three ship (Memory, JSON, SQLite). Adding a
|
|
198
|
+
fourth = implement the `Storage` interface (lib/iriq/storage/memory.rb
|
|
199
|
+
is the canonical reference), wire it into `Storage.open` extension
|
|
200
|
+
routing, add it to `script/cli_parity.sh`.
|
|
201
|
+
|
|
202
|
+
## Learning layer (Phase 2, on top of the substrate above)
|
|
203
|
+
|
|
204
|
+
The Phase 1 substrate gives us typed observations, structured Shape,
|
|
205
|
+
and re-runnable inference. Phase 2 builds the learning pipeline on top:
|
|
206
|
+
|
|
207
|
+
- **Source-IRI log** (v0.21). Storage persists every observed canonical
|
|
208
|
+
IRI alongside the materialized views. `Corpus#reinfer` drops the
|
|
209
|
+
views and replays the log through the current classifier + reducers
|
|
210
|
+
— lets us tune thresholds or swap the classifier without re-feeding.
|
|
211
|
+
- **RecognizerProposal** (v0.23). A struct describing a learned
|
|
212
|
+
pattern: prefix / suggested_type / positions / hosts / coverage /
|
|
213
|
+
confidence / observation_count / sample_values / strategy.
|
|
214
|
+
Emitted by ProposalStrategy implementations; not auto-applied.
|
|
215
|
+
- **SynthesizedRecognizer** (v0.26). Built from a proposal's prefix;
|
|
216
|
+
regex `^<prefix>[A-Za-z0-9]+$`, `Specificity::SEMANTIC`. Same
|
|
217
|
+
Recognizer interface as the built-ins (UUID, Date, Integer) — the
|
|
218
|
+
ensemble doesn't know the difference.
|
|
219
|
+
- **CrossHostShape** (v0.27). Read-side: route shapes that recur
|
|
220
|
+
across multiple hosts. Independent evidence of semantic pattern.
|
|
221
|
+
- **Confidence formula** (v0.28). `min(1.0, coverage + 0.05 *
|
|
222
|
+
(host_count - 1))`. Single-host proposals are unchanged; cross-host
|
|
223
|
+
proposals get boosted. `--activate-above F` checks confidence.
|
data/docs/ROADMAP.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# iriq roadmap
|
|
2
|
+
|
|
3
|
+
A multi-quarter plan to lift iriq from a single-pass string classifier into an
|
|
4
|
+
event-driven inference system over typed Positions. Pre-release; we will break
|
|
5
|
+
APIs freely and bump minor versions per shipped feature.
|
|
6
|
+
|
|
7
|
+
Source of the plan: principal-engineer architectural review on 2026-05-30,
|
|
8
|
+
captured in the project history.
|
|
9
|
+
|
|
10
|
+
## Where to pick up next time
|
|
11
|
+
|
|
12
|
+
Last updated: 2026-05-31, current version **v0.28.0**.
|
|
13
|
+
|
|
14
|
+
**Phase 1**: complete (v0.11 → v0.19). The structural foundation —
|
|
15
|
+
Recognizer ensemble, Position, Shape, Evidence, single Normalizer,
|
|
16
|
+
events + reducers — is all in place.
|
|
17
|
+
|
|
18
|
+
**Phase 2**: 5 of 7 items shipped (v0.21 → v0.28).
|
|
19
|
+
- ✅ Re-runnable inference (library + CLI)
|
|
20
|
+
- ✅ Learned recognizers (library + propose CLI + auto-activate CLI)
|
|
21
|
+
- ✅ Cross-host learning (catalog + confidence integration)
|
|
22
|
+
- ⏭️ **Inter-position correlations** — next up. When slot A's value
|
|
23
|
+
predicts slot B's type, surface that ("after `/orgs/{org_id}/` the
|
|
24
|
+
next segment is almost always `users` or `repos`"). Read-side query
|
|
25
|
+
over existing cluster + position stats; no storage changes needed.
|
|
26
|
+
Estimated scope: similar to cross-host shape learning. One commit.
|
|
27
|
+
- ⏭️ **Near-shape clustering** — edit-distance over Shapes catches
|
|
28
|
+
near-duplicates that today require exact match. Bigger scope — needs
|
|
29
|
+
a Shape distance metric, candidate-pair search, and CLI for
|
|
30
|
+
inspection. Two or three commits.
|
|
31
|
+
|
|
32
|
+
**Phase 3**: not started. Schema export, PII mode, streaming observatory,
|
|
33
|
+
external recognizer registry.
|
|
34
|
+
|
|
35
|
+
**Housekeeping done**: Ruby 3.3 dropped (4d49509), shell completion
|
|
36
|
+
with brew auto-install (v0.25), homebrew skill updated, README has the
|
|
37
|
+
full learning-loop documentation, all CLI flags reflected in `-h` and
|
|
38
|
+
completion scripts. Homebrew formula tracks 0.28.0.
|
|
39
|
+
|
|
40
|
+
**Recommended next commit**: inter-position correlations. The
|
|
41
|
+
read-side seam is clean (walk clusters, group segments by their
|
|
42
|
+
predecessor) and the payoff is concrete (next-segment hints for
|
|
43
|
+
corpus-informed normalization). Falls out of the existing data with
|
|
44
|
+
no schema changes.
|
|
45
|
+
|
|
46
|
+
## Operating ground rules (decisions on file)
|
|
47
|
+
|
|
48
|
+
- **Backward compatibility:** none required while pre-1.0. Break what you must.
|
|
49
|
+
Bump the minor version per feature.
|
|
50
|
+
- **Public API surface:** narrow on purpose. The four `Iriq.*` module methods
|
|
51
|
+
and the CLI are the contract; everything else may move freely. Dev speed
|
|
52
|
+
beats API stability for now.
|
|
53
|
+
- **Ruby ↔ Go parity:** still the goal. The current discipline (Ruby-first,
|
|
54
|
+
fixture regen, Go-mirror, parity test) stays in force as the *acceptance
|
|
55
|
+
gate*; ordering within a single feature commit is implementer's call.
|
|
56
|
+
- **Performance:** no hard target. Don't be wasteful. Watch out for obvious
|
|
57
|
+
regressions (per-segment allocation in hot paths, redundant classify calls,
|
|
58
|
+
etc.) but no benchmark gates yet.
|
|
59
|
+
- **Extensibility:** internal-only is fine for v1. Design the seams so an
|
|
60
|
+
external recognizer registry is a small follow-up, not a redesign.
|
|
61
|
+
- **Calibration data:** we generate and check in our own. See task #3.
|
|
62
|
+
- **Commit discipline:** one feature, one commit, immediate push. Tests must
|
|
63
|
+
pass: `bundle exec rspec && go test ./... && script/cli_parity.sh`.
|
|
64
|
+
|
|
65
|
+
## Phase 1 — Year 1: structural foundation
|
|
66
|
+
|
|
67
|
+
Status: **complete**. All 9 tasks shipped in a single session, v0.11.0 → v0.19.0.
|
|
68
|
+
|
|
69
|
+
Goal: stop being a 554-line `SegmentClassifier` doing five jobs. Land the
|
|
70
|
+
substrate that the year-2 learning work compounds on.
|
|
71
|
+
|
|
72
|
+
Ordered task list (each was a commit checkpoint, each bumped a minor version):
|
|
73
|
+
|
|
74
|
+
1. ✅ **Docs + plan** — this file and `ARCHITECTURE.md`. Shipped at 866525f.
|
|
75
|
+
2. ✅ **Recognizer interface (uuid, date, integer)** — carved three
|
|
76
|
+
Recognizers out of `SegmentClassifier`. No behavior change; first-match
|
|
77
|
+
equivalent. Shipped in v0.12.0 (b3cc889).
|
|
78
|
+
3. ✅ **Calibration corpus** — 202 labeled segments across 25 types at
|
|
79
|
+
`spec/fixtures/calibration/segments.json`, generated by
|
|
80
|
+
`script/build_calibration.rb`. Loaded by both Ruby (`calibration_spec`)
|
|
81
|
+
and Go (`calibration_test.go`) as a regression suite + future
|
|
82
|
+
calibration target. Shipped in v0.13.0 (f9eb980).
|
|
83
|
+
4. ✅ **Scored ensemble** — added Specificity bands (SEMANTIC, STRUCTURED,
|
|
84
|
+
BOUNDED, TYPED, PATTERN, FALLBACK) on each Recognizer's Verdict and an
|
|
85
|
+
Ensemble helper that picks max(specificity × confidence) with stable
|
|
86
|
+
earlier-wins tie-break. Shipped in v0.14.0 (924f1ba).
|
|
87
|
+
5. ✅ **Position promoted to first-class** — Position is { host, scope ∈
|
|
88
|
+
{:path,:query}, locator } — a typed slot in a host's URL structure.
|
|
89
|
+
Storage contracts now take Position; SQLite schema gained a `scope`
|
|
90
|
+
column (bumped to v2). Shipped in v0.15.0 (954e7a6).
|
|
91
|
+
6. ✅ **Structured Shape** — Shape is a value object: an ordered list of
|
|
92
|
+
typed segment entries plus Render() / Equal() over the structured form.
|
|
93
|
+
PathShape became a thin wrapper. Cluster gained `#shape_object`.
|
|
94
|
+
Shipped in v0.16.0 (97a60c2).
|
|
95
|
+
7. ✅ **Evidence records + Trace as view** — Evidence is the structured
|
|
96
|
+
substrate for explanation: three subject kinds (segment / position /
|
|
97
|
+
cluster), five sources (lexical / recognizer / corpus / neighbor /
|
|
98
|
+
policy). Ruby Trace was rewritten to build Evidence internally and
|
|
99
|
+
render it into the same `{value, type, output, notes}` hash. Go ships
|
|
100
|
+
the Evidence types but kept Trace internals hand-coded (output parity
|
|
101
|
+
preserved). Shipped in v0.17.0 (9e1133e).
|
|
102
|
+
8. ✅ **One Normalizer** — folded mechanical Normalizer and
|
|
103
|
+
corpus-informed Corpus#normalize into one entry point with an evidence
|
|
104
|
+
source. NullEvidence provides classifier-only behavior; Corpus
|
|
105
|
+
implements the same RenderPath / RenderQuery interface for
|
|
106
|
+
corpus-informed rendering. Shipped in v0.18.0 (24ad168).
|
|
107
|
+
9. ✅ **Events + reducers** — Corpus.observe now builds an ordered Event
|
|
108
|
+
list (HostSeen, PathLengthSeen, RawShapeSeen, FingerprintSeen,
|
|
109
|
+
per-segment PositionSeen, ClusterAddition) and applies each through
|
|
110
|
+
the Reducer registry. Adding a new metric = define a new Event +
|
|
111
|
+
Reducer; no other module changes. Event list is transient today;
|
|
112
|
+
future commit can persist it for re-runnable inference. Shipped in
|
|
113
|
+
v0.19.0 (891fd12).
|
|
114
|
+
|
|
115
|
+
End of Phase 1 deliverable: the same CLI surface, but the internals are an
|
|
116
|
+
evidence-driven inference system over typed Positions with pluggable
|
|
117
|
+
recognizers and re-runnable-inference-ready event emission.
|
|
118
|
+
|
|
119
|
+
## Phase 2 — Year 2: learning that compounds on the new substrate
|
|
120
|
+
|
|
121
|
+
Each of these is a non-trivial multi-PR initiative.
|
|
122
|
+
|
|
123
|
+
- ✅ **Re-runnable inference (library)** — source-IRI log persisted alongside
|
|
124
|
+
materialized views; `Corpus#reinfer` drops the views and replays the log
|
|
125
|
+
through events + reducers. Lets users tune thresholds or swap the
|
|
126
|
+
classifier without re-feeding IRIs. Shipped in v0.21.0.
|
|
127
|
+
- ✅ **Re-runnable inference (CLI)** — `iriq --corpus PATH --reinfer`
|
|
128
|
+
replays the log and prints a before/after summary. Parity scenarios
|
|
129
|
+
added for JSON + SQLite backends. Shipped in v0.22.0.
|
|
130
|
+
- **Inter-position correlations** — when slot A's value predicts slot B's
|
|
131
|
+
type, surface that. Catches things like "the segment after `/orgs/{org_id}/`
|
|
132
|
+
is almost always `users` or `repos`".
|
|
133
|
+
- ✅ **Learned recognizers** — full loop:
|
|
134
|
+
* `Corpus#propose_recognizers` scans observed values via pluggable
|
|
135
|
+
ProposalStrategy strategies (v0.23.0). v1 ships PrefixUnderscoreId
|
|
136
|
+
(detects `ghp_…`, `cus_…`, `sk_…` shapes at slug/opaque_id
|
|
137
|
+
positions).
|
|
138
|
+
* `iriq --corpus PATH --propose-recognizers [--json]` CLI flag with
|
|
139
|
+
tunable `--min-observations` / `--min-coverage` / `--min-hosts`
|
|
140
|
+
thresholds (v0.24.0).
|
|
141
|
+
* `Corpus#activate_proposal(p)` promotes a proposal into a live
|
|
142
|
+
SynthesizedRecognizer on the corpus's classifier, persists it,
|
|
143
|
+
and reinfers. Reopens re-apply via `Corpus.open`. Doesn't leak
|
|
144
|
+
to the module-level DEFAULT classifier — corpora are isolated
|
|
145
|
+
(v0.26.0).
|
|
146
|
+
* `iriq --corpus PATH --propose-recognizers --activate-above F`
|
|
147
|
+
auto-activates every proposal at or above coverage F (v0.26.0).
|
|
148
|
+
- ✅ **Cross-host learning** — full integration:
|
|
149
|
+
* `Corpus#cross_host_shapes(min_hosts:)` lists route shapes that
|
|
150
|
+
recur across multiple hosts (independent evidence of semantic
|
|
151
|
+
pattern, not host-local quirk). CLI `--cross-host-shapes`. v0.27.0.
|
|
152
|
+
* Cross-host count wires into `RecognizerProposal#confidence`:
|
|
153
|
+
each additional host beyond the first adds 0.05 (capped at 1.0).
|
|
154
|
+
Single-host proposals get `confidence == coverage`; cross-host
|
|
155
|
+
proposals get boosted. `--activate-above F` checks confidence
|
|
156
|
+
(not raw coverage). Proposals are sorted by confidence desc.
|
|
157
|
+
v0.28.0.
|
|
158
|
+
- **Near-shape clustering** — edit-distance over Shapes catches
|
|
159
|
+
near-duplicates that today require exact match.
|
|
160
|
+
|
|
161
|
+
## Phase 3 — Year 3: productize
|
|
162
|
+
|
|
163
|
+
Speculative; revisit after Phase 2.
|
|
164
|
+
|
|
165
|
+
- Exported learned schemas (OpenAPI-ish: routes per host with inferred
|
|
166
|
+
types per Position).
|
|
167
|
+
- Privacy/PII mode (recognizer-driven redaction policies).
|
|
168
|
+
- Streaming observatory with shape-drift alerts.
|
|
169
|
+
- External recognizer registry (the public surface for what Phase 1 keeps
|
|
170
|
+
internal).
|
|
171
|
+
|
|
172
|
+
## Risks tracked
|
|
173
|
+
|
|
174
|
+
- **Parity tax compounds.** Every refactor is 2× surface. If a refactor's Go
|
|
175
|
+
port slips a release, mark it explicitly in CHANGELOG and treat it as tech
|
|
176
|
+
debt — don't let it sit.
|
|
177
|
+
- **Confidence calibration is theater without truth data.** Task #3 ships
|
|
178
|
+
before #4 for that reason.
|
|
179
|
+
- **Recognizer-set versioning.** Once we ship pluggable recognizers, stored
|
|
180
|
+
corpora need to know which recognizer-set produced them. Plan: stamp
|
|
181
|
+
`recognizer_set_id` into `PositionStats` at observation time. Defer
|
|
182
|
+
implementation until Phase 2.
|
|
183
|
+
|
|
184
|
+
## How to use this document
|
|
185
|
+
|
|
186
|
+
- Update it as decisions change. It is not a frozen artifact.
|
|
187
|
+
- Each completed Phase 1 task: tick it off, add a one-line "shipped in vX.Y.Z"
|
|
188
|
+
pointer. Don't delete history.
|
|
189
|
+
- Bigger plan changes (reorder, scope cut, scope grow): leave the prior
|
|
190
|
+
list intact and append a "Plan change YYYY-MM-DD" note explaining why.
|
data/iriq.gemspec
CHANGED
|
@@ -4,18 +4,19 @@ Gem::Specification.new do |s|
|
|
|
4
4
|
s.name = "iriq"
|
|
5
5
|
s.version = Iriq::VERSION
|
|
6
6
|
s.authors = ["Daniel Pepper"]
|
|
7
|
-
s.description = "
|
|
8
|
-
s.files = `git ls-files * ':!:spec'`.split("\n")
|
|
7
|
+
s.description = "IRI extraction, normalization, and clustering."
|
|
8
|
+
s.files = `git ls-files * ':!:spec' ':!:script' ':!:bin' ':!:rust' ':!:go'`.split("\n")
|
|
9
9
|
s.bindir = "exe"
|
|
10
10
|
s.executables = ["iriq"]
|
|
11
11
|
s.homepage = "https://github.com/dpep/iriq"
|
|
12
12
|
s.license = "MIT"
|
|
13
|
-
s.summary = "
|
|
13
|
+
s.summary = "IRI extraction, normalization, and clustering."
|
|
14
14
|
|
|
15
|
-
s.required_ruby_version = ">= 3.
|
|
15
|
+
s.required_ruby_version = ">= 3.4"
|
|
16
16
|
|
|
17
17
|
s.add_development_dependency 'debug', '>= 1'
|
|
18
18
|
s.add_development_dependency 'rspec', '>= 3.10'
|
|
19
19
|
s.add_development_dependency 'rspec-debugging'
|
|
20
20
|
s.add_development_dependency 'simplecov', '>= 0.22'
|
|
21
|
+
s.add_development_dependency 'sqlite3', '>= 1.6'
|
|
21
22
|
end
|