iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 95d6bc09f7de65bcb4acc5db3ad68d1b83c326360d913c45aede66038a100461
4
- data.tar.gz: ae58d39b77fce3041cc5561b575dea52706b38f4cfab11d4881cf2f644f6cc59
3
+ metadata.gz: 598c04e3c1777787ae9e5d1be98e2bc68d441e2020ebe7743bdf8075b20fdaec
4
+ data.tar.gz: 396ad6b0b0acffb76b7bc2b4e31792b02ac65749c6ac769fd70a47ce5d806496
5
5
  SHA512:
6
- metadata.gz: a8fa85f112d9766ff4e9e4ad60c1043ce9701fe42e6dbd01f70da39fa0dd554bff573f40c858d88ab1e1539de0c1f017609535cc57e8e4fcc6651f0d20697e60
7
- data.tar.gz: 97c714e664874c08278a305b8beff470b68bd2b7e4f58977cc44cf2e3716e619d17011a33d77b8ea9356eec0715acb6c815252551edf64db16eec4599f816274
6
+ metadata.gz: 16637ff46f4648a2cfc14404ed074d3cedc6fc08cabf0c46a6fa7a39553b8b78020907fde1d9024005cec3a4f2cdb4cb3e999802ec866a942c20c15be6c7af34
7
+ data.tar.gz: 458aa6deba73a571a07801bb3df445a4d08500a55c3d1d8c50b11df0811dbc785270fd1ef908ebbb58c6f53dfb1ecfc7013fe137e216e88bca81c9bc56d21fa4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,90 @@
1
+ ### 0.30.2 (2026-06-23)
2
+ - Piped stdin and `--file` now **stream** the per-IRI sections (`-n`/`-p`/`-c`/`-e`) line by line, flushing each IRI as it's processed — `tail -f access.log | iriq -n` is live and memory stays bounded on huge inputs. Output is byte-identical to before; the aggregate views (deduped URL list, clusters, `--stats`) still read the whole input. Ruby, Go, and Rust.
3
+
4
+ ### 0.30.1 (2026-06-21)
5
+ - Batch sections (`--normalize` etc.) are now corpus-informed when `--corpus` is supplied, matching single-input behavior.
6
+ - Added a CLI end-to-end test suite (sections, formats, batch/cluster, subcommands) and a `make check` Rust gate + pre-push hook.
7
+
8
+ ### 0.30.0 (2026-06-21)
9
+ - Rust consolidated into a single crate (library + `iriq` binary) with SQLite always on by default — no separate sqlite build.
10
+ - Go moved into the `go/` subdirectory; import path is now `github.com/dpep/iriq/go`.
11
+
12
+ ### 0.11.0 (2026-05-27)
13
+ - New classifier types: `:color` (hex form `#fff`/`#ffffff`/`#ffffff80`), `:coordinate` (`lat,lng` pair with plausible-range validation), `:country` (ISO 3166-1 alpha-2, allowlisted), `:base64` (≥16 chars with `+`/`/`/`=` to disambiguate from `:opaque_id`).
14
+ - `SegmentClassifier.color_kind(value)` / `ColorKind(value)` returns `:hex` for hex-shaped colors — placeholder for future named / rgb / hsl support, mirrors the file_kind pattern.
15
+ - Param-name hint map extended: `color`/`bg`/`fg`/`background`/`foreground` → `:color`, `coords`/`coordinates`/`geo`/`location`/`position`/`latlng` → `:coordinate`, `country`/`country_code`/`nation` → `:country`.
16
+ - `-J` is now a short alias for `--ndjson` (combinable: `iriq -nJ < file`).
17
+ - New CLI `-e/--explain` flag — annotated normalization trace. For each path segment / query param, shows the value, type, output (placeholder or canonical value), and notes for every non-obvious transformation (hint suppression for semantic types, currency upcase, IP umbrella collapse, canonical date, param-name lift). JSON via `-e -j` returns the same structure.
18
+ - Library API: `Iriq::Trace.for(input)` (Ruby) / `iriq.Trace(input)` (Go) returns the same trace data structure.
19
+ - Classifier perf: each regex test is now gated on a cheap composition check (`String#include?` / `IndexByte` / `size`) so a literal like `"users"` skips ~20 regex matches instead of walking the full chain. Measured: Ruby normalize +12%, extract +27%; Go CLI wall time -25%.
20
+
21
+ ### 0.10.0 (2026-05-27)
22
+ - New classifier type `:file` — `name.ext` shape where `ext` is in a curated allowlist spanning image / document / data / text / web / audio / video / archive / code kinds. `image.png` and `report.pdf` classify as `:file` instead of falling through to `:opaque_id`. The per-extension kind (`:image`, `:document`, etc.) is surfaced via `SegmentClassifier.file_kind(value)` / `FileKindOf(value)` for verbose displays.
23
+ - `Cluster#param_summary` adds `:kind_distribution` for `:file`-typed params — buckets observed values by kind. Best-effort: only reflects values within the tracking cap.
24
+ - New phone format: NANP-style `555-666-7777`, `555.666.7777`, `(555) 666-7777`. Leading area-code + exchange digits constrained to 2-9 so dotted version strings / digit blobs don't shadow. The `+` E.164 form still covers international.
25
+ - Param-name hints — when a value's type is generic (`:literal`, `:opaque_id`, `:slug`), the param name can supply the type. `?phone=unknown` becomes `{phone}` and `?email=tbd` becomes `{email}`. Hint map covers phone/email/locale/currency/url/jwt/mime variations. Specific value types (e.g. `?phone=12345` → `:integer`) still win.
26
+
27
+ ### 0.9.0 (2026-05-27)
28
+ - Semantic types (`:version`, `:locale`, `:currency`, `:date`, `:boolean`, `:timestamp`, etc.) now surface as `{type}` placeholders instead of being run through the noun-singularize hint. `/api/v1/status` renders `/api/{version}/status` rather than the misleading `/api/{api_id}/status`. Only ID-shaped types (`:integer`, `:uuid`, `:hash`, `:opaque_id`, `:slug`) keep the `{noun_id}` form.
29
+ - `--normalize` collapses `:ipv4` and `:ipv6` to `{ip}` in placeholder form (previously rendered as `{ipv4}` / `{ipv6}`). The classifier still tracks the specific family; cluster summary keeps the distinct types.
30
+ - `--normalize` canonicalizes currency segments and params to ISO 4217 upper case — `/pricing/usd` → `/pricing/USD`, `?currency=eur` → `?currency=EUR`. Mirrors the existing date canonicalization (`canonical_currencies: true` flag on `PathShape`).
31
+ - `LOCALE_RE` tightened: the region/script portion now caps at 2-4 alphanumeric chars and the language portion is validated against the ISO 639-1 allowlist — `by-locale` no longer wrongly classifies as `:locale`.
32
+ - New classifier types: `:phone` (E.164 — `+` then 7-15 digits with optional separators), `:jwt` (three base64url segments separated by dots), `:mime` (RFC 2046 top-level type + subtype, e.g. `image/png`, `application/vnd.api+json`).
33
+ - New corpus-promoted type `:http_status` — integer positions whose observed range falls inside 100..599 with ≥2 distinct values and ≥5 samples get promoted. Same range-analysis pattern as `:year`.
34
+ - Scheme-less URL detection: query values like `?redirect=foo.com/path` classify as `:url`. Requires a dotted host with a TLD-like ≥2-letter suffix followed by a slash, so `image.png` stays as `:opaque_id`.
35
+ - `Cluster#param_summary` adds two new fields:
36
+ - `:value_distribution` — fractions per tracked value, for `:boolean` and `:enum` positions (e.g. `{ "true" => 0.97, "false" => 0.03 }`). Same data already in `value_counts`, surfaced as ratios.
37
+ - `:subtype_distribution` — int-vs-float split for `:number` positions (e.g. `{ integer: 0.4, float: 0.6 }`).
38
+ - `:boolean` now wins over `:enum` when the dominant type is boolean — a position of pure `true`/`false` stays `:boolean` rather than being demoted to a 2-value enum.
39
+
40
+ ### 0.8.0 (2026-05-27)
41
+ - **Breaking**: `:numeric` umbrella renamed to `:number` (Ruby) / `TypeNumeric` → `TypeNumber` (Go). Same semantics.
42
+ - New classifier types: `:boolean` (`true`/`false`, any case), `:version` (`v1`, `v2.0.1`, `v1.2.3-beta` — requires the `v` prefix), `:locale` (BCP 47-ish full forms like `en-US`/`fr_CA`, plus bare 2-letter language codes from an inline ISO 639-1 allowlist of ~55 entries — `en`, `fr`, `ja`, etc.), `:currency` (3-letter codes from an inline ISO 4217 allowlist of ~35 entries).
43
+ - `:year` is now corpus-only: an `:integer` position whose observed min/max land in 1900..2100 with ≥2 distinct values gets promoted. A single 4-digit integer in isolation classifies as `:integer` — only range analysis across observations is reliable.
44
+ - `PositionStats` now tracks `numeric_min` / `numeric_max` / `numeric_sum` / `numeric_count` for `:integer`/`:float` observations. `Cluster#param_summary` surfaces `min` / `max` / `avg` on any param with numeric observations.
45
+ - Shape-y variable types (`:version`, `:locale`, `:currency`, `:boolean`) now respect the stable-literal rule: a single dominant value at a position (`v1` only across many observations) stays as the literal `v1` instead of being placeholdered as `{version}`. High cardinality at the same position falls back to `{version}` / `{locale}` / etc. as expected.
46
+ - 0/1 booleans still classify as `:integer` individually; the existing `:enum` umbrella catches `?flag=0` / `?flag=1` patterns when they cluster.
47
+
48
+ ### 0.7.0 (2026-05-27)
49
+ - **Breaking**: `:integer_id` classifier type renamed to `:integer` (Ruby) / `TypeIntegerID` → `TypeInteger` (Go). The "ID" semantics live in the hints layer (which still produces `{user_id}` placeholders); the classifier now reflects pure shape. Update any direct `.classify(...) == :integer_id` checks, dump-file consumers, and persisted corpora — the type symbol changed in `type_counts` and raw shape strings (e.g. `/users/{integer_id}` → `/users/{integer}`).
50
+ - New `:enum` umbrella (corpus-only): when a param has a small bounded set of repeated values (default ≥20 observations, ≤10 distinct, each ≥2 occurrences, ≥95% coverage), `Cluster#param_type` returns `:enum` and `param_summary` includes the value list under `:values`. Normalize output keeps the `{enum}` placeholder — values aren't inlined.
51
+ - `iriq --host=full|registrable|reg|none` CLI flag plumbs `Corpus#host_strategy` from the command line. `reg` is a short alias for `registrable`.
52
+
53
+ ### 0.6.0 (2026-05-27)
54
+ - New classifier types: `:ipv4`, `:ipv6`, `:url`, `:email` (Ruby) / `TypeIPv4`, `TypeIPv6`, `TypeURL`, `TypeEmail` (Go). Slotted before the generic `:opaque_id` / `:literal` catch-alls so URL params like `?redirect=https://foo.com/...`, `?email=alice@example.com`, `?ip=192.168.1.1`, `?gateway=fe80::1` get distinct types instead of falling through.
55
+ - IPv4 validates octets ≤ 255 — out-of-range dotted-quads fall back to `:opaque_id`.
56
+ - IPv6 accepts the full eight-group form and any compressed form containing `::`. IPv4-mapped variants (`::ffff:192.0.2.1`) are not recognized.
57
+
58
+ ### 0.5.0 (2026-05-27)
59
+ - Float values now classify as `:float` instead of falling through to `:opaque_id` (Ruby `:float` / Go `TypeFloat`). Regex requires digits on both sides of the decimal — `3.14`, `-2.5`, `1.0` match; `.5`, `1.`, `1e10` do not.
60
+ - New `:numeric` umbrella (corpus-only): when a cluster sees both `:integer_id` and `:float` observations at the same param with neither subtype hitting the 80% confidence threshold, the param surfaces as `:numeric` in `param_summary` and renders as `{numeric}` in `Corpus#normalize` output. The classifier itself never returns `:numeric` directly — individual values are always specifically int or float.
61
+ - `Corpus.new(host_strategy: ...)` knob controls how host is keyed into clusters: `:full` (default, unchanged), `:registrable` (strip subdomains, so `api.foo.com` and `app.foo.com` cluster as `foo.com`), `:none` (ignore host, group all observations by shape alone). `:registrable` uses an inline allowlist of ~70 common multi-label TLDs (`co.uk`, `com.au`, `co.jp`, etc.) — niche multi-label suffixes like `.priv.no` will be over-stripped.
62
+
63
+ ### 0.4.0 (2026-05-27)
64
+ - Query-param clustering: each `Cluster` now tracks per-param presence, value cardinality, and type via `param_stats`. Surfaced on `cluster.to_h[:params]` (and the JSON cluster view), persisted in both JSON and SQLite backends.
65
+ - `Corpus#normalize` (Ruby) / `Corpus.NormalizeIdentifier` (Go) now include query params, rendered with corpus-informed types when available (falls back to mechanical classification otherwise).
66
+ - New `corpus.params_for(url)` / `Corpus.ParamsFor(url)` — returns the inferred params for the cluster `url` would fall into. Useful for "what params might this URL accept?" tooling.
67
+ - Date detection expanded to include `YYYY/MM/DD` and `YYYYMMDD` (with year/month/day sanity bounds) alongside the existing `YYYY-MM-DD`.
68
+ - `SegmentClassifier.canonical_date(value)` / `CanonicalDate(value)` returns the ISO form for any recognized date.
69
+ - `--normalize` output canonicalizes recognized date values to `YYYY-MM-DD` (path segments and query params). Cluster keys still use `{date}` placeholders so dated routes still group together.
70
+ - `PositionStats::DEFAULT_MAX_VALUES` is now the value cap for `cluster.param_stats[name]` too.
71
+
72
+ ### 0.3.0 (2026-05-25)
73
+ - Go: SQLite backend is now opt-in via `-tags sqlite`. Default `go install` and the `iriq` Homebrew formula ship a slim binary (~30% smaller) with JSON corpora only. SQLite users compile with `-tags sqlite` or install `dpep/tools/iriq-sqlite`.
74
+ - Makefile: `release` / `release-sqlite` targets strip debug symbols and use `-trimpath` for reproducible builds.
75
+ - CLI: `iriq --help` reports the active build (slim vs sqlite).
76
+ - Slim build returns a friendly error when a `.db` corpus path is opened, pointing at the iriq-sqlite formula.
77
+ - `PositionStats::DEFAULT_MAX_VALUES` / `DefaultMaxValuesPerPosition` raised from 1000 → 5000. Existing corpora keep whatever cap they were created with (the cap is persisted in the dump / SQLite meta table); only freshly-constructed corpora pick up the new default.
78
+
79
+ ### 0.2.0 (2026-05-25)
80
+ - Corpus storage backends: JSON (default) and SQLite, dispatched by file extension
81
+ - Go: `iriq.OpenCorpus(path)`; Ruby: `Iriq::Corpus.open(path)`
82
+ - SQLite backend: incremental UPSERTs, WAL mode, concurrent-safe via busy_timeout + BEGIN IMMEDIATE; checkpoints on close so the WAL sidecar doesn't grow unbounded
83
+ - Batch mode: `corpus.batch { ... }` (Ruby) / `corpus.Batch(fn)` (Go) wraps many observations in one transaction
84
+ - Clusterer now wraps the in-memory Storage backend; only one cluster code path
85
+ - script/bench_storage.sh — JSON vs SQLite timing across single-process, incremental, and concurrent workloads
86
+ - **Breaking (Go)**: `Corpus.HostCounts` / `PathLengthCounts` / `RawShapeCounts` / `FingerprintCounts` are methods now, not fields
87
+
1
88
  ### 0.1.0 (2026-05-24)
2
89
  - CLI: auto-detect file argument, retire --extract flag
3
90
  - CLI: section flags work in pipe mode + clean up help text
data/CLAUDE.md ADDED
@@ -0,0 +1,208 @@
1
+ # Iriq development conventions
2
+
3
+ > **⚠️ Behavior changes touch ALL THREE runtimes.** Ruby is the reference; Go
4
+ > + Rust mirror it. Before committing any change to
5
+ > parser/normalizer/extractor/CLI/etc:
6
+ >
7
+ > 1. Update Ruby + specs.
8
+ > 2. `bundle exec ruby script/generate_fixtures.rb` (regenerate JSON parity fixtures).
9
+ > 3. Port the change to Go (the Go module lives in `go/`).
10
+ > 4. `go -C go test ./...` — fixture tests should still pass.
11
+ > 5. `make build && script/cli_parity.sh` — Ruby ↔ Go CLI parity should still pass.
12
+ > 6. Port the change to Rust under `rust/`.
13
+ > 7. `cd rust && cargo test --workspace` — Rust fixture tests should still pass
14
+ > (SQLite is a default feature).
15
+ > 8. `cd rust && cargo build --release --bin iriq && cd .. && script/rust_parity.sh`
16
+ > — Rust ↔ Go CLI parity (covers Ruby transitively).
17
+ > 9. Commit the regenerated fixtures alongside the code change.
18
+ >
19
+ > CI's parity + Rust jobs will fail if any step is skipped. The **Rust gate**
20
+ > (fmt + clippy + tests) is automated — run `make hooks` once to install the
21
+ > committed pre-push hook that runs `make check`. Full multi-runtime pre-push
22
+ > for a behavior change:
23
+ > `bundle exec rspec && go -C go test ./... && script/cli_parity.sh && make check && script/rust_parity.sh`.
24
+
25
+ ## Repo layout — Ruby at the root, Go and Rust in subdirs
26
+
27
+ The Ruby gem lives at the repo root (it's the reference implementation and the
28
+ published gem); the two mirror implementations are compartmentalized into
29
+ `go/` and `rust/`. Earlier the Go code was intermixed at the root; it now sits
30
+ in `go/`, symmetric with `rust/`, so the root reads as "Ruby + two ports."
31
+
32
+ ```
33
+ iriq/
34
+ lib/ exe/ spec/ ← Ruby gem (library, CLI, specs) — the reference
35
+ completions/ ← shell-completion scripts shipped by the gem
36
+ iriq.gemspec
37
+ Gemfile
38
+
39
+ go/ ← Go module github.com/dpep/iriq/go
40
+ go.mod go.sum
41
+ *.go ← Go package `iriq`
42
+ cmd/iriq/ ← Go CLI binary
43
+ completions/ ← Go's own embedded copy (go:embed can't reach ../)
44
+
45
+ rust/ ← Cargo workspace
46
+ Cargo.toml ← workspace root
47
+ iriq/ ← one crate: library + `iriq` CLI binary; inlines completions
48
+ REPORT.md ← Go → Rust port spike notes + perf
49
+ target/ ← Rust build artifacts (gitignored)
50
+
51
+ bin/ ← built Go binary (gitignored)
52
+ script/ ← shared dev scripts (fixture gen, parity, benches)
53
+ spec/fixtures/ ← golden JSON shared by Ruby specs + Go + Rust tests
54
+ .github/workflows/ ← Ruby CI, Go CI, Rust CI, parity CIs
55
+ ```
56
+
57
+ Notes on this layout:
58
+
59
+ - Go's import path is now `github.com/dpep/iriq/go` (the `/go` suffix matches
60
+ the subdir). Consumers import `github.com/dpep/iriq/go`.
61
+ - One version tag (`vX.Y.Z`) serves all three runtimes — Ruby's gemspec, Go's
62
+ module, and Rust's `Cargo.toml` use the same tag stream.
63
+ - The gemspec ships only Ruby + `completions/`, excluding `go/` and `rust/`:
64
+ `git ls-files * ':!:spec' ':!:script' ':!:bin' ':!:rust' ':!:go'`.
65
+ - Completion scripts exist in three places (gem root `completions/`, `go/completions/`
66
+ for `go:embed`, and inlined in the Rust CLI) — keep them in sync like fixtures.
67
+
68
+ ## Building
69
+
70
+ ```sh
71
+ # Ruby gem
72
+ bundle install
73
+ bundle exec exe/iriq --help # runs the CLI from source
74
+
75
+ # Go binary — convenience targets in the Makefile
76
+ make build # → ./bin/iriq
77
+ make install # go install into $GOBIN
78
+ make uninstall # remove from $GOBIN
79
+ make clean # remove ./bin/
80
+ make test # go test ./...
81
+
82
+ # Rust — one crate (library + `iriq` binary), SQLite bundled by default
83
+ cd rust && cargo build --release --bin iriq # → ./rust/target/release/iriq
84
+ cd rust && cargo install --path iriq # install into ~/.cargo/bin
85
+ cd rust && cargo test --workspace
86
+
87
+ # Via Homebrew (builds the Rust CLI from main)
88
+ brew install dpep/tools/iriq
89
+
90
+ # Via crates.io
91
+ cargo install iriq
92
+ ```
93
+
94
+ ## Keeping the three runtimes in sync
95
+
96
+ Ruby is the **reference implementation**. Go and Rust mirror its public API
97
+ and behavior. Three layers of parity testing keep them aligned:
98
+
99
+ 1. **Golden JSON fixtures** (`spec/fixtures/*.json`)
100
+ Generated by `script/generate_fixtures.rb` from the Ruby implementation
101
+ over a curated set of inputs. Go's `fixtures_test.go` and Rust's
102
+ `rust/iriq/tests/fixtures.rs` both load each file and assert the same
103
+ outputs.
104
+
105
+ 2. **Ruby ↔ Go CLI parity harness** (`script/cli_parity.sh`)
106
+ Runs the same input through `bundle exec exe/iriq` and the Go binary and
107
+ diffs stdout. Lives in CI as the `Ruby ↔ Go parity` job.
108
+
109
+ 3. **Rust ↔ Go CLI parity harness** (`script/rust_parity.sh`)
110
+ Same idea — runs every Phase 1 + Phase 2 scenario (single-input,
111
+ pipe-mode, JSON corpus, SQLite corpus, --stats, --reinfer,
112
+ --propose-recognizers, --cross-host-shapes, --host=reg) through the
113
+ Go and Rust binaries and diffs stdout. Lives in CI as the
114
+ `Rust ↔ Go parity` job. Rust transitively inherits Ruby parity via Go.
115
+
116
+ When changing behavior:
117
+
118
+ 1. Update the Ruby code + specs first.
119
+ 2. Regenerate fixtures: `bundle exec ruby script/generate_fixtures.rb`.
120
+ 3. Port the change to Go.
121
+ 4. `go test ./...` (uses the updated fixtures).
122
+ 5. `script/cli_parity.sh` should pass.
123
+ 6. Port the change to Rust under `rust/`.
124
+ 7. `cd rust && cargo test --workspace`.
125
+ 8. `cd rust && cargo build --release --bin iriq && cd .. && script/rust_parity.sh` should pass.
126
+ 9. Commit fixtures with the change — CI will fail if they're stale.
127
+
128
+ ## Tests
129
+
130
+ ```sh
131
+ bundle exec rspec # Ruby suite (305+ examples)
132
+ go test ./... # Go suite (native + fixture parity)
133
+ script/cli_parity.sh # Ruby ↔ Go CLI parity
134
+ cd rust && cargo test --workspace
135
+ cd rust && cargo fmt --check # formatting (CI-gated)
136
+ cd rust && cargo clippy --workspace --all-targets -- -D warnings
137
+ make check # the three Rust checks above, in one shot
138
+ script/rust_parity.sh # Rust ↔ Go CLI parity (~59 scenarios)
139
+ ```
140
+
141
+ ## Releases
142
+
143
+ Versioning is single-stream: one `vX.Y.Z` covers all three runtimes. Bump the
144
+ three version constants **together** — the `--version` parity checks fail
145
+ if they drift:
146
+
147
+ 1. `lib/iriq/version.rb` (`VERSION`), `go/version.go` (`Version`), and the two
148
+ `version = "X.Y.Z"` / `pub const VERSION` lines in `rust/iriq/Cargo.toml` and
149
+ `rust/iriq/src/lib.rs` — same string.
150
+ 2. `Gemfile.lock` — re-resolve so the pinned `iriq (X.Y.Z)` matches
151
+ (`bundle install`, or it regenerates on the next `bundle exec`). Commit it.
152
+ 3. Run `cd rust && cargo update -p iriq` to refresh `Cargo.lock`.
153
+ 4. Tag `vX.Y.Z` and push. Go consumers pick it up via
154
+ `go get github.com/dpep/iriq/go@vX.Y.Z`.
155
+ 5. `gem push iriq-X.Y.Z.gem` to publish to RubyGems.
156
+ 6. `cd rust && cargo publish -p iriq` to publish to crates.io (the crate ships
157
+ both the library and the `iriq` binary).
158
+
159
+ ### Keep Homebrew in sync — bump on EVERY version change
160
+
161
+ The tap (`~/code/lib/homebrew-tools`) ships a single `Formula/iriq.rb` that
162
+ builds the Rust CLI (`cargo install --path rust/iriq`) from `branch: "main"`.
163
+ SQLite is on by default (the `iriq` crate's `default` feature set), so there is
164
+ no longer a separate `iriq-sqlite` formula.
165
+
166
+ The formula pins a static `version "X.Y.Z"` label. Because the build tracks
167
+ `main` rather than a tagged tarball, `brew upgrade` only rebuilds when that
168
+ label changes. So on every bump here, update the `version` string in
169
+ `Formula/iriq.rb` to match `version.rb`, then commit + push the tap. Leaving it
170
+ stale means brew users never get the new code even though it's already on `main`.
171
+
172
+ ## Corpus storage backends
173
+
174
+ The `Corpus` class delegates state to a `Storage` backend; three backends ship:
175
+
176
+ - **Memory** — default, in-process only.
177
+ - **JSON** — Memory wrapped with atomic load/save against a JSON file
178
+ (`.json` by default). Same shape both runtimes have always written.
179
+ - **SQLite** — incremental UPSERTs against a `.db` / `.sqlite` / `.sqlite3`
180
+ file with WAL journaling. Supports concurrent observers and avoids
181
+ loading the whole corpus into memory.
182
+
183
+ `Corpus.open(path)` (Ruby) / `iriq.OpenCorpus(path)` (Go) picks the backend
184
+ by file extension. `corpus.save(other_path)` exports as JSON regardless of
185
+ the live backend; `corpus.save(same_path)` is idempotent (no clobbering a
186
+ SQLite file with JSON, etc.).
187
+
188
+ The Ruby `sqlite3` gem is loaded lazily (only when a `.db` path is opened),
189
+ keeping the iriq install footprint minimal for users that stick with JSON.
190
+ On the Go side we use `modernc.org/sqlite` (pure Go — no cgo). The Rust
191
+ side uses `rusqlite` with the `bundled` feature (statically links C SQLite,
192
+ ~3-4 MB binary cost). Schema v4 is shared across all three runtimes — a
193
+ `.db` written by any binary opens cleanly in any other.
194
+
195
+ When adding a new backend, replicate the contract in all three languages
196
+ and add parity scenarios in `script/cli_parity.sh`'s `corpus_pair`
197
+ section + `script/rust_parity.sh`'s `corpus_pair`.
198
+
199
+ ## What lives where in scripts
200
+
201
+ - `script/benchmark.rb` — Ruby-only throughput benchmark.
202
+ - `script/memory.rb` — Ruby-only memory profile.
203
+ - `script/generate_fixtures.rb` — produces `spec/fixtures/*.json` for cross-runtime parity.
204
+ - `script/cli_parity.sh` — Ruby ↔ Go CLI diff.
205
+ - `script/rust_parity.sh` — Rust ↔ Go CLI diff.
206
+ - `script/bench_three_way.sh` — Go vs Rust wall-clock comparison.
207
+ - `script/bench_compare.sh` — Ruby vs Go CLI wall-time comparison.
208
+ - `script/bench_storage.sh` — JSON vs SQLite backend timing (single-process, incremental, concurrent).
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- iriq (0.1.0)
4
+ iriq (0.30.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -19,6 +19,7 @@ GEM
19
19
  prism (>= 1.3.0)
20
20
  rdoc (>= 4.0.0)
21
21
  reline (>= 0.4.2)
22
+ mini_portile2 (2.8.9)
22
23
  pp (0.6.3)
23
24
  prettyprint
24
25
  prettyprint (0.2.0)
@@ -53,6 +54,8 @@ GEM
53
54
  simplecov_json_formatter (~> 0.1)
54
55
  simplecov-html (0.13.2)
55
56
  simplecov_json_formatter (0.1.4)
57
+ sqlite3 (2.9.5)
58
+ mini_portile2 (~> 2.8.0)
56
59
  stringio (3.2.0)
57
60
  tsort (0.2.0)
58
61
 
@@ -65,6 +68,7 @@ DEPENDENCIES
65
68
  rspec (>= 3.10)
66
69
  rspec-debugging
67
70
  simplecov (>= 0.22)
71
+ sqlite3 (>= 1.6)
68
72
 
69
73
  CHECKSUMS
70
74
  date (3.5.1) sha256=750d06384d7b9c15d562c76291407d89e368dda4d4fff957eb94962d325a0dc0
@@ -74,7 +78,8 @@ CHECKSUMS
74
78
  erb (6.0.4) sha256=38e3803694be357fe2bfe312487c74beaf9fb4e5beb3e22498952fe1645b95d9
75
79
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
76
80
  irb (1.17.0) sha256=168c4ddb93d8a361a045c41d92b2952c7a118fa73f23fe14e55609eb7a863aae
77
- iriq (0.1.0)
81
+ iriq (0.30.2)
82
+ mini_portile2 (2.8.9) sha256=0cd7c7f824e010c072e33f68bc02d85a00aeb6fce05bb4819c03dfd3c140c289
78
83
  pp (0.6.3) sha256=2951d514450b93ccfeb1df7d021cae0da16e0a7f95ee1e2273719669d0ab9df6
79
84
  prettyprint (0.2.0) sha256=2bc9e15581a94742064a3cc8b0fb9d45aae3d03a1baa6ef80922627a0766f193
80
85
  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
@@ -90,6 +95,7 @@ CHECKSUMS
90
95
  simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
91
96
  simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
92
97
  simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
98
+ sqlite3 (2.9.5) sha256=04572973a3f943ad50a8adfffc8dd752a5f06e4c3db2026f71838fed8a982606
93
99
  stringio (3.2.0) sha256=c37cb2e58b4ffbd33fe5cd948c05934af997b36e0b6ca6fdf43afa234cf222e1
94
100
  tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
95
101
 
data/Makefile ADDED
@@ -0,0 +1,113 @@
1
+ # Iriq Go binary — build/install/clean/uninstall helpers.
2
+ #
3
+ # make - same as `make help`
4
+ # make build - dev build into ./bin/iriq (no SQLite, debug info)
5
+ # make build-sqlite - dev build with SQLite backend included
6
+ # make release - stripped + trimpath build (no SQLite)
7
+ # make release-sqlite - stripped + trimpath build with SQLite
8
+ # make install - go install into $GOBIN
9
+ # make test - go test ./... (both tag states)
10
+ # make clean - remove ./bin/
11
+ # make uninstall - remove the binary from $GOBIN
12
+ #
13
+ # The default build excludes the SQLite backend to keep the binary lean.
14
+ # Pass `-tags sqlite` (or use the *-sqlite targets) to compile it in. The
15
+ # CLI's `--version` output tells you which backends are baked in.
16
+ #
17
+ # Ruby gem build/install is handled by Bundler/RubyGems; see CLAUDE.md.
18
+
19
+ GO ?= go
20
+ GO_DIR := go
21
+ BIN_DIR := bin
22
+ BIN := $(BIN_DIR)/iriq
23
+ # Absolute output path: builds run inside $(GO_DIR) via `go -C`, so a
24
+ # relative -o would land under go/. Keep the binary at the repo-root bin/.
25
+ ABS_BIN := $(CURDIR)/$(BIN)
26
+ PKG := ./cmd/iriq
27
+
28
+ # Rust crate lives under rust/; CI gates fmt + clippy + tests there.
29
+ CARGO ?= cargo
30
+ RUST_DIR := rust
31
+
32
+ # Release flags strip the symbol table (-s), debug info (-w), and bake
33
+ # reproducible paths (-trimpath). Drops binary size ~30% with no
34
+ # functional impact; stack-trace function names are gone but file:line
35
+ # resolution still works.
36
+ RELEASE_FLAGS := -ldflags "-s -w" -trimpath
37
+
38
+ # Resolve $GOBIN, falling back to $GOPATH/bin (Go's default install location).
39
+ GOBIN := $(shell $(GO) env GOBIN)
40
+ ifeq ($(GOBIN),)
41
+ GOBIN := $(shell $(GO) env GOPATH)/bin
42
+ endif
43
+ INSTALLED := $(GOBIN)/iriq
44
+
45
+ .DEFAULT_GOAL := help
46
+ .PHONY: help build build-sqlite release release-sqlite install test clean uninstall check fmt hooks
47
+
48
+ help:
49
+ @echo "Iriq Go targets:"
50
+ @echo " make build slim dev build into $(BIN)"
51
+ @echo " make build-sqlite dev build with SQLite backend"
52
+ @echo " make release stripped slim build into $(BIN)"
53
+ @echo " make release-sqlite stripped build with SQLite backend"
54
+ @echo " make install go install into $(GOBIN)"
55
+ @echo " make test run go test ./... in both tag states"
56
+ @echo " make check Rust gate: cargo fmt --check + clippy + test (run before merging)"
57
+ @echo " make fmt cargo fmt the Rust crate"
58
+ @echo " make hooks enable the committed git hooks (pre-push runs 'make check')"
59
+ @echo " make clean remove $(BIN_DIR)/"
60
+ @echo " make uninstall remove $(INSTALLED)"
61
+
62
+ build:
63
+ @mkdir -p $(BIN_DIR)
64
+ $(GO) -C $(GO_DIR) build -o $(ABS_BIN) $(PKG)
65
+ @echo "built $(BIN) (slim, debug)"
66
+
67
+ build-sqlite:
68
+ @mkdir -p $(BIN_DIR)
69
+ $(GO) -C $(GO_DIR) build -tags sqlite -o $(ABS_BIN) $(PKG)
70
+ @echo "built $(BIN) (sqlite, debug)"
71
+
72
+ release:
73
+ @mkdir -p $(BIN_DIR)
74
+ $(GO) -C $(GO_DIR) build $(RELEASE_FLAGS) -o $(ABS_BIN) $(PKG)
75
+ @echo "built $(BIN) (slim, stripped)"
76
+
77
+ release-sqlite:
78
+ @mkdir -p $(BIN_DIR)
79
+ $(GO) -C $(GO_DIR) build -tags sqlite $(RELEASE_FLAGS) -o $(ABS_BIN) $(PKG)
80
+ @echo "built $(BIN) (sqlite, stripped)"
81
+
82
+ install:
83
+ $(GO) -C $(GO_DIR) install $(PKG)
84
+ @echo "installed $(INSTALLED)"
85
+
86
+ test:
87
+ $(GO) -C $(GO_DIR) test ./...
88
+ $(GO) -C $(GO_DIR) test -tags sqlite ./...
89
+
90
+ # The Rust gate — mirrors CI's Rust job. Run before merging/pushing (the
91
+ # pre-push hook runs this for you once `make hooks` is enabled).
92
+ check:
93
+ cd $(RUST_DIR) && $(CARGO) fmt --check
94
+ cd $(RUST_DIR) && $(CARGO) clippy --workspace --all-targets -- -D warnings
95
+ cd $(RUST_DIR) && $(CARGO) test --workspace
96
+
97
+ fmt:
98
+ cd $(RUST_DIR) && $(CARGO) fmt
99
+
100
+ hooks:
101
+ git config core.hooksPath .githooks
102
+ @echo "git hooks enabled (.githooks) — pre-push now runs 'make check'"
103
+
104
+ clean:
105
+ rm -rf $(BIN_DIR)
106
+ @echo "removed $(BIN_DIR)/"
107
+
108
+ uninstall:
109
+ @if [ -f "$(INSTALLED)" ]; then \
110
+ rm "$(INSTALLED)" && echo "removed $(INSTALLED)"; \
111
+ else \
112
+ echo "not installed at $(INSTALLED)"; \
113
+ fi