iriq 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/CLAUDE.md +121 -0
- data/Gemfile.lock +8 -2
- data/Makefile +56 -0
- data/README.md +334 -39
- data/iriq.gemspec +4 -3
- data/lib/iriq/cli.rb +289 -100
- data/lib/iriq/cluster.rb +47 -0
- data/lib/iriq/clusterer.rb +29 -39
- data/lib/iriq/corpus.rb +322 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +138 -0
- data/lib/iriq/storage/sqlite.rb +367 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +11 -0
- metadata +29 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e629988f23137ecb0c0f4737e246f65a949fa2843f4bd16d244566ca76dc37ed
|
|
4
|
+
data.tar.gz: c36cae38205a6a6f63a8a38e40849c922bbb6cf7046e9599aaaefc37fb443303
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 75d0329756d16dd7b9c8e5ca7b8ba447aa51c4d3c34a1ec31549bb6e6725338bfbdb2ee4badb18c52342c8ae0ece5f005e9288c617fa5e140d7cfff400274561
|
|
7
|
+
data.tar.gz: cb670e665a2e67feeb5f1bcad157a27e15da1aa61efc53e4ae66388a67ee498ef7caa1605be85d27e087fda44c399a9f93b6701242d1a22dec2653b667918c6d
|
data/CHANGELOG.md
CHANGED
|
@@ -1,2 +1,27 @@
|
|
|
1
|
+
### 0.2.0 (2026-05-25)
|
|
2
|
+
- Corpus storage backends: JSON (default) and SQLite, dispatched by file extension
|
|
3
|
+
- Go: `iriq.OpenCorpus(path)`; Ruby: `Iriq::Corpus.open(path)`
|
|
4
|
+
- SQLite backend: incremental UPSERTs, WAL mode, concurrent-safe via busy_timeout + BEGIN IMMEDIATE; checkpoints on close so the WAL sidecar doesn't grow unbounded
|
|
5
|
+
- Batch mode: `corpus.batch { ... }` (Ruby) / `corpus.Batch(fn)` (Go) wraps many observations in one transaction
|
|
6
|
+
- Clusterer now wraps the in-memory Storage backend; only one cluster code path
|
|
7
|
+
- script/bench_storage.sh — JSON vs SQLite timing across single-process, incremental, and concurrent workloads
|
|
8
|
+
- **Breaking (Go)**: `Corpus.HostCounts` / `PathLengthCounts` / `RawShapeCounts` / `FingerprintCounts` are methods now, not fields
|
|
9
|
+
|
|
10
|
+
### 0.1.0 (2026-05-24)
|
|
11
|
+
- CLI: auto-detect file argument, retire --extract flag
|
|
12
|
+
- CLI: section flags work in pipe mode + clean up help text
|
|
13
|
+
- script/memory.rb — track retained memory + cache footprints
|
|
14
|
+
- Perf: classifier + inflector memoization, singleton classifier, combined extractor regex
|
|
15
|
+
- Perf: derive SegmentHints once per Corpus.observe (~2x faster)
|
|
16
|
+
- script/benchmark.rb — measure the main hot paths
|
|
17
|
+
- README: replace fabricated example numbers with real fixture output
|
|
18
|
+
- Pipe mode: extraction by default, auto-switch to cluster view at scale
|
|
19
|
+
- Iriq::Extractor — pull IRIs out of free text
|
|
20
|
+
- E2E spec: pipe IriGenerator stream through real iriq binary
|
|
21
|
+
- IriGenerator fixture + popular-outlier heuristic
|
|
22
|
+
- CLI --corpus persistence, pipe batch mode, --stats, E2E specs
|
|
23
|
+
- Streaming Corpus with rolling stats and learning
|
|
24
|
+
- RESTful hints, flag-based CLI, swappable inflector
|
|
25
|
+
|
|
1
26
|
### 0.0.1 (2026-05-24)
|
|
2
27
|
- prototype
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Iriq development conventions
|
|
2
|
+
|
|
3
|
+
## Repo layout — Ruby and Go intermixed at the root
|
|
4
|
+
|
|
5
|
+
We chose to mix Ruby and Go at the repo root rather than nest the Go module
|
|
6
|
+
under `/go/`. The signal is "both implementations are peers, not one-is-primary."
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
iriq/
|
|
10
|
+
lib/ exe/ spec/ ← Ruby gem (library, CLI, specs)
|
|
11
|
+
iriq.gemspec
|
|
12
|
+
Gemfile
|
|
13
|
+
|
|
14
|
+
go.mod ← module github.com/dpep/iriq
|
|
15
|
+
*.go ← Go package `iriq` at the root
|
|
16
|
+
cmd/iriq/ ← Go CLI binary
|
|
17
|
+
bin/ ← built Go binary (gitignored)
|
|
18
|
+
|
|
19
|
+
script/ ← shared dev scripts (fixture gen, parity, benches)
|
|
20
|
+
spec/fixtures/ ← golden JSON shared by Ruby specs + Go tests
|
|
21
|
+
.github/workflows/ ← Ruby CI, Go CI, parity CI
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Trade-offs of this layout:
|
|
25
|
+
|
|
26
|
+
- Clean import path: `github.com/dpep/iriq` (no `/go/` artifact in consumers' code).
|
|
27
|
+
- One version tag (`vX.Y.Z`) serves both runtimes — Ruby's gemspec and Go's
|
|
28
|
+
module use the same tag stream.
|
|
29
|
+
- Root `ls` is busier (~15 `.go` files next to Ruby ones), accepted in exchange.
|
|
30
|
+
- The gemspec explicitly excludes Go files so `gem build` doesn't ship them:
|
|
31
|
+
`git ls-files * ':!:spec' ':!:script' ':!:cmd' ':!:bin' ':!:*.go' ':!:go.mod' ':!:go.sum'`.
|
|
32
|
+
|
|
33
|
+
## Building
|
|
34
|
+
|
|
35
|
+
```sh
|
|
36
|
+
# Ruby gem
|
|
37
|
+
bundle install
|
|
38
|
+
bundle exec exe/iriq --help # runs the CLI from source
|
|
39
|
+
|
|
40
|
+
# Go binary — convenience targets in the Makefile
|
|
41
|
+
make build # → ./bin/iriq
|
|
42
|
+
make install # go install into $GOBIN
|
|
43
|
+
make uninstall # remove from $GOBIN
|
|
44
|
+
make clean # remove ./bin/
|
|
45
|
+
make test # go test ./...
|
|
46
|
+
|
|
47
|
+
# Both via Homebrew
|
|
48
|
+
brew install dpep/tools/iriq # uses the Ruby gem under the hood
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Keeping Ruby and Go in sync
|
|
52
|
+
|
|
53
|
+
The Ruby gem is the **reference implementation**. Go mirrors its public API
|
|
54
|
+
and behavior. Two layers of parity testing keep them aligned:
|
|
55
|
+
|
|
56
|
+
1. **Golden JSON fixtures** (`spec/fixtures/*.json`)
|
|
57
|
+
Generated by `script/generate_fixtures.rb` from the Ruby implementation
|
|
58
|
+
over a curated set of inputs. Go's `fixtures_test.go` loads each file
|
|
59
|
+
and asserts the same outputs from the Go side.
|
|
60
|
+
|
|
61
|
+
2. **CLI parity harness** (`script/cli_parity.sh`)
|
|
62
|
+
Runs the same input through `bundle exec exe/iriq` and the Go binary and
|
|
63
|
+
diffs stdout. Lives in CI as the `Ruby ↔ Go parity` job.
|
|
64
|
+
|
|
65
|
+
When changing behavior:
|
|
66
|
+
|
|
67
|
+
1. Update the Ruby code + specs first.
|
|
68
|
+
2. Regenerate fixtures: `bundle exec ruby script/generate_fixtures.rb`.
|
|
69
|
+
3. Port the change to Go.
|
|
70
|
+
4. `go test ./...` (uses the updated fixtures).
|
|
71
|
+
5. `script/cli_parity.sh` should pass.
|
|
72
|
+
6. Commit fixtures with the change — CI will fail if they're stale.
|
|
73
|
+
|
|
74
|
+
## Tests
|
|
75
|
+
|
|
76
|
+
```sh
|
|
77
|
+
bundle exec rspec # Ruby suite (305+ examples)
|
|
78
|
+
go test ./... # Go suite (native + fixture parity tests)
|
|
79
|
+
script/cli_parity.sh # CLI parity (13+ scenarios)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Releases
|
|
83
|
+
|
|
84
|
+
- One version tag covers both runtimes — bump `lib/iriq/version.rb` (and
|
|
85
|
+
optionally a matching constant on the Go side if we add one), tag `vX.Y.Z`,
|
|
86
|
+
push.
|
|
87
|
+
- `gem push iriq-X.Y.Z.gem` to publish to RubyGems.
|
|
88
|
+
- Update `Formula/iriq.rb` in the homebrew-tools tap to the new version.
|
|
89
|
+
- Go consumers pick up the tag automatically via `go get @vX.Y.Z`.
|
|
90
|
+
|
|
91
|
+
## Corpus storage backends
|
|
92
|
+
|
|
93
|
+
The `Corpus` class delegates state to a `Storage` backend; three backends ship:
|
|
94
|
+
|
|
95
|
+
- **Memory** — default, in-process only.
|
|
96
|
+
- **JSON** — Memory wrapped with atomic load/save against a JSON file
|
|
97
|
+
(`.json` by default). Same shape both runtimes have always written.
|
|
98
|
+
- **SQLite** — incremental UPSERTs against a `.db` / `.sqlite` / `.sqlite3`
|
|
99
|
+
file with WAL journaling. Supports concurrent observers and avoids
|
|
100
|
+
loading the whole corpus into memory.
|
|
101
|
+
|
|
102
|
+
`Corpus.open(path)` (Ruby) / `iriq.OpenCorpus(path)` (Go) picks the backend
|
|
103
|
+
by file extension. `corpus.save(other_path)` exports as JSON regardless of
|
|
104
|
+
the live backend; `corpus.save(same_path)` is idempotent (no clobbering a
|
|
105
|
+
SQLite file with JSON, etc.).
|
|
106
|
+
|
|
107
|
+
The Ruby `sqlite3` gem is loaded lazily (only when a `.db` path is opened),
|
|
108
|
+
keeping the iriq install footprint minimal for users that stick with JSON.
|
|
109
|
+
On the Go side we use `modernc.org/sqlite` (pure Go — no cgo).
|
|
110
|
+
|
|
111
|
+
When adding a new backend, replicate the contract in both languages and
|
|
112
|
+
add a parity scenario in `script/cli_parity.sh`'s `corpus_pair` section.
|
|
113
|
+
|
|
114
|
+
## What lives where in scripts
|
|
115
|
+
|
|
116
|
+
- `script/benchmark.rb` — Ruby-only throughput benchmark.
|
|
117
|
+
- `script/memory.rb` — Ruby-only memory profile.
|
|
118
|
+
- `script/generate_fixtures.rb` — produces `spec/fixtures/*.json` for cross-runtime parity.
|
|
119
|
+
- `script/cli_parity.sh` — Ruby ↔ Go CLI diff.
|
|
120
|
+
- `script/bench_compare.sh` — Ruby vs Go CLI wall-time comparison.
|
|
121
|
+
- `script/bench_storage.sh` — JSON vs SQLite backend timing (single-process, incremental, concurrent).
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
iriq (0.0
|
|
4
|
+
iriq (0.2.0)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -19,6 +19,7 @@ GEM
|
|
|
19
19
|
prism (>= 1.3.0)
|
|
20
20
|
rdoc (>= 4.0.0)
|
|
21
21
|
reline (>= 0.4.2)
|
|
22
|
+
mini_portile2 (2.8.9)
|
|
22
23
|
pp (0.6.3)
|
|
23
24
|
prettyprint
|
|
24
25
|
prettyprint (0.2.0)
|
|
@@ -53,6 +54,8 @@ GEM
|
|
|
53
54
|
simplecov_json_formatter (~> 0.1)
|
|
54
55
|
simplecov-html (0.13.2)
|
|
55
56
|
simplecov_json_formatter (0.1.4)
|
|
57
|
+
sqlite3 (2.9.4)
|
|
58
|
+
mini_portile2 (~> 2.8.0)
|
|
56
59
|
stringio (3.2.0)
|
|
57
60
|
tsort (0.2.0)
|
|
58
61
|
|
|
@@ -65,6 +68,7 @@ DEPENDENCIES
|
|
|
65
68
|
rspec (>= 3.10)
|
|
66
69
|
rspec-debugging
|
|
67
70
|
simplecov (>= 0.22)
|
|
71
|
+
sqlite3 (>= 1.6)
|
|
68
72
|
|
|
69
73
|
CHECKSUMS
|
|
70
74
|
date (3.5.1) sha256=750d06384d7b9c15d562c76291407d89e368dda4d4fff957eb94962d325a0dc0
|
|
@@ -74,7 +78,8 @@ CHECKSUMS
|
|
|
74
78
|
erb (6.0.4) sha256=38e3803694be357fe2bfe312487c74beaf9fb4e5beb3e22498952fe1645b95d9
|
|
75
79
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
76
80
|
irb (1.17.0) sha256=168c4ddb93d8a361a045c41d92b2952c7a118fa73f23fe14e55609eb7a863aae
|
|
77
|
-
iriq (0.0
|
|
81
|
+
iriq (0.2.0)
|
|
82
|
+
mini_portile2 (2.8.9) sha256=0cd7c7f824e010c072e33f68bc02d85a00aeb6fce05bb4819c03dfd3c140c289
|
|
78
83
|
pp (0.6.3) sha256=2951d514450b93ccfeb1df7d021cae0da16e0a7f95ee1e2273719669d0ab9df6
|
|
79
84
|
prettyprint (0.2.0) sha256=2bc9e15581a94742064a3cc8b0fb9d45aae3d03a1baa6ef80922627a0766f193
|
|
80
85
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
@@ -90,6 +95,7 @@ CHECKSUMS
|
|
|
90
95
|
simplecov (0.22.0) sha256=fe2622c7834ff23b98066bb0a854284b2729a569ac659f82621fc22ef36213a5
|
|
91
96
|
simplecov-html (0.13.2) sha256=bd0b8e54e7c2d7685927e8d6286466359b6f16b18cb0df47b508e8d73c777246
|
|
92
97
|
simplecov_json_formatter (0.1.4) sha256=529418fbe8de1713ac2b2d612aa3daa56d316975d307244399fa4838c601b428
|
|
98
|
+
sqlite3 (2.9.4) sha256=6161c5b9c17886b289558e6c8082b28a22a814736d2433c9a67f4c6bfcde5c97
|
|
93
99
|
stringio (3.2.0) sha256=c37cb2e58b4ffbd33fe5cd948c05934af997b36e0b6ca6fdf43afa234cf222e1
|
|
94
100
|
tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
|
|
95
101
|
|
data/Makefile
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Iriq Go binary — build/install/clean/uninstall helpers.
|
|
2
|
+
#
|
|
3
|
+
# make - same as `make help`
|
|
4
|
+
# make build - build into ./bin/iriq
|
|
5
|
+
# make install - go install into $GOBIN (defaults to $GOPATH/bin)
|
|
6
|
+
# make test - go test ./...
|
|
7
|
+
# make clean - remove ./bin/
|
|
8
|
+
# make uninstall - remove the binary from $GOBIN
|
|
9
|
+
#
|
|
10
|
+
# Ruby gem build/install is handled by Bundler/RubyGems; see CLAUDE.md.
|
|
11
|
+
|
|
12
|
+
GO ?= go
|
|
13
|
+
BIN_DIR := bin
|
|
14
|
+
BIN := $(BIN_DIR)/iriq
|
|
15
|
+
PKG := ./cmd/iriq
|
|
16
|
+
|
|
17
|
+
# Resolve $GOBIN, falling back to $GOPATH/bin (Go's default install location).
|
|
18
|
+
GOBIN := $(shell $(GO) env GOBIN)
|
|
19
|
+
ifeq ($(GOBIN),)
|
|
20
|
+
GOBIN := $(shell $(GO) env GOPATH)/bin
|
|
21
|
+
endif
|
|
22
|
+
INSTALLED := $(GOBIN)/iriq
|
|
23
|
+
|
|
24
|
+
.DEFAULT_GOAL := help
|
|
25
|
+
.PHONY: help build install test clean uninstall
|
|
26
|
+
|
|
27
|
+
help:
|
|
28
|
+
@echo "Iriq Go targets:"
|
|
29
|
+
@echo " make build build into $(BIN)"
|
|
30
|
+
@echo " make install go install into $(GOBIN)"
|
|
31
|
+
@echo " make test run go test ./..."
|
|
32
|
+
@echo " make clean remove $(BIN_DIR)/"
|
|
33
|
+
@echo " make uninstall remove $(INSTALLED)"
|
|
34
|
+
|
|
35
|
+
build:
|
|
36
|
+
@mkdir -p $(BIN_DIR)
|
|
37
|
+
$(GO) build -o $(BIN) $(PKG)
|
|
38
|
+
@echo "built $(BIN)"
|
|
39
|
+
|
|
40
|
+
install:
|
|
41
|
+
$(GO) install $(PKG)
|
|
42
|
+
@echo "installed $(INSTALLED)"
|
|
43
|
+
|
|
44
|
+
test:
|
|
45
|
+
$(GO) test ./...
|
|
46
|
+
|
|
47
|
+
clean:
|
|
48
|
+
rm -rf $(BIN_DIR)
|
|
49
|
+
@echo "removed $(BIN_DIR)/"
|
|
50
|
+
|
|
51
|
+
uninstall:
|
|
52
|
+
@if [ -f "$(INSTALLED)" ]; then \
|
|
53
|
+
rm "$(INSTALLED)" && echo "removed $(INSTALLED)"; \
|
|
54
|
+
else \
|
|
55
|
+
echo "not installed at $(INSTALLED)"; \
|
|
56
|
+
fi
|