whatwasit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. whatwasit-0.1.0/.github/workflows/ci.yml +74 -0
  2. whatwasit-0.1.0/.github/workflows/publish.yml +61 -0
  3. whatwasit-0.1.0/.gitignore +38 -0
  4. whatwasit-0.1.0/ARCHITECTURE.md +187 -0
  5. whatwasit-0.1.0/BENCHMARKS.md +86 -0
  6. whatwasit-0.1.0/FUTURE_IDEAS.md +42 -0
  7. whatwasit-0.1.0/LICENSE +21 -0
  8. whatwasit-0.1.0/PKG-INFO +258 -0
  9. whatwasit-0.1.0/README.md +200 -0
  10. whatwasit-0.1.0/benchmarks/query_cold_warm.py +143 -0
  11. whatwasit-0.1.0/benchmarks/run_bench.py +191 -0
  12. whatwasit-0.1.0/docs/ACCURACY_BGE_RESULTS.md +62 -0
  13. whatwasit-0.1.0/docs/ACCURACY_RESEARCH.md +247 -0
  14. whatwasit-0.1.0/docs/BENCHMARKS_BGE.md +31 -0
  15. whatwasit-0.1.0/eval/README.md +52 -0
  16. whatwasit-0.1.0/eval/__init__.py +1 -0
  17. whatwasit-0.1.0/eval/baseline.py +60 -0
  18. whatwasit-0.1.0/eval/build_dataset.py +807 -0
  19. whatwasit-0.1.0/eval/metrics.py +82 -0
  20. whatwasit-0.1.0/eval/metrics_summary.csv +89 -0
  21. whatwasit-0.1.0/eval/metrics_summary_v2.csv +89 -0
  22. whatwasit-0.1.0/eval/queries.jsonl +96 -0
  23. whatwasit-0.1.0/eval/queries_keyword_heavy.jsonl +15 -0
  24. whatwasit-0.1.0/eval/raw_sources/emir_commands_sample.txt +1500 -0
  25. whatwasit-0.1.0/eval/raw_sources/hotal_commands_sample.txt +397 -0
  26. whatwasit-0.1.0/eval/raw_sources/hrsvrn_commands_sample.txt +10 -0
  27. whatwasit-0.1.0/eval/results.jsonl +192 -0
  28. whatwasit-0.1.0/eval/results_raw_v2.jsonl +192 -0
  29. whatwasit-0.1.0/eval/run_eval.py +450 -0
  30. whatwasit-0.1.0/eval/sessions.jsonl +57 -0
  31. whatwasit-0.1.0/eval/summary.json +7431 -0
  32. whatwasit-0.1.0/eval/summary_bge.json +7431 -0
  33. whatwasit-0.1.0/eval/summary_v2.json +7431 -0
  34. whatwasit-0.1.0/eval/tables.md +235 -0
  35. whatwasit-0.1.0/eval/tables_v2.md +235 -0
  36. whatwasit-0.1.0/pyproject.toml +61 -0
  37. whatwasit-0.1.0/tests/__init__.py +0 -0
  38. whatwasit-0.1.0/tests/synthetic.py +214 -0
  39. whatwasit-0.1.0/tests/test_brand.py +69 -0
  40. whatwasit-0.1.0/tests/test_cli.py +234 -0
  41. whatwasit-0.1.0/tests/test_daemon.py +69 -0
  42. whatwasit-0.1.0/tests/test_embedder.py +242 -0
  43. whatwasit-0.1.0/tests/test_index.py +106 -0
  44. whatwasit-0.1.0/tests/test_indexer.py +191 -0
  45. whatwasit-0.1.0/tests/test_integration.py +97 -0
  46. whatwasit-0.1.0/tests/test_parsers.py +231 -0
  47. whatwasit-0.1.0/tests/test_search.py +187 -0
  48. whatwasit-0.1.0/tests/test_sessions.py +291 -0
  49. whatwasit-0.1.0/tests/test_synthetic.py +96 -0
  50. whatwasit-0.1.0/tests/test_tui.py +156 -0
  51. whatwasit-0.1.0/tests/test_tui_config.py +184 -0
  52. whatwasit-0.1.0/whatwasit/__init__.py +3 -0
  53. whatwasit-0.1.0/whatwasit/brand.py +59 -0
  54. whatwasit-0.1.0/whatwasit/cli.py +216 -0
  55. whatwasit-0.1.0/whatwasit/config.py +82 -0
  56. whatwasit-0.1.0/whatwasit/config_loader.py +78 -0
  57. whatwasit-0.1.0/whatwasit/daemon.py +341 -0
  58. whatwasit-0.1.0/whatwasit/db.py +174 -0
  59. whatwasit-0.1.0/whatwasit/embedder.py +347 -0
  60. whatwasit-0.1.0/whatwasit/index.py +57 -0
  61. whatwasit-0.1.0/whatwasit/indexer.py +108 -0
  62. whatwasit-0.1.0/whatwasit/interfaces.py +62 -0
  63. whatwasit-0.1.0/whatwasit/models.py +191 -0
  64. whatwasit-0.1.0/whatwasit/output.py +144 -0
  65. whatwasit-0.1.0/whatwasit/parsers/__init__.py +1 -0
  66. whatwasit-0.1.0/whatwasit/parsers/atuin.py +96 -0
  67. whatwasit-0.1.0/whatwasit/parsers/base.py +65 -0
  68. whatwasit-0.1.0/whatwasit/parsers/bash.py +53 -0
  69. whatwasit-0.1.0/whatwasit/parsers/zsh.py +78 -0
  70. whatwasit-0.1.0/whatwasit/search.py +239 -0
  71. whatwasit-0.1.0/whatwasit/sessions.py +229 -0
  72. whatwasit-0.1.0/whatwasit/tui.py +393 -0
@@ -0,0 +1,74 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, "feature/**"]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ci-${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ test:
15
+ name: Python ${{ matrix.python-version }}
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ fail-fast: false
19
+ matrix:
20
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+ cache: pip
29
+
30
+ - name: Cache embedding model
31
+ uses: actions/cache@v4
32
+ with:
33
+ path: ~/.cache/huggingface
34
+ key: minilm-onnx-${{ runner.os }}-${{ hashFiles('whatwasit/embedder.py', 'pyproject.toml') }}
35
+
36
+ - name: Install package and test deps
37
+ run: pip install -e ".[dev]"
38
+
39
+ - name: Prefetch ONNX embedding model
40
+ run: |
41
+ python - <<'PY'
42
+ from whatwasit.config import Config
43
+ from whatwasit.embedder import build_embedder
44
+ build_embedder(Config.default()).encode(["warmup"])
45
+ PY
46
+
47
+ - name: Run tests
48
+ env:
49
+ HF_HUB_OFFLINE: "1"
50
+ TRANSFORMERS_OFFLINE: "1"
51
+ run: pytest --cov=whatwasit --cov-report=term-missing
52
+
53
+ build:
54
+ name: Build wheel
55
+ runs-on: ubuntu-latest
56
+ steps:
57
+ - uses: actions/checkout@v4
58
+
59
+ - uses: actions/setup-python@v5
60
+ with:
61
+ python-version: "3.12"
62
+ cache: pip
63
+
64
+ - name: Install build backend
65
+ run: pip install build
66
+
67
+ - name: Build sdist and wheel
68
+ run: python -m build
69
+
70
+ - name: Smoke-test wheel install
71
+ run: |
72
+ pip install dist/*.whl
73
+ whatwasit --help
74
+ python -c "import whatwasit; print(whatwasit.__version__)"
@@ -0,0 +1,61 @@
1
+ name: Publish to PyPI
2
+
3
+ # Publishes when you publish a GitHub Release (tag vX.Y.Z must match pyproject.toml).
4
+ #
5
+ # One-time setup:
6
+ # 1. PyPI → Account settings → API tokens → "Add API token"
7
+ # - Scope: project "whatwasit" (or entire account for the first upload)
8
+ # - Copy the token (pypi-AgE...); you only see it once
9
+ # 2. GitHub repo → Settings → Secrets and variables → Actions → New repository secret
10
+ # - Name: PYPI_API_TOKEN
11
+ # - Value: paste the PyPI token
12
+ #
13
+ # Release flow:
14
+ # 1. Bump version in pyproject.toml and whatwasit/__init__.py
15
+ # 2. Commit and push main
16
+ # 3. git tag v0.1.0 && git push origin v0.1.0
17
+ # 4. GitHub → Releases → Draft new release → pick tag → Publish release
18
+
19
+ on:
20
+ release:
21
+ types: [published]
22
+
23
+ concurrency:
24
+ group: pypi-${{ github.event.release.tag_name }}
25
+ cancel-in-progress: false
26
+
27
+ jobs:
28
+ publish:
29
+ name: Build and publish
30
+ runs-on: ubuntu-latest
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+
34
+ - uses: actions/setup-python@v5
35
+ with:
36
+ python-version: "3.12"
37
+
38
+ - name: Verify release tag matches package version
39
+ run: |
40
+ TAG="${{ github.event.release.tag_name }}"
41
+ VERSION="${TAG#v}"
42
+ PKG_VERSION=$(python - <<'PY'
43
+ import tomllib
44
+ print(tomllib.load(open("pyproject.toml", "rb"))["project"]["version"])
45
+ PY
46
+ )
47
+ if [ "$VERSION" != "$PKG_VERSION" ]; then
48
+ echo "::error::Release tag ${TAG} (version ${VERSION}) does not match pyproject.toml (${PKG_VERSION})"
49
+ exit 1
50
+ fi
51
+
52
+ - name: Build package
53
+ run: |
54
+ python -m pip install --upgrade pip build
55
+ python -m build
56
+
57
+ - name: Publish to PyPI
58
+ uses: pypa/gh-action-pypi-publish@release/v1
59
+ with:
60
+ packages-dir: dist/
61
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .pytest_cache/
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+ *.egg
12
+
13
+ # Virtual envs
14
+ .venv/
15
+ venv/
16
+
17
+ # hist local data / generated artifacts
18
+ *.usearch
19
+ *.db
20
+ *.sqlite
21
+ *.sqlite3
22
+ .coverage
23
+ htmlcov/
24
+
25
+ # Editor / IDE
26
+ .idea/
27
+ .vscode/
28
+ .cursor/
29
+
30
+ # Worktrees (created during development, not part of the package)
31
+ /.worktrees/
32
+
33
+ # Local temp (verification artifacts)
34
+ .verify_tmp/
35
+
36
+ # Local-only notes, not for version control
37
+ /FINAL_REPORT.md
38
+ /PROJECT_CONTEXT.md
@@ -0,0 +1,187 @@
1
+ # Architecture
2
+
3
+ This document records the architecture decisions for `whatwasit` and the reasoning
4
+ behind each one. It is the contract that the implementation must follow.
5
+
6
+ ## Goal recap
7
+
8
+ A local-first, privacy-preserving CLI that searches shell history by
9
+ meaning/intent. Group history into time/directory "sessions", embed each session
10
+ locally with a small sentence-embedding model, store vectors in a local on-disk
11
+ index, and do nearest-neighbour search at query time. No cloud, no API keys, no
12
+ data leaves the machine.
13
+
14
+ ---
15
+
16
+ ## Decision 1: Language -- Python
17
+
18
+ **Chosen: Python.** Rust was the alternative.
19
+
20
+ The performance requirements are demanding on paper (index 10k commands in <30s;
21
+ query <1s at 100k+ sessions) but the actual hot paths are all C/native code
22
+ regardless of the host language:
23
+
24
+ - Embedding runs in ONNX Runtime (C++).
25
+ - Nearest-neighbour search runs in usearch (C++) or numpy (BLAS).
26
+
27
+ The Python layer only does parsing, grouping, and SQLite I/O, none of which is a
28
+ bottleneck at this scale. Crucially, **we embed per *session*, not per command**:
29
+ 10k commands collapse to a few hundred to low-thousand sessions, so we run far
30
+ fewer (and batched) encodes than the raw command count suggests.
31
+
32
+ Python also wins on iteration speed and ecosystem (`fastembed`, `usearch`,
33
+ `sentence-transformers`) and is effectively required by Hard Requirement #7
34
+ (`pip install whatwasit`). Rust would give a single static binary and marginally
35
+ faster cold start, but offers no help meeting the latency targets, which are
36
+ already met. Rust is noted in `FUTURE_IDEAS.md` as a future distribution path.
37
+
38
+ ## Decision 2: Embedding model & runtime -- all-MiniLM-L6-v2 on onnxruntime
39
+
40
+ **Chosen: `sentence-transformers/all-MiniLM-L6-v2` (384-dim) run as an ONNX
41
+ graph directly through `onnxruntime` + `tokenizers`, on CPU, fully offline.**
42
+
43
+ The model itself is confirmed as the right choice for short, terse command-line
44
+ text:
45
+
46
+ - 384 dimensions, ~90MB ONNX on disk, single-digit-ms CPU encode when batched.
47
+ - General-purpose semantic similarity model that handles short text well, which
48
+ matches our session documents (a handful of command lines plus the directory
49
+ name).
50
+
51
+ **Runtime decision and the reason it changed.** The spec named
52
+ `sentence-transformers`, which depends on PyTorch (200MB+ even CPU-only) and
53
+ undercuts the lightweight promise. We first adopted `fastembed` (same model,
54
+ ONNX, no PyTorch). However, **measured on the target CPU fastembed delivered
55
+ only ~28 texts/sec**, which blew the budgets (indexing 10k commands took ~34s vs
56
+ the 30s limit, and a query took ~2-3s vs the 1s limit). Profiling showed the
57
+ same ONNX graph driven *directly* through `onnxruntime` -- with
58
+ `graph_optimization_level=ORT_ENABLE_ALL` and `intra_op_num_threads = cpu_count`
59
+ -- runs at **~600 texts/sec (~20x faster)** and produces **bit-for-bit identical
60
+ vectors (cosine 1.0)**. So the embedder now drives onnxruntime itself:
61
+ `tokenizers` for tokenization, the ONNX `model.onnx` for inference, mask-weighted
62
+ mean pooling, then L2 normalization (exactly matching all-MiniLM-L6-v2). The
63
+ ONNX weights + tokenizer are fetched once via `huggingface_hub` and cached.
64
+
65
+ This is precisely the payoff of hiding the model behind the `Embedder` ABC
66
+ (`encode(texts) -> np.ndarray`, rows L2-normalized): the runtime was swapped with
67
+ zero changes to the indexer, search, or CLI. Swapping forward to an
68
+ instruction-tuned asymmetric model (e5/bge) remains a one-class change. See
69
+ `BENCHMARKS.md` for the before/after numbers.
70
+
71
+ Known limitation: queries are natural language ("how did I fix the nginx issue")
72
+ while documents are commands. MiniLM is roughly symmetric and works well enough
73
+ for the MVP; asymmetric query/passage models are listed in `FUTURE_IDEAS.md`.
74
+
75
+ ## Decision 3: Vector storage -- usearch
76
+
77
+ **Chosen: `usearch`**, behind the `VectorIndex` ABC. Candidates compared for the
78
+ 10k-100k+ vector scale:
79
+
80
+ | Option | Verdict | Reasoning |
81
+ |---|---|---|
82
+ | `sqlite-vss` | Rejected | Effectively unmaintained (superseded by `sqlite-vec`); relies on loadable SQLite extensions with spotty/fragile wheels, and is weak on Python 3.13. Install friction directly threatens the clean `pip install` requirement. |
83
+ | `hnswlib` | Viable, not chosen | Fast, mature HNSW, but forces us to hand-roll the key->metadata mapping and index persistence ourselves -- more glue, more ways to desync the index from the DB. |
84
+ | `usearch` | **Chosen** | Single pip wheel across manylinux/macOS/Windows; HNSW *and* exact search; cosine metric; memory-mapped single-file save/load; integer keys map directly to our SQLite `session.id`; scales far past 100k. |
85
+
86
+ **Scale sanity check:** at 100k vectors x 384 float32 (~150MB) even a brute-force
87
+ numpy cosine scan is ~tens of milliseconds, so the <1s query target is met *even
88
+ without* an ANN index. usearch is chosen not because ANN is strictly required at
89
+ this scale, but for clean single-file persistence and headroom to grow. Because
90
+ the store is behind `VectorIndex`, a numpy brute-force backend remains a trivial
91
+ drop-in fallback.
92
+
93
+ The vector file (`index.usearch`) holds vectors keyed by `session.id`; all
94
+ session metadata and command text lives in SQLite. The two are linked solely by
95
+ that integer key.
96
+
97
+ ## Decision 4: Session-grouping algorithm -- time gap + reconstructed cwd
98
+
99
+ **Chosen: split on time gap OR working-directory change, with the working
100
+ directory reconstructed from the command stream.**
101
+
102
+ The spec's heuristic (commands within ~5 min AND in the same directory form one
103
+ session) is sound, but it hits a hard reality: **plain `~/.zsh_history` and
104
+ `~/.bash_history` do not record a working directory per command.** Only atuin
105
+ stores cwd. So the directory half of the heuristic is not directly available for
106
+ the primary data sources.
107
+
108
+ Resolution:
109
+
110
+ 1. **Reconstruct cwd by replaying directory changes.** Walk commands in order,
111
+ maintaining a running cwd. Update it on `cd`, `pushd`, `popd`, and bare `cd`
112
+ (-> home). Resolve relative paths against the current cwd; when a target
113
+ cannot be resolved (e.g. `cd "$VAR"`, `cd $(...)`), keep the previous cwd and
114
+ mark uncertainty rather than guessing.
115
+ 2. **Use real cwd when available.** atuin records the actual cwd, so when reading
116
+ from atuin we use it directly instead of reconstructing.
117
+ 3. **Boundary rule.** Start a new session when the inter-command time gap exceeds
118
+ `session_window_seconds` (default 300) **or** (if `split_on_cwd_change`) the
119
+ cwd changes. Both knobs live in `Config`.
120
+ 4. **Timestamp-less sources.** bash without `HISTTIMEFORMAT` has no timestamps;
121
+ there we fall back to splitting on cwd change in file order only.
122
+
123
+ This keeps the spec's intent (sessions = "what you were doing in one place around
124
+ one time") while being honest about what the data actually contains.
125
+
126
+ ## Decision 5: Extensibility guardrails (applied, not over-built)
127
+
128
+ - `Embedder` ABC wraps the model so it can be swapped without touching callers.
129
+ - `VectorIndex` ABC wraps the store so usearch/hnswlib/numpy are interchangeable.
130
+ - `schema_version` is written to the SQLite `meta` table from day one (currently
131
+ `1`).
132
+ - All tunables (session window, cwd-split toggle, model name, dim, data paths,
133
+ top-k) are centralized in a single `Config` object, not scattered constants.
134
+ - We deliberately do **not** build plugin systems, multi-backend config, or
135
+ speculative feature flags. Anything tempting goes to `FUTURE_IDEAS.md`.
136
+
137
+ ---
138
+
139
+ ## Data model
140
+
141
+ SQLite (`whatwasit.db`) holds metadata; `index.usearch` holds vectors keyed by
142
+ `session.id`.
143
+
144
+ - `meta(key, value)` -- includes `schema_version`.
145
+ - `sessions(id, start_ts, end_ts, cwd, command_count, doc_text)`.
146
+ - `commands(id, session_id, seq, source, ts, duration, exit_code, cwd, raw_cmd)`.
147
+
148
+ `Session.to_document()` is the single source of truth for the embedded text:
149
+ the directory basename followed by the session's commands, in order.
150
+
151
+ ## Pipeline
152
+
153
+ ```mermaid
154
+ flowchart LR
155
+ hfiles["~/.zsh_history / ~/.bash_history / atuin.db"] --> parser
156
+ parser["parsers -> Command[]"] --> grouper
157
+ grouper["sessions: time gap + cwd replay -> Session[]"] --> indexer
158
+ indexer["indexer: persist + embed + add to index"] --> store["SQLite + index.usearch"]
159
+ query["whatwasit 'natural language query'"] --> search
160
+ store --> search
161
+ search["search: embed query -> ANN -> hydrate"] --> output["rich output: ts, cwd, highlighted match + context"]
162
+ ```
163
+
164
+ ## Module boundaries
165
+
166
+ - `whatwasit/config.py`, `whatwasit/models.py`, `whatwasit/db.py`, `whatwasit/interfaces.py` —
167
+ configuration, data model, persistence, and swappable subsystem interfaces.
168
+ - `whatwasit/parsers/{base,zsh,bash,atuin}.py` — history file parsers.
169
+ - `whatwasit/sessions.py` — session grouping (time gap + cwd reconstruction).
170
+ - `whatwasit/embedder.py` — `OnnxEmbedder(Embedder)` (MiniLM via onnxruntime).
171
+ - `whatwasit/index.py` — `UsearchIndex(VectorIndex)`.
172
+ - `whatwasit/indexer.py` — parse → group → persist → embed → index orchestration.
173
+ - `whatwasit/search.py` — query → embed → ANN → length normalization → gated hybrid
174
+ RRF reranking → hydrate → per-command match highlighting.
175
+ - `whatwasit/cli.py`, `whatwasit/output.py` — CLI entry point and rich formatting.
176
+ - `tests/synthetic.py` — synthetic multi-topic history generator (tests + bench).
177
+ - `eval/` — offline search-quality evaluation harness (not shipped in the wheel).
178
+ - `benchmarks/` — performance benchmark script (not shipped in the wheel).
179
+
180
+ ## Performance plan vs requirements
181
+
182
+ - **Index 10k commands < 30s:** embedding is per-session (far fewer than 10k) and
183
+ batched through ONNX; SQLite writes are batched in a transaction.
184
+ - **Query < 1s at 100k+ sessions:** one query encode (~ms) + usearch search
185
+ (sub-ms to low-ms); brute-force numpy would still be tens of ms.
186
+
187
+ Real measured numbers are recorded in `BENCHMARKS.md`.
@@ -0,0 +1,86 @@
1
+ # Benchmarks
2
+
3
+ Fully offline. Embedding model `all-MiniLM-L6-v2` (384-dim ONNX via
4
+ `onnxruntime` + `tokenizers`), vector store `usearch`, SQLite metadata.
5
+
6
+ Reproduce with:
7
+
8
+ ```bash
9
+ HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python benchmarks/run_bench.py --write-md
10
+ ```
11
+
12
+ ## Important note on the measurement machine
13
+
14
+ The first numbers captured, in the "Measured" table below, came from an
15
+ **11th Gen Intel Core i5-11320H that was thermally throttled to ~1.5-1.9 GHz**
16
+ (nominal boost is 4.2 GHz) and under concurrent load at the time. Indexing time
17
+ is dominated by CPU embedding throughput, so those figures were a pessimistic
18
+ lower bound, not representative hardware. Both hard requirements are now
19
+ **confirmed passing** on representative, un-throttled hardware — see
20
+ "Confirmed on un-throttled hardware" below.
21
+
22
+ ## Measured (throttled i5-11320H @ ~1.5-1.9 GHz, under load)
23
+
24
+ | Commands | Sessions | Index time | Query latency (avg) | Query latency (max) |
25
+ |---:|---:|---:|---:|---:|
26
+ | 1,000 | 109 | 11.33 s | 1.18 s | 1.48 s |
27
+ | 10,000 | 1,087 | 88.41 s | 1.37 s | 1.85 s |
28
+ | 100,000 | 10,870 | 464.10 s | 610.8 ms | 813.6 ms |
29
+
30
+ (The 1k/10k query rows were measured while duplicate benchmark processes were
31
+ still contending for the CPU; the 100k row ran with less contention, which is why
32
+ its query latency is actually the *lowest* of the three at 611 ms avg.)
33
+
34
+ **Contention is measurable.** Three benchmark passes happened to overlap on this
35
+ machine, and the same 10k indexing workload landed at **33.7 s / 47.8 s / 88.4 s**
36
+ depending on how many passes were fighting for the CPU at that moment. The
37
+ best-observed 10k indexing (33.7 s) is already right at the 30 s target under
38
+ merely partial contention, so an un-throttled, uncontended CPU clears it easily.
39
+
40
+ ## Clean component measurements (same machine, momentarily idle)
41
+
42
+ These isolate the pieces that matter for the requirements and were captured when
43
+ the CPU was briefly idle:
44
+
45
+ | Component | Result |
46
+ |---|---|
47
+ | Embedder throughput (onnxruntime + tokenizers, batched) | **~608 texts/sec** |
48
+ | Same model via `fastembed` (rejected runtime) | ~28 texts/sec (~40 max, even with `parallel=8`) |
49
+ | Raw ONNX batched inference (256 x len-32) | ~163 seq/sec |
50
+ | Nearest-neighbour search over **100,000** session vectors | **1.6 ms avg, 2.1 ms max** |
51
+
52
+ Embedding is per **session**, not per command, so a session count is what drives
53
+ indexing time. At the clean ~608 texts/sec, embedding the 1,087 sessions from
54
+ 10k commands is ~1.8 s; the rest of indexing (parse, group, SQLite writes,
55
+ `usearch.add`) is well under a second.
56
+
57
+ ## Confirmed on un-throttled hardware
58
+
59
+ Measured on an Intel i9-14900 (32 threads, boosts to 5.5 GHz), Ubuntu 22.04,
60
+ idle system, single clean run:
61
+
62
+ | Commands | Sessions | Index time | Query latency (avg) | Query latency (max) |
63
+ |---:|---:|---:|---:|---:|
64
+ | 1,000 | 109 | 0.80 s | 65.1 ms | 101.0 ms |
65
+ | 10,000 | 1,087 | 7.19 s | 165.0 ms | 270.8 ms |
66
+ | 100,000 | 10,870 | 92.89 s | 153.3 ms | 206.5 ms |
67
+
68
+ Pure nearest-neighbour search over 100,000 session vectors: **0.7 ms avg, 0.9 ms max**.
69
+
70
+ ## Requirement checks
71
+
72
+ - **HR#3 - index 10k commands < 30s: PASS, with 4x margin.** 7.19 s measured
73
+ (vs. 88.41 s throttled / 33.7-47.8 s partially-throttled). Indexing scales
74
+ with session count and clock speed as predicted from the clean component
75
+ measurements above, not anything specific to the throttled machine.
76
+ - **HR#5 - query < 1s at 100k+ indexed sessions: PASS, with 5-6x margin.**
77
+ 153.3 ms avg / 206.5 ms max at 100k sessions. Nearest-neighbour search itself
78
+ is negligible (0.7 ms); the ~150-200ms is dominated by the two small ONNX
79
+ forward passes per query (embedding the query text, then batch-embedding the
80
+ matched sessions' commands for highlighting) — a fixed per-query cost that is
81
+ independent of index size, which is exactly why query latency does not grow
82
+ from 10k to 100k sessions.
83
+
84
+ Both hard requirements are now confirmed on representative hardware; the
85
+ throttled numbers above remain as a record of why the embedder was rewritten
86
+ from `fastembed` to direct `onnxruntime`, and as a worst-case lower bound.
@@ -0,0 +1,42 @@
1
+ # Future Ideas (explicitly out of MVP scope)
2
+
3
+ These are deliberately **not** built in v1. The MVP scope is fixed to the Hard
4
+ Requirements. Good ideas are parked here instead of expanding scope.
5
+
6
+ ## Captured from the spec (Full Version)
7
+
8
+ - **Background daemon / shell hooks for live indexing.** Embed and index new
9
+ commands as they are run, in the background, with zero perceptible latency.
10
+ This is also the clean way to capture the *real* working directory going
11
+ forward (instead of reconstructing it from `cd` replay).
12
+ - **Fish shell support.** Reading `~/.local/share/fish/fish_history` (YAML-ish).
13
+ - **Interactive mode.** fzf-style fuzzy re-filtering of the ranked results.
14
+ - **Deeper atuin integration.** Beyond reading its DB: respect its richer
15
+ metadata (hostname, session id, exit status) for ranking.
16
+ - **Git semantic search (`giths`).** Same architecture applied to commit
17
+ messages + first N lines of each diff.
18
+ - **Distribution beyond pip.** Homebrew formula; a single static Rust binary for
19
+ the CLI front-end (the embedding/index core could stay Python or move to a
20
+ Rust core with ONNX + usearch bindings).
21
+
22
+ ## Engineering ideas surfaced during design
23
+
24
+ - **Asymmetric query/passage embedding models** (e5, bge, gte) with
25
+ query/passage prefixes, to better match natural-language queries against
26
+ command documents. Trivial to try behind the existing `Embedder` ABC.
27
+ - **Per-command vectors** in addition to per-session, for finer-grained
28
+ highlighting and "jump to the exact command" results.
29
+ - **numpy brute-force `VectorIndex` backend** as a zero-native-dep fallback
30
+ (already trivially supported by the `VectorIndex` interface).
31
+ - **Incremental indexing** (only embed sessions whose commands changed) once a
32
+ live daemon exists, using content hashes.
33
+ - **Schema migrations** beyond the version stamp, once `schema_version > 1`.
34
+ - **Secret redaction** on read (mask things that look like tokens/passwords in
35
+ the displayed output). Shell history is sensitive; worth doing carefully.
36
+ - **Encryption / sync.** Explicitly out of scope (that is atuin's domain), noted
37
+ only for completeness.
38
+
39
+ ## Process note
40
+
41
+ If a feature here starts to feel necessary for the MVP, stop and confirm scope
42
+ before building it.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ThorOdinson246
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.