whatwasit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whatwasit-0.1.0/.github/workflows/ci.yml +74 -0
- whatwasit-0.1.0/.github/workflows/publish.yml +61 -0
- whatwasit-0.1.0/.gitignore +38 -0
- whatwasit-0.1.0/ARCHITECTURE.md +187 -0
- whatwasit-0.1.0/BENCHMARKS.md +86 -0
- whatwasit-0.1.0/FUTURE_IDEAS.md +42 -0
- whatwasit-0.1.0/LICENSE +21 -0
- whatwasit-0.1.0/PKG-INFO +258 -0
- whatwasit-0.1.0/README.md +200 -0
- whatwasit-0.1.0/benchmarks/query_cold_warm.py +143 -0
- whatwasit-0.1.0/benchmarks/run_bench.py +191 -0
- whatwasit-0.1.0/docs/ACCURACY_BGE_RESULTS.md +62 -0
- whatwasit-0.1.0/docs/ACCURACY_RESEARCH.md +247 -0
- whatwasit-0.1.0/docs/BENCHMARKS_BGE.md +31 -0
- whatwasit-0.1.0/eval/README.md +52 -0
- whatwasit-0.1.0/eval/__init__.py +1 -0
- whatwasit-0.1.0/eval/baseline.py +60 -0
- whatwasit-0.1.0/eval/build_dataset.py +807 -0
- whatwasit-0.1.0/eval/metrics.py +82 -0
- whatwasit-0.1.0/eval/metrics_summary.csv +89 -0
- whatwasit-0.1.0/eval/metrics_summary_v2.csv +89 -0
- whatwasit-0.1.0/eval/queries.jsonl +96 -0
- whatwasit-0.1.0/eval/queries_keyword_heavy.jsonl +15 -0
- whatwasit-0.1.0/eval/raw_sources/emir_commands_sample.txt +1500 -0
- whatwasit-0.1.0/eval/raw_sources/hotal_commands_sample.txt +397 -0
- whatwasit-0.1.0/eval/raw_sources/hrsvrn_commands_sample.txt +10 -0
- whatwasit-0.1.0/eval/results.jsonl +192 -0
- whatwasit-0.1.0/eval/results_raw_v2.jsonl +192 -0
- whatwasit-0.1.0/eval/run_eval.py +450 -0
- whatwasit-0.1.0/eval/sessions.jsonl +57 -0
- whatwasit-0.1.0/eval/summary.json +7431 -0
- whatwasit-0.1.0/eval/summary_bge.json +7431 -0
- whatwasit-0.1.0/eval/summary_v2.json +7431 -0
- whatwasit-0.1.0/eval/tables.md +235 -0
- whatwasit-0.1.0/eval/tables_v2.md +235 -0
- whatwasit-0.1.0/pyproject.toml +61 -0
- whatwasit-0.1.0/tests/__init__.py +0 -0
- whatwasit-0.1.0/tests/synthetic.py +214 -0
- whatwasit-0.1.0/tests/test_brand.py +69 -0
- whatwasit-0.1.0/tests/test_cli.py +234 -0
- whatwasit-0.1.0/tests/test_daemon.py +69 -0
- whatwasit-0.1.0/tests/test_embedder.py +242 -0
- whatwasit-0.1.0/tests/test_index.py +106 -0
- whatwasit-0.1.0/tests/test_indexer.py +191 -0
- whatwasit-0.1.0/tests/test_integration.py +97 -0
- whatwasit-0.1.0/tests/test_parsers.py +231 -0
- whatwasit-0.1.0/tests/test_search.py +187 -0
- whatwasit-0.1.0/tests/test_sessions.py +291 -0
- whatwasit-0.1.0/tests/test_synthetic.py +96 -0
- whatwasit-0.1.0/tests/test_tui.py +156 -0
- whatwasit-0.1.0/tests/test_tui_config.py +184 -0
- whatwasit-0.1.0/whatwasit/__init__.py +3 -0
- whatwasit-0.1.0/whatwasit/brand.py +59 -0
- whatwasit-0.1.0/whatwasit/cli.py +216 -0
- whatwasit-0.1.0/whatwasit/config.py +82 -0
- whatwasit-0.1.0/whatwasit/config_loader.py +78 -0
- whatwasit-0.1.0/whatwasit/daemon.py +341 -0
- whatwasit-0.1.0/whatwasit/db.py +174 -0
- whatwasit-0.1.0/whatwasit/embedder.py +347 -0
- whatwasit-0.1.0/whatwasit/index.py +57 -0
- whatwasit-0.1.0/whatwasit/indexer.py +108 -0
- whatwasit-0.1.0/whatwasit/interfaces.py +62 -0
- whatwasit-0.1.0/whatwasit/models.py +191 -0
- whatwasit-0.1.0/whatwasit/output.py +144 -0
- whatwasit-0.1.0/whatwasit/parsers/__init__.py +1 -0
- whatwasit-0.1.0/whatwasit/parsers/atuin.py +96 -0
- whatwasit-0.1.0/whatwasit/parsers/base.py +65 -0
- whatwasit-0.1.0/whatwasit/parsers/bash.py +53 -0
- whatwasit-0.1.0/whatwasit/parsers/zsh.py +78 -0
- whatwasit-0.1.0/whatwasit/search.py +239 -0
- whatwasit-0.1.0/whatwasit/sessions.py +229 -0
- whatwasit-0.1.0/whatwasit/tui.py +393 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, "feature/**"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ci-${{ github.workflow }}-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
test:
|
|
15
|
+
name: Python ${{ matrix.python-version }}
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: ${{ matrix.python-version }}
|
|
28
|
+
cache: pip
|
|
29
|
+
|
|
30
|
+
- name: Cache embedding model
|
|
31
|
+
uses: actions/cache@v4
|
|
32
|
+
with:
|
|
33
|
+
path: ~/.cache/huggingface
|
|
34
|
+
key: minilm-onnx-${{ runner.os }}-${{ hashFiles('whatwasit/embedder.py', 'pyproject.toml') }}
|
|
35
|
+
|
|
36
|
+
- name: Install package and test deps
|
|
37
|
+
run: pip install -e ".[dev]"
|
|
38
|
+
|
|
39
|
+
- name: Prefetch ONNX embedding model
|
|
40
|
+
run: |
|
|
41
|
+
python - <<'PY'
|
|
42
|
+
from whatwasit.config import Config
|
|
43
|
+
from whatwasit.embedder import build_embedder
|
|
44
|
+
build_embedder(Config.default()).encode(["warmup"])
|
|
45
|
+
PY
|
|
46
|
+
|
|
47
|
+
- name: Run tests
|
|
48
|
+
env:
|
|
49
|
+
HF_HUB_OFFLINE: "1"
|
|
50
|
+
TRANSFORMERS_OFFLINE: "1"
|
|
51
|
+
run: pytest --cov=whatwasit --cov-report=term-missing
|
|
52
|
+
|
|
53
|
+
build:
|
|
54
|
+
name: Build wheel
|
|
55
|
+
runs-on: ubuntu-latest
|
|
56
|
+
steps:
|
|
57
|
+
- uses: actions/checkout@v4
|
|
58
|
+
|
|
59
|
+
- uses: actions/setup-python@v5
|
|
60
|
+
with:
|
|
61
|
+
python-version: "3.12"
|
|
62
|
+
cache: pip
|
|
63
|
+
|
|
64
|
+
- name: Install build backend
|
|
65
|
+
run: pip install build
|
|
66
|
+
|
|
67
|
+
- name: Build sdist and wheel
|
|
68
|
+
run: python -m build
|
|
69
|
+
|
|
70
|
+
- name: Smoke-test wheel install
|
|
71
|
+
run: |
|
|
72
|
+
pip install dist/*.whl
|
|
73
|
+
whatwasit --help
|
|
74
|
+
python -c "import whatwasit; print(whatwasit.__version__)"
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes when you publish a GitHub Release (tag vX.Y.Z must match pyproject.toml).
|
|
4
|
+
#
|
|
5
|
+
# One-time setup:
|
|
6
|
+
# 1. PyPI → Account settings → API tokens → "Add API token"
|
|
7
|
+
# - Scope: project "whatwasit" (or entire account for the first upload)
|
|
8
|
+
# - Copy the token (pypi-AgE...); you only see it once
|
|
9
|
+
# 2. GitHub repo → Settings → Secrets and variables → Actions → New repository secret
|
|
10
|
+
# - Name: PYPI_API_TOKEN
|
|
11
|
+
# - Value: paste the PyPI token
|
|
12
|
+
#
|
|
13
|
+
# Release flow:
|
|
14
|
+
# 1. Bump version in pyproject.toml and whatwasit/__init__.py
|
|
15
|
+
# 2. Commit and push main
|
|
16
|
+
# 3. git tag v0.1.0 && git push origin v0.1.0
|
|
17
|
+
# 4. GitHub → Releases → Draft new release → pick tag → Publish release
|
|
18
|
+
|
|
19
|
+
on:
|
|
20
|
+
release:
|
|
21
|
+
types: [published]
|
|
22
|
+
|
|
23
|
+
concurrency:
|
|
24
|
+
group: pypi-${{ github.event.release.tag_name }}
|
|
25
|
+
cancel-in-progress: false
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
publish:
|
|
29
|
+
name: Build and publish
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
steps:
|
|
32
|
+
- uses: actions/checkout@v4
|
|
33
|
+
|
|
34
|
+
- uses: actions/setup-python@v5
|
|
35
|
+
with:
|
|
36
|
+
python-version: "3.12"
|
|
37
|
+
|
|
38
|
+
- name: Verify release tag matches package version
|
|
39
|
+
run: |
|
|
40
|
+
TAG="${{ github.event.release.tag_name }}"
|
|
41
|
+
VERSION="${TAG#v}"
|
|
42
|
+
PKG_VERSION=$(python - <<'PY'
|
|
43
|
+
import tomllib
|
|
44
|
+
print(tomllib.load(open("pyproject.toml", "rb"))["project"]["version"])
|
|
45
|
+
PY
|
|
46
|
+
)
|
|
47
|
+
if [ "$VERSION" != "$PKG_VERSION" ]; then
|
|
48
|
+
echo "::error::Release tag ${TAG} (version ${VERSION}) does not match pyproject.toml (${PKG_VERSION})"
|
|
49
|
+
exit 1
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
- name: Build package
|
|
53
|
+
run: |
|
|
54
|
+
python -m pip install --upgrade pip build
|
|
55
|
+
python -m build
|
|
56
|
+
|
|
57
|
+
- name: Publish to PyPI
|
|
58
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
59
|
+
with:
|
|
60
|
+
packages-dir: dist/
|
|
61
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.mypy_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
*.egg
|
|
12
|
+
|
|
13
|
+
# Virtual envs
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
|
|
17
|
+
# hist local data / generated artifacts
|
|
18
|
+
*.usearch
|
|
19
|
+
*.db
|
|
20
|
+
*.sqlite
|
|
21
|
+
*.sqlite3
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
|
|
25
|
+
# Editor / IDE
|
|
26
|
+
.idea/
|
|
27
|
+
.vscode/
|
|
28
|
+
.cursor/
|
|
29
|
+
|
|
30
|
+
# Worktrees (created during development, not part of the package)
|
|
31
|
+
/.worktrees/
|
|
32
|
+
|
|
33
|
+
# Local temp (verification artifacts)
|
|
34
|
+
.verify_tmp/
|
|
35
|
+
|
|
36
|
+
# Local-only notes, not for version control
|
|
37
|
+
/FINAL_REPORT.md
|
|
38
|
+
/PROJECT_CONTEXT.md
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
This document records the architecture decisions for `whatwasit` and the reasoning
|
|
4
|
+
behind each one. It is the contract that the implementation must follow.
|
|
5
|
+
|
|
6
|
+
## Goal recap
|
|
7
|
+
|
|
8
|
+
A local-first, privacy-preserving CLI that searches shell history by
|
|
9
|
+
meaning/intent. Group history into time/directory "sessions", embed each session
|
|
10
|
+
locally with a small sentence-embedding model, store vectors in a local on-disk
|
|
11
|
+
index, and do nearest-neighbour search at query time. No cloud, no API keys, no
|
|
12
|
+
data leaves the machine.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Decision 1: Language -- Python
|
|
17
|
+
|
|
18
|
+
**Chosen: Python.** Rust was the alternative.
|
|
19
|
+
|
|
20
|
+
The performance requirements are demanding on paper (index 10k commands in <30s;
|
|
21
|
+
query <1s at 100k+ sessions) but the actual hot paths are all C/native code
|
|
22
|
+
regardless of the host language:
|
|
23
|
+
|
|
24
|
+
- Embedding runs in ONNX Runtime (C++).
|
|
25
|
+
- Nearest-neighbour search runs in usearch (C++) or numpy (BLAS).
|
|
26
|
+
|
|
27
|
+
The Python layer only does parsing, grouping, and SQLite I/O, none of which is a
|
|
28
|
+
bottleneck at this scale. Crucially, **we embed per *session*, not per command**:
|
|
29
|
+
10k commands collapse to a few hundred to low-thousand sessions, so we run far
|
|
30
|
+
fewer (and batched) encodes than the raw command count suggests.
|
|
31
|
+
|
|
32
|
+
Python also wins on iteration speed and ecosystem (`fastembed`, `usearch`,
|
|
33
|
+
`sentence-transformers`) and is effectively required by Hard Requirement #7
|
|
34
|
+
(`pip install whatwasit`). Rust would give a single static binary and marginally
|
|
35
|
+
faster cold start, but offers no help meeting the latency targets, which are
|
|
36
|
+
already met. Rust is noted in `FUTURE_IDEAS.md` as a future distribution path.
|
|
37
|
+
|
|
38
|
+
## Decision 2: Embedding model & runtime -- all-MiniLM-L6-v2 on onnxruntime
|
|
39
|
+
|
|
40
|
+
**Chosen: `sentence-transformers/all-MiniLM-L6-v2` (384-dim) run as an ONNX
|
|
41
|
+
graph directly through `onnxruntime` + `tokenizers`, on CPU, fully offline.**
|
|
42
|
+
|
|
43
|
+
The model itself is confirmed as the right choice for short, terse command-line
|
|
44
|
+
text:
|
|
45
|
+
|
|
46
|
+
- 384 dimensions, ~90MB ONNX on disk, single-digit-ms CPU encode when batched.
|
|
47
|
+
- General-purpose semantic similarity model that handles short text well, which
|
|
48
|
+
matches our session documents (a handful of command lines plus the directory
|
|
49
|
+
name).
|
|
50
|
+
|
|
51
|
+
**Runtime decision and the reason it changed.** The spec named
|
|
52
|
+
`sentence-transformers`, which depends on PyTorch (200MB+ even CPU-only) and
|
|
53
|
+
undercuts the lightweight promise. We first adopted `fastembed` (same model,
|
|
54
|
+
ONNX, no PyTorch). However, **measured on the target CPU fastembed delivered
|
|
55
|
+
only ~28 texts/sec**, which blew the budgets (indexing 10k commands took ~34s vs
|
|
56
|
+
the 30s limit, and a query took ~2-3s vs the 1s limit). Profiling showed the
|
|
57
|
+
same ONNX graph driven *directly* through `onnxruntime` -- with
|
|
58
|
+
`graph_optimization_level=ORT_ENABLE_ALL` and `intra_op_num_threads = cpu_count`
|
|
59
|
+
-- runs at **~600 texts/sec (~20x faster)** and produces **bit-for-bit identical
|
|
60
|
+
vectors (cosine 1.0)**. So the embedder now drives onnxruntime itself:
|
|
61
|
+
`tokenizers` for tokenization, the ONNX `model.onnx` for inference, mask-weighted
|
|
62
|
+
mean pooling, then L2 normalization (exactly matching all-MiniLM-L6-v2). The
|
|
63
|
+
ONNX weights + tokenizer are fetched once via `huggingface_hub` and cached.
|
|
64
|
+
|
|
65
|
+
This is precisely the payoff of hiding the model behind the `Embedder` ABC
|
|
66
|
+
(`encode(texts) -> np.ndarray`, rows L2-normalized): the runtime was swapped with
|
|
67
|
+
zero changes to the indexer, search, or CLI. Swapping forward to an
|
|
68
|
+
instruction-tuned asymmetric model (e5/bge) remains a one-class change. See
|
|
69
|
+
`BENCHMARKS.md` for the before/after numbers.
|
|
70
|
+
|
|
71
|
+
Known limitation: queries are natural language ("how did I fix the nginx issue")
|
|
72
|
+
while documents are commands. MiniLM is roughly symmetric and works well enough
|
|
73
|
+
for the MVP; asymmetric query/passage models are listed in `FUTURE_IDEAS.md`.
|
|
74
|
+
|
|
75
|
+
## Decision 3: Vector storage -- usearch
|
|
76
|
+
|
|
77
|
+
**Chosen: `usearch`**, behind the `VectorIndex` ABC. Candidates compared for the
|
|
78
|
+
10k-100k+ vector scale:
|
|
79
|
+
|
|
80
|
+
| Option | Verdict | Reasoning |
|
|
81
|
+
|---|---|---|
|
|
82
|
+
| `sqlite-vss` | Rejected | Effectively unmaintained (superseded by `sqlite-vec`); relies on loadable SQLite extensions with spotty/fragile wheels, and is weak on Python 3.13. Install friction directly threatens the clean `pip install` requirement. |
|
|
83
|
+
| `hnswlib` | Viable, not chosen | Fast, mature HNSW, but forces us to hand-roll the key->metadata mapping and index persistence ourselves -- more glue, more ways to desync the index from the DB. |
|
|
84
|
+
| `usearch` | **Chosen** | Single pip wheel across manylinux/macOS/Windows; HNSW *and* exact search; cosine metric; memory-mapped single-file save/load; integer keys map directly to our SQLite `session.id`; scales far past 100k. |
|
|
85
|
+
|
|
86
|
+
**Scale sanity check:** at 100k vectors x 384 float32 (~150MB) even a brute-force
|
|
87
|
+
numpy cosine scan is ~tens of milliseconds, so the <1s query target is met *even
|
|
88
|
+
without* an ANN index. usearch is chosen not because ANN is strictly required at
|
|
89
|
+
this scale, but for clean single-file persistence and headroom to grow. Because
|
|
90
|
+
the store is behind `VectorIndex`, a numpy brute-force backend remains a trivial
|
|
91
|
+
drop-in fallback.
|
|
92
|
+
|
|
93
|
+
The vector file (`index.usearch`) holds vectors keyed by `session.id`; all
|
|
94
|
+
session metadata and command text lives in SQLite. The two are linked solely by
|
|
95
|
+
that integer key.
|
|
96
|
+
|
|
97
|
+
## Decision 4: Session-grouping algorithm -- time gap + reconstructed cwd
|
|
98
|
+
|
|
99
|
+
**Chosen: split on time gap OR working-directory change, with the working
|
|
100
|
+
directory reconstructed from the command stream.**
|
|
101
|
+
|
|
102
|
+
The spec's heuristic (commands within ~5 min AND in the same directory form one
|
|
103
|
+
session) is sound, but it hits a hard reality: **plain `~/.zsh_history` and
|
|
104
|
+
`~/.bash_history` do not record a working directory per command.** Only atuin
|
|
105
|
+
stores cwd. So the directory half of the heuristic is not directly available for
|
|
106
|
+
the primary data sources.
|
|
107
|
+
|
|
108
|
+
Resolution:
|
|
109
|
+
|
|
110
|
+
1. **Reconstruct cwd by replaying directory changes.** Walk commands in order,
|
|
111
|
+
maintaining a running cwd. Update it on `cd`, `pushd`, `popd`, and bare `cd`
|
|
112
|
+
(-> home). Resolve relative paths against the current cwd; when a target
|
|
113
|
+
cannot be resolved (e.g. `cd "$VAR"`, `cd $(...)`), keep the previous cwd and
|
|
114
|
+
mark uncertainty rather than guessing.
|
|
115
|
+
2. **Use real cwd when available.** atuin records the actual cwd, so when reading
|
|
116
|
+
from atuin we use it directly instead of reconstructing.
|
|
117
|
+
3. **Boundary rule.** Start a new session when the inter-command time gap exceeds
|
|
118
|
+
`session_window_seconds` (default 300) **or** (if `split_on_cwd_change`) the
|
|
119
|
+
cwd changes. Both knobs live in `Config`.
|
|
120
|
+
4. **Timestamp-less sources.** bash without `HISTTIMEFORMAT` has no timestamps;
|
|
121
|
+
there we fall back to splitting on cwd change in file order only.
|
|
122
|
+
|
|
123
|
+
This keeps the spec's intent (sessions = "what you were doing in one place around
|
|
124
|
+
one time") while being honest about what the data actually contains.
|
|
125
|
+
|
|
126
|
+
## Decision 5: Extensibility guardrails (applied, not over-built)
|
|
127
|
+
|
|
128
|
+
- `Embedder` ABC wraps the model so it can be swapped without touching callers.
|
|
129
|
+
- `VectorIndex` ABC wraps the store so usearch/hnswlib/numpy are interchangeable.
|
|
130
|
+
- `schema_version` is written to the SQLite `meta` table from day one (currently
|
|
131
|
+
`1`).
|
|
132
|
+
- All tunables (session window, cwd-split toggle, model name, dim, data paths,
|
|
133
|
+
top-k) are centralized in a single `Config` object, not scattered constants.
|
|
134
|
+
- We deliberately do **not** build plugin systems, multi-backend config, or
|
|
135
|
+
speculative feature flags. Anything tempting goes to `FUTURE_IDEAS.md`.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Data model
|
|
140
|
+
|
|
141
|
+
SQLite (`whatwasit.db`) holds metadata; `index.usearch` holds vectors keyed by
|
|
142
|
+
`session.id`.
|
|
143
|
+
|
|
144
|
+
- `meta(key, value)` -- includes `schema_version`.
|
|
145
|
+
- `sessions(id, start_ts, end_ts, cwd, command_count, doc_text)`.
|
|
146
|
+
- `commands(id, session_id, seq, source, ts, duration, exit_code, cwd, raw_cmd)`.
|
|
147
|
+
|
|
148
|
+
`Session.to_document()` is the single source of truth for the embedded text:
|
|
149
|
+
the directory basename followed by the session's commands, in order.
|
|
150
|
+
|
|
151
|
+
## Pipeline
|
|
152
|
+
|
|
153
|
+
```mermaid
|
|
154
|
+
flowchart LR
|
|
155
|
+
hfiles["~/.zsh_history / ~/.bash_history / atuin.db"] --> parser
|
|
156
|
+
parser["parsers -> Command[]"] --> grouper
|
|
157
|
+
grouper["sessions: time gap + cwd replay -> Session[]"] --> indexer
|
|
158
|
+
indexer["indexer: persist + embed + add to index"] --> store["SQLite + index.usearch"]
|
|
159
|
+
query["whatwasit 'natural language query'"] --> search
|
|
160
|
+
store --> search
|
|
161
|
+
search["search: embed query -> ANN -> hydrate"] --> output["rich output: ts, cwd, highlighted match + context"]
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Module boundaries
|
|
165
|
+
|
|
166
|
+
- `whatwasit/config.py`, `whatwasit/models.py`, `whatwasit/db.py`, `whatwasit/interfaces.py` —
|
|
167
|
+
configuration, data model, persistence, and swappable subsystem interfaces.
|
|
168
|
+
- `whatwasit/parsers/{base,zsh,bash,atuin}.py` — history file parsers.
|
|
169
|
+
- `whatwasit/sessions.py` — session grouping (time gap + cwd reconstruction).
|
|
170
|
+
- `whatwasit/embedder.py` — `OnnxEmbedder(Embedder)` (MiniLM via onnxruntime).
|
|
171
|
+
- `whatwasit/index.py` — `UsearchIndex(VectorIndex)`.
|
|
172
|
+
- `whatwasit/indexer.py` — parse → group → persist → embed → index orchestration.
|
|
173
|
+
- `whatwasit/search.py` — query → embed → ANN → length normalization → gated hybrid
|
|
174
|
+
RRF reranking → hydrate → per-command match highlighting.
|
|
175
|
+
- `whatwasit/cli.py`, `whatwasit/output.py` — CLI entry point and rich formatting.
|
|
176
|
+
- `tests/synthetic.py` — synthetic multi-topic history generator (tests + bench).
|
|
177
|
+
- `eval/` — offline search-quality evaluation harness (not shipped in the wheel).
|
|
178
|
+
- `benchmarks/` — performance benchmark script (not shipped in the wheel).
|
|
179
|
+
|
|
180
|
+
## Performance plan vs requirements
|
|
181
|
+
|
|
182
|
+
- **Index 10k commands < 30s:** embedding is per-session (far fewer than 10k) and
|
|
183
|
+
batched through ONNX; SQLite writes are batched in a transaction.
|
|
184
|
+
- **Query < 1s at 100k+ sessions:** one query encode (~ms) + usearch search
|
|
185
|
+
(sub-ms to low-ms); brute-force numpy would still be tens of ms.
|
|
186
|
+
|
|
187
|
+
Real measured numbers are recorded in `BENCHMARKS.md`.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Benchmarks
|
|
2
|
+
|
|
3
|
+
Fully offline. Embedding model `all-MiniLM-L6-v2` (384-dim ONNX via
|
|
4
|
+
`onnxruntime` + `tokenizers`), vector store `usearch`, SQLite metadata.
|
|
5
|
+
|
|
6
|
+
Reproduce with:
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python benchmarks/run_bench.py --write-md
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Important note on the measurement machine
|
|
13
|
+
|
|
14
|
+
The first numbers captured, in the "Measured" table below, came from an
|
|
15
|
+
**11th Gen Intel Core i5-11320H that was thermally throttled to ~1.5-1.9 GHz**
|
|
16
|
+
(nominal boost is 4.2 GHz) and under concurrent load at the time. Indexing time
|
|
17
|
+
is dominated by CPU embedding throughput, so those figures were a pessimistic
|
|
18
|
+
lower bound, not representative hardware. Both hard requirements are now
|
|
19
|
+
**confirmed passing** on representative, un-throttled hardware — see
|
|
20
|
+
"Confirmed on un-throttled hardware" below.
|
|
21
|
+
|
|
22
|
+
## Measured (throttled i5-11320H @ ~1.5-1.9 GHz, under load)
|
|
23
|
+
|
|
24
|
+
| Commands | Sessions | Index time | Query latency (avg) | Query latency (max) |
|
|
25
|
+
|---:|---:|---:|---:|---:|
|
|
26
|
+
| 1,000 | 109 | 11.33 s | 1.18 s | 1.48 s |
|
|
27
|
+
| 10,000 | 1,087 | 88.41 s | 1.37 s | 1.85 s |
|
|
28
|
+
| 100,000 | 10,870 | 464.10 s | 610.8 ms | 813.6 ms |
|
|
29
|
+
|
|
30
|
+
(The 1k/10k query rows were measured while duplicate benchmark processes were
|
|
31
|
+
still contending for the CPU; the 100k row ran with less contention, which is why
|
|
32
|
+
its query latency is actually the *lowest* of the three at 611 ms avg.)
|
|
33
|
+
|
|
34
|
+
**Contention is measurable.** Three benchmark passes happened to overlap on this
|
|
35
|
+
machine, and the same 10k indexing workload landed at **33.7 s / 47.8 s / 88.4 s**
|
|
36
|
+
depending on how many passes were fighting for the CPU at that moment. The
|
|
37
|
+
best-observed 10k indexing (33.7 s) is already right at the 30 s target under
|
|
38
|
+
merely partial contention, so an un-throttled, uncontended CPU clears it easily.
|
|
39
|
+
|
|
40
|
+
## Clean component measurements (same machine, momentarily idle)
|
|
41
|
+
|
|
42
|
+
These isolate the pieces that matter for the requirements and were captured when
|
|
43
|
+
the CPU was briefly idle:
|
|
44
|
+
|
|
45
|
+
| Component | Result |
|
|
46
|
+
|---|---|
|
|
47
|
+
| Embedder throughput (onnxruntime + tokenizers, batched) | **~608 texts/sec** |
|
|
48
|
+
| Same model via `fastembed` (rejected runtime) | ~28 texts/sec (~40 max, even with `parallel=8`) |
|
|
49
|
+
| Raw ONNX batched inference (256 x len-32) | ~163 seq/sec |
|
|
50
|
+
| Nearest-neighbour search over **100,000** session vectors | **1.6 ms avg, 2.1 ms max** |
|
|
51
|
+
|
|
52
|
+
Embedding is per **session**, not per command, so a session count is what drives
|
|
53
|
+
indexing time. At the clean ~608 texts/sec, embedding the 1,087 sessions from
|
|
54
|
+
10k commands is ~1.8 s; the rest of indexing (parse, group, SQLite writes,
|
|
55
|
+
`usearch.add`) is well under a second.
|
|
56
|
+
|
|
57
|
+
## Confirmed on un-throttled hardware
|
|
58
|
+
|
|
59
|
+
Measured on an Intel i9-14900 (32 threads, boosts to 5.5 GHz), Ubuntu 22.04,
|
|
60
|
+
idle system, single clean run:
|
|
61
|
+
|
|
62
|
+
| Commands | Sessions | Index time | Query latency (avg) | Query latency (max) |
|
|
63
|
+
|---:|---:|---:|---:|---:|
|
|
64
|
+
| 1,000 | 109 | 0.80 s | 65.1 ms | 101.0 ms |
|
|
65
|
+
| 10,000 | 1,087 | 7.19 s | 165.0 ms | 270.8 ms |
|
|
66
|
+
| 100,000 | 10,870 | 92.89 s | 153.3 ms | 206.5 ms |
|
|
67
|
+
|
|
68
|
+
Pure nearest-neighbour search over 100,000 session vectors: **0.7 ms avg, 0.9 ms max**.
|
|
69
|
+
|
|
70
|
+
## Requirement checks
|
|
71
|
+
|
|
72
|
+
- **HR#3 - index 10k commands < 30s: PASS, with 4x margin.** 7.19 s measured
|
|
73
|
+
(vs. 88.41 s throttled / 33.7-47.8 s partially-throttled). Indexing scales
|
|
74
|
+
with session count and clock speed as predicted from the clean component
|
|
75
|
+
measurements above, not anything specific to the throttled machine.
|
|
76
|
+
- **HR#5 - query < 1s at 100k+ indexed sessions: PASS, with 5-6x margin.**
|
|
77
|
+
153.3 ms avg / 206.5 ms max at 100k sessions. Nearest-neighbour search itself
|
|
78
|
+
is negligible (0.7 ms); the ~150-200ms is dominated by the two small ONNX
|
|
79
|
+
forward passes per query (embedding the query text, then batch-embedding the
|
|
80
|
+
matched sessions' commands for highlighting) — a fixed per-query cost that is
|
|
81
|
+
independent of index size, which is exactly why query latency does not grow
|
|
82
|
+
from 10k to 100k sessions.
|
|
83
|
+
|
|
84
|
+
Both hard requirements are now confirmed on representative hardware; the
|
|
85
|
+
throttled numbers above remain as a record of why the embedder was rewritten
|
|
86
|
+
from `fastembed` to direct `onnxruntime`, and as a worst-case lower bound.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Future Ideas (explicitly out of MVP scope)
|
|
2
|
+
|
|
3
|
+
These are deliberately **not** built in v1. The MVP scope is fixed to the Hard
|
|
4
|
+
Requirements. Good ideas are parked here instead of expanding scope.
|
|
5
|
+
|
|
6
|
+
## Captured from the spec (Full Version)
|
|
7
|
+
|
|
8
|
+
- **Background daemon / shell hooks for live indexing.** Embed and index new
|
|
9
|
+
commands as they are run, in the background, with zero perceptible latency.
|
|
10
|
+
This is also the clean way to capture the *real* working directory going
|
|
11
|
+
forward (instead of reconstructing it from `cd` replay).
|
|
12
|
+
- **Fish shell support.** Reading `~/.local/share/fish/fish_history` (YAML-ish).
|
|
13
|
+
- **Interactive mode.** fzf-style fuzzy re-filtering of the ranked results.
|
|
14
|
+
- **Deeper atuin integration.** Beyond reading its DB: respect its richer
|
|
15
|
+
metadata (hostname, session id, exit status) for ranking.
|
|
16
|
+
- **Git semantic search (`giths`).** Same architecture applied to commit
|
|
17
|
+
messages + first N lines of each diff.
|
|
18
|
+
- **Distribution beyond pip.** Homebrew formula; a single static Rust binary for
|
|
19
|
+
the CLI front-end (the embedding/index core could stay Python or move to a
|
|
20
|
+
Rust core with ONNX + usearch bindings).
|
|
21
|
+
|
|
22
|
+
## Engineering ideas surfaced during design
|
|
23
|
+
|
|
24
|
+
- **Asymmetric query/passage embedding models** (e5, bge, gte) with
|
|
25
|
+
query/passage prefixes, to better match natural-language queries against
|
|
26
|
+
command documents. Trivial to try behind the existing `Embedder` ABC.
|
|
27
|
+
- **Per-command vectors** in addition to per-session, for finer-grained
|
|
28
|
+
highlighting and "jump to the exact command" results.
|
|
29
|
+
- **numpy brute-force `VectorIndex` backend** as a zero-native-dep fallback
|
|
30
|
+
(already trivially supported by the `VectorIndex` interface).
|
|
31
|
+
- **Incremental indexing** (only embed sessions whose commands changed) once a
|
|
32
|
+
live daemon exists, using content hashes.
|
|
33
|
+
- **Schema migrations** beyond the version stamp, once `schema_version > 1`.
|
|
34
|
+
- **Secret redaction** on read (mask things that look like tokens/passwords in
|
|
35
|
+
the displayed output). Shell history is sensitive; worth doing carefully.
|
|
36
|
+
- **Encryption / sync.** Explicitly out of scope (that is atuin's domain), noted
|
|
37
|
+
only for completeness.
|
|
38
|
+
|
|
39
|
+
## Process note
|
|
40
|
+
|
|
41
|
+
If a feature here starts to feel necessary for the MVP, stop and confirm scope
|
|
42
|
+
before building it.
|
whatwasit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ThorOdinson246
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|