chunkhound-index-compactor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ tests/fixtures/shopware-cli-chunks.duckdb filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,53 @@
1
+ # Python bytecode
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ *.egg-info/
8
+ *.egg
9
+ build/
10
+ dist/
11
+ .eggs/
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+
17
+ # Node (prettier toolchain)
18
+ node_modules/
19
+
20
+ # Tooling caches
21
+ .pytest_cache/
22
+ .ruff_cache/
23
+ .mypy_cache/
24
+
25
+ # Coverage
26
+ .coverage
27
+ .coverage.*
28
+ htmlcov/
29
+ coverage.xml
30
+
31
+ # IDE
32
+ .idea/
33
+ .vscode/
34
+ *.swp
35
+ *.swo
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Compactor-produced artifacts (any location)
42
+ *.compacted.duckdb
43
+ *.bak
44
+
45
+ # Superpowers plans and specs stay untracked
46
+ docs/superpowers/plans/
47
+ docs/superpowers/specs/
48
+
49
+ # .claude is excluded except for the shared skills and project settings
50
+ .claude/*
51
+ !.claude/hook-contexts/
52
+ !.claude/skills/
53
+ !.claude/settings.json
@@ -0,0 +1,38 @@
1
+ # Mirrors the local checks documented in CONTRIBUTING.md §Local checks.
2
+ # Install once per clone: `pre-commit install`.
3
+ # Run against the whole tree on demand: `pre-commit run --all-files`.
4
+
5
+ repos:
6
+ - repo: https://github.com/astral-sh/ruff-pre-commit
7
+ rev: v0.15.13
8
+ hooks:
9
+ - id: ruff
10
+ # Default invocation is `ruff check` without --fix.
11
+ # Auto-fix is off by project policy; run `uv run ruff check --fix` manually.
12
+
13
+ - repo: https://github.com/crate-ci/typos
14
+ rev: v1.46.2
15
+ hooks:
16
+ - id: typos
17
+
18
+ - repo: https://github.com/rbubley/mirrors-prettier
19
+ rev: v3.8.3
20
+ hooks:
21
+ - id: prettier
22
+ args: [--check]
23
+
24
+ - repo: local
25
+ hooks:
26
+ - id: mypy
27
+ name: mypy
28
+ entry: uv run mypy src/
29
+ language: system
30
+ files: ^(src/.*\.py|pyproject\.toml)$
31
+ pass_filenames: false
32
+
33
+ - id: pytest
34
+ name: pytest
35
+ entry: uv run pytest
36
+ language: system
37
+ always_run: true
38
+ pass_filenames: false
@@ -0,0 +1,17 @@
1
+ node_modules/
2
+ dist/
3
+ .venv/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .mypy_cache/
7
+
8
+ src/
9
+ tests/
10
+
11
+ # Markdown is hand-formatted. Prettier mangles identifiers containing
12
+ # underscores (treats them as emphasis) and pads tables past printWidth.
13
+ **/*.md
14
+
15
+ uv.lock
16
+ package-lock.json
17
+ LICENSE
@@ -0,0 +1,7 @@
1
+ {
2
+ "printWidth": 100,
3
+ "proseWrap": "preserve",
4
+ "tabWidth": 2,
5
+ "trailingComma": "all",
6
+ "endOfLine": "lf"
7
+ }
@@ -0,0 +1,13 @@
1
+ [files]
2
+ extend-exclude = [
3
+ "tests/fixtures/",
4
+ "dist/",
5
+ "node_modules/",
6
+ "uv.lock",
7
+ "package-lock.json",
8
+ "CHANGELOG.md",
9
+ ]
10
+
11
+ [default.extend-words]
12
+ # "unparseable" is a valid English variant; we use it in test names.
13
+ unparseable = "unparseable"
@@ -0,0 +1,79 @@
1
+ # AGENTS.md
2
+
3
+ Operational navigation for LLM coding agents. Human docs: `README.md`, `docs/architecture.md`, `docs/benchmarks.md`, `docs/out-of-scope.md`, `CONTRIBUTING.md`.
4
+
5
+ ## Layout
6
+
7
+ ```
8
+ chunkhound-index-compactor/
9
+ ├── pyproject.toml
10
+ ├── package.json # prettier dev dep (Node)
11
+ ├── .prettierrc.json
12
+ ├── .prettierignore
13
+ ├── .typos.toml
14
+ ├── .github/workflows/ # ci.yml, rolling.yml, release.yml
15
+ ├── README.md
16
+ ├── AGENTS.md
17
+ ├── CLAUDE.md # @AGENTS.md
18
+ ├── CONTRIBUTING.md # dev tooling, CI, release process
19
+ ├── CHANGELOG.md
20
+ ├── LICENSE
21
+ ├── docs/
22
+ │ ├── architecture.md # pipeline, RAM asymmetry, recipe table, vss bundling, ChunkHound compat, refused inputs
23
+ │ ├── benchmarks.md # empirical baseline (1.25 TB ChunkHound index + fixture cross-check)
24
+ │ └── out-of-scope.md # rejected approaches (DiskANN, hnsw_compact_index, M/M0/ef_*, ...)
25
+ ├── src/chunkhound_index_compactor/
26
+ │ ├── __init__.py # public API re-exports
27
+ │ ├── __main__.py # python -m entry
28
+ │ ├── cli.py # Typer app
29
+ │ └── core.py # compaction logic
30
+ └── tests/
31
+ ├── conftest.py # fixtures: populated_db, bloated_db, hnsw_db, shopware_cli_index
32
+ ├── fixtures/ # committed real-world DB artifacts (provenance in conftest.py)
33
+ ├── test_core.py
34
+ ├── test_cli.py
35
+ ├── test_extensions.py
36
+ ├── test_rebuild.py
37
+ └── test_human_size.py
38
+ ```
39
+
40
+ ## Module → symbols
41
+
42
+ | Module | Public | Private |
43
+ |---|---|---|
44
+ | `core.py` | `compact_database`, `restore_indexes`, `replace_with_compacted`, `human_size`, `CompactionResult`, `RestoreResult` | `_topological_order`, `_referenced_tables`, `_reject_unsupported_objects`, `_capture_hnsw_recipes`, `_write_recipe_table`, `_load_bundled_extension`, `_bundled_extension_path`, `_escape_sql_literal`, `RECIPE_TABLE` constant, regexes `_HNSW_RE`, `_HNSW_COLUMN_RE`, `_FK_REFERENCES_RE` |
45
+ | `cli.py` | `app` (Typer), `compact`, `restore` commands; `DefaultCommandGroup` routes bare args to `compact` | (none) |
46
+ | `__main__.py` | `app()` invocation | (none) |
47
+ | `__init__.py` | re-exports from `core` | (none) |
48
+
49
+ ## When to modify
50
+
51
+ | Task | File / symbol |
52
+ |---|---|
53
+ | Rebuild SQL sequence | `core.py` → `compact_database()` |
54
+ | FK ordering | `core.py` → `_topological_order()` / `_referenced_tables()` |
55
+ | Schema/view rejection | `core.py` → `_reject_unsupported_objects()` |
56
+ | HNSW metric recovery / recipe table schema | `core.py` → `_capture_hnsw_recipes()` / `_write_recipe_table()` / `RECIPE_TABLE` |
57
+ | Index restore | `core.py` → `restore_indexes()` |
58
+ | Atomic replace / backup suffix | `core.py` → `replace_with_compacted()` |
59
+ | CLI args / commands / output strings | `cli.py` (`DefaultCommandGroup`, `compact`, `restore`) |
60
+ | Byte formatting | `core.py` → `human_size()` |
61
+ | New public export | `core.py` + `__init__.py` `__all__` |
62
+ | Pipeline narrative, design rationale, refused-input reasoning | `docs/architecture.md` (not here) |
63
+ | Empirical baseline / scale numbers | `docs/benchmarks.md` (not here) |
64
+ | Rejected approaches (DiskANN, hnsw_compact_index, M/M0/ef_*, etc.) | `docs/out-of-scope.md` (not here) |
65
+
66
+ ## Invariants enforced by code
67
+
68
+ - HNSW metric must survive rebuild. Catalog DDL strips `WITH (...)`, so the metric is read from `pragma_hnsw_index_info()` in `_capture_hnsw_recipes`. (architecture.md §ChunkHound compatibility)
69
+ - SQL DDL is built by string interpolation (no parameter binding); escape literals via `_escape_sql_literal`, wrap table and index names in double quotes. (architecture.md §Compaction pipeline)
70
+ - Public-API exceptions (`ValueError`, `FileNotFoundError`, `FileExistsError`) enumerated at README §Library Usage; refused inputs reasoned at architecture.md §Not supported (and why).
71
+ - Reading the source never loads its HNSW into RAM; building the destination HNSW dominates peak RAM. `--skip-hnsw` is the small-RAM unlock; `restore` is a separate-machine step. (architecture.md §RAM cost asymmetry)
72
+
73
+ ## Build / verify
74
+
75
+ - Setup, local-check commands, tooling configs, CI workflow details, and release process at `CONTRIBUTING.md` §Setup, §Local checks, §CI workflows, §Release process.
76
+
77
+ ## Runtime deps
78
+
79
+ - Authoritative constraints at `pyproject.toml`. Load-bearing context: `duckdb` range matches `chunkhound` to stay file-format-compatible; `duckdb-extension-vss>=1.5.2` pins `duckdb==1.5.2` transitively. Python `>=3.10,<3.14`.
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2026-05-20
4
+
5
+ ### Added
6
+ - Initial release of `chunkhound-index-compactor`.
7
+ - `chunkhound-index-compactor` CLI (Typer-based) with a `compact` default command and a `restore` subcommand, routed via `DefaultCommandGroup` so `chunkhound-index-compactor SOURCE [TARGET]` still works.
8
+ - `compact_database(source, target, *, skip_hnsw=False)`: rebuild a DuckDB database into a fresh file via a foreign-key-ordered streaming rebuild. Captures the source schema, recreates sequences/tables/indexes in a freshly-allocated file, computes a foreign-key-topological table order, and inserts one table at a time parent-before-child. This sidesteps the FK race that breaks `ATTACH` + `COPY FROM DATABASE` on large FK-bearing databases (e.g. ChunkHound indexes at scale) while still dropping orphaned blocks.
9
+ - HNSW indexes are recreated with the metric recovered from `pragma_hnsw_index_info()`. The catalog DDL strips the `WITH (...)` clause, so a verbatim rebuild would silently reset a `cosine` index to the `l2sq` default and leave it dead (queries fall back to brute force).
10
+ - `--skip-hnsw` flag / `skip_hnsw=True` parameter: rebuild without vector indexes (RAM-flat, smallest output) and record what was stripped in a `_compactor_hnsw_recipe` table inside the output.
11
+ - `restore` CLI command / `restore_indexes()` function: rebuild the stripped HNSW indexes in place from the recipe table, idempotently, on a RAM-capable machine.
12
+ - `replace_with_compacted()`: atomic swap with `.bak` backup.
13
+ - `human_size()`: binary-prefix byte formatting.
14
+ - `CompactionResult` and `RestoreResult` dataclasses.
15
+ - `--replace` flag for in-place compaction with backup.
16
+ - Bundled `vss.duckdb_extension` binary from `duckdb-extension-vss` is `LOAD`ed directly from disk when an HNSW index is present, so compaction of ChunkHound and other vector-search DuckDBs works offline out of the box.
17
+
18
+ ### Fail-hard
19
+ - Sources with non-`main` schemas, views, or foreign-key cycles raise `ValueError` rather than silently dropping objects.
20
+ - On any failure after the target file is created, the partial target is unlinked. A half-written multi-GB file is worse than nothing.
@@ -0,0 +1 @@
1
+ @AGENTS.md
@@ -0,0 +1,85 @@
1
+ # Contributing
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ uv sync --extra dev # Python toolchain (mypy, pytest, ruff, typos, pre-commit)
7
+ npm install # Node toolchain (prettier; one-time per clone)
8
+ uv run pre-commit install # wires the git pre-commit hook (one-time per clone)
9
+ ```
10
+
11
+ Python 3.10 through 3.13 supported. macOS and Linux are tested.
12
+
13
+ The pre-commit hook runs ruff, typos, prettier, mypy, and pytest on every commit. Bypass with `git commit --no-verify` when you need to ship a WIP commit; CI still runs the same checks.
14
+
15
+ ## Local checks
16
+
17
+ ```bash
18
+ uv run pytest
19
+ uv run ruff check src/ tests/
20
+ uv run ruff format --check src/ tests/
21
+ uv run mypy src/
22
+ uv run typos
23
+ npx prettier --check .
24
+ ```
25
+
26
+ Apply prettier fixes with `npm run format:fix`. CI runs the same commands.
27
+
28
+ ### Tooling notes
29
+
30
+ - **Ruff**: `E W F I B C4 UP ARG SIM PTH`, line length 100, `E501` ignored.
31
+ - **MyPy**: strict; `tests.*` relaxed; `duckdb`, `duckdb_extension_vss` missing-imports ignored.
32
+ - **Pytest**: discovers `tests/`. Fixtures bundle real ChunkHound DB samples (see `tests/conftest.py`).
33
+ - **Typos**: config at `.typos.toml`.
34
+ - **Prettier**: config at `.prettierrc.json` and `.prettierignore`. Markdown is excluded because prettier mangles identifiers containing underscores and bloats markdown tables past the 100-column line limit.
35
+
36
+ ## CI workflows
37
+
38
+ Three workflows under `.github/workflows/`. All third-party actions are SHA-pinned with `# vX.Y.Z` comments so Renovate can update them later.
39
+
40
+ ### ci.yml
41
+
42
+ Runs on every push to `main` and every PR targeting `main`. Six parallel jobs:
43
+
44
+ | Job | What |
45
+ |-------------|---------------------------------------------------------------------------------------|
46
+ | `lint` | `ruff check` and `ruff format --check` |
47
+ | `typecheck` | `mypy src/` |
48
+ | `test` | `pytest` with coverage on a Python 3.10 / 3.11 / 3.12 / 3.13 matrix (`ubuntu-latest`) |
49
+ | `typos` | `typos` against the repo |
50
+ | `prettier` | `prettier --check .` |
51
+ | `build` | `uv build`; uploads wheel + sdist as a `dist` artifact (14-day retention) |
52
+
53
+ Coverage is reported but not gated. The `build` job runs independently of the others, so PR reviewers can download a wheel even when other jobs fail.
54
+
55
+ ### rolling.yml
56
+
57
+ After `ci.yml` succeeds on `main`, deletes the existing `rolling` tag plus release and recreates them at the new HEAD SHA. Marked prerelease and not-latest so it never shadows a tagged release.
58
+
59
+ Install the current main-branch build from the rolling asset:
60
+
61
+ ```bash
62
+ uv tool install https://github.com/it-bens/chunkhound-index-compactor/releases/download/rolling/chunkhound_index_compactor-<version>-py3-none-any.whl
63
+ ```
64
+
65
+ ### release.yml
66
+
67
+ Triggers on a `v*.*.*` tag push or manual `workflow_dispatch` against a tag ref.
68
+
69
+ 1. Verifies the ref is a tag.
70
+ 2. Verifies the tag (minus the `v` prefix) matches `pyproject.toml`'s `version`.
71
+ 3. Builds wheel + sdist.
72
+ 4. Publishes to PyPI via Trusted Publisher (OIDC; no PyPI token in the repo).
73
+ 5. Creates a GitHub Release with the wheel + sdist attached. Release notes are not auto-generated; edit them on GitHub afterward.
74
+
75
+ ## Release process
76
+
77
+ To cut a release:
78
+
79
+ 1. Bump `version` in `pyproject.toml` (for example, `0.1.0` to `0.2.0`) and update `CHANGELOG.md`.
80
+ 2. Commit, push, wait for CI to pass on `main`.
81
+ 3. Tag and push: `git tag v0.2.0 && git push origin v0.2.0`.
82
+ 4. The release workflow publishes to PyPI and creates a GitHub Release.
83
+ 5. Open the Release on GitHub and write the release notes.
84
+
85
+ To re-trigger from a tag manually (for example, after a transient PyPI failure): GitHub Actions, then Release, then Run workflow, then pick the tag from the ref dropdown.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Martin Bens
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunkhound-index-compactor
3
+ Version: 0.1.0
4
+ Summary: Compact a DuckDB database (ChunkHound index or otherwise) by rebuilding it into a fresh file
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: <3.14,>=3.10
8
+ Requires-Dist: duckdb-extension-vss>=1.5.2
9
+ Requires-Dist: duckdb<1.5.3.dev0,>=1.4.0
10
+ Requires-Dist: typer>=0.25
11
+ Provides-Extra: dev
12
+ Requires-Dist: mypy>=2.1; extra == 'dev'
13
+ Requires-Dist: pre-commit>=4.5; extra == 'dev'
14
+ Requires-Dist: pytest-cov>=7.0; extra == 'dev'
15
+ Requires-Dist: pytest>=9.0; extra == 'dev'
16
+ Requires-Dist: ruff>=0.15; extra == 'dev'
17
+ Requires-Dist: typos>=1.46; extra == 'dev'
@@ -0,0 +1,103 @@
1
+ # ChunkHound Index Compactor
2
+
3
+ Compact a [DuckDB](https://duckdb.org) database by rebuilding it into a fresh file. The motivating use case was shrinking a bloated [ChunkHound](https://github.com/chunkhound/chunkhound) index (whose per-batch HNSW re-serialization leaves large amounts of orphaned-but-counted blocks), but the implementation is fully generic; it works on any single-schema DuckDB file.
4
+
5
+ ## ⚡ Quick Start
6
+
7
+ ```bash
8
+ uvx chunkhound-index-compactor path/to/db.duckdb
9
+ # writes path/to/db.duckdb.compacted
10
+
11
+ uvx chunkhound-index-compactor path/to/db.duckdb --replace
12
+ # swaps in the compacted copy and keeps the original at path/to/db.duckdb.bak
13
+
14
+ uvx chunkhound-index-compactor path/to/db.duckdb --skip-hnsw
15
+ # skips rebuilding vector indexes (RAM-flat, smallest output); restore them later
16
+ uvx chunkhound-index-compactor restore path/to/db.duckdb.compacted
17
+ ```
18
+
19
+ The source is opened read-only, but an active writer holds the file lock. Close any process writing to the database before running.
20
+
21
+ ## 🖥️ CLI Usage
22
+
23
+ ```
24
+ $ chunkhound-index-compactor --help
25
+ Usage: chunkhound-index-compactor [OPTIONS] COMMAND [ARGS]...
26
+
27
+ Commands:
28
+ compact Compact a DuckDB database by rebuilding it into a fresh file. (default)
29
+ restore Rebuild HNSW vector indexes in a --skip-hnsw artifact, in place.
30
+ ```
31
+
32
+ A bare invocation routes to `compact`, so `chunkhound-index-compactor SOURCE` still works:
33
+
34
+ ```
35
+ chunkhound-index-compactor SOURCE [TARGET] [--replace] [--skip-hnsw]
36
+ chunkhound-index-compactor restore DATABASE
37
+ ```
38
+
39
+ | Argument / Option | Meaning |
40
+ |-------------------|-----------------------------------------------------------------------------------|
41
+ | `SOURCE` | Path to the existing DuckDB file (required) |
42
+ | `TARGET` | Path for the compacted output [default: `<source>.compacted`] |
43
+ | `--replace` | After success, replace source with the compacted file (original → `<source>.bak`) |
44
+ | `--skip-hnsw` | Do not rebuild vector indexes; write a recipe table for later `restore` |
45
+
46
+ With `--skip-hnsw`, the output has no vector index and falls back to a brute-force scan (correct, just unaccelerated) until you run `restore`. Rebuilding the HNSW is the memory-dominant step, so `--skip-hnsw` lets you compact on a small machine and `restore` on a RAM-capable one. See [docs/benchmarks.md](docs/benchmarks.md) for peak-RAM numbers and [docs/architecture.md §RAM cost asymmetry](docs/architecture.md#ram-cost-asymmetry) for why.
47
+
48
+ ## 🐍 Library Usage
49
+
50
+ ```python
51
+ from pathlib import Path
52
+ from chunkhound_index_compactor import compact_database, restore_indexes, replace_with_compacted
53
+
54
+ result = compact_database(Path("big.duckdb"), Path("small.duckdb"))
55
+ print(f"{result.source_size} -> {result.target_size} ({result.delta_pct:+.1f}%)")
56
+
57
+ # Small-RAM path: skip the vector index, restore it later on a bigger machine.
58
+ compact_database(Path("big.duckdb"), Path("small.duckdb"), skip_hnsw=True)
59
+ restored = restore_indexes(Path("small.duckdb"))
60
+ print(f"restored: {restored.restored}")
61
+
62
+ # Optional: swap in place with .bak backup
63
+ backup = replace_with_compacted(result.source, result.target)
64
+ ```
65
+
66
+ `compact_database()` raises:
67
+ - `ValueError`: `target` resolves to the same path as `source`, the source has a non-`main` schema or a view, or the FK graph has a cycle.
68
+ - `FileNotFoundError`: `source` does not exist.
69
+ - `FileExistsError`: `target` already exists.
70
+
71
+ `restore_indexes()` raises:
72
+ - `FileNotFoundError`: `database` does not exist.
73
+ - `ValueError`: `database` has no `_compactor_hnsw_recipe` table (not a `--skip-hnsw` artifact).
74
+
75
+ `replace_with_compacted()` raises `FileExistsError` if `<source>.bak` already exists. It refuses to overwrite an existing backup.
76
+
77
+ ## 🚫 Not Supported
78
+
79
+ The tool fails hard rather than silently dropping anything it cannot reproduce:
80
+
81
+ - Non-`main` schemas and views (raise `ValueError`).
82
+ - Foreign-key cycles among tables (raise `ValueError`).
83
+ - HNSW tuning parameters other than `metric` (`M`, `M0`, `ef_construction`, `ef_search`); they are not recoverable from a built index and are rebuilt at the `vss` defaults.
84
+
85
+ See [docs/architecture.md](docs/architecture.md#not-supported-and-why) for the reasoning, and [docs/out-of-scope.md](docs/out-of-scope.md) for approaches considered and not pursued.
86
+
87
+ ## 🏗️ Development
88
+
89
+ Setup, local checks, CI, and release process: [CONTRIBUTING.md](CONTRIBUTING.md).
90
+
91
+ ## ⚖️ License
92
+
93
+ MIT
94
+
95
+ ---
96
+
97
+ > [!NOTE]
98
+ > Yes, an AI wrote this README. And the code, the docs, the tests, and
99
+ > the `.claude/skills` it now uses to write the next round. Yes, a human
100
+ > told it to keep the emojis. The human has ADHD, which, as it turns
101
+ > out, means his brain was already doing attention re-routing and
102
+ > context-window thrashing before LLMs made it cool. They call him ...
103
+ > LLMartin. The emojis are a feature.
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "chunkhound-index-compactor-devtools",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {
6
+ "": {
7
+ "name": "chunkhound-index-compactor-devtools",
8
+ "devDependencies": {
9
+ "prettier": "3.8.3"
10
+ }
11
+ },
12
+ "node_modules/prettier": {
13
+ "version": "3.8.3",
14
+ "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.8.3.tgz",
15
+ "integrity": "sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw==",
16
+ "dev": true,
17
+ "license": "MIT",
18
+ "bin": {
19
+ "prettier": "bin/prettier.cjs"
20
+ },
21
+ "engines": {
22
+ "node": ">=14"
23
+ },
24
+ "funding": {
25
+ "url": "https://github.com/prettier/prettier?sponsor=1"
26
+ }
27
+ }
28
+ }
29
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ "name": "chunkhound-index-compactor-devtools",
3
+ "private": true,
4
+ "description": "Local dev tooling (prettier). Not published. Not a runtime dep.",
5
+ "scripts": {
6
+ "format": "prettier --check .",
7
+ "format:fix": "prettier --write ."
8
+ },
9
+ "devDependencies": {
10
+ "prettier": "3.8.3"
11
+ }
12
+ }
@@ -0,0 +1,68 @@
1
+ [project]
2
+ name = "chunkhound-index-compactor"
3
+ version = "0.1.0"
4
+ description = "Compact a DuckDB database (ChunkHound index or otherwise) by rebuilding it into a fresh file"
5
+ requires-python = ">=3.10,<3.14"
6
+ license = "MIT"
7
+ dependencies = [
8
+ "duckdb>=1.4.0,<1.5.3.dev0",
9
+ "duckdb-extension-vss>=1.5.2",
10
+ "typer>=0.25",
11
+ ]
12
+
13
+ [project.optional-dependencies]
14
+ dev = [
15
+ "mypy>=2.1",
16
+ "pre-commit>=4.5",
17
+ "pytest>=9.0",
18
+ "pytest-cov>=7.0",
19
+ "ruff>=0.15",
20
+ "typos>=1.46",
21
+ ]
22
+
23
+ [project.scripts]
24
+ chunkhound-index-compactor = "chunkhound_index_compactor.cli:app"
25
+
26
+ [build-system]
27
+ requires = ["hatchling"]
28
+ build-backend = "hatchling.build"
29
+
30
+ [tool.hatch.build.targets.wheel]
31
+ packages = ["src/chunkhound_index_compactor"]
32
+
33
+ [tool.hatch.build.targets.sdist]
34
+ exclude = ["tests/", "docs/", ".github/", ".claude/**"]
35
+
36
+ [tool.pytest.ini_options]
37
+ testpaths = ["tests"]
38
+ addopts = ["-v", "--tb=short", "--strict-markers"]
39
+
40
+ [tool.ruff]
41
+ target-version = "py310"
42
+ line-length = 100
43
+
44
+ [tool.ruff.lint]
45
+ select = ["E", "W", "F", "I", "B", "C4", "UP", "ARG", "SIM", "PTH"]
46
+ ignore = ["E501"]
47
+
48
+ [tool.ruff.lint.isort]
49
+ known-first-party = ["chunkhound_index_compactor"]
50
+
51
+ [tool.ruff.format]
52
+ quote-style = "double"
53
+ indent-style = "space"
54
+
55
+ [tool.mypy]
56
+ python_version = "3.10"
57
+ strict = true
58
+ warn_return_any = true
59
+ warn_unused_ignores = true
60
+
61
+ [[tool.mypy.overrides]]
62
+ module = ["tests.*"]
63
+ disallow_untyped_defs = false
64
+ disallow_untyped_calls = false
65
+
66
+ [[tool.mypy.overrides]]
67
+ module = ["duckdb", "duckdb_extension_vss"]
68
+ ignore_missing_imports = true
@@ -0,0 +1,19 @@
1
+ """Compact a DuckDB database by copying it into a fresh file."""
2
+
3
+ from .core import (
4
+ CompactionResult,
5
+ RestoreResult,
6
+ compact_database,
7
+ human_size,
8
+ replace_with_compacted,
9
+ restore_indexes,
10
+ )
11
+
12
+ __all__ = [
13
+ "CompactionResult",
14
+ "RestoreResult",
15
+ "compact_database",
16
+ "human_size",
17
+ "replace_with_compacted",
18
+ "restore_indexes",
19
+ ]
@@ -0,0 +1,5 @@
1
+ """Allow running as `python -m chunkhound_index_compactor`."""
2
+
3
+ from .cli import app
4
+
5
+ app()