chunkhound-index-compactor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkhound_index_compactor-0.1.0/.gitattributes +1 -0
- chunkhound_index_compactor-0.1.0/.gitignore +53 -0
- chunkhound_index_compactor-0.1.0/.pre-commit-config.yaml +38 -0
- chunkhound_index_compactor-0.1.0/.prettierignore +17 -0
- chunkhound_index_compactor-0.1.0/.prettierrc.json +7 -0
- chunkhound_index_compactor-0.1.0/.typos.toml +13 -0
- chunkhound_index_compactor-0.1.0/AGENTS.md +79 -0
- chunkhound_index_compactor-0.1.0/CHANGELOG.md +20 -0
- chunkhound_index_compactor-0.1.0/CLAUDE.md +1 -0
- chunkhound_index_compactor-0.1.0/CONTRIBUTING.md +85 -0
- chunkhound_index_compactor-0.1.0/LICENSE +21 -0
- chunkhound_index_compactor-0.1.0/PKG-INFO +17 -0
- chunkhound_index_compactor-0.1.0/README.md +103 -0
- chunkhound_index_compactor-0.1.0/package-lock.json +29 -0
- chunkhound_index_compactor-0.1.0/package.json +12 -0
- chunkhound_index_compactor-0.1.0/pyproject.toml +68 -0
- chunkhound_index_compactor-0.1.0/src/chunkhound_index_compactor/__init__.py +19 -0
- chunkhound_index_compactor-0.1.0/src/chunkhound_index_compactor/__main__.py +5 -0
- chunkhound_index_compactor-0.1.0/src/chunkhound_index_compactor/cli.py +123 -0
- chunkhound_index_compactor-0.1.0/src/chunkhound_index_compactor/core.py +368 -0
- chunkhound_index_compactor-0.1.0/uv.lock +720 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tests/fixtures/shopware-cli-chunks.duckdb filter=lfs diff=lfs merge=lfs -text
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Python bytecode
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
*.egg-info/
|
|
8
|
+
*.egg
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
.eggs/
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
|
|
17
|
+
# Node (prettier toolchain)
|
|
18
|
+
node_modules/
|
|
19
|
+
|
|
20
|
+
# Tooling caches
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.ruff_cache/
|
|
23
|
+
.mypy_cache/
|
|
24
|
+
|
|
25
|
+
# Coverage
|
|
26
|
+
.coverage
|
|
27
|
+
.coverage.*
|
|
28
|
+
htmlcov/
|
|
29
|
+
coverage.xml
|
|
30
|
+
|
|
31
|
+
# IDE
|
|
32
|
+
.idea/
|
|
33
|
+
.vscode/
|
|
34
|
+
*.swp
|
|
35
|
+
*.swo
|
|
36
|
+
|
|
37
|
+
# OS
|
|
38
|
+
.DS_Store
|
|
39
|
+
Thumbs.db
|
|
40
|
+
|
|
41
|
+
# Compactor-produced artifacts (any location)
|
|
42
|
+
*.compacted.duckdb
|
|
43
|
+
*.bak
|
|
44
|
+
|
|
45
|
+
# Superpowers plans and specs stay untracked
|
|
46
|
+
docs/superpowers/plans/
|
|
47
|
+
docs/superpowers/specs/
|
|
48
|
+
|
|
49
|
+
# .claude is excluded except for the shared skills and project settings
|
|
50
|
+
.claude/*
|
|
51
|
+
!.claude/hook-contexts/
|
|
52
|
+
!.claude/skills/
|
|
53
|
+
!.claude/settings.json
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Mirrors the local checks documented in CONTRIBUTING.md §Local checks.
|
|
2
|
+
# Install once per clone: `pre-commit install`.
|
|
3
|
+
# Run against the whole tree on demand: `pre-commit run --all-files`.
|
|
4
|
+
|
|
5
|
+
repos:
|
|
6
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
7
|
+
rev: v0.15.13
|
|
8
|
+
hooks:
|
|
9
|
+
- id: ruff
|
|
10
|
+
# Default invocation is `ruff check` without --fix.
|
|
11
|
+
# Auto-fix is off by project policy; run `uv run ruff check --fix` manually.
|
|
12
|
+
|
|
13
|
+
- repo: https://github.com/crate-ci/typos
|
|
14
|
+
rev: v1.46.2
|
|
15
|
+
hooks:
|
|
16
|
+
- id: typos
|
|
17
|
+
|
|
18
|
+
- repo: https://github.com/rbubley/mirrors-prettier
|
|
19
|
+
rev: v3.8.3
|
|
20
|
+
hooks:
|
|
21
|
+
- id: prettier
|
|
22
|
+
args: [--check]
|
|
23
|
+
|
|
24
|
+
- repo: local
|
|
25
|
+
hooks:
|
|
26
|
+
- id: mypy
|
|
27
|
+
name: mypy
|
|
28
|
+
entry: uv run mypy src/
|
|
29
|
+
language: system
|
|
30
|
+
files: ^(src/.*\.py|pyproject\.toml)$
|
|
31
|
+
pass_filenames: false
|
|
32
|
+
|
|
33
|
+
- id: pytest
|
|
34
|
+
name: pytest
|
|
35
|
+
entry: uv run pytest
|
|
36
|
+
language: system
|
|
37
|
+
always_run: true
|
|
38
|
+
pass_filenames: false
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
node_modules/
|
|
2
|
+
dist/
|
|
3
|
+
.venv/
|
|
4
|
+
.pytest_cache/
|
|
5
|
+
.ruff_cache/
|
|
6
|
+
.mypy_cache/
|
|
7
|
+
|
|
8
|
+
src/
|
|
9
|
+
tests/
|
|
10
|
+
|
|
11
|
+
# Markdown is hand-formatted. Prettier mangles identifiers containing
|
|
12
|
+
# underscores (treats them as emphasis) and pads tables past printWidth.
|
|
13
|
+
**/*.md
|
|
14
|
+
|
|
15
|
+
uv.lock
|
|
16
|
+
package-lock.json
|
|
17
|
+
LICENSE
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
[files]
|
|
2
|
+
extend-exclude = [
|
|
3
|
+
"tests/fixtures/",
|
|
4
|
+
"dist/",
|
|
5
|
+
"node_modules/",
|
|
6
|
+
"uv.lock",
|
|
7
|
+
"package-lock.json",
|
|
8
|
+
"CHANGELOG.md",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[default.extend-words]
|
|
12
|
+
# "unparseable" is a valid English variant; we use it in test names.
|
|
13
|
+
unparseable = "unparseable"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
Operational navigation for LLM coding agents. Human docs: `README.md`, `docs/architecture.md`, `docs/benchmarks.md`, `docs/out-of-scope.md`, `CONTRIBUTING.md`.
|
|
4
|
+
|
|
5
|
+
## Layout
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
chunkhound-index-compactor/
|
|
9
|
+
├── pyproject.toml
|
|
10
|
+
├── package.json # prettier dev dep (Node)
|
|
11
|
+
├── .prettierrc.json
|
|
12
|
+
├── .prettierignore
|
|
13
|
+
├── .typos.toml
|
|
14
|
+
├── .github/workflows/ # ci.yml, rolling.yml, release.yml
|
|
15
|
+
├── README.md
|
|
16
|
+
├── AGENTS.md
|
|
17
|
+
├── CLAUDE.md # @AGENTS.md
|
|
18
|
+
├── CONTRIBUTING.md # dev tooling, CI, release process
|
|
19
|
+
├── CHANGELOG.md
|
|
20
|
+
├── LICENSE
|
|
21
|
+
├── docs/
|
|
22
|
+
│ ├── architecture.md # pipeline, RAM asymmetry, recipe table, vss bundling, ChunkHound compat, refused inputs
|
|
23
|
+
│ ├── benchmarks.md # empirical baseline (1.25 TB ChunkHound index + fixture cross-check)
|
|
24
|
+
│ └── out-of-scope.md # rejected approaches (DiskANN, hnsw_compact_index, M/M0/ef_*, ...)
|
|
25
|
+
├── src/chunkhound_index_compactor/
|
|
26
|
+
│ ├── __init__.py # public API re-exports
|
|
27
|
+
│ ├── __main__.py # python -m entry
|
|
28
|
+
│ ├── cli.py # Typer app
|
|
29
|
+
│ └── core.py # compaction logic
|
|
30
|
+
└── tests/
|
|
31
|
+
├── conftest.py # fixtures: populated_db, bloated_db, hnsw_db, shopware_cli_index
|
|
32
|
+
├── fixtures/ # committed real-world DB artifacts (provenance in conftest.py)
|
|
33
|
+
├── test_core.py
|
|
34
|
+
├── test_cli.py
|
|
35
|
+
├── test_extensions.py
|
|
36
|
+
├── test_rebuild.py
|
|
37
|
+
└── test_human_size.py
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Module → symbols
|
|
41
|
+
|
|
42
|
+
| Module | Public | Private |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `core.py` | `compact_database`, `restore_indexes`, `replace_with_compacted`, `human_size`, `CompactionResult`, `RestoreResult` | `_topological_order`, `_referenced_tables`, `_reject_unsupported_objects`, `_capture_hnsw_recipes`, `_write_recipe_table`, `_load_bundled_extension`, `_bundled_extension_path`, `_escape_sql_literal`, `RECIPE_TABLE` constant, regexes `_HNSW_RE`, `_HNSW_COLUMN_RE`, `_FK_REFERENCES_RE` |
|
|
45
|
+
| `cli.py` | `app` (Typer), `compact`, `restore` commands; `DefaultCommandGroup` routes bare args to `compact` | (none) |
|
|
46
|
+
| `__main__.py` | `app()` invocation | (none) |
|
|
47
|
+
| `__init__.py` | re-exports from `core` | (none) |
|
|
48
|
+
|
|
49
|
+
## When to modify
|
|
50
|
+
|
|
51
|
+
| Task | File / symbol |
|
|
52
|
+
|---|---|
|
|
53
|
+
| Rebuild SQL sequence | `core.py` → `compact_database()` |
|
|
54
|
+
| FK ordering | `core.py` → `_topological_order()` / `_referenced_tables()` |
|
|
55
|
+
| Schema/view rejection | `core.py` → `_reject_unsupported_objects()` |
|
|
56
|
+
| HNSW metric recovery / recipe table schema | `core.py` → `_capture_hnsw_recipes()` / `_write_recipe_table()` / `RECIPE_TABLE` |
|
|
57
|
+
| Index restore | `core.py` → `restore_indexes()` |
|
|
58
|
+
| Atomic replace / backup suffix | `core.py` → `replace_with_compacted()` |
|
|
59
|
+
| CLI args / commands / output strings | `cli.py` (`DefaultCommandGroup`, `compact`, `restore`) |
|
|
60
|
+
| Byte formatting | `core.py` → `human_size()` |
|
|
61
|
+
| New public export | `core.py` + `__init__.py` `__all__` |
|
|
62
|
+
| Pipeline narrative, design rationale, refused-input reasoning | `docs/architecture.md` (not here) |
|
|
63
|
+
| Empirical baseline / scale numbers | `docs/benchmarks.md` (not here) |
|
|
64
|
+
| Rejected approaches (DiskANN, hnsw_compact_index, M/M0/ef_*, etc.) | `docs/out-of-scope.md` (not here) |
|
|
65
|
+
|
|
66
|
+
## Invariants enforced by code
|
|
67
|
+
|
|
68
|
+
- HNSW metric must survive rebuild. Catalog DDL strips `WITH (...)`, so the metric is read from `pragma_hnsw_index_info()` in `_capture_hnsw_recipes`. (architecture.md §ChunkHound compatibility)
|
|
69
|
+
- SQL DDL is built by string interpolation (no parameter binding); escape literals via `_escape_sql_literal`, wrap table and index names in double quotes. (architecture.md §Compaction pipeline)
|
|
70
|
+
- Public-API exceptions (`ValueError`, `FileNotFoundError`, `FileExistsError`) enumerated at README §Library Usage; refused inputs reasoned at architecture.md §Not supported (and why).
|
|
71
|
+
- Reading the source never loads its HNSW into RAM; building the destination HNSW dominates peak RAM. `--skip-hnsw` is the small-RAM unlock; `restore` is a separate-machine step. (architecture.md §RAM cost asymmetry)
|
|
72
|
+
|
|
73
|
+
## Build / verify
|
|
74
|
+
|
|
75
|
+
- Setup, local-check commands, tooling configs, CI workflow details, and release process at `CONTRIBUTING.md` §Setup, §Local checks, §CI workflows, §Release process.
|
|
76
|
+
|
|
77
|
+
## Runtime deps
|
|
78
|
+
|
|
79
|
+
- Authoritative constraints at `pyproject.toml`. Load-bearing context: `duckdb` range matches `chunkhound` to stay file-format-compatible; `duckdb-extension-vss>=1.5.2` pins `duckdb==1.5.2` transitively. Python `>=3.10,<3.14`.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-05-20
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Initial release of `chunkhound-index-compactor`.
|
|
7
|
+
- `chunkhound-index-compactor` CLI (Typer-based) with a `compact` default command and a `restore` subcommand, routed via `DefaultCommandGroup` so `chunkhound-index-compactor SOURCE [TARGET]` still works.
|
|
8
|
+
- `compact_database(source, target, *, skip_hnsw=False)`: rebuild a DuckDB database into a fresh file via a foreign-key-ordered streaming rebuild. Captures the source schema, recreates sequences/tables/indexes in a freshly-allocated file, computes a foreign-key-topological table order, and inserts one table at a time parent-before-child. This sidesteps the FK race that breaks `ATTACH` + `COPY FROM DATABASE` on large FK-bearing databases (e.g. ChunkHound indexes at scale) while still dropping orphaned blocks.
|
|
9
|
+
- HNSW indexes are recreated with the metric recovered from `pragma_hnsw_index_info()`. The catalog DDL strips the `WITH (...)` clause, so a verbatim rebuild would silently reset a `cosine` index to the `l2sq` default and leave it dead (queries fall back to brute force).
|
|
10
|
+
- `--skip-hnsw` flag / `skip_hnsw=True` parameter: rebuild without vector indexes (RAM-flat, smallest output) and record what was stripped in a `_compactor_hnsw_recipe` table inside the output.
|
|
11
|
+
- `restore` CLI command / `restore_indexes()` function: rebuild the stripped HNSW indexes in place from the recipe table, idempotently, on a RAM-capable machine.
|
|
12
|
+
- `replace_with_compacted()`: atomic swap with `.bak` backup.
|
|
13
|
+
- `human_size()`: binary-prefix byte formatting.
|
|
14
|
+
- `CompactionResult` and `RestoreResult` dataclasses.
|
|
15
|
+
- `--replace` flag for in-place compaction with backup.
|
|
16
|
+
- Bundled `vss.duckdb_extension` binary from `duckdb-extension-vss` is `LOAD`ed directly from disk when an HNSW index is present, so compaction of ChunkHound and other vector-search DuckDBs works offline out of the box.
|
|
17
|
+
|
|
18
|
+
### Fail-hard
|
|
19
|
+
- Sources with non-`main` schemas, views, or foreign-key cycles raise `ValueError` rather than silently dropping objects.
|
|
20
|
+
- On any failure after the target file is created, the partial target is unlinked. A half-written multi-GB file is worse than nothing.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
@AGENTS.md
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
uv sync --extra dev # Python toolchain (mypy, pytest, ruff, typos, pre-commit)
|
|
7
|
+
npm install # Node toolchain (prettier; one-time per clone)
|
|
8
|
+
uv run pre-commit install # wires the git pre-commit hook (one-time per clone)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Python 3.10 through 3.13 supported. macOS and Linux are tested.
|
|
12
|
+
|
|
13
|
+
The pre-commit hook runs ruff, typos, prettier, mypy, and pytest on every commit. Bypass with `git commit --no-verify` when you need to ship a WIP commit; CI still runs the same checks.
|
|
14
|
+
|
|
15
|
+
## Local checks
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv run pytest
|
|
19
|
+
uv run ruff check src/ tests/
|
|
20
|
+
uv run ruff format --check src/ tests/
|
|
21
|
+
uv run mypy src/
|
|
22
|
+
uv run typos
|
|
23
|
+
npx prettier --check .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Apply prettier fixes with `npm run format:fix`. CI runs the same commands.
|
|
27
|
+
|
|
28
|
+
### Tooling notes
|
|
29
|
+
|
|
30
|
+
- **Ruff**: `E W F I B C4 UP ARG SIM PTH`, line length 100, `E501` ignored.
|
|
31
|
+
- **MyPy**: strict; `tests.*` relaxed; `duckdb`, `duckdb_extension_vss` missing-imports ignored.
|
|
32
|
+
- **Pytest**: discovers `tests/`. Fixtures bundle real ChunkHound DB samples (see `tests/conftest.py`).
|
|
33
|
+
- **Typos**: config at `.typos.toml`.
|
|
34
|
+
- **Prettier**: config at `.prettierrc.json` and `.prettierignore`. Markdown is excluded because prettier mangles identifiers containing underscores and bloats markdown tables past the 100-column line limit.
|
|
35
|
+
|
|
36
|
+
## CI workflows
|
|
37
|
+
|
|
38
|
+
Three workflows under `.github/workflows/`. All third-party actions are SHA-pinned with `# vX.Y.Z` comments so Renovate can update them later.
|
|
39
|
+
|
|
40
|
+
### ci.yml
|
|
41
|
+
|
|
42
|
+
Runs on every push to `main` and every PR targeting `main`. Six parallel jobs:
|
|
43
|
+
|
|
44
|
+
| Job | What |
|
|
45
|
+
|-------------|---------------------------------------------------------------------------------------|
|
|
46
|
+
| `lint` | `ruff check` and `ruff format --check` |
|
|
47
|
+
| `typecheck` | `mypy src/` |
|
|
48
|
+
| `test` | `pytest` with coverage on a Python 3.10 / 3.11 / 3.12 / 3.13 matrix (`ubuntu-latest`) |
|
|
49
|
+
| `typos` | `typos` against the repo |
|
|
50
|
+
| `prettier` | `prettier --check .` |
|
|
51
|
+
| `build` | `uv build`; uploads wheel + sdist as a `dist` artifact (14-day retention) |
|
|
52
|
+
|
|
53
|
+
Coverage is reported but not gated. The `build` job runs independently of the others, so PR reviewers can download a wheel even when other jobs fail.
|
|
54
|
+
|
|
55
|
+
### rolling.yml
|
|
56
|
+
|
|
57
|
+
After `ci.yml` succeeds on `main`, deletes the existing `rolling` tag plus release and recreates them at the new HEAD SHA. Marked prerelease and not-latest so it never shadows a tagged release.
|
|
58
|
+
|
|
59
|
+
Install the current main-branch build from the rolling asset:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv tool install https://github.com/it-bens/chunkhound-index-compactor/releases/download/rolling/chunkhound_index_compactor-<version>-py3-none-any.whl
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### release.yml
|
|
66
|
+
|
|
67
|
+
Triggers on a `v*.*.*` tag push or manual `workflow_dispatch` against a tag ref.
|
|
68
|
+
|
|
69
|
+
1. Verifies the ref is a tag.
|
|
70
|
+
2. Verifies the tag (minus the `v` prefix) matches `pyproject.toml`'s `version`.
|
|
71
|
+
3. Builds wheel + sdist.
|
|
72
|
+
4. Publishes to PyPI via Trusted Publisher (OIDC; no PyPI token in the repo).
|
|
73
|
+
5. Creates a GitHub Release with the wheel + sdist attached. Release notes are not auto-generated; edit them on GitHub afterward.
|
|
74
|
+
|
|
75
|
+
## Release process
|
|
76
|
+
|
|
77
|
+
To cut a release:
|
|
78
|
+
|
|
79
|
+
1. Bump `version` in `pyproject.toml` (for example, `0.1.0` to `0.2.0`) and update `CHANGELOG.md`.
|
|
80
|
+
2. Commit, push, wait for CI to pass on `main`.
|
|
81
|
+
3. Tag and push: `git tag v0.2.0 && git push origin v0.2.0`.
|
|
82
|
+
4. The release workflow publishes to PyPI and creates a GitHub Release.
|
|
83
|
+
5. Open the Release on GitHub and write the release notes.
|
|
84
|
+
|
|
85
|
+
To re-trigger from a tag manually (for example, after a transient PyPI failure): GitHub Actions, then Release, then Run workflow, then pick the tag from the ref dropdown.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Martin Bens
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunkhound-index-compactor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Compact a DuckDB database (ChunkHound index or otherwise) by rebuilding it into a fresh file
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: <3.14,>=3.10
|
|
8
|
+
Requires-Dist: duckdb-extension-vss>=1.5.2
|
|
9
|
+
Requires-Dist: duckdb<1.5.3.dev0,>=1.4.0
|
|
10
|
+
Requires-Dist: typer>=0.25
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: mypy>=2.1; extra == 'dev'
|
|
13
|
+
Requires-Dist: pre-commit>=4.5; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest-cov>=7.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest>=9.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: ruff>=0.15; extra == 'dev'
|
|
17
|
+
Requires-Dist: typos>=1.46; extra == 'dev'
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ChunkHound Index Compactor
|
|
2
|
+
|
|
3
|
+
Compact a [DuckDB](https://duckdb.org) database by rebuilding it into a fresh file. The motivating use case was shrinking a bloated [ChunkHound](https://github.com/chunkhound/chunkhound) index (whose per-batch HNSW re-serialization leaves large amounts of orphaned-but-counted blocks), but the implementation is fully generic; it works on any single-schema DuckDB file.
|
|
4
|
+
|
|
5
|
+
## ⚡ Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uvx chunkhound-index-compactor path/to/db.duckdb
|
|
9
|
+
# writes path/to/db.duckdb.compacted
|
|
10
|
+
|
|
11
|
+
uvx chunkhound-index-compactor path/to/db.duckdb --replace
|
|
12
|
+
# swaps in the compacted copy and keeps the original at path/to/db.duckdb.bak
|
|
13
|
+
|
|
14
|
+
uvx chunkhound-index-compactor path/to/db.duckdb --skip-hnsw
|
|
15
|
+
# skips rebuilding vector indexes (RAM-flat, smallest output); restore them later
|
|
16
|
+
uvx chunkhound-index-compactor restore path/to/db.duckdb.compacted
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
The source is opened read-only, but an active writer holds the file lock. Close any process writing to the database before running.
|
|
20
|
+
|
|
21
|
+
## 🖥️ CLI Usage
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
$ chunkhound-index-compactor --help
|
|
25
|
+
Usage: chunkhound-index-compactor [OPTIONS] COMMAND [ARGS]...
|
|
26
|
+
|
|
27
|
+
Commands:
|
|
28
|
+
compact Compact a DuckDB database by rebuilding it into a fresh file. (default)
|
|
29
|
+
restore Rebuild HNSW vector indexes in a --skip-hnsw artifact, in place.
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
A bare invocation routes to `compact`, so `chunkhound-index-compactor SOURCE` still works:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
chunkhound-index-compactor SOURCE [TARGET] [--replace] [--skip-hnsw]
|
|
36
|
+
chunkhound-index-compactor restore DATABASE
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
| Argument / Option | Meaning |
|
|
40
|
+
|-------------------|-----------------------------------------------------------------------------------|
|
|
41
|
+
| `SOURCE` | Path to the existing DuckDB file (required) |
|
|
42
|
+
| `TARGET` | Path for the compacted output [default: `<source>.compacted`] |
|
|
43
|
+
| `--replace` | After success, replace source with the compacted file (original → `<source>.bak`) |
|
|
44
|
+
| `--skip-hnsw` | Do not rebuild vector indexes; write a recipe table for later `restore` |
|
|
45
|
+
|
|
46
|
+
With `--skip-hnsw`, the output has no vector index and falls back to a brute-force scan (correct, just unaccelerated) until you run `restore`. Rebuilding the HNSW is the memory-dominant step, so `--skip-hnsw` lets you compact on a small machine and `restore` on a RAM-capable one. See [docs/benchmarks.md](docs/benchmarks.md) for peak-RAM numbers and [docs/architecture.md §RAM cost asymmetry](docs/architecture.md#ram-cost-asymmetry) for why.
|
|
47
|
+
|
|
48
|
+
## 🐍 Library Usage
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
from chunkhound_index_compactor import compact_database, restore_indexes, replace_with_compacted
|
|
53
|
+
|
|
54
|
+
result = compact_database(Path("big.duckdb"), Path("small.duckdb"))
|
|
55
|
+
print(f"{result.source_size} -> {result.target_size} ({result.delta_pct:+.1f}%)")
|
|
56
|
+
|
|
57
|
+
# Small-RAM path: skip the vector index, restore it later on a bigger machine.
|
|
58
|
+
compact_database(Path("big.duckdb"), Path("small.duckdb"), skip_hnsw=True)
|
|
59
|
+
restored = restore_indexes(Path("small.duckdb"))
|
|
60
|
+
print(f"restored: {restored.restored}")
|
|
61
|
+
|
|
62
|
+
# Optional: swap in place with .bak backup
|
|
63
|
+
backup = replace_with_compacted(result.source, result.target)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
`compact_database()` raises:
|
|
67
|
+
- `ValueError`: `target` resolves to the same path as `source`, the source has a non-`main` schema or a view, or the FK graph has a cycle.
|
|
68
|
+
- `FileNotFoundError`: `source` does not exist.
|
|
69
|
+
- `FileExistsError`: `target` already exists.
|
|
70
|
+
|
|
71
|
+
`restore_indexes()` raises:
|
|
72
|
+
- `FileNotFoundError`: `database` does not exist.
|
|
73
|
+
- `ValueError`: `database` has no `_compactor_hnsw_recipe` table (not a `--skip-hnsw` artifact).
|
|
74
|
+
|
|
75
|
+
`replace_with_compacted()` raises `FileExistsError` if `<source>.bak` already exists. It refuses to overwrite an existing backup.
|
|
76
|
+
|
|
77
|
+
## 🚫 Not Supported
|
|
78
|
+
|
|
79
|
+
The tool fails hard rather than silently dropping anything it cannot reproduce:
|
|
80
|
+
|
|
81
|
+
- Non-`main` schemas and views (raise `ValueError`).
|
|
82
|
+
- Foreign-key cycles among tables (raise `ValueError`).
|
|
83
|
+
- HNSW tuning parameters other than `metric` (`M`, `M0`, `ef_construction`, `ef_search`); they are not recoverable from a built index and are rebuilt at the `vss` defaults.
|
|
84
|
+
|
|
85
|
+
See [docs/architecture.md](docs/architecture.md#not-supported-and-why) for the reasoning, and [docs/out-of-scope.md](docs/out-of-scope.md) for approaches considered and not pursued.
|
|
86
|
+
|
|
87
|
+
## 🏗️ Development
|
|
88
|
+
|
|
89
|
+
Setup, local checks, CI, and release process: [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
90
|
+
|
|
91
|
+
## ⚖️ License
|
|
92
|
+
|
|
93
|
+
MIT
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
> [!NOTE]
|
|
98
|
+
> Yes, an AI wrote this README. And the code, the docs, the tests, and
|
|
99
|
+
> the `.claude/skills` it now uses to write the next round. Yes, a human
|
|
100
|
+
> told it to keep the emojis. The human has ADHD, which, as it turns
|
|
101
|
+
> out, means his brain was already doing attention re-routing and
|
|
102
|
+
> context-window thrashing before LLMs made it cool. They call him ...
|
|
103
|
+
> LLMartin. The emojis are a feature.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "chunkhound-index-compactor-devtools",
|
|
3
|
+
"lockfileVersion": 3,
|
|
4
|
+
"requires": true,
|
|
5
|
+
"packages": {
|
|
6
|
+
"": {
|
|
7
|
+
"name": "chunkhound-index-compactor-devtools",
|
|
8
|
+
"devDependencies": {
|
|
9
|
+
"prettier": "3.8.3"
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
"node_modules/prettier": {
|
|
13
|
+
"version": "3.8.3",
|
|
14
|
+
"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.8.3.tgz",
|
|
15
|
+
"integrity": "sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw==",
|
|
16
|
+
"dev": true,
|
|
17
|
+
"license": "MIT",
|
|
18
|
+
"bin": {
|
|
19
|
+
"prettier": "bin/prettier.cjs"
|
|
20
|
+
},
|
|
21
|
+
"engines": {
|
|
22
|
+
"node": ">=14"
|
|
23
|
+
},
|
|
24
|
+
"funding": {
|
|
25
|
+
"url": "https://github.com/prettier/prettier?sponsor=1"
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "chunkhound-index-compactor-devtools",
|
|
3
|
+
"private": true,
|
|
4
|
+
"description": "Local dev tooling (prettier). Not published. Not a runtime dep.",
|
|
5
|
+
"scripts": {
|
|
6
|
+
"format": "prettier --check .",
|
|
7
|
+
"format:fix": "prettier --write ."
|
|
8
|
+
},
|
|
9
|
+
"devDependencies": {
|
|
10
|
+
"prettier": "3.8.3"
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "chunkhound-index-compactor"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Compact a DuckDB database (ChunkHound index or otherwise) by rebuilding it into a fresh file"
|
|
5
|
+
requires-python = ">=3.10,<3.14"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"duckdb>=1.4.0,<1.5.3.dev0",
|
|
9
|
+
"duckdb-extension-vss>=1.5.2",
|
|
10
|
+
"typer>=0.25",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[project.optional-dependencies]
|
|
14
|
+
dev = [
|
|
15
|
+
"mypy>=2.1",
|
|
16
|
+
"pre-commit>=4.5",
|
|
17
|
+
"pytest>=9.0",
|
|
18
|
+
"pytest-cov>=7.0",
|
|
19
|
+
"ruff>=0.15",
|
|
20
|
+
"typos>=1.46",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
chunkhound-index-compactor = "chunkhound_index_compactor.cli:app"
|
|
25
|
+
|
|
26
|
+
[build-system]
|
|
27
|
+
requires = ["hatchling"]
|
|
28
|
+
build-backend = "hatchling.build"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
packages = ["src/chunkhound_index_compactor"]
|
|
32
|
+
|
|
33
|
+
[tool.hatch.build.targets.sdist]
|
|
34
|
+
exclude = ["tests/", "docs/", ".github/", ".claude/**"]
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
testpaths = ["tests"]
|
|
38
|
+
addopts = ["-v", "--tb=short", "--strict-markers"]
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
target-version = "py310"
|
|
42
|
+
line-length = 100
|
|
43
|
+
|
|
44
|
+
[tool.ruff.lint]
|
|
45
|
+
select = ["E", "W", "F", "I", "B", "C4", "UP", "ARG", "SIM", "PTH"]
|
|
46
|
+
ignore = ["E501"]
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint.isort]
|
|
49
|
+
known-first-party = ["chunkhound_index_compactor"]
|
|
50
|
+
|
|
51
|
+
[tool.ruff.format]
|
|
52
|
+
quote-style = "double"
|
|
53
|
+
indent-style = "space"
|
|
54
|
+
|
|
55
|
+
[tool.mypy]
|
|
56
|
+
python_version = "3.10"
|
|
57
|
+
strict = true
|
|
58
|
+
warn_return_any = true
|
|
59
|
+
warn_unused_ignores = true
|
|
60
|
+
|
|
61
|
+
[[tool.mypy.overrides]]
|
|
62
|
+
module = ["tests.*"]
|
|
63
|
+
disallow_untyped_defs = false
|
|
64
|
+
disallow_untyped_calls = false
|
|
65
|
+
|
|
66
|
+
[[tool.mypy.overrides]]
|
|
67
|
+
module = ["duckdb", "duckdb_extension_vss"]
|
|
68
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Compact a DuckDB database by copying it into a fresh file."""
|
|
2
|
+
|
|
3
|
+
from .core import (
|
|
4
|
+
CompactionResult,
|
|
5
|
+
RestoreResult,
|
|
6
|
+
compact_database,
|
|
7
|
+
human_size,
|
|
8
|
+
replace_with_compacted,
|
|
9
|
+
restore_indexes,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"CompactionResult",
|
|
14
|
+
"RestoreResult",
|
|
15
|
+
"compact_database",
|
|
16
|
+
"human_size",
|
|
17
|
+
"replace_with_compacted",
|
|
18
|
+
"restore_indexes",
|
|
19
|
+
]
|