simplevecdb 2.4.0__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- simplevecdb-2.6.0/.bandit +9 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.gitignore +4 -2
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CHANGELOG.md +160 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/PKG-INFO +39 -3
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/README.md +19 -2
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/CHANGELOG.md +160 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/encryption.md +53 -6
- simplevecdb-2.6.0/lefthook.yml +39 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/pyproject.toml +39 -3
- simplevecdb-2.6.0/scripts/bump_version.py +88 -0
- simplevecdb-2.6.0/scripts/check_version_sync.py +92 -0
- simplevecdb-2.6.0/scripts/track_metrics.py +82 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/__init__.py +13 -1
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/async_core.py +59 -10
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/config.py +18 -3
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/constants.py +6 -1
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/core.py +335 -67
- simplevecdb-2.6.0/src/simplevecdb/embeddings/__init__.py +12 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/embeddings/models.py +38 -9
- simplevecdb-2.6.0/src/simplevecdb/embeddings/server.py +637 -0
- simplevecdb-2.6.0/src/simplevecdb/encryption.py +672 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/catalog.py +328 -150
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/clustering.py +13 -2
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/quantization.py +42 -5
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/search.py +118 -59
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/usearch_index.py +115 -56
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/integrations/langchain.py +27 -5
- simplevecdb-2.6.0/src/simplevecdb/integrations/llamaindex.py +355 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/logging.py +9 -39
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/utils.py +162 -1
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_langchain.py +1 -1
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_llamaindex.py +4 -4
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_rag.py +12 -3
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_server.py +15 -10
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_core_additional_coverage.py +2 -2
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_initialization.py +2 -2
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_missing_coverage.py +6 -7
- simplevecdb-2.6.0/tests/unit/core/test_v25_correctness.py +267 -0
- simplevecdb-2.6.0/tests/unit/core/test_v25_features.py +344 -0
- simplevecdb-2.6.0/tests/unit/core/test_v25_robustness.py +306 -0
- simplevecdb-2.6.0/tests/unit/core/test_v26_safety.py +182 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_models.py +1 -0
- simplevecdb-2.6.0/tests/unit/embeddings/test_repo_id_validation.py +100 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_server.py +33 -25
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_server_coverage.py +1 -2
- simplevecdb-2.6.0/tests/unit/embeddings/test_v25_enhancements.py +380 -0
- simplevecdb-2.6.0/tests/unit/engine/test_v26_quantization_clustering.py +133 -0
- simplevecdb-2.6.0/tests/unit/integrations/test_llamaindex_review_pass_3.py +154 -0
- simplevecdb-2.6.0/tests/unit/integrations/test_llamaindex_v26.py +190 -0
- simplevecdb-2.6.0/tests/unit/test_async_v26.py +115 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_catalog_coverage.py +10 -12
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_core.py +6 -7
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_encryption_coverage.py +0 -1
- simplevecdb-2.6.0/tests/unit/test_encryption_salt.py +98 -0
- simplevecdb-2.6.0/tests/unit/test_encryption_v1_format.py +220 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_error_handling.py +0 -16
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search_missing_coverage.py +2 -3
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_usearch_index_missing_coverage.py +2 -3
- simplevecdb-2.6.0/tests/unit/test_v26_encryption_review_pass_3.py +214 -0
- simplevecdb-2.6.0/tests/unit/test_v26_misc.py +184 -0
- simplevecdb-2.6.0/tests/unit/test_v26_review_pass_3.py +270 -0
- simplevecdb-2.6.0/tests/unit/test_v26_review_pass_4.py +179 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/uv.lock +245 -227
- simplevecdb-2.4.0/.bandit +0 -9
- simplevecdb-2.4.0/.claude/settings.local.json +0 -8
- simplevecdb-2.4.0/.pre-commit-config.yaml +0 -37
- simplevecdb-2.4.0/src/simplevecdb/embeddings/__init__.py +0 -0
- simplevecdb-2.4.0/src/simplevecdb/embeddings/server.py +0 -374
- simplevecdb-2.4.0/src/simplevecdb/encryption.py +0 -417
- simplevecdb-2.4.0/src/simplevecdb/integrations/llamaindex.py +0 -227
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.env.example +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/FUNDING.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/dependabot.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/ci.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/publish.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/security.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/update-sponsors.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.python-version +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CODE_OF_CONDUCT.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CONTRIBUTING.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/LICENSE +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/SECURITY.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/CONTRIBUTING.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/ENV_SETUP.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/LICENSE +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/async.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/config.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/core.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/embeddings.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/catalog.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/quantization.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/search.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/integrations.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/types.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/benchmarks.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/examples.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/guides/clustering.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/index.md +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/auto_embed.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/backend_benchmark.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/embeddings/perf_benchmark.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/quant_benchmark.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/langchain_rag.ipynb +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/llama_rag.ipynb +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/ollama_rag.ipynb +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/smoke_test.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/mkdocs.yml +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/__init__.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/integrations/__init__.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/types.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/conftest.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_v21_features.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/perf/test_batch_detection.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/perf/test_performance.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/__init__.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_batch_detection.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_factory_methods.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_filters.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_quantization.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_similarity_search.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/__init__.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/__init__.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/test_langchain_coverage.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/test_llamaindex_coverage.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_async.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_async_coverage.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_clustering.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_config.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_cross_collection_search.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_encryption.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_hierarchy.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_multi_collection.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search_coverage.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_streaming.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_types.py +0 -0
- {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_utils.py +0 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
exclude_dirs:
|
|
2
|
+
|
|
3
|
+
- /tests
|
|
4
|
+
- /examples
|
|
5
|
+
|
|
6
|
+
skips:
|
|
7
|
+
|
|
8
|
+
- B104 # 0.0.0.0 binding: SERVER_HOST defaults to 127.0.0.1; bandit can't see runtime defaults, so the warning is a false positive on this codebase. Keep it skipped only because the default is safe — if anyone introduces a hardcoded "0.0.0.0", remove this skip.
|
|
9
|
+
- B608 # SQL injection false positive: table names are validated via _validate_table_name()
|
|
@@ -21,16 +21,18 @@ build/
|
|
|
21
21
|
*.db
|
|
22
22
|
*.sqlite
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# Agentic CLI tools (per-developer state)
|
|
25
25
|
.opencode/
|
|
26
26
|
opencode.json
|
|
27
|
+
.claude/
|
|
28
|
+
.codex
|
|
27
29
|
|
|
28
30
|
# Project specific
|
|
29
31
|
simplevecdb_plan.md
|
|
30
32
|
AGENTS.md
|
|
31
33
|
htmlcov/
|
|
32
34
|
site/
|
|
33
|
-
|
|
35
|
+
htmlcov/
|
|
34
36
|
.coverage
|
|
35
37
|
NEXT_UPDATES.md
|
|
36
38
|
pro_pack/
|
|
@@ -5,6 +5,166 @@ All notable changes to SimpleVecDB will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.6.0] - 2026-05-06
|
|
9
|
+
|
|
10
|
+
### Review pass 3 — final correctness/security pass before tag
|
|
11
|
+
|
|
12
|
+
#### Critical fixes
|
|
13
|
+
|
|
14
|
+
- **`UsearchIndex.save` lost-update race** — the `_dirty = False` clear was outside the `file_lock` window, so a concurrent `add()` between `os.replace()` and the dirty-flag clear could be silently overwritten. Moved inside `file_lock`.
|
|
15
|
+
- **`UsearchIndex.save` data fsync on `O_RDONLY` fd** — `fsync(2)` on a read-only file descriptor has implementation-defined behavior on Linux (some kernels return `EBADF`, swallowed by the warning branch). Switched to `O_RDWR` so the data fsync is guaranteed.
|
|
16
|
+
- **`_rebuild_index_locked` bare `conn.execute`** — replaced the bare `self.conn.execute("SELECT id FROM ...")` with the new `CatalogManager.list_all_ids()`, which routes the read through `self._lock` instead of relying on RLock re-entrancy from a single caller.
|
|
17
|
+
- **PBKDF2 iteration bump** — raised from 480 000 → 600 000 to match the OWASP 2024 minimum for PBKDF2-HMAC-SHA256.
|
|
18
|
+
- **AES-GCM AAD now binds the v1 header** — `encrypt_file` / `decrypt_file` pass the magic+version bytes as `associated_data`, so any tampering with the header (including downgrade attempts) fails authentication instead of silently succeeding.
|
|
19
|
+
- **Bounded normalize-key cache** — `_NORMALIZE_KEY_CACHE` is now an LRU capped at 64 entries, serialized by a `threading.Lock`. Long-running multi-tenant processes no longer leak derived key material indefinitely.
|
|
20
|
+
- **LlamaIndex `delete()` no longer swallows `sqlite3.DatabaseError`** — narrowed the exception in the metadata-fallback path to `(TypeError, NotImplementedError)`. A locked DB, closed connection, or schema mismatch now propagates to the caller instead of becoming a silent no-op.
|
|
21
|
+
- **Hybrid-search RRF rank symmetry** — vector candidates now use the original HNSW position as their RRF rank (via `enumerate(vector_keys_list)`), matching how keyword candidates use raw BM25 position. Previously, a metadata filter that rejected vector candidates inflated surviving vector scores relative to keyword scores, corrupting result ordering.
|
|
22
|
+
- **`add_documents` FTS sentinel guard** — added a defense-in-depth check that raises `RuntimeError` if any `-1` sentinel rowid remains in `real_ids` before the FTS upsert. Prevents a hypothetical retry-loop interaction from corrupting the FTS index with rowid `-1`.
|
|
23
|
+
|
|
24
|
+
#### Important fixes
|
|
25
|
+
|
|
26
|
+
- **`delete_collection` TOCTOU** — moved the `list_collections()` existence check inside the `with self._lock:` block so two concurrent `delete_collection(name)` calls cannot both pass the check; the second now sees a clean `KeyError` instead of a SQLite error.
|
|
27
|
+
- **Salt sidecar `O_EXCL` guard** — `_resolve_salt(create_if_missing=True)` now creates the sidecar with `O_CREAT | O_EXCL`. If two processes race, the loser reads the winner's salt; if a sidecar already exists out-of-band, it is preserved instead of being clobbered (which would have rendered an existing DB unreadable).
|
|
28
|
+
- **`encrypt_index_file` v0→v1 sidecar migration** — re-encrypting a legacy v0 blob (no sidecar) now creates a fresh sidecar, completing the migration path to per-DB salts. Previously, `is_first_encryption` was keyed on `.enc` presence rather than `.salt` presence.
|
|
29
|
+
- **LlamaIndex legacy-collection warning** — `SimpleVecDBLlamaStore.__init__` now emits a one-shot `DeprecationWarning` when it detects rows lacking `_simplevecdb_node_id`, telling the operator to call `migrate_node_id_metadata()` and noting the inherent limitation that pre-2.6 rows can only be stamped with `str(doc_id)` (the original LlamaIndex node ids were never persisted).
|
|
30
|
+
- **INT8 quantization range break softened** — instead of raising `ValueError` on `max(|x|) > 1.0 + 1e-5`, the strategy now emits a one-shot `DeprecationWarning` and clips. Restores backwards compatibility for callers that relied on the prior silent-clip behavior.
|
|
31
|
+
- **`scripts/check_version_sync.py` now validates `CHANGELOG.md`** — the hook fails if the latest CHANGELOG entry header does not match `pyproject.toml`'s version, preventing a release from shipping with a stale changelog.
|
|
32
|
+
|
|
33
|
+
#### Test coverage added (review pass 3 gaps)
|
|
34
|
+
|
|
35
|
+
- `tests/unit/test_v26_review_pass_3.py` — covers parent-directory fsync on save, `.tmp` cleanup on save failure, `db._lock is catalog._lock` shared-RLock identity, adversarial inputs to `_validate_table_name`, hybrid-search RRF rank symmetry under filter, and same-text-different-id deduplication.
|
|
36
|
+
- `tests/unit/test_v26_encryption_review_pass_3.py` — covers nonce uniqueness across saves, wrong-key decrypt does not create the output file, AAD-bound header tampering fails authentication, salt sidecar O_EXCL preservation, and v0→v1 migration round-trip.
|
|
37
|
+
- `tests/unit/integrations/test_llamaindex_review_pass_3.py` — covers the `add → query` round-trip preserving the original LlamaIndex node id, end-to-end migration-then-delete on v2.5-shaped data, the legacy-collection `DeprecationWarning` at `__init__` time, and that `sqlite3.DatabaseError` from the metadata-fallback path now propagates instead of being swallowed.
|
|
38
|
+
|
|
39
|
+
### Fixed (concurrency & durability)
|
|
40
|
+
|
|
41
|
+
- **Atomic `UsearchIndex.save`** — now writes to a sibling `.tmp`, fsyncs, then `os.replace()`s onto the live path and fsyncs the parent directory. A crash mid-save can no longer corrupt the only copy of the index. Also moved the `_dirty` short-circuit inside `_write_lock` so a concurrent `add` cannot have its dirty flag silently cleared.
|
|
42
|
+
- **Atomic `rebuild_index`** — builds the new index at a sibling `.rebuild` path and atomically swaps it onto the live path; the old index remains the canonical copy until the swap succeeds.
|
|
43
|
+
- **Atomic encrypted save** — `encrypt_file` / `decrypt_file` now write to a sibling `.tmp`, fsync, set mode `0o600`, then `os.replace()`. `encrypt_index_file` only unlinks the plaintext after the encrypted output is durably on disk. A torn write can no longer leave the index unrecoverable.
|
|
44
|
+
- **`VectorDB`-level `RLock`** — a single re-entrant lock now serializes the `_collections` cache (no more check-then-insert TOCTOU on `collection()`) and is shared with every `CatalogManager` so all `with self.conn:` blocks across collections cannot interleave on the shared `sqlite3.Connection`. Reads remain lock-free at the SQLite level via WAL.
|
|
45
|
+
- **`AsyncVectorDB.close` drains** — switched from `executor.shutdown(wait=False)` to `wait=True` so in-flight pool tasks finish their cursors before the SQLite connection is closed. Pending (not-yet-started) work is still cancelled.
|
|
46
|
+
- **`set_parent` cycle check is transactional** — descendant lookup and parent UPDATE now run inside the same `with self._lock, self.conn:` block, closing a TOCTOU window where a concurrent edge could form a cycle.
|
|
47
|
+
- **Cluster persistence** — `_ensure_cluster_table`, `save_cluster_state`, `delete_cluster_state` now use `with self._lock, self.conn:` instead of bare `conn.commit()`; an exception during the execute is properly rolled back.
|
|
48
|
+
- **`add_documents` ID recovery is correct under upsert** — replaced the `last_insert_rowid()` arithmetic (which silently returned wrong IDs for batches mixing explicit and `None` IDs because UPSERTs do not advance the auto-increment counter) with a single `INSERT … RETURNING id` for the auto-ID rows. Explicit-ID rows still take the upsert path.
|
|
49
|
+
- **`delete_collection` closes cached indexes first** — any `VectorCollection` instances cached for the deleted name have their `UsearchIndex` closed before the file is unlinked, so a stale mmap view cannot race the unlink.
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- **`upsert_fts_rows` / `delete_fts_rows` are now `_upsert_fts_rows` / `_delete_fts_rows`** (private). The FTS shadow table must be updated inside the same transaction as the main table or it can desync on crash; the rename signals the contract.
|
|
54
|
+
- **`get_legacy_vectors`, `drop_legacy_vec_table`** now validate the supplied table name via `_validate_table_name` before interpolating into SQL.
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
|
|
58
|
+
- **Declared `python-dotenv` dependency** — `simplevecdb.config` already imported and called `load_dotenv` at package import; the missing dependency would `ImportError` on a clean install of the base package without optional extras.
|
|
59
|
+
|
|
60
|
+
### Fixed (correctness & quality)
|
|
61
|
+
|
|
62
|
+
- **RRF deduplication keys by document ID, not text** — `hybrid_search` previously deduped by `doc.page_content`, silently merging two distinct documents that happened to share text into one inflated-score result.
|
|
63
|
+
- **NaN/Inf guard at insert** — `add_texts` and `add_texts_streaming` reject non-finite vectors instead of feeding them to HNSW, which would produce undefined neighbours and could corrupt the graph.
|
|
64
|
+
- **`normalize_l2` handles subnormals** — replaced the exact `norm == 0` compare with a `< 1e-12` check (matching the existing usearch_index guard); subnormal floats no longer produce wildly large normalized vectors.
|
|
65
|
+
- **Silhouette score samples on large collections** — `silhouette_score` is O(n²); now caps the evaluation sample at `SILHOUETTE_MAX_SAMPLE = 10_000`. Large collections no longer OOM.
|
|
66
|
+
- **MMR maintains the selected matrix incrementally** — replaced per-iteration `np.stack(selected_embs)` with `np.vstack` of a running matrix. O(k²·d) wasted allocations dropped to O(k·d).
|
|
67
|
+
- **`_parse_bool_env` treats `KEY=` as unset** — empty strings now fall through to the default; previously they were truthy because `"".strip()` is not in the falsey set.
|
|
68
|
+
- **LangChain async methods use `asyncio.to_thread`** — `aadd_texts` / `asimilarity_search` / `amax_marginal_relevance_search` no longer block the event loop.
|
|
69
|
+
- **LlamaIndex `delete()` survives a process restart** — node IDs are persisted into document metadata under `_simplevecdb_node_id`; `delete()` falls back to a metadata query when the in-memory `_id_map` is empty.
|
|
70
|
+
- **LlamaIndex query results carry stable node IDs** — replaced `str(hash(page_content))` (process-randomized, collision-prone) with the persisted `_simplevecdb_node_id`.
|
|
71
|
+
- **`AsyncVectorDB.collection` accepts `store_embeddings`** — async callers can now enable embedding storage (required for `rebuild_index()`); previously they had no way to set it.
|
|
72
|
+
|
|
73
|
+
### Security
|
|
74
|
+
|
|
75
|
+
- **API key comparison uses `hmac.compare_digest`** — the prior `token not in allowed_keys` short-circuit leaked key prefixes via response time.
|
|
76
|
+
- **SQLCipher PRAGMA key always uses the `x'hex'` form** — every key path now goes through `_normalize_key` first, eliminating string interpolation of user-supplied passphrase characters into a quoted PRAGMA argument.
|
|
77
|
+
- **`is_database_encrypted` rejects zero-byte files** — previously a missing/empty DB looked like an unencrypted DB because `sqlite3.connect` would create a fresh one.
|
|
78
|
+
|
|
79
|
+
### Changed (tooling)
|
|
80
|
+
|
|
81
|
+
- **Ruff and mypy targets aligned with `requires-python>=3.10`** — both were `py312`, hiding 3.10/3.11 incompatibilities. Cleaned three resulting `F401` unused-import warnings (`signal` in models.py, `_batched` and `constants` re-imports).
|
|
82
|
+
- **Pre-commit version-sync hook** — `__init__.py` derives `__version__` dynamically via `importlib.metadata`, so `check_version_sync.py` was failing on every commit looking for a literal `__version__ = "x.y.z"` line that does not exist. The hook now validates only `pyproject.toml`'s version field. `bump_version.py` similarly stops trying to rewrite `__init__.py` and uses an anchored regex to update only the canonical version field.
|
|
83
|
+
|
|
84
|
+
### Security (2.6.0 final)
|
|
85
|
+
|
|
86
|
+
- **Per-DB random PBKDF2 salt** — encrypted databases and index files now generate a random 16-byte salt at creation time, written to a `<resource>.salt` sidecar with mode `0o600`. The previous fixed `b"simplevecdb-sqlcipher-key"` salt let an attacker precompute one rainbow table that broke every simplevecdb installation with the same passphrase. Pre-2.6.0 encrypted resources keep working unchanged: when no sidecar exists, the loader falls back to the legacy fixed salt automatically.
|
|
87
|
+
- **HuggingFace `repo_id` allowlist + `trust_remote_code=False`** — the embeddings server validates model names against a strict regex (`namespace/name` with `[A-Za-z0-9_.-]` only) before passing them to `snapshot_download` / `SentenceTransformer`, blocking path traversal and local-filesystem inputs. `SentenceTransformer` is constructed with `trust_remote_code=False` so a malicious model card cannot trigger arbitrary downloaded Python on load.
|
|
88
|
+
- **CORS is opt-in** — the server no longer adds CORS middleware unless `EMBEDDING_SERVER_CORS_ORIGINS` is set. When the operator does set wildcard origins (`["*"]`), `allow_credentials` is forced off so the spec-violating wildcard-with-credentials combo can't be produced.
|
|
89
|
+
|
|
90
|
+
### Migration helpers
|
|
91
|
+
|
|
92
|
+
- **`SimpleVecDBLlamaStore.migrate_node_id_metadata()`** — backfills `_simplevecdb_node_id` for documents inserted before 2.6.0. Pre-2.6.0 versions did not persist the LlamaIndex node_id into metadata, so `delete()` could not find the right row after a process restart. Idempotent — already-stamped rows are skipped.
|
|
93
|
+
|
|
94
|
+
### Added (hygiene & polish)
|
|
95
|
+
|
|
96
|
+
- **`ClusterResult` and `ClusterTagCallback` exported from `simplevecdb`** — they were return/argument types of public methods but had no public import path; users had to reach into `simplevecdb.types`.
|
|
97
|
+
- **`NullHandler` attached to the package's root logger** at import time, per the Python logging HOWTO. Idempotent — duplicate calls do not stack handlers.
|
|
98
|
+
- **`SimpleVecDBLlamaStore.delete_nodes` raises `NotImplementedError`** when called with `filters`, instead of silently dropping the filter portion and pretending the deletion succeeded.
|
|
99
|
+
- **Recursive CTE depth bound as a parameter** in `get_descendants` / `get_ancestors`. The previous f-string interpolation was safe due to `int()` coercion but is now one less line away from injection on a future refactor.
|
|
100
|
+
- **`Config.from_env()` documented** as returning the import-time-frozen instance; setting env vars after import does not refresh.
|
|
101
|
+
- **`ModelRegistry(allow_unlisted=...)` defaults to `False`** to match the secure-by-default config setting; programmatic instantiations no longer get an open registry by accident.
|
|
102
|
+
- **`/v1/usage` returns aggregated totals when auth is disabled** instead of leaking the per-IP buckets to anyone who hits the endpoint.
|
|
103
|
+
- **Server validates `EMBEDDING_SERVER_MAX_REQUEST_ITEMS <= _MAX_ENCODE_BATCH` at startup** so an out-of-range env var fails fast at boot rather than per request.
|
|
104
|
+
- **`pyproject.toml` gains `[project.urls]`, `classifiers`, and `keywords`** for a useful PyPI listing.
|
|
105
|
+
- **`.bandit` documents the B104 skip** and warns that any future `0.0.0.0` binding requires removing the skip.
|
|
106
|
+
- **Encrypted file format now carries a 3-byte header** (`'SV' + version`) so future format changes are detectable. `decrypt_file` accepts both the new v1 format and the v0 (pre-2.6.0) format, so existing encrypted indexes still load without re-encryption.
|
|
107
|
+
|
|
108
|
+
### Fixed (review pass 2)
|
|
109
|
+
|
|
110
|
+
- **NaN/Inf rejection no longer leaves orphan catalog rows** — `add_texts` and `_process_streaming_batch` now validate vectors *before* the SQLite insert. Previously the catalog row committed first and a non-finite vector then raised, leaving rows visible via `get_documents_by_ids` but unreachable through similarity search.
|
|
111
|
+
- **`VectorCollection.__repr__` no longer issues SQL** — the previous `count()` call would raise `ProgrammingError` after `close()`, breaking debuggers and exception formatters that auto-stringify objects. The 2.6.0 fix only covered `VectorDB.__repr__`.
|
|
112
|
+
- **`EMBEDDING_SERVER_MAX_REQUEST_ITEMS` validation runs at module import** — the guard was previously inside `run_server()` and was bypassed under any non-CLI ASGI deployment (gunicorn, programmatic uvicorn).
|
|
113
|
+
- **LlamaIndex empty-`node_id` path is atomic** — `SimpleVecDBLlamaStore.add` now generates a UUID for nodes that arrive without a `node_id` and stamps it into metadata *before* the row insert, so the metadata commit is in the same SQLite transaction as the catalog row. Previously a separate `UPDATE` followed `add_texts`; a crash in the gap left rows un-stampable and cross-restart `delete()` silently no-op'd.
|
|
114
|
+
- **Catalog read paths serialize on `self._lock`** — `get_documents_by_ids`, `get_embeddings_by_ids`, `get_documents_and_embeddings_by_ids`, `find_ids_by_texts`, `find_ids_by_filter`, `keyword_search`, `count`, `get_all_docs_with_text`, `check_legacy_sqlite_vec`, `get_legacy_vectors`, `get_children`, `get_parent`, `get_descendants`, `get_ancestors`, `load_cluster_state`, `list_cluster_states`, and `VectorDB.list_collections` now acquire the connection-level lock around `conn.execute`. `sqlite3.Connection` is not safe for concurrent statement execution from multiple threads even under WAL.
|
|
115
|
+
- **`rebuild_index` is fully serialized** — the entire fetch + build + swap now runs inside `with self._lock:` so concurrent `add` / `delete` cannot mutate the catalog mid-rebuild and produce a stale snapshot.
|
|
116
|
+
- **`_ensure_cluster_table` double-checked under lock** — the `_cluster_table_ready` flag is now re-checked inside the lock and set inside the `with` block. Concurrent first-callers no longer both run the DDL.
|
|
117
|
+
- **`utils.file_lock` opens via `os.open(O_CREAT | O_RDWR, 0o600)`** — no truncation of stale lock files from a crashed prior run, restricted permissions on the lock sentinel.
|
|
118
|
+
|
|
119
|
+
## [2.5.0] - 2026-04-07
|
|
120
|
+
|
|
121
|
+
### Added
|
|
122
|
+
|
|
123
|
+
- **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
|
|
124
|
+
- **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
|
|
125
|
+
- **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
|
|
126
|
+
- **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
|
|
127
|
+
- **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
|
|
128
|
+
- **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
|
|
129
|
+
- **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
|
|
130
|
+
- **Embeddings server enhancements:**
|
|
131
|
+
- Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
|
|
132
|
+
- CORS middleware with configurable origins for browser-based clients
|
|
133
|
+
- Model warm-up on startup (skip with `--no-warmup`)
|
|
134
|
+
- Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
|
|
135
|
+
- Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
|
|
136
|
+
- Startup banner logging config summary (host, port, model, auth, rate limits)
|
|
137
|
+
- Nested token array normalization (`list[list[int]]` input format)
|
|
138
|
+
- Async executor offload for `embed_texts` (non-blocking event loop)
|
|
139
|
+
- OpenAPI version synced from package metadata
|
|
140
|
+
- Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
|
|
141
|
+
|
|
142
|
+
### Fixed
|
|
143
|
+
|
|
144
|
+
- **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
|
|
145
|
+
- **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
|
|
146
|
+
- **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
|
|
147
|
+
- **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
|
|
148
|
+
- **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
|
|
149
|
+
- **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
|
|
150
|
+
- **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
|
|
151
|
+
|
|
152
|
+
### Improved
|
|
153
|
+
|
|
154
|
+
- **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
|
|
155
|
+
- **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
|
|
156
|
+
- **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
|
|
157
|
+
- **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
|
|
158
|
+
|
|
159
|
+
### Removed
|
|
160
|
+
|
|
161
|
+
- **Duplicate `_dim` property** — removed in favor of the public `dim` property.
|
|
162
|
+
|
|
163
|
+
### Breaking Changes
|
|
164
|
+
|
|
165
|
+
- String metadata filters now use exact equality (was substring match).
|
|
166
|
+
- `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
|
|
167
|
+
|
|
8
168
|
## [2.4.0] - 2026-03-22
|
|
9
169
|
|
|
10
170
|
### Added
|
|
@@ -1,14 +1,33 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: simplevecdb
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: Dead-simple local vector database powered by usearch HNSW.
|
|
5
|
+
Project-URL: Homepage, https://github.com/CoderDayton/simplevecdb
|
|
6
|
+
Project-URL: Repository, https://github.com/CoderDayton/simplevecdb
|
|
7
|
+
Project-URL: Issues, https://github.com/CoderDayton/simplevecdb/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/CoderDayton/simplevecdb/blob/main/CHANGELOG.md
|
|
5
9
|
Author-email: Dayton Dunbar <coderdayton14@gmail.com>
|
|
6
10
|
License: MIT
|
|
7
11
|
License-File: LICENSE
|
|
12
|
+
Keywords: embeddings,hnsw,langchain,llamaindex,rag,similarity-search,sqlite,usearch,vector-database,vectordb
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Database
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Typing :: Typed
|
|
8
26
|
Requires-Python: >=3.10
|
|
9
27
|
Requires-Dist: cryptography>=41.0
|
|
10
28
|
Requires-Dist: hdbscan>=0.8.33
|
|
11
29
|
Requires-Dist: numpy>=1.24
|
|
30
|
+
Requires-Dist: python-dotenv>=1.0
|
|
12
31
|
Requires-Dist: scikit-learn>=1.3.0
|
|
13
32
|
Requires-Dist: sqlcipher3-binary>=0.5.0
|
|
14
33
|
Requires-Dist: sqlite-vec>=0.1.6
|
|
@@ -169,10 +188,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
|
|
|
169
188
|
**Optional: Run embeddings server (OpenAI-compatible)**
|
|
170
189
|
|
|
171
190
|
```bash
|
|
172
|
-
simplevecdb-server --port 8000
|
|
191
|
+
simplevecdb-server --port 8000 # Default model, auto warm-up
|
|
192
|
+
simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
|
|
193
|
+
simplevecdb-server --no-warmup # Skip model preload on startup
|
|
194
|
+
simplevecdb-server --help # Show all options
|
|
173
195
|
```
|
|
174
196
|
|
|
175
|
-
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
|
|
197
|
+
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
|
|
176
198
|
|
|
177
199
|
### Option 3: With LangChain or LlamaIndex
|
|
178
200
|
|
|
@@ -331,6 +353,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
|
|
|
331
353
|
for doc_id, text, metadata in docs:
|
|
332
354
|
print(f"[{doc_id}] {text[:50]}...")
|
|
333
355
|
|
|
356
|
+
# Paginated access (v2.5+)
|
|
357
|
+
page1 = collection.get_documents(limit=100)
|
|
358
|
+
page2 = collection.get_documents(limit=100, offset=100)
|
|
359
|
+
|
|
334
360
|
# Fetch stored embeddings
|
|
335
361
|
embeddings = collection.get_embeddings_by_ids([1, 2, 3])
|
|
336
362
|
|
|
@@ -342,6 +368,9 @@ collection.update_metadata([
|
|
|
342
368
|
|
|
343
369
|
# Quick stats
|
|
344
370
|
print(f"Collection has {collection.count()} documents, dim={collection.dim}")
|
|
371
|
+
|
|
372
|
+
# Delete an entire collection (v2.5+)
|
|
373
|
+
db.delete_collection("old_data")
|
|
345
374
|
```
|
|
346
375
|
|
|
347
376
|
### Vector Clustering (v2.2+)
|
|
@@ -384,6 +413,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
|
|
|
384
413
|
| **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
|
|
385
414
|
| **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
|
|
386
415
|
| **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
|
|
416
|
+
| **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
|
|
417
|
+
| **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
|
|
418
|
+
| **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
|
|
419
|
+
| **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
|
|
387
420
|
|
|
388
421
|
## Performance Benchmarks
|
|
389
422
|
|
|
@@ -456,6 +489,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
|
|
|
456
489
|
- [x] Vector clustering and auto-tagging (v2.2)
|
|
457
490
|
- [x] Public catalog API for document management (v2.4)
|
|
458
491
|
- [x] Async executor injection for thread-safe sharing (v2.4)
|
|
492
|
+
- [x] Collection management: `delete_collection()`, pagination (v2.5)
|
|
493
|
+
- [x] Cross-process file locking and connection health checks (v2.5)
|
|
494
|
+
- [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
|
|
459
495
|
- [ ] Incremental clustering (online learning)
|
|
460
496
|
- [ ] Cluster visualization exports
|
|
461
497
|
|
|
@@ -140,10 +140,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
|
|
|
140
140
|
**Optional: Run embeddings server (OpenAI-compatible)**
|
|
141
141
|
|
|
142
142
|
```bash
|
|
143
|
-
simplevecdb-server --port 8000
|
|
143
|
+
simplevecdb-server --port 8000 # Default model, auto warm-up
|
|
144
|
+
simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
|
|
145
|
+
simplevecdb-server --no-warmup # Skip model preload on startup
|
|
146
|
+
simplevecdb-server --help # Show all options
|
|
144
147
|
```
|
|
145
148
|
|
|
146
|
-
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
|
|
149
|
+
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
|
|
147
150
|
|
|
148
151
|
### Option 3: With LangChain or LlamaIndex
|
|
149
152
|
|
|
@@ -302,6 +305,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
|
|
|
302
305
|
for doc_id, text, metadata in docs:
|
|
303
306
|
print(f"[{doc_id}] {text[:50]}...")
|
|
304
307
|
|
|
308
|
+
# Paginated access (v2.5+)
|
|
309
|
+
page1 = collection.get_documents(limit=100)
|
|
310
|
+
page2 = collection.get_documents(limit=100, offset=100)
|
|
311
|
+
|
|
305
312
|
# Fetch stored embeddings
|
|
306
313
|
embeddings = collection.get_embeddings_by_ids([1, 2, 3])
|
|
307
314
|
|
|
@@ -313,6 +320,9 @@ collection.update_metadata([
|
|
|
313
320
|
|
|
314
321
|
# Quick stats
|
|
315
322
|
print(f"Collection has {collection.count()} documents, dim={collection.dim}")
|
|
323
|
+
|
|
324
|
+
# Delete an entire collection (v2.5+)
|
|
325
|
+
db.delete_collection("old_data")
|
|
316
326
|
```
|
|
317
327
|
|
|
318
328
|
### Vector Clustering (v2.2+)
|
|
@@ -355,6 +365,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
|
|
|
355
365
|
| **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
|
|
356
366
|
| **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
|
|
357
367
|
| **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
|
|
368
|
+
| **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
|
|
369
|
+
| **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
|
|
370
|
+
| **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
|
|
371
|
+
| **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
|
|
358
372
|
|
|
359
373
|
## Performance Benchmarks
|
|
360
374
|
|
|
@@ -427,6 +441,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
|
|
|
427
441
|
- [x] Vector clustering and auto-tagging (v2.2)
|
|
428
442
|
- [x] Public catalog API for document management (v2.4)
|
|
429
443
|
- [x] Async executor injection for thread-safe sharing (v2.4)
|
|
444
|
+
- [x] Collection management: `delete_collection()`, pagination (v2.5)
|
|
445
|
+
- [x] Cross-process file locking and connection health checks (v2.5)
|
|
446
|
+
- [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
|
|
430
447
|
- [ ] Incremental clustering (online learning)
|
|
431
448
|
- [ ] Cluster visualization exports
|
|
432
449
|
|
|
@@ -5,6 +5,166 @@ All notable changes to SimpleVecDB will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.6.0] - 2026-05-06
|
|
9
|
+
|
|
10
|
+
### Review pass 3 — final correctness/security pass before tag
|
|
11
|
+
|
|
12
|
+
#### Critical fixes
|
|
13
|
+
|
|
14
|
+
- **`UsearchIndex.save` lost-update race** — the `_dirty = False` clear was outside the `file_lock` window, so a concurrent `add()` between `os.replace()` and the dirty-flag clear could be silently overwritten. Moved inside `file_lock`.
|
|
15
|
+
- **`UsearchIndex.save` data fsync on `O_RDONLY` fd** — `fsync(2)` on a read-only file descriptor has implementation-defined behavior on Linux (some kernels return `EBADF`, swallowed by the warning branch). Switched to `O_RDWR` so the data fsync is guaranteed.
|
|
16
|
+
- **`_rebuild_index_locked` bare `conn.execute`** — replaced the bare `self.conn.execute("SELECT id FROM ...")` with the new `CatalogManager.list_all_ids()`, which routes the read through `self._lock` instead of relying on RLock re-entrancy from a single caller.
|
|
17
|
+
- **PBKDF2 iteration bump** — raised from 480 000 → 600 000 to match the OWASP 2024 minimum for PBKDF2-HMAC-SHA256.
|
|
18
|
+
- **AES-GCM AAD now binds the v1 header** — `encrypt_file` / `decrypt_file` pass the magic+version bytes as `associated_data`, so any tampering with the header (including downgrade attempts) fails authentication instead of silently succeeding.
|
|
19
|
+
- **Bounded normalize-key cache** — `_NORMALIZE_KEY_CACHE` is now an LRU capped at 64 entries, serialized by a `threading.Lock`. Long-running multi-tenant processes no longer leak derived key material indefinitely.
|
|
20
|
+
- **LlamaIndex `delete()` no longer swallows `sqlite3.DatabaseError`** — narrowed the exception in the metadata-fallback path to `(TypeError, NotImplementedError)`. A locked DB, closed connection, or schema mismatch now propagates to the caller instead of becoming a silent no-op.
|
|
21
|
+
- **Hybrid-search RRF rank symmetry** — vector candidates now use the original HNSW position as their RRF rank (via `enumerate(vector_keys_list)`), matching how keyword candidates use raw BM25 position. Previously, a metadata filter that rejected vector candidates inflated surviving vector scores relative to keyword scores, corrupting result ordering.
|
|
22
|
+
- **`add_documents` FTS sentinel guard** — added a defense-in-depth check that raises `RuntimeError` if any `-1` sentinel rowid remains in `real_ids` before the FTS upsert. Prevents a hypothetical retry-loop interaction from corrupting the FTS index with rowid `-1`.
|
|
23
|
+
|
|
24
|
+
#### Important fixes
|
|
25
|
+
|
|
26
|
+
- **`delete_collection` TOCTOU** — moved the `list_collections()` existence check inside the `with self._lock:` block so two concurrent `delete_collection(name)` calls cannot both pass the check; the second now sees a clean `KeyError` instead of a SQLite error.
|
|
27
|
+
- **Salt sidecar `O_EXCL` guard** — `_resolve_salt(create_if_missing=True)` now creates the sidecar with `O_CREAT | O_EXCL`. If two processes race, the loser reads the winner's salt; if a sidecar already exists out-of-band, it is preserved instead of being clobbered (which would have rendered an existing DB unreadable).
|
|
28
|
+
- **`encrypt_index_file` v0→v1 sidecar migration** — re-encrypting a legacy v0 blob (no sidecar) now creates a fresh sidecar, completing the migration path to per-DB salts. Previously, `is_first_encryption` was keyed on `.enc` presence rather than `.salt` presence.
|
|
29
|
+
- **LlamaIndex legacy-collection warning** — `SimpleVecDBLlamaStore.__init__` now emits a one-shot `DeprecationWarning` when it detects rows lacking `_simplevecdb_node_id`, telling the operator to call `migrate_node_id_metadata()` and noting the inherent limitation that pre-2.6 rows can only be stamped with `str(doc_id)` (the original LlamaIndex node ids were never persisted).
|
|
30
|
+
- **INT8 quantization range break softened** — instead of raising `ValueError` on `max(|x|) > 1.0 + 1e-5`, the strategy now emits a one-shot `DeprecationWarning` and clips. Restores backwards compatibility for callers that relied on the prior silent-clip behavior.
|
|
31
|
+
- **`scripts/check_version_sync.py` now validates `CHANGELOG.md`** — the hook fails if the latest CHANGELOG entry header does not match `pyproject.toml`'s version, preventing a release from shipping with a stale changelog.
|
|
32
|
+
|
|
33
|
+
#### Test coverage added (review pass 3 gaps)
|
|
34
|
+
|
|
35
|
+
- `tests/unit/test_v26_review_pass_3.py` — covers parent-directory fsync on save, `.tmp` cleanup on save failure, `db._lock is catalog._lock` shared-RLock identity, adversarial inputs to `_validate_table_name`, hybrid-search RRF rank symmetry under filter, and same-text-different-id deduplication.
|
|
36
|
+
- `tests/unit/test_v26_encryption_review_pass_3.py` — covers nonce uniqueness across saves, wrong-key decrypt does not create the output file, AAD-bound header tampering fails authentication, salt sidecar O_EXCL preservation, and v0→v1 migration round-trip.
|
|
37
|
+
- `tests/unit/integrations/test_llamaindex_review_pass_3.py` — covers the `add → query` round-trip preserving the original LlamaIndex node id, end-to-end migration-then-delete on v2.5-shaped data, the legacy-collection `DeprecationWarning` at `__init__` time, and that `sqlite3.DatabaseError` from the metadata-fallback path now propagates instead of being swallowed.
|
|
38
|
+
|
|
39
|
+
### Fixed (concurrency & durability)
|
|
40
|
+
|
|
41
|
+
- **Atomic `UsearchIndex.save`** — now writes to a sibling `.tmp`, fsyncs, then `os.replace()`s onto the live path and fsyncs the parent directory. A crash mid-save can no longer corrupt the only copy of the index. Also moved the `_dirty` short-circuit inside `_write_lock` so a concurrent `add` cannot have its dirty flag silently cleared.
|
|
42
|
+
- **Atomic `rebuild_index`** — builds the new index at a sibling `.rebuild` path and atomically swaps it onto the live path; the old index remains the canonical copy until the swap succeeds.
|
|
43
|
+
- **Atomic encrypted save** — `encrypt_file` / `decrypt_file` now write to a sibling `.tmp`, fsync, set mode `0o600`, then `os.replace()`. `encrypt_index_file` only unlinks the plaintext after the encrypted output is durably on disk. A torn write can no longer leave the index unrecoverable.
|
|
44
|
+
- **`VectorDB`-level `RLock`** — a single re-entrant lock now serializes the `_collections` cache (no more check-then-insert TOCTOU on `collection()`) and is shared with every `CatalogManager` so all `with self.conn:` blocks across collections cannot interleave on the shared `sqlite3.Connection`. Reads remain lock-free at the SQLite level via WAL.
|
|
45
|
+
- **`AsyncVectorDB.close` drains** — switched from `executor.shutdown(wait=False)` to `wait=True` so in-flight pool tasks finish their cursors before the SQLite connection is closed. Pending (not-yet-started) work is still cancelled.
|
|
46
|
+
- **`set_parent` cycle check is transactional** — descendant lookup and parent UPDATE now run inside the same `with self._lock, self.conn:` block, closing a TOCTOU window where a concurrent edge could form a cycle.
|
|
47
|
+
- **Cluster persistence** — `_ensure_cluster_table`, `save_cluster_state`, `delete_cluster_state` now use `with self._lock, self.conn:` instead of bare `conn.commit()`; an exception during the execute is properly rolled back.
|
|
48
|
+
- **`add_documents` ID recovery is correct under upsert** — replaced the `last_insert_rowid()` arithmetic (which silently returned wrong IDs for batches mixing explicit and `None` IDs because UPSERTs do not advance the auto-increment counter) with a single `INSERT … RETURNING id` for the auto-ID rows. Explicit-ID rows still take the upsert path.
|
|
49
|
+
- **`delete_collection` closes cached indexes first** — any `VectorCollection` instances cached for the deleted name have their `UsearchIndex` closed before the file is unlinked, so a stale mmap view cannot race the unlink.
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- **`upsert_fts_rows` / `delete_fts_rows` are now `_upsert_fts_rows` / `_delete_fts_rows`** (private). The FTS shadow table must be updated inside the same transaction as the main table or it can desync on crash; the rename signals the contract.
|
|
54
|
+
- **`get_legacy_vectors`, `drop_legacy_vec_table`** now validate the supplied table name via `_validate_table_name` before interpolating into SQL.
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
|
|
58
|
+
- **Declared `python-dotenv` dependency** — `simplevecdb.config` already imported and called `load_dotenv` at package import; the missing dependency would `ImportError` on a clean install of the base package without optional extras.
|
|
59
|
+
|
|
60
|
+
### Fixed (correctness & quality)
|
|
61
|
+
|
|
62
|
+
- **RRF deduplication keys by document ID, not text** — `hybrid_search` previously deduped by `doc.page_content`, silently merging two distinct documents that happened to share text into one inflated-score result.
|
|
63
|
+
- **NaN/Inf guard at insert** — `add_texts` and `add_texts_streaming` reject non-finite vectors instead of feeding them to HNSW, which would produce undefined neighbours and could corrupt the graph.
|
|
64
|
+
- **`normalize_l2` handles subnormals** — replaced the exact `norm == 0` compare with a `< 1e-12` check (matching the existing usearch_index guard); subnormal floats no longer produce wildly large normalized vectors.
|
|
65
|
+
- **Silhouette score samples on large collections** — `silhouette_score` is O(n²); now caps the evaluation sample at `SILHOUETTE_MAX_SAMPLE = 10_000`. Large collections no longer OOM.
|
|
66
|
+
- **MMR maintains the selected matrix incrementally** — replaced per-iteration `np.stack(selected_embs)` with `np.vstack` of a running matrix. O(k²·d) wasted allocations dropped to O(k·d).
|
|
67
|
+
- **`_parse_bool_env` treats `KEY=` as unset** — empty strings now fall through to the default; previously they were truthy because `"".strip()` is not in the falsey set.
|
|
68
|
+
- **LangChain async methods use `asyncio.to_thread`** — `aadd_texts` / `asimilarity_search` / `amax_marginal_relevance_search` no longer block the event loop.
|
|
69
|
+
- **LlamaIndex `delete()` survives a process restart** — node IDs are persisted into document metadata under `_simplevecdb_node_id`; `delete()` falls back to a metadata query when the in-memory `_id_map` is empty.
|
|
70
|
+
- **LlamaIndex query results carry stable node IDs** — replaced `str(hash(page_content))` (process-randomized, collision-prone) with the persisted `_simplevecdb_node_id`.
|
|
71
|
+
- **`AsyncVectorDB.collection` accepts `store_embeddings`** — async callers can now enable embedding storage (required for `rebuild_index()`); previously they had no way to set it.
|
|
72
|
+
|
|
73
|
+
### Security
|
|
74
|
+
|
|
75
|
+
- **API key comparison uses `hmac.compare_digest`** — the prior `token not in allowed_keys` short-circuit leaked key prefixes via response time.
|
|
76
|
+
- **SQLCipher PRAGMA key always uses the `x'hex'` form** — every key path now goes through `_normalize_key` first, eliminating string interpolation of user-supplied passphrase characters into a quoted PRAGMA argument.
|
|
77
|
+
- **`is_database_encrypted` rejects zero-byte files** — previously a missing/empty DB looked like an unencrypted DB because `sqlite3.connect` would create a fresh one.
|
|
78
|
+
|
|
79
|
+
### Changed (tooling)
|
|
80
|
+
|
|
81
|
+
- **Ruff and mypy targets aligned with `requires-python>=3.10`** — both were `py312`, hiding 3.10/3.11 incompatibilities. Cleaned three resulting `F401` unused-import warnings (`signal` in models.py, `_batched` and `constants` re-imports).
|
|
82
|
+
- **Pre-commit version-sync hook** — `__init__.py` derives `__version__` dynamically via `importlib.metadata`, so `check_version_sync.py` was failing on every commit looking for a literal `__version__ = "x.y.z"` line that does not exist. The hook now validates only `pyproject.toml`'s version field. `bump_version.py` similarly stops trying to rewrite `__init__.py` and uses an anchored regex to update only the canonical version field.
|
|
83
|
+
|
|
84
|
+
### Security (2.6.0 final)
|
|
85
|
+
|
|
86
|
+
- **Per-DB random PBKDF2 salt** — encrypted databases and index files now generate a random 16-byte salt at creation time, written to a `<resource>.salt` sidecar with mode `0o600`. The previous fixed `b"simplevecdb-sqlcipher-key"` salt let an attacker precompute one rainbow table that broke every simplevecdb installation with the same passphrase. Pre-2.6.0 encrypted resources keep working unchanged: when no sidecar exists, the loader falls back to the legacy fixed salt automatically.
|
|
87
|
+
- **HuggingFace `repo_id` allowlist + `trust_remote_code=False`** — the embeddings server validates model names against a strict regex (`namespace/name` with `[A-Za-z0-9_.-]` only) before passing them to `snapshot_download` / `SentenceTransformer`, blocking path traversal and local-filesystem inputs. `SentenceTransformer` is constructed with `trust_remote_code=False` so a malicious model card cannot trigger arbitrary downloaded Python on load.
|
|
88
|
+
- **CORS is opt-in** — the server no longer adds CORS middleware unless `EMBEDDING_SERVER_CORS_ORIGINS` is set. When the operator does set wildcard origins (`["*"]`), `allow_credentials` is forced off so the spec-violating wildcard-with-credentials combo can't be produced.
|
|
89
|
+
|
|
90
|
+
### Migration helpers
|
|
91
|
+
|
|
92
|
+
- **`SimpleVecDBLlamaStore.migrate_node_id_metadata()`** — backfills `_simplevecdb_node_id` for documents inserted before 2.6.0. Pre-2.6.0 versions did not persist the LlamaIndex node_id into metadata, so `delete()` could not find the right row after a process restart. Idempotent — already-stamped rows are skipped.
|
|
93
|
+
|
|
94
|
+
### Added (hygiene & polish)
|
|
95
|
+
|
|
96
|
+
- **`ClusterResult` and `ClusterTagCallback` exported from `simplevecdb`** — they were return/argument types of public methods but had no public import path; users had to reach into `simplevecdb.types`.
|
|
97
|
+
- **`NullHandler` attached to the package's root logger** at import time, per the Python logging HOWTO. Idempotent — duplicate calls do not stack handlers.
|
|
98
|
+
- **`SimpleVecDBLlamaStore.delete_nodes` raises `NotImplementedError`** when called with `filters`, instead of silently dropping the filter portion and pretending the deletion succeeded.
|
|
99
|
+
- **Recursive CTE depth bound as a parameter** in `get_descendants` / `get_ancestors`. The previous f-string interpolation was safe due to `int()` coercion but is now one less line away from injection on a future refactor.
|
|
100
|
+
- **`Config.from_env()` documented** as returning the import-time-frozen instance; setting env vars after import does not refresh.
|
|
101
|
+
- **`ModelRegistry(allow_unlisted=...)` defaults to `False`** to match the secure-by-default config setting; programmatic instantiations no longer get an open registry by accident.
|
|
102
|
+
- **`/v1/usage` returns aggregated totals when auth is disabled** instead of leaking the per-IP buckets to anyone who hits the endpoint.
|
|
103
|
+
- **Server validates `EMBEDDING_SERVER_MAX_REQUEST_ITEMS <= _MAX_ENCODE_BATCH` at startup** so an out-of-range env var fails fast at boot rather than per request.
|
|
104
|
+
- **`pyproject.toml` gains `[project.urls]`, `classifiers`, and `keywords`** for a useful PyPI listing.
|
|
105
|
+
- **`.bandit` documents the B104 skip** and warns that any future `0.0.0.0` binding requires removing the skip.
|
|
106
|
+
- **Encrypted file format now carries a 3-byte header** (`'SV' + version`) so future format changes are detectable. `decrypt_file` accepts both the new v1 format and the v0 (pre-2.6.0) format, so existing encrypted indexes still load without re-encryption.
|
|
107
|
+
|
|
108
|
+
### Fixed (review pass 2)
|
|
109
|
+
|
|
110
|
+
- **NaN/Inf rejection no longer leaves orphan catalog rows** — `add_texts` and `_process_streaming_batch` now validate vectors *before* the SQLite insert. Previously the catalog row committed first and a non-finite vector then raised, leaving rows visible via `get_documents_by_ids` but unreachable through similarity search.
|
|
111
|
+
- **`VectorCollection.__repr__` no longer issues SQL** — the previous `count()` call would raise `ProgrammingError` after `close()`, breaking debuggers and exception formatters that auto-stringify objects. The 2.6.0 fix only covered `VectorDB.__repr__`.
|
|
112
|
+
- **`EMBEDDING_SERVER_MAX_REQUEST_ITEMS` validation runs at module import** — the guard was previously inside `run_server()` and was bypassed under any non-CLI ASGI deployment (gunicorn, programmatic uvicorn).
|
|
113
|
+
- **LlamaIndex empty-`node_id` path is atomic** — `SimpleVecDBLlamaStore.add` now generates a UUID for nodes that arrive without a `node_id` and stamps it into metadata *before* the row insert, so the metadata commit is in the same SQLite transaction as the catalog row. Previously a separate `UPDATE` followed `add_texts`; a crash in the gap left rows un-stampable and cross-restart `delete()` silently no-op'd.
|
|
114
|
+
- **Catalog read paths serialize on `self._lock`** — `get_documents_by_ids`, `get_embeddings_by_ids`, `get_documents_and_embeddings_by_ids`, `find_ids_by_texts`, `find_ids_by_filter`, `keyword_search`, `count`, `get_all_docs_with_text`, `check_legacy_sqlite_vec`, `get_legacy_vectors`, `get_children`, `get_parent`, `get_descendants`, `get_ancestors`, `load_cluster_state`, `list_cluster_states`, and `VectorDB.list_collections` now acquire the connection-level lock around `conn.execute`. `sqlite3.Connection` is not safe for concurrent statement execution from multiple threads even under WAL.
|
|
115
|
+
- **`rebuild_index` is fully serialized** — the entire fetch + build + swap now runs inside `with self._lock:` so concurrent `add` / `delete` cannot mutate the catalog mid-rebuild and produce a stale snapshot.
|
|
116
|
+
- **`_ensure_cluster_table` double-checked under lock** — the `_cluster_table_ready` flag is now re-checked inside the lock and set inside the `with` block. Concurrent first-callers no longer both run the DDL.
|
|
117
|
+
- **`utils.file_lock` opens via `os.open(O_CREAT | O_RDWR, 0o600)`** — no truncation of stale lock files from a crashed prior run, restricted permissions on the lock sentinel.
|
|
118
|
+
|
|
119
|
+
## [2.5.0] - 2026-04-07
|
|
120
|
+
|
|
121
|
+
### Added
|
|
122
|
+
|
|
123
|
+
- **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
|
|
124
|
+
- **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
|
|
125
|
+
- **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
|
|
126
|
+
- **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
|
|
127
|
+
- **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
|
|
128
|
+
- **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
|
|
129
|
+
- **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
|
|
130
|
+
- **Embeddings server enhancements:**
|
|
131
|
+
- Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
|
|
132
|
+
- CORS middleware with configurable origins for browser-based clients
|
|
133
|
+
- Model warm-up on startup (skip with `--no-warmup`)
|
|
134
|
+
- Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
|
|
135
|
+
- Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
|
|
136
|
+
- Startup banner logging config summary (host, port, model, auth, rate limits)
|
|
137
|
+
- Nested token array normalization (`list[list[int]]` input format)
|
|
138
|
+
- Async executor offload for `embed_texts` (non-blocking event loop)
|
|
139
|
+
- OpenAPI version synced from package metadata
|
|
140
|
+
- Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
|
|
141
|
+
|
|
142
|
+
### Fixed
|
|
143
|
+
|
|
144
|
+
- **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
|
|
145
|
+
- **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
|
|
146
|
+
- **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
|
|
147
|
+
- **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
|
|
148
|
+
- **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
|
|
149
|
+
- **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
|
|
150
|
+
- **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
|
|
151
|
+
|
|
152
|
+
### Improved
|
|
153
|
+
|
|
154
|
+
- **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
|
|
155
|
+
- **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
|
|
156
|
+
- **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
|
|
157
|
+
- **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
|
|
158
|
+
|
|
159
|
+
### Removed
|
|
160
|
+
|
|
161
|
+
- **Duplicate `_dim` property** — removed in favor of the public `dim` property.
|
|
162
|
+
|
|
163
|
+
### Breaking Changes
|
|
164
|
+
|
|
165
|
+
- String metadata filters now use exact equality (was substring match).
|
|
166
|
+
- `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
|
|
167
|
+
|
|
8
168
|
## [2.4.0] - 2026-03-22
|
|
9
169
|
|
|
10
170
|
### Added
|
|
@@ -80,12 +80,57 @@ db = VectorDB("secure.db", encryption_key=encryption_key)
|
|
|
80
80
|
With encryption enabled, files are stored as:
|
|
81
81
|
|
|
82
82
|
```
|
|
83
|
-
mydb.db
|
|
84
|
-
mydb.db.
|
|
83
|
+
mydb.db # SQLCipher encrypted SQLite database
|
|
84
|
+
mydb.db.salt # 16-byte random salt sidecar (mode 0o600)
|
|
85
|
+
mydb.db.default.usearch.enc # AES-256-GCM encrypted usearch index (v1)
|
|
86
|
+
mydb.db.default.usearch.enc.salt # 16-byte salt sidecar for the index
|
|
85
87
|
```
|
|
86
88
|
|
|
87
89
|
When opened, the index is decrypted to memory (or a temp file). On `save()` or `close()`, the index is re-encrypted.
|
|
88
90
|
|
|
91
|
+
### Per-DB random salt (2.6.0+)
|
|
92
|
+
|
|
93
|
+
Each encrypted database and each encrypted index file gets its own
|
|
94
|
+
random 16-byte salt, written to a sibling `.salt` file with mode
|
|
95
|
+
`0o600`. The salt is the second input to PBKDF2-HMAC-SHA256, so two
|
|
96
|
+
databases that share the same passphrase derive **different** keys.
|
|
97
|
+
|
|
98
|
+
The sidecar is created with `O_CREAT | O_EXCL` so two processes opening
|
|
99
|
+
the same fresh database concurrently cannot race to write conflicting
|
|
100
|
+
salts; the loser reads the winner's salt and proceeds. An existing
|
|
101
|
+
sidecar is never overwritten — clobbering it would render the database
|
|
102
|
+
permanently unreadable with the original passphrase.
|
|
103
|
+
|
|
104
|
+
Pre-2.6.0 databases continue to open with a fixed legacy salt when no
|
|
105
|
+
sidecar is present, so existing on-disk data keeps working unchanged.
|
|
106
|
+
|
|
107
|
+
### v1 index file format (2.6.0+)
|
|
108
|
+
|
|
109
|
+
Index files written by 2.6.0+ start with a 3-byte version header:
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
magic = b"SV" (2 bytes)
|
|
113
|
+
version = 0x01 (1 byte)
|
|
114
|
+
nonce = 12 bytes
|
|
115
|
+
ciphertext + GCM tag
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The header bytes are bound into the AES-GCM **associated_data**, so
|
|
119
|
+
any tampering with the magic or version (including a downgrade attempt
|
|
120
|
+
that strips them) fails authentication on decrypt. Pre-2.6.0 (v0) blobs
|
|
121
|
+
have no header and continue to decrypt successfully — `decrypt_file`
|
|
122
|
+
detects the format automatically.
|
|
123
|
+
|
|
124
|
+
### Atomic durability
|
|
125
|
+
|
|
126
|
+
`encrypt_file` and `decrypt_file` write to a sibling `.tmp` file,
|
|
127
|
+
`fsync()` the data, set mode `0o600`, then `os.replace()` onto the
|
|
128
|
+
target. The parent directory is also fsynced so the rename itself is
|
|
129
|
+
durable on POSIX. A crash mid-write leaves only the orphan temp file —
|
|
130
|
+
the live target is never torn. `encrypt_index_file` only unlinks the
|
|
131
|
+
plaintext after the encrypted output is durably on disk, so an
|
|
132
|
+
interrupted re-encryption never destroys data.
|
|
133
|
+
|
|
89
134
|
## Performance
|
|
90
135
|
|
|
91
136
|
### Search Operations
|
|
@@ -135,10 +180,12 @@ except EncryptionUnavailableError:
|
|
|
135
180
|
|
|
136
181
|
## Security Notes
|
|
137
182
|
|
|
138
|
-
- **SQLCipher** uses AES-256-CBC with HMAC-SHA512 for authentication
|
|
139
|
-
- **Index encryption** uses AES-256-GCM with random 96-bit nonces
|
|
140
|
-
- **Key derivation** uses PBKDF2-SHA256 with
|
|
141
|
-
- **
|
|
183
|
+
- **SQLCipher** uses AES-256-CBC with HMAC-SHA512 for authentication.
|
|
184
|
+
- **Index encryption** uses AES-256-GCM with random 96-bit nonces (`secrets.token_bytes`); each save generates a fresh nonce.
|
|
185
|
+
- **Key derivation** uses PBKDF2-HMAC-SHA256 with **600,000 iterations** (OWASP 2024 recommendation) and a per-DB random salt.
|
|
186
|
+
- **v1 file format** binds the magic+version header bytes into AES-GCM `associated_data`, defeating header tampering and downgrade attacks.
|
|
187
|
+
- **Derived keys** are cached in a bounded LRU (max 64 entries, serialized by a thread lock) so repeat opens within a process avoid the 600k-iter cost without leaking key material in long-running multi-tenant processes.
|
|
188
|
+
- **The encryption key is held in memory** during database usage.
|
|
142
189
|
|
|
143
190
|
## API Reference
|
|
144
191
|
|