simplevecdb 2.4.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. simplevecdb-2.6.0/.bandit +9 -0
  2. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.gitignore +4 -2
  3. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CHANGELOG.md +160 -0
  4. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/PKG-INFO +39 -3
  5. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/README.md +19 -2
  6. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/CHANGELOG.md +160 -0
  7. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/encryption.md +53 -6
  8. simplevecdb-2.6.0/lefthook.yml +39 -0
  9. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/pyproject.toml +39 -3
  10. simplevecdb-2.6.0/scripts/bump_version.py +88 -0
  11. simplevecdb-2.6.0/scripts/check_version_sync.py +92 -0
  12. simplevecdb-2.6.0/scripts/track_metrics.py +82 -0
  13. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/__init__.py +13 -1
  14. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/async_core.py +59 -10
  15. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/config.py +18 -3
  16. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/constants.py +6 -1
  17. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/core.py +335 -67
  18. simplevecdb-2.6.0/src/simplevecdb/embeddings/__init__.py +12 -0
  19. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/embeddings/models.py +38 -9
  20. simplevecdb-2.6.0/src/simplevecdb/embeddings/server.py +637 -0
  21. simplevecdb-2.6.0/src/simplevecdb/encryption.py +672 -0
  22. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/catalog.py +328 -150
  23. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/clustering.py +13 -2
  24. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/quantization.py +42 -5
  25. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/search.py +118 -59
  26. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/usearch_index.py +115 -56
  27. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/integrations/langchain.py +27 -5
  28. simplevecdb-2.6.0/src/simplevecdb/integrations/llamaindex.py +355 -0
  29. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/logging.py +9 -39
  30. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/utils.py +162 -1
  31. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_langchain.py +1 -1
  32. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_llamaindex.py +4 -4
  33. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_rag.py +12 -3
  34. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_server.py +15 -10
  35. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_core_additional_coverage.py +2 -2
  36. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_initialization.py +2 -2
  37. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_missing_coverage.py +6 -7
  38. simplevecdb-2.6.0/tests/unit/core/test_v25_correctness.py +267 -0
  39. simplevecdb-2.6.0/tests/unit/core/test_v25_features.py +344 -0
  40. simplevecdb-2.6.0/tests/unit/core/test_v25_robustness.py +306 -0
  41. simplevecdb-2.6.0/tests/unit/core/test_v26_safety.py +182 -0
  42. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_models.py +1 -0
  43. simplevecdb-2.6.0/tests/unit/embeddings/test_repo_id_validation.py +100 -0
  44. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_server.py +33 -25
  45. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/test_server_coverage.py +1 -2
  46. simplevecdb-2.6.0/tests/unit/embeddings/test_v25_enhancements.py +380 -0
  47. simplevecdb-2.6.0/tests/unit/engine/test_v26_quantization_clustering.py +133 -0
  48. simplevecdb-2.6.0/tests/unit/integrations/test_llamaindex_review_pass_3.py +154 -0
  49. simplevecdb-2.6.0/tests/unit/integrations/test_llamaindex_v26.py +190 -0
  50. simplevecdb-2.6.0/tests/unit/test_async_v26.py +115 -0
  51. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_catalog_coverage.py +10 -12
  52. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_core.py +6 -7
  53. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_encryption_coverage.py +0 -1
  54. simplevecdb-2.6.0/tests/unit/test_encryption_salt.py +98 -0
  55. simplevecdb-2.6.0/tests/unit/test_encryption_v1_format.py +220 -0
  56. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_error_handling.py +0 -16
  57. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search_missing_coverage.py +2 -3
  58. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_usearch_index_missing_coverage.py +2 -3
  59. simplevecdb-2.6.0/tests/unit/test_v26_encryption_review_pass_3.py +214 -0
  60. simplevecdb-2.6.0/tests/unit/test_v26_misc.py +184 -0
  61. simplevecdb-2.6.0/tests/unit/test_v26_review_pass_3.py +270 -0
  62. simplevecdb-2.6.0/tests/unit/test_v26_review_pass_4.py +179 -0
  63. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/uv.lock +245 -227
  64. simplevecdb-2.4.0/.bandit +0 -9
  65. simplevecdb-2.4.0/.claude/settings.local.json +0 -8
  66. simplevecdb-2.4.0/.pre-commit-config.yaml +0 -37
  67. simplevecdb-2.4.0/src/simplevecdb/embeddings/__init__.py +0 -0
  68. simplevecdb-2.4.0/src/simplevecdb/embeddings/server.py +0 -374
  69. simplevecdb-2.4.0/src/simplevecdb/encryption.py +0 -417
  70. simplevecdb-2.4.0/src/simplevecdb/integrations/llamaindex.py +0 -227
  71. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.env.example +0 -0
  72. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/FUNDING.yml +0 -0
  73. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  74. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  75. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  76. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/dependabot.yml +0 -0
  77. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/ci.yml +0 -0
  78. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/publish.yml +0 -0
  79. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/security.yml +0 -0
  80. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.github/workflows/update-sponsors.yml +0 -0
  81. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/.python-version +0 -0
  82. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CODE_OF_CONDUCT.md +0 -0
  83. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/CONTRIBUTING.md +0 -0
  84. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/LICENSE +0 -0
  85. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/SECURITY.md +0 -0
  86. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/CONTRIBUTING.md +0 -0
  87. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/ENV_SETUP.md +0 -0
  88. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/LICENSE +0 -0
  89. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/async.md +0 -0
  90. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/config.md +0 -0
  91. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/core.md +0 -0
  92. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/embeddings.md +0 -0
  93. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/catalog.md +0 -0
  94. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/quantization.md +0 -0
  95. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/engine/search.md +0 -0
  96. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/integrations.md +0 -0
  97. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/api/types.md +0 -0
  98. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/benchmarks.md +0 -0
  99. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/examples.md +0 -0
  100. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/guides/clustering.md +0 -0
  101. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/docs/index.md +0 -0
  102. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/auto_embed.py +0 -0
  103. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/backend_benchmark.py +0 -0
  104. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/embeddings/perf_benchmark.py +0 -0
  105. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/quant_benchmark.py +0 -0
  106. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/langchain_rag.ipynb +0 -0
  107. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/llama_rag.ipynb +0 -0
  108. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/rag/ollama_rag.ipynb +0 -0
  109. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/examples/smoke_test.py +0 -0
  110. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/mkdocs.yml +0 -0
  111. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/engine/__init__.py +0 -0
  112. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/integrations/__init__.py +0 -0
  113. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/src/simplevecdb/types.py +0 -0
  114. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/conftest.py +0 -0
  115. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/integration/test_v21_features.py +0 -0
  116. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/perf/test_batch_detection.py +0 -0
  117. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/perf/test_performance.py +0 -0
  118. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/__init__.py +0 -0
  119. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_batch_detection.py +0 -0
  120. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_factory_methods.py +0 -0
  121. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_filters.py +0 -0
  122. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_quantization.py +0 -0
  123. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/core/test_similarity_search.py +0 -0
  124. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/embeddings/__init__.py +0 -0
  125. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/__init__.py +0 -0
  126. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/test_langchain_coverage.py +0 -0
  127. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/integrations/test_llamaindex_coverage.py +0 -0
  128. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_async.py +0 -0
  129. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_async_coverage.py +0 -0
  130. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_clustering.py +0 -0
  131. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_config.py +0 -0
  132. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_cross_collection_search.py +0 -0
  133. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_encryption.py +0 -0
  134. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_hierarchy.py +0 -0
  135. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_multi_collection.py +0 -0
  136. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search.py +0 -0
  137. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_search_coverage.py +0 -0
  138. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_streaming.py +0 -0
  139. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_types.py +0 -0
  140. {simplevecdb-2.4.0 → simplevecdb-2.6.0}/tests/unit/test_utils.py +0 -0
@@ -0,0 +1,9 @@
1
+ exclude_dirs:
2
+
3
+ - /tests
4
+ - /examples
5
+
6
+ skips:
7
+
8
+ - B104 # 0.0.0.0 binding: SERVER_HOST defaults to 127.0.0.1; bandit can't see runtime defaults, so the warning is a false positive on this codebase. Keep it skipped only because the default is safe — if anyone introduces a hardcoded "0.0.0.0", remove this skip.
9
+ - B608 # SQL injection false positive: table names are validated via _validate_table_name()
@@ -21,16 +21,18 @@ build/
21
21
  *.db
22
22
  *.sqlite
23
23
 
24
- # OpenCode
24
+ # Agentic CLI tools (per-developer state)
25
25
  .opencode/
26
26
  opencode.json
27
+ .claude/
28
+ .codex
27
29
 
28
30
  # Project specific
29
31
  simplevecdb_plan.md
30
32
  AGENTS.md
31
33
  htmlcov/
32
34
  site/
33
- scripts
35
+ htmlcov/
34
36
  .coverage
35
37
  NEXT_UPDATES.md
36
38
  pro_pack/
@@ -5,6 +5,166 @@ All notable changes to SimpleVecDB will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.6.0] - 2026-05-06
9
+
10
+ ### Review pass 3 — final correctness/security pass before tag
11
+
12
+ #### Critical fixes
13
+
14
+ - **`UsearchIndex.save` lost-update race** — the `_dirty = False` clear was outside the `file_lock` window, so a concurrent `add()` between `os.replace()` and the dirty-flag clear could be silently overwritten. Moved inside `file_lock`.
15
+ - **`UsearchIndex.save` data fsync on `O_RDONLY` fd** — `fsync(2)` on a read-only file descriptor has implementation-defined behavior on Linux (some kernels return `EBADF`, swallowed by the warning branch). Switched to `O_RDWR` so the data fsync is guaranteed.
16
+ - **`_rebuild_index_locked` bare `conn.execute`** — replaced the bare `self.conn.execute("SELECT id FROM ...")` with the new `CatalogManager.list_all_ids()`, which routes the read through `self._lock` instead of relying on RLock re-entrancy from a single caller.
17
+ - **PBKDF2 iteration bump** — raised from 480 000 → 600 000 to match the OWASP 2024 minimum for PBKDF2-HMAC-SHA256.
18
+ - **AES-GCM AAD now binds the v1 header** — `encrypt_file` / `decrypt_file` pass the magic+version bytes as `associated_data`, so any tampering with the header (including downgrade attempts) fails authentication instead of silently succeeding.
19
+ - **Bounded normalize-key cache** — `_NORMALIZE_KEY_CACHE` is now an LRU capped at 64 entries, serialized by a `threading.Lock`. Long-running multi-tenant processes no longer leak derived key material indefinitely.
20
+ - **LlamaIndex `delete()` no longer swallows `sqlite3.DatabaseError`** — narrowed the exception in the metadata-fallback path to `(TypeError, NotImplementedError)`. A locked DB, closed connection, or schema mismatch now propagates to the caller instead of becoming a silent no-op.
21
+ - **Hybrid-search RRF rank symmetry** — vector candidates now use the original HNSW position as their RRF rank (via `enumerate(vector_keys_list)`), matching how keyword candidates use raw BM25 position. Previously, a metadata filter that rejected vector candidates inflated surviving vector scores relative to keyword scores, corrupting result ordering.
22
+ - **`add_documents` FTS sentinel guard** — added a defense-in-depth check that raises `RuntimeError` if any `-1` sentinel rowid remains in `real_ids` before the FTS upsert. Prevents a hypothetical retry-loop interaction from corrupting the FTS index with rowid `-1`.
23
+
24
+ #### Important fixes
25
+
26
+ - **`delete_collection` TOCTOU** — moved the `list_collections()` existence check inside the `with self._lock:` block so two concurrent `delete_collection(name)` calls cannot both pass the check; the second now sees a clean `KeyError` instead of a SQLite error.
27
+ - **Salt sidecar `O_EXCL` guard** — `_resolve_salt(create_if_missing=True)` now creates the sidecar with `O_CREAT | O_EXCL`. If two processes race, the loser reads the winner's salt; if a sidecar already exists out-of-band, it is preserved instead of being clobbered (which would have rendered an existing DB unreadable).
28
+ - **`encrypt_index_file` v0→v1 sidecar migration** — re-encrypting a legacy v0 blob (no sidecar) now creates a fresh sidecar, completing the migration path to per-DB salts. Previously, `is_first_encryption` was keyed on `.enc` presence rather than `.salt` presence.
29
+ - **LlamaIndex legacy-collection warning** — `SimpleVecDBLlamaStore.__init__` now emits a one-shot `DeprecationWarning` when it detects rows lacking `_simplevecdb_node_id`, telling the operator to call `migrate_node_id_metadata()` and noting the inherent limitation that pre-2.6 rows can only be stamped with `str(doc_id)` (the original LlamaIndex node ids were never persisted).
30
+ - **INT8 quantization range break softened** — instead of raising `ValueError` on `max(|x|) > 1.0 + 1e-5`, the strategy now emits a one-shot `DeprecationWarning` and clips. Restores backwards compatibility for callers that relied on the prior silent-clip behavior.
31
+ - **`scripts/check_version_sync.py` now validates `CHANGELOG.md`** — the hook fails if the latest CHANGELOG entry header does not match `pyproject.toml`'s version, preventing a release from shipping with a stale changelog.
32
+
33
+ #### Test coverage added (review pass 3 gaps)
34
+
35
+ - `tests/unit/test_v26_review_pass_3.py` — covers parent-directory fsync on save, `.tmp` cleanup on save failure, `db._lock is catalog._lock` shared-RLock identity, adversarial inputs to `_validate_table_name`, hybrid-search RRF rank symmetry under filter, and same-text-different-id deduplication.
36
+ - `tests/unit/test_v26_encryption_review_pass_3.py` — covers nonce uniqueness across saves, wrong-key decrypt does not create the output file, AAD-bound header tampering fails authentication, salt sidecar O_EXCL preservation, and v0→v1 migration round-trip.
37
+ - `tests/unit/integrations/test_llamaindex_review_pass_3.py` — covers the `add → query` round-trip preserving the original LlamaIndex node id, end-to-end migration-then-delete on v2.5-shaped data, the legacy-collection `DeprecationWarning` at `__init__` time, and that `sqlite3.DatabaseError` from the metadata-fallback path now propagates instead of being swallowed.
38
+
39
+ ### Fixed (concurrency & durability)
40
+
41
+ - **Atomic `UsearchIndex.save`** — now writes to a sibling `.tmp`, fsyncs, then `os.replace()`s onto the live path and fsyncs the parent directory. A crash mid-save can no longer corrupt the only copy of the index. Also moved the `_dirty` short-circuit inside `_write_lock` so a concurrent `add` cannot have its dirty flag silently cleared.
42
+ - **Atomic `rebuild_index`** — builds the new index at a sibling `.rebuild` path and atomically swaps it onto the live path; the old index remains the canonical copy until the swap succeeds.
43
+ - **Atomic encrypted save** — `encrypt_file` / `decrypt_file` now write to a sibling `.tmp`, fsync, set mode `0o600`, then `os.replace()`. `encrypt_index_file` only unlinks the plaintext after the encrypted output is durably on disk. A torn write can no longer leave the index unrecoverable.
44
+ - **`VectorDB`-level `RLock`** — a single re-entrant lock now serializes the `_collections` cache (no more check-then-insert TOCTOU on `collection()`) and is shared with every `CatalogManager` so all `with self.conn:` blocks across collections cannot interleave on the shared `sqlite3.Connection`. Reads remain lock-free at the SQLite level via WAL.
45
+ - **`AsyncVectorDB.close` drains** — switched from `executor.shutdown(wait=False)` to `wait=True` so in-flight pool tasks finish their cursors before the SQLite connection is closed. Pending (not-yet-started) work is still cancelled.
46
+ - **`set_parent` cycle check is transactional** — descendant lookup and parent UPDATE now run inside the same `with self._lock, self.conn:` block, closing a TOCTOU window where a concurrent edge could form a cycle.
47
+ - **Cluster persistence** — `_ensure_cluster_table`, `save_cluster_state`, `delete_cluster_state` now use `with self._lock, self.conn:` instead of bare `conn.commit()`; an exception during the execute is properly rolled back.
48
+ - **`add_documents` ID recovery is correct under upsert** — replaced the `last_insert_rowid()` arithmetic (which silently returned wrong IDs for batches mixing explicit and `None` IDs because UPSERTs do not advance the auto-increment counter) with a single `INSERT … RETURNING id` for the auto-ID rows. Explicit-ID rows still take the upsert path.
49
+ - **`delete_collection` closes cached indexes first** — any `VectorCollection` instances cached for the deleted name have their `UsearchIndex` closed before the file is unlinked, so a stale mmap view cannot race the unlink.
50
+
51
+ ### Changed
52
+
53
+ - **`upsert_fts_rows` / `delete_fts_rows` are now `_upsert_fts_rows` / `_delete_fts_rows`** (private). The FTS shadow table must be updated inside the same transaction as the main table or it can desync on crash; the rename signals the contract.
54
+ - **`get_legacy_vectors`, `drop_legacy_vec_table`** now validate the supplied table name via `_validate_table_name` before interpolating into SQL.
55
+
56
+ ### Added
57
+
58
+ - **Declared `python-dotenv` dependency** — `simplevecdb.config` already imported and called `load_dotenv` at package import; the missing dependency would `ImportError` on a clean install of the base package without optional extras.
59
+
60
+ ### Fixed (correctness & quality)
61
+
62
+ - **RRF deduplication keys by document ID, not text** — `hybrid_search` previously deduped by `doc.page_content`, silently merging two distinct documents that happened to share text into one inflated-score result.
63
+ - **NaN/Inf guard at insert** — `add_texts` and `add_texts_streaming` reject non-finite vectors instead of feeding them to HNSW, which would produce undefined neighbours and could corrupt the graph.
64
+ - **`normalize_l2` handles subnormals** — replaced the exact `norm == 0` compare with a `< 1e-12` check (matching the existing usearch_index guard); subnormal floats no longer produce wildly large normalized vectors.
65
+ - **Silhouette score samples on large collections** — `silhouette_score` is O(n²); now caps the evaluation sample at `SILHOUETTE_MAX_SAMPLE = 10_000`. Large collections no longer OOM.
66
+ - **MMR maintains the selected matrix incrementally** — replaced per-iteration `np.stack(selected_embs)` with `np.vstack` of a running matrix. O(k²·d) wasted allocations dropped to O(k·d).
67
+ - **`_parse_bool_env` treats `KEY=` as unset** — empty strings now fall through to the default; previously they were truthy because `"".strip()` is not in the falsey set.
68
+ - **LangChain async methods use `asyncio.to_thread`** — `aadd_texts` / `asimilarity_search` / `amax_marginal_relevance_search` no longer block the event loop.
69
+ - **LlamaIndex `delete()` survives a process restart** — node IDs are persisted into document metadata under `_simplevecdb_node_id`; `delete()` falls back to a metadata query when the in-memory `_id_map` is empty.
70
+ - **LlamaIndex query results carry stable node IDs** — replaced `str(hash(page_content))` (process-randomized, collision-prone) with the persisted `_simplevecdb_node_id`.
71
+ - **`AsyncVectorDB.collection` accepts `store_embeddings`** — async callers can now enable embedding storage (required for `rebuild_index()`); previously they had no way to set it.
72
+
73
+ ### Security
74
+
75
+ - **API key comparison uses `hmac.compare_digest`** — the prior `token not in allowed_keys` short-circuit leaked key prefixes via response time.
76
+ - **SQLCipher PRAGMA key always uses the `x'hex'` form** — every key path now goes through `_normalize_key` first, eliminating string interpolation of user-supplied passphrase characters into a quoted PRAGMA argument.
77
+ - **`is_database_encrypted` rejects zero-byte files** — previously a missing/empty DB looked like an unencrypted DB because `sqlite3.connect` would create a fresh one.
78
+
79
+ ### Changed (tooling)
80
+
81
+ - **Ruff and mypy targets aligned with `requires-python>=3.10`** — both were `py312`, hiding 3.10/3.11 incompatibilities. Cleaned three resulting `F401` unused-import warnings (`signal` in models.py, `_batched` and `constants` re-imports).
82
+ - **Pre-commit version-sync hook** — `__init__.py` derives `__version__` dynamically via `importlib.metadata`, so `check_version_sync.py` was failing on every commit looking for a literal `__version__ = "x.y.z"` line that does not exist. The hook now validates only `pyproject.toml`'s version field. `bump_version.py` similarly stops trying to rewrite `__init__.py` and uses an anchored regex to update only the canonical version field.
83
+
84
+ ### Security (2.6.0 final)
85
+
86
+ - **Per-DB random PBKDF2 salt** — encrypted databases and index files now generate a random 16-byte salt at creation time, written to a `<resource>.salt` sidecar with mode `0o600`. The previous fixed `b"simplevecdb-sqlcipher-key"` salt let an attacker precompute one rainbow table that broke every simplevecdb installation with the same passphrase. Pre-2.6.0 encrypted resources keep working unchanged: when no sidecar exists, the loader falls back to the legacy fixed salt automatically.
87
+ - **HuggingFace `repo_id` allowlist + `trust_remote_code=False`** — the embeddings server validates model names against a strict regex (`namespace/name` with `[A-Za-z0-9_.-]` only) before passing them to `snapshot_download` / `SentenceTransformer`, blocking path traversal and local-filesystem inputs. `SentenceTransformer` is constructed with `trust_remote_code=False` so a malicious model card cannot trigger arbitrary downloaded Python on load.
88
+ - **CORS is opt-in** — the server no longer adds CORS middleware unless `EMBEDDING_SERVER_CORS_ORIGINS` is set. When the operator does set wildcard origins (`["*"]`), `allow_credentials` is forced off so the spec-violating wildcard-with-credentials combo can't be produced.
89
+
90
+ ### Migration helpers
91
+
92
+ - **`SimpleVecDBLlamaStore.migrate_node_id_metadata()`** — backfills `_simplevecdb_node_id` for documents inserted before 2.6.0. Pre-2.6.0 versions did not persist the LlamaIndex node_id into metadata, so `delete()` could not find the right row after a process restart. Idempotent — already-stamped rows are skipped.
93
+
94
+ ### Added (hygiene & polish)
95
+
96
+ - **`ClusterResult` and `ClusterTagCallback` exported from `simplevecdb`** — they were return/argument types of public methods but had no public import path; users had to reach into `simplevecdb.types`.
97
+ - **`NullHandler` attached to the package's root logger** at import time, per the Python logging HOWTO. Idempotent — duplicate calls do not stack handlers.
98
+ - **`SimpleVecDBLlamaStore.delete_nodes` raises `NotImplementedError`** when called with `filters`, instead of silently dropping the filter portion and pretending the deletion succeeded.
99
+ - **Recursive CTE depth bound as a parameter** in `get_descendants` / `get_ancestors`. The previous f-string interpolation was safe due to `int()` coercion but is now one less line away from injection on a future refactor.
100
+ - **`Config.from_env()` documented** as returning the import-time-frozen instance; setting env vars after import does not refresh.
101
+ - **`ModelRegistry(allow_unlisted=...)` defaults to `False`** to match the secure-by-default config setting; programmatic instantiations no longer get an open registry by accident.
102
+ - **`/v1/usage` returns aggregated totals when auth is disabled** instead of leaking the per-IP buckets to anyone who hits the endpoint.
103
+ - **Server validates `EMBEDDING_SERVER_MAX_REQUEST_ITEMS <= _MAX_ENCODE_BATCH` at startup** so an out-of-range env var fails fast at boot rather than per request.
104
+ - **`pyproject.toml` gains `[project.urls]`, `classifiers`, and `keywords`** for a useful PyPI listing.
105
+ - **`.bandit` documents the B104 skip** and warns that any future `0.0.0.0` binding requires removing the skip.
106
+ - **Encrypted file format now carries a 3-byte header** (`'SV' + version`) so future format changes are detectable. `decrypt_file` accepts both the new v1 format and the v0 (pre-2.6.0) format, so existing encrypted indexes still load without re-encryption.
107
+
108
+ ### Fixed (review pass 2)
109
+
110
+ - **NaN/Inf rejection no longer leaves orphan catalog rows** — `add_texts` and `_process_streaming_batch` now validate vectors *before* the SQLite insert. Previously the catalog row committed first and a non-finite vector then raised, leaving rows visible via `get_documents_by_ids` but unreachable through similarity search.
111
+ - **`VectorCollection.__repr__` no longer issues SQL** — the previous `count()` call would raise `ProgrammingError` after `close()`, breaking debuggers and exception formatters that auto-stringify objects. The 2.6.0 fix only covered `VectorDB.__repr__`.
112
+ - **`EMBEDDING_SERVER_MAX_REQUEST_ITEMS` validation runs at module import** — the guard was previously inside `run_server()` and was bypassed under any non-CLI ASGI deployment (gunicorn, programmatic uvicorn).
113
+ - **LlamaIndex empty-`node_id` path is atomic** — `SimpleVecDBLlamaStore.add` now generates a UUID for nodes that arrive without a `node_id` and stamps it into metadata *before* the row insert, so the metadata commit is in the same SQLite transaction as the catalog row. Previously a separate `UPDATE` followed `add_texts`; a crash in the gap left rows un-stampable and cross-restart `delete()` silently no-op'd.
114
+ - **Catalog read paths serialize on `self._lock`** — `get_documents_by_ids`, `get_embeddings_by_ids`, `get_documents_and_embeddings_by_ids`, `find_ids_by_texts`, `find_ids_by_filter`, `keyword_search`, `count`, `get_all_docs_with_text`, `check_legacy_sqlite_vec`, `get_legacy_vectors`, `get_children`, `get_parent`, `get_descendants`, `get_ancestors`, `load_cluster_state`, `list_cluster_states`, and `VectorDB.list_collections` now acquire the connection-level lock around `conn.execute`. `sqlite3.Connection` is not safe for concurrent statement execution from multiple threads even under WAL.
115
+ - **`rebuild_index` is fully serialized** — the entire fetch + build + swap now runs inside `with self._lock:` so concurrent `add` / `delete` cannot mutate the catalog mid-rebuild and produce a stale snapshot.
116
+ - **`_ensure_cluster_table` double-checked under lock** — the `_cluster_table_ready` flag is now re-checked inside the lock and set inside the `with` block. Concurrent first-callers no longer both run the DDL.
117
+ - **`utils.file_lock` opens via `os.open(O_CREAT | O_RDWR, 0o600)`** — no truncation of stale lock files from a crashed prior run, restricted permissions on the lock sentinel.
118
+
119
+ ## [2.5.0] - 2026-04-07
120
+
121
+ ### Added
122
+
123
+ - **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
124
+ - **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
125
+ - **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
126
+ - **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
127
+ - **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
128
+ - **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
129
+ - **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
130
+ - **Embeddings server enhancements:**
131
+ - Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
132
+ - CORS middleware with configurable origins for browser-based clients
133
+ - Model warm-up on startup (skip with `--no-warmup`)
134
+ - Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
135
+ - Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
136
+ - Startup banner logging config summary (host, port, model, auth, rate limits)
137
+ - Nested token array normalization (`list[list[int]]` input format)
138
+ - Async executor offload for `embed_texts` (non-blocking event loop)
139
+ - OpenAPI version synced from package metadata
140
+ - Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
141
+
142
+ ### Fixed
143
+
144
+ - **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
145
+ - **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
146
+ - **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
147
+ - **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
148
+ - **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
149
+ - **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
150
+ - **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
151
+
152
+ ### Improved
153
+
154
+ - **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
155
+ - **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
156
+ - **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
157
+ - **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
158
+
159
+ ### Removed
160
+
161
+ - **Duplicate `_dim` property** — removed in favor of the public `dim` property.
162
+
163
+ ### Breaking Changes
164
+
165
+ - String metadata filters now use exact equality (was substring match).
166
+ - `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
167
+
8
168
  ## [2.4.0] - 2026-03-22
9
169
 
10
170
  ### Added
@@ -1,14 +1,33 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: simplevecdb
3
- Version: 2.4.0
3
+ Version: 2.6.0
4
4
  Summary: Dead-simple local vector database powered by usearch HNSW.
5
+ Project-URL: Homepage, https://github.com/CoderDayton/simplevecdb
6
+ Project-URL: Repository, https://github.com/CoderDayton/simplevecdb
7
+ Project-URL: Issues, https://github.com/CoderDayton/simplevecdb/issues
8
+ Project-URL: Changelog, https://github.com/CoderDayton/simplevecdb/blob/main/CHANGELOG.md
5
9
  Author-email: Dayton Dunbar <coderdayton14@gmail.com>
6
10
  License: MIT
7
11
  License-File: LICENSE
12
+ Keywords: embeddings,hnsw,langchain,llamaindex,rag,similarity-search,sqlite,usearch,vector-database,vectordb
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Classifier: Typing :: Typed
8
26
  Requires-Python: >=3.10
9
27
  Requires-Dist: cryptography>=41.0
10
28
  Requires-Dist: hdbscan>=0.8.33
11
29
  Requires-Dist: numpy>=1.24
30
+ Requires-Dist: python-dotenv>=1.0
12
31
  Requires-Dist: scikit-learn>=1.3.0
13
32
  Requires-Dist: sqlcipher3-binary>=0.5.0
14
33
  Requires-Dist: sqlite-vec>=0.1.6
@@ -169,10 +188,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
169
188
  **Optional: Run embeddings server (OpenAI-compatible)**
170
189
 
171
190
  ```bash
172
- simplevecdb-server --port 8000
191
+ simplevecdb-server --port 8000 # Default model, auto warm-up
192
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
193
+ simplevecdb-server --no-warmup # Skip model preload on startup
194
+ simplevecdb-server --help # Show all options
173
195
  ```
174
196
 
175
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
197
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
176
198
 
177
199
  ### Option 3: With LangChain or LlamaIndex
178
200
 
@@ -331,6 +353,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
331
353
  for doc_id, text, metadata in docs:
332
354
  print(f"[{doc_id}] {text[:50]}...")
333
355
 
356
+ # Paginated access (v2.5+)
357
+ page1 = collection.get_documents(limit=100)
358
+ page2 = collection.get_documents(limit=100, offset=100)
359
+
334
360
  # Fetch stored embeddings
335
361
  embeddings = collection.get_embeddings_by_ids([1, 2, 3])
336
362
 
@@ -342,6 +368,9 @@ collection.update_metadata([
342
368
 
343
369
  # Quick stats
344
370
  print(f"Collection has {collection.count()} documents, dim={collection.dim}")
371
+
372
+ # Delete an entire collection (v2.5+)
373
+ db.delete_collection("old_data")
345
374
  ```
346
375
 
347
376
  ### Vector Clustering (v2.2+)
@@ -384,6 +413,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
384
413
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
385
414
  | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
386
415
  | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
416
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
417
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
418
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
419
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
387
420
 
388
421
  ## Performance Benchmarks
389
422
 
@@ -456,6 +489,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
456
489
  - [x] Vector clustering and auto-tagging (v2.2)
457
490
  - [x] Public catalog API for document management (v2.4)
458
491
  - [x] Async executor injection for thread-safe sharing (v2.4)
492
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
493
+ - [x] Cross-process file locking and connection health checks (v2.5)
494
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
459
495
  - [ ] Incremental clustering (online learning)
460
496
  - [ ] Cluster visualization exports
461
497
 
@@ -140,10 +140,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
140
140
  **Optional: Run embeddings server (OpenAI-compatible)**
141
141
 
142
142
  ```bash
143
- simplevecdb-server --port 8000
143
+ simplevecdb-server --port 8000 # Default model, auto warm-up
144
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
145
+ simplevecdb-server --no-warmup # Skip model preload on startup
146
+ simplevecdb-server --help # Show all options
144
147
  ```
145
148
 
146
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
149
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
147
150
 
148
151
  ### Option 3: With LangChain or LlamaIndex
149
152
 
@@ -302,6 +305,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
302
305
  for doc_id, text, metadata in docs:
303
306
  print(f"[{doc_id}] {text[:50]}...")
304
307
 
308
+ # Paginated access (v2.5+)
309
+ page1 = collection.get_documents(limit=100)
310
+ page2 = collection.get_documents(limit=100, offset=100)
311
+
305
312
  # Fetch stored embeddings
306
313
  embeddings = collection.get_embeddings_by_ids([1, 2, 3])
307
314
 
@@ -313,6 +320,9 @@ collection.update_metadata([
313
320
 
314
321
  # Quick stats
315
322
  print(f"Collection has {collection.count()} documents, dim={collection.dim}")
323
+
324
+ # Delete an entire collection (v2.5+)
325
+ db.delete_collection("old_data")
316
326
  ```
317
327
 
318
328
  ### Vector Clustering (v2.2+)
@@ -355,6 +365,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
355
365
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
356
366
  | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
357
367
  | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
368
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
369
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
370
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
371
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
358
372
 
359
373
  ## Performance Benchmarks
360
374
 
@@ -427,6 +441,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
427
441
  - [x] Vector clustering and auto-tagging (v2.2)
428
442
  - [x] Public catalog API for document management (v2.4)
429
443
  - [x] Async executor injection for thread-safe sharing (v2.4)
444
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
445
+ - [x] Cross-process file locking and connection health checks (v2.5)
446
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
430
447
  - [ ] Incremental clustering (online learning)
431
448
  - [ ] Cluster visualization exports
432
449
 
@@ -5,6 +5,166 @@ All notable changes to SimpleVecDB will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.6.0] - 2026-05-06
9
+
10
+ ### Review pass 3 — final correctness/security pass before tag
11
+
12
+ #### Critical fixes
13
+
14
+ - **`UsearchIndex.save` lost-update race** — the `_dirty = False` clear was outside the `file_lock` window, so a concurrent `add()` between `os.replace()` and the dirty-flag clear could be silently overwritten. Moved inside `file_lock`.
15
+ - **`UsearchIndex.save` data fsync on `O_RDONLY` fd** — `fsync(2)` on a read-only file descriptor has implementation-defined behavior on Linux (some kernels return `EBADF`, swallowed by the warning branch). Switched to `O_RDWR` so the data fsync is guaranteed.
16
+ - **`_rebuild_index_locked` bare `conn.execute`** — replaced the bare `self.conn.execute("SELECT id FROM ...")` with the new `CatalogManager.list_all_ids()`, which routes the read through `self._lock` instead of relying on RLock re-entrancy from a single caller.
17
+ - **PBKDF2 iteration bump** — raised from 480 000 → 600 000 to match the OWASP 2024 minimum for PBKDF2-HMAC-SHA256.
18
+ - **AES-GCM AAD now binds the v1 header** — `encrypt_file` / `decrypt_file` pass the magic+version bytes as `associated_data`, so any tampering with the header (including downgrade attempts) fails authentication instead of silently succeeding.
19
+ - **Bounded normalize-key cache** — `_NORMALIZE_KEY_CACHE` is now an LRU capped at 64 entries, serialized by a `threading.Lock`. Long-running multi-tenant processes no longer leak derived key material indefinitely.
20
+ - **LlamaIndex `delete()` no longer swallows `sqlite3.DatabaseError`** — narrowed the exception in the metadata-fallback path to `(TypeError, NotImplementedError)`. A locked DB, closed connection, or schema mismatch now propagates to the caller instead of becoming a silent no-op.
21
+ - **Hybrid-search RRF rank symmetry** — vector candidates now use the original HNSW position as their RRF rank (via `enumerate(vector_keys_list)`), matching how keyword candidates use raw BM25 position. Previously, a metadata filter that rejected vector candidates inflated surviving vector scores relative to keyword scores, corrupting result ordering.
22
+ - **`add_documents` FTS sentinel guard** — added a defense-in-depth check that raises `RuntimeError` if any `-1` sentinel rowid remains in `real_ids` before the FTS upsert. Prevents a hypothetical retry-loop interaction from corrupting the FTS index with rowid `-1`.
23
+
24
+ #### Important fixes
25
+
26
+ - **`delete_collection` TOCTOU** — moved the `list_collections()` existence check inside the `with self._lock:` block so two concurrent `delete_collection(name)` calls cannot both pass the check; the second now sees a clean `KeyError` instead of a SQLite error.
27
+ - **Salt sidecar `O_EXCL` guard** — `_resolve_salt(create_if_missing=True)` now creates the sidecar with `O_CREAT | O_EXCL`. If two processes race, the loser reads the winner's salt; if a sidecar already exists out-of-band, it is preserved instead of being clobbered (which would have rendered an existing DB unreadable).
28
+ - **`encrypt_index_file` v0→v1 sidecar migration** — re-encrypting a legacy v0 blob (no sidecar) now creates a fresh sidecar, completing the migration path to per-DB salts. Previously, `is_first_encryption` was keyed on `.enc` presence rather than `.salt` presence.
29
+ - **LlamaIndex legacy-collection warning** — `SimpleVecDBLlamaStore.__init__` now emits a one-shot `DeprecationWarning` when it detects rows lacking `_simplevecdb_node_id`, telling the operator to call `migrate_node_id_metadata()` and noting the inherent limitation that pre-2.6 rows can only be stamped with `str(doc_id)` (the original LlamaIndex node ids were never persisted).
30
+ - **INT8 quantization range break softened** — instead of raising `ValueError` on `max(|x|) > 1.0 + 1e-5`, the strategy now emits a one-shot `DeprecationWarning` and clips. Restores backwards compatibility for callers that relied on the prior silent-clip behavior.
31
+ - **`scripts/check_version_sync.py` now validates `CHANGELOG.md`** — the hook fails if the latest CHANGELOG entry header does not match `pyproject.toml`'s version, preventing a release from shipping with a stale changelog.
32
+
33
+ #### Test coverage added (review pass 3 gaps)
34
+
35
+ - `tests/unit/test_v26_review_pass_3.py` — covers parent-directory fsync on save, `.tmp` cleanup on save failure, `db._lock is catalog._lock` shared-RLock identity, adversarial inputs to `_validate_table_name`, hybrid-search RRF rank symmetry under filter, and same-text-different-id deduplication.
36
+ - `tests/unit/test_v26_encryption_review_pass_3.py` — covers nonce uniqueness across saves, wrong-key decrypt does not create the output file, AAD-bound header tampering fails authentication, salt sidecar O_EXCL preservation, and v0→v1 migration round-trip.
37
+ - `tests/unit/integrations/test_llamaindex_review_pass_3.py` — covers the `add → query` round-trip preserving the original LlamaIndex node id, end-to-end migration-then-delete on v2.5-shaped data, the legacy-collection `DeprecationWarning` at `__init__` time, and that `sqlite3.DatabaseError` from the metadata-fallback path now propagates instead of being swallowed.
38
+
39
+ ### Fixed (concurrency & durability)
40
+
41
+ - **Atomic `UsearchIndex.save`** — now writes to a sibling `.tmp`, fsyncs, then `os.replace()`s onto the live path and fsyncs the parent directory. A crash mid-save can no longer corrupt the only copy of the index. Also moved the `_dirty` short-circuit inside `_write_lock` so a concurrent `add` cannot have its dirty flag silently cleared.
42
+ - **Atomic `rebuild_index`** — builds the new index at a sibling `.rebuild` path and atomically swaps it onto the live path; the old index remains the canonical copy until the swap succeeds.
43
+ - **Atomic encrypted save** — `encrypt_file` / `decrypt_file` now write to a sibling `.tmp`, fsync, set mode `0o600`, then `os.replace()`. `encrypt_index_file` only unlinks the plaintext after the encrypted output is durably on disk. A torn write can no longer leave the index unrecoverable.
44
+ - **`VectorDB`-level `RLock`** — a single re-entrant lock now serializes the `_collections` cache (no more check-then-insert TOCTOU on `collection()`) and is shared with every `CatalogManager` so all `with self.conn:` blocks across collections cannot interleave on the shared `sqlite3.Connection`. Reads remain lock-free at the SQLite level via WAL.
45
+ - **`AsyncVectorDB.close` drains** — switched from `executor.shutdown(wait=False)` to `wait=True` so in-flight pool tasks finish their cursors before the SQLite connection is closed. Pending (not-yet-started) work is still cancelled.
46
+ - **`set_parent` cycle check is transactional** — descendant lookup and parent UPDATE now run inside the same `with self._lock, self.conn:` block, closing a TOCTOU window where a concurrent edge could form a cycle.
47
+ - **Cluster persistence** — `_ensure_cluster_table`, `save_cluster_state`, `delete_cluster_state` now use `with self._lock, self.conn:` instead of bare `conn.commit()`; an exception during the execute is properly rolled back.
48
+ - **`add_documents` ID recovery is correct under upsert** — replaced the `last_insert_rowid()` arithmetic (which silently returned wrong IDs for batches mixing explicit and `None` IDs because UPSERTs do not advance the auto-increment counter) with a single `INSERT … RETURNING id` for the auto-ID rows. Explicit-ID rows still take the upsert path.
49
+ - **`delete_collection` closes cached indexes first** — any `VectorCollection` instances cached for the deleted name have their `UsearchIndex` closed before the file is unlinked, so a stale mmap view cannot race the unlink.
50
+
51
+ ### Changed
52
+
53
+ - **`upsert_fts_rows` / `delete_fts_rows` are now `_upsert_fts_rows` / `_delete_fts_rows`** (private). The FTS shadow table must be updated inside the same transaction as the main table or it can desync on crash; the rename signals the contract.
54
+ - **`get_legacy_vectors`, `drop_legacy_vec_table`** now validate the supplied table name via `_validate_table_name` before interpolating into SQL.
55
+
56
+ ### Added
57
+
58
+ - **Declared `python-dotenv` dependency** — `simplevecdb.config` already imported and called `load_dotenv` at package import; the missing dependency would `ImportError` on a clean install of the base package without optional extras.
59
+
60
+ ### Fixed (correctness & quality)
61
+
62
+ - **RRF deduplication keys by document ID, not text** — `hybrid_search` previously deduped by `doc.page_content`, silently merging two distinct documents that happened to share text into one inflated-score result.
63
+ - **NaN/Inf guard at insert** — `add_texts` and `add_texts_streaming` reject non-finite vectors instead of feeding them to HNSW, which would produce undefined neighbours and could corrupt the graph.
64
+ - **`normalize_l2` handles subnormals** — replaced the exact `norm == 0` compare with a `< 1e-12` check (matching the existing usearch_index guard); subnormal floats no longer produce wildly large normalized vectors.
65
+ - **Silhouette score samples on large collections** — `silhouette_score` is O(n²); now caps the evaluation sample at `SILHOUETTE_MAX_SAMPLE = 10_000`. Large collections no longer OOM.
66
+ - **MMR maintains the selected matrix incrementally** — replaced per-iteration `np.stack(selected_embs)` with `np.vstack` of a running matrix. O(k²·d) wasted allocations dropped to O(k·d).
67
+ - **`_parse_bool_env` treats `KEY=` as unset** — empty strings now fall through to the default; previously they were truthy because `"".strip()` is not in the falsey set.
68
+ - **LangChain async methods use `asyncio.to_thread`** — `aadd_texts` / `asimilarity_search` / `amax_marginal_relevance_search` no longer block the event loop.
69
+ - **LlamaIndex `delete()` survives a process restart** — node IDs are persisted into document metadata under `_simplevecdb_node_id`; `delete()` falls back to a metadata query when the in-memory `_id_map` is empty.
70
+ - **LlamaIndex query results carry stable node IDs** — replaced `str(hash(page_content))` (process-randomized, collision-prone) with the persisted `_simplevecdb_node_id`.
71
+ - **`AsyncVectorDB.collection` accepts `store_embeddings`** — async callers can now enable embedding storage (required for `rebuild_index()`); previously they had no way to set it.
72
+
73
+ ### Security
74
+
75
+ - **API key comparison uses `hmac.compare_digest`** — the prior `token not in allowed_keys` short-circuit leaked key prefixes via response time.
76
+ - **SQLCipher PRAGMA key always uses the `x'hex'` form** — every key path now goes through `_normalize_key` first, eliminating string interpolation of user-supplied passphrase characters into a quoted PRAGMA argument.
77
+ - **`is_database_encrypted` rejects zero-byte files** — previously a missing/empty DB looked like an unencrypted DB because `sqlite3.connect` would create a fresh one.
78
+
79
+ ### Changed (tooling)
80
+
81
+ - **Ruff and mypy targets aligned with `requires-python>=3.10`** — both were `py312`, hiding 3.10/3.11 incompatibilities. Cleaned three resulting `F401` unused-import warnings (`signal` in models.py, `_batched` and `constants` re-imports).
82
+ - **Pre-commit version-sync hook** — `__init__.py` derives `__version__` dynamically via `importlib.metadata`, so `check_version_sync.py` was failing on every commit looking for a literal `__version__ = "x.y.z"` line that does not exist. The hook now validates only `pyproject.toml`'s version field. `bump_version.py` similarly stops trying to rewrite `__init__.py` and uses an anchored regex to update only the canonical version field.
83
+
84
+ ### Security (2.6.0 final)
85
+
86
+ - **Per-DB random PBKDF2 salt** — encrypted databases and index files now generate a random 16-byte salt at creation time, written to a `<resource>.salt` sidecar with mode `0o600`. The previous fixed `b"simplevecdb-sqlcipher-key"` salt let an attacker precompute one rainbow table that broke every simplevecdb installation with the same passphrase. Pre-2.6.0 encrypted resources keep working unchanged: when no sidecar exists, the loader falls back to the legacy fixed salt automatically.
87
+ - **HuggingFace `repo_id` allowlist + `trust_remote_code=False`** — the embeddings server validates model names against a strict regex (`namespace/name` with `[A-Za-z0-9_.-]` only) before passing them to `snapshot_download` / `SentenceTransformer`, blocking path traversal and local-filesystem inputs. `SentenceTransformer` is constructed with `trust_remote_code=False` so a malicious model card cannot trigger arbitrary downloaded Python on load.
88
+ - **CORS is opt-in** — the server no longer adds CORS middleware unless `EMBEDDING_SERVER_CORS_ORIGINS` is set. When the operator does set wildcard origins (`["*"]`), `allow_credentials` is forced off so the spec-violating wildcard-with-credentials combo can't be produced.
89
+
90
+ ### Migration helpers
91
+
92
+ - **`SimpleVecDBLlamaStore.migrate_node_id_metadata()`** — backfills `_simplevecdb_node_id` for documents inserted before 2.6.0. Pre-2.6.0 versions did not persist the LlamaIndex node_id into metadata, so `delete()` could not find the right row after a process restart. Idempotent — already-stamped rows are skipped.
93
+
94
+ ### Added (hygiene & polish)
95
+
96
+ - **`ClusterResult` and `ClusterTagCallback` exported from `simplevecdb`** — they were return/argument types of public methods but had no public import path; users had to reach into `simplevecdb.types`.
97
+ - **`NullHandler` attached to the package's root logger** at import time, per the Python logging HOWTO. Idempotent — duplicate calls do not stack handlers.
98
+ - **`SimpleVecDBLlamaStore.delete_nodes` raises `NotImplementedError`** when called with `filters`, instead of silently dropping the filter portion and pretending the deletion succeeded.
99
+ - **Recursive CTE depth bound as a parameter** in `get_descendants` / `get_ancestors`. The previous f-string interpolation was safe due to `int()` coercion but is now one less line away from injection on a future refactor.
100
+ - **`Config.from_env()` documented** as returning the import-time-frozen instance; setting env vars after import does not refresh.
101
+ - **`ModelRegistry(allow_unlisted=...)` defaults to `False`** to match the secure-by-default config setting; programmatic instantiations no longer get an open registry by accident.
102
+ - **`/v1/usage` returns aggregated totals when auth is disabled** instead of leaking the per-IP buckets to anyone who hits the endpoint.
103
+ - **Server validates `EMBEDDING_SERVER_MAX_REQUEST_ITEMS <= _MAX_ENCODE_BATCH` at startup** so an out-of-range env var fails fast at boot rather than per request.
104
+ - **`pyproject.toml` gains `[project.urls]`, `classifiers`, and `keywords`** for a useful PyPI listing.
105
+ - **`.bandit` documents the B104 skip** and warns that any future `0.0.0.0` binding requires removing the skip.
106
+ - **Encrypted file format now carries a 3-byte header** (`'SV' + version`) so future format changes are detectable. `decrypt_file` accepts both the new v1 format and the v0 (pre-2.6.0) format, so existing encrypted indexes still load without re-encryption.
107
+
108
+ ### Fixed (review pass 2)
109
+
110
+ - **NaN/Inf rejection no longer leaves orphan catalog rows** — `add_texts` and `_process_streaming_batch` now validate vectors *before* the SQLite insert. Previously the catalog row committed first and a non-finite vector then raised, leaving rows visible via `get_documents_by_ids` but unreachable through similarity search.
111
+ - **`VectorCollection.__repr__` no longer issues SQL** — the previous `count()` call would raise `ProgrammingError` after `close()`, breaking debuggers and exception formatters that auto-stringify objects. The 2.6.0 fix only covered `VectorDB.__repr__`.
112
+ - **`EMBEDDING_SERVER_MAX_REQUEST_ITEMS` validation runs at module import** — the guard was previously inside `run_server()` and was bypassed under any non-CLI ASGI deployment (gunicorn, programmatic uvicorn).
113
+ - **LlamaIndex empty-`node_id` path is atomic** — `SimpleVecDBLlamaStore.add` now generates a UUID for nodes that arrive without a `node_id` and stamps it into metadata *before* the row insert, so the metadata commit is in the same SQLite transaction as the catalog row. Previously a separate `UPDATE` followed `add_texts`; a crash in the gap left rows un-stampable and cross-restart `delete()` silently no-op'd.
114
+ - **Catalog read paths serialize on `self._lock`** — `get_documents_by_ids`, `get_embeddings_by_ids`, `get_documents_and_embeddings_by_ids`, `find_ids_by_texts`, `find_ids_by_filter`, `keyword_search`, `count`, `get_all_docs_with_text`, `check_legacy_sqlite_vec`, `get_legacy_vectors`, `get_children`, `get_parent`, `get_descendants`, `get_ancestors`, `load_cluster_state`, `list_cluster_states`, and `VectorDB.list_collections` now acquire the connection-level lock around `conn.execute`. `sqlite3.Connection` is not safe for concurrent statement execution from multiple threads even under WAL.
115
+ - **`rebuild_index` is fully serialized** — the entire fetch + build + swap now runs inside `with self._lock:` so concurrent `add` / `delete` cannot mutate the catalog mid-rebuild and produce a stale snapshot.
116
+ - **`_ensure_cluster_table` double-checked under lock** — the `_cluster_table_ready` flag is now re-checked inside the lock and set inside the `with` block. Concurrent first-callers no longer both run the DDL.
117
+ - **`utils.file_lock` opens via `os.open(O_CREAT | O_RDWR, 0o600)`** — no truncation of stale lock files from a crashed prior run, restricted permissions on the lock sentinel.
118
+
119
+ ## [2.5.0] - 2026-04-07
120
+
121
+ ### Added
122
+
123
+ - **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
124
+ - **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
125
+ - **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
126
+ - **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
127
+ - **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
128
+ - **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
129
+ - **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
130
+ - **Embeddings server enhancements:**
131
+ - Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
132
+ - CORS middleware with configurable origins for browser-based clients
133
+ - Model warm-up on startup (skip with `--no-warmup`)
134
+ - Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
135
+ - Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
136
+ - Startup banner logging config summary (host, port, model, auth, rate limits)
137
+ - Nested token array normalization (`list[list[int]]` input format)
138
+ - Async executor offload for `embed_texts` (non-blocking event loop)
139
+ - OpenAPI version synced from package metadata
140
+ - Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
141
+
142
+ ### Fixed
143
+
144
+ - **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
145
+ - **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
146
+ - **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
147
+ - **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
148
+ - **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
149
+ - **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
150
+ - **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
151
+
152
+ ### Improved
153
+
154
+ - **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
155
+ - **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
156
+ - **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
157
+ - **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
158
+
159
+ ### Removed
160
+
161
+ - **Duplicate `_dim` property** — removed in favor of the public `dim` property.
162
+
163
+ ### Breaking Changes
164
+
165
+ - String metadata filters now use exact equality (was substring match).
166
+ - `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
167
+
8
168
  ## [2.4.0] - 2026-03-22
9
169
 
10
170
  ### Added
@@ -80,12 +80,57 @@ db = VectorDB("secure.db", encryption_key=encryption_key)
80
80
  With encryption enabled, files are stored as:
81
81
 
82
82
  ```
83
- mydb.db # SQLCipher encrypted SQLite database
84
- mydb.db.default.usearch.enc # AES-256-GCM encrypted usearch index
83
+ mydb.db # SQLCipher encrypted SQLite database
84
+ mydb.db.salt # 16-byte random salt sidecar (mode 0o600)
85
+ mydb.db.default.usearch.enc # AES-256-GCM encrypted usearch index (v1)
86
+ mydb.db.default.usearch.enc.salt # 16-byte salt sidecar for the index
85
87
  ```
86
88
 
87
89
  When opened, the index is decrypted to memory (or a temp file). On `save()` or `close()`, the index is re-encrypted.
88
90
 
91
+ ### Per-DB random salt (2.6.0+)
92
+
93
+ Each encrypted database and each encrypted index file gets its own
94
+ random 16-byte salt, written to a sibling `.salt` file with mode
95
+ `0o600`. The salt is the second input to PBKDF2-HMAC-SHA256, so two
96
+ databases that share the same passphrase derive **different** keys.
97
+
98
+ The sidecar is created with `O_CREAT | O_EXCL` so two processes opening
99
+ the same fresh database concurrently cannot race to write conflicting
100
+ salts; the loser reads the winner's salt and proceeds. An existing
101
+ sidecar is never overwritten — clobbering it would render the database
102
+ permanently unreadable with the original passphrase.
103
+
104
+ Pre-2.6.0 databases continue to open with a fixed legacy salt when no
105
+ sidecar is present, so existing on-disk data keeps working unchanged.
106
+
107
+ ### v1 index file format (2.6.0+)
108
+
109
+ Index files written by 2.6.0+ start with a 3-byte version header:
110
+
111
+ ```
112
+ magic = b"SV" (2 bytes)
113
+ version = 0x01 (1 byte)
114
+ nonce = 12 bytes
115
+ ciphertext + GCM tag
116
+ ```
117
+
118
+ The header bytes are bound into the AES-GCM **associated_data**, so
119
+ any tampering with the magic or version (including a downgrade attempt
120
+ that strips them) fails authentication on decrypt. Pre-2.6.0 (v0) blobs
121
+ have no header and continue to decrypt successfully — `decrypt_file`
122
+ detects the format automatically.
123
+
124
+ ### Atomic durability
125
+
126
+ `encrypt_file` and `decrypt_file` write to a sibling `.tmp` file,
127
+ `fsync()` the data, set mode `0o600`, then `os.replace()` onto the
128
+ target. The parent directory is also fsynced so the rename itself is
129
+ durable on POSIX. A crash mid-write leaves only the orphan temp file —
130
+ the live target is never torn. `encrypt_index_file` only unlinks the
131
+ plaintext after the encrypted output is durably on disk, so an
132
+ interrupted re-encryption never destroys data.
133
+
89
134
  ## Performance
90
135
 
91
136
  ### Search Operations
@@ -135,10 +180,12 @@ except EncryptionUnavailableError:
135
180
 
136
181
  ## Security Notes
137
182
 
138
- - **SQLCipher** uses AES-256-CBC with HMAC-SHA512 for authentication
139
- - **Index encryption** uses AES-256-GCM with random 96-bit nonces
140
- - **Key derivation** uses PBKDF2-SHA256 with 480,000 iterations (OWASP 2023 recommendation)
141
- - **The encryption key is held in memory** during database usage
183
+ - **SQLCipher** uses AES-256-CBC with HMAC-SHA512 for authentication.
184
+ - **Index encryption** uses AES-256-GCM with random 96-bit nonces (`secrets.token_bytes`); each save generates a fresh nonce.
185
+ - **Key derivation** uses PBKDF2-HMAC-SHA256 with **600,000 iterations** (OWASP 2024 recommendation) and a per-DB random salt.
186
+ - **v1 file format** binds the magic+version header bytes into AES-GCM `associated_data`, defeating header tampering and downgrade attacks.
187
+ - **Derived keys** are cached in a bounded LRU (max 64 entries, serialized by a thread lock) so repeat opens within a process avoid the 600k-iter cost without leaking key material in long-running multi-tenant processes.
188
+ - **The encryption key is held in memory** during database usage.
142
189
 
143
190
  ## API Reference
144
191