simplevecdb 2.3.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. {simplevecdb-2.3.0/docs → simplevecdb-2.5.0}/CHANGELOG.md +182 -80
  2. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/PKG-INFO +48 -4
  3. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/README.md +47 -3
  4. {simplevecdb-2.3.0 → simplevecdb-2.5.0/docs}/CHANGELOG.md +36 -0
  5. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/pyproject.toml +1 -1
  6. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/__init__.py +12 -2
  7. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/async_core.py +174 -14
  8. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/constants.py +17 -1
  9. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/core.py +218 -30
  10. simplevecdb-2.5.0/src/simplevecdb/embeddings/__init__.py +12 -0
  11. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/models.py +33 -5
  12. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/server.py +206 -38
  13. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/catalog.py +152 -80
  14. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/quantization.py +6 -0
  15. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/search.py +68 -27
  16. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/usearch_index.py +64 -49
  17. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/utils.py +155 -1
  18. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_langchain.py +1 -1
  19. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_server.py +10 -8
  20. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_core_additional_coverage.py +1 -1
  21. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_filters.py +5 -5
  22. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_initialization.py +2 -2
  23. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_missing_coverage.py +1 -1
  24. simplevecdb-2.5.0/tests/unit/core/test_v25_correctness.py +265 -0
  25. simplevecdb-2.5.0/tests/unit/core/test_v25_features.py +344 -0
  26. simplevecdb-2.5.0/tests/unit/core/test_v25_robustness.py +312 -0
  27. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server.py +33 -25
  28. simplevecdb-2.5.0/tests/unit/embeddings/test_v25_enhancements.py +382 -0
  29. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_core.py +6 -6
  30. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/uv.lock +1 -1
  31. simplevecdb-2.3.0/src/simplevecdb/embeddings/__init__.py +0 -0
  32. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.bandit +0 -0
  33. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.env.example +0 -0
  34. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/FUNDING.yml +0 -0
  35. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  36. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  37. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  38. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/dependabot.yml +0 -0
  39. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/ci.yml +0 -0
  40. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/publish.yml +0 -0
  41. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/security.yml +0 -0
  42. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/update-sponsors.yml +0 -0
  43. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.gitignore +0 -0
  44. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.pre-commit-config.yaml +0 -0
  45. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.python-version +0 -0
  46. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/CODE_OF_CONDUCT.md +0 -0
  47. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/CONTRIBUTING.md +0 -0
  48. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/LICENSE +0 -0
  49. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/SECURITY.md +0 -0
  50. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/CONTRIBUTING.md +0 -0
  51. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/ENV_SETUP.md +0 -0
  52. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/LICENSE +0 -0
  53. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/async.md +0 -0
  54. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/config.md +0 -0
  55. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/core.md +0 -0
  56. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/embeddings.md +0 -0
  57. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/encryption.md +0 -0
  58. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/catalog.md +0 -0
  59. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/quantization.md +0 -0
  60. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/search.md +0 -0
  61. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/integrations.md +0 -0
  62. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/types.md +0 -0
  63. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/benchmarks.md +0 -0
  64. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/examples.md +0 -0
  65. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/guides/clustering.md +0 -0
  66. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/index.md +0 -0
  67. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/auto_embed.py +0 -0
  68. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/backend_benchmark.py +0 -0
  69. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/embeddings/perf_benchmark.py +0 -0
  70. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/quant_benchmark.py +0 -0
  71. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/langchain_rag.ipynb +0 -0
  72. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/llama_rag.ipynb +0 -0
  73. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/ollama_rag.ipynb +0 -0
  74. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/smoke_test.py +0 -0
  75. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/mkdocs.yml +0 -0
  76. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/config.py +0 -0
  77. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/encryption.py +0 -0
  78. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/__init__.py +0 -0
  79. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/clustering.py +0 -0
  80. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/__init__.py +0 -0
  81. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/langchain.py +0 -0
  82. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/llamaindex.py +0 -0
  83. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/logging.py +0 -0
  84. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/types.py +0 -0
  85. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/conftest.py +0 -0
  86. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_llamaindex.py +0 -0
  87. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_rag.py +0 -0
  88. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_v21_features.py +0 -0
  89. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/perf/test_batch_detection.py +0 -0
  90. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/perf/test_performance.py +0 -0
  91. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/__init__.py +0 -0
  92. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_batch_detection.py +0 -0
  93. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_factory_methods.py +0 -0
  94. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_quantization.py +0 -0
  95. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_similarity_search.py +0 -0
  96. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/__init__.py +0 -0
  97. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_models.py +0 -0
  98. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server_coverage.py +0 -0
  99. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/__init__.py +0 -0
  100. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_langchain_coverage.py +0 -0
  101. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_llamaindex_coverage.py +0 -0
  102. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_async.py +0 -0
  103. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_async_coverage.py +0 -0
  104. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_catalog_coverage.py +0 -0
  105. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_clustering.py +0 -0
  106. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_config.py +0 -0
  107. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_cross_collection_search.py +0 -0
  108. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_encryption.py +0 -0
  109. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_encryption_coverage.py +0 -0
  110. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_error_handling.py +0 -0
  111. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_hierarchy.py +0 -0
  112. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_multi_collection.py +0 -0
  113. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search.py +0 -0
  114. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search_coverage.py +0 -0
  115. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search_missing_coverage.py +0 -0
  116. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_streaming.py +0 -0
  117. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_types.py +0 -0
  118. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_usearch_index_missing_coverage.py +0 -0
  119. {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_utils.py +0 -0
@@ -5,106 +5,202 @@ All notable changes to SimpleVecDB will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [2.2.0] - 2026-01-17
8
+ ## [2.5.0] - 2026-04-07
9
9
 
10
10
  ### Added
11
11
 
12
- - **Vector Clustering & Auto-Tagging** - Discover natural groupings in embeddings
13
- - `VectorCollection.cluster()` - Cluster documents by semantic similarity
14
- - **K-means**: Classic centroid-based clustering for balanced clusters
15
- - **MiniBatch K-means**: Scalable variant for large datasets (default)
16
- - **HDBSCAN**: Density-based clustering that auto-discovers cluster count
17
- - `VectorCollection.auto_tag()` - Generate descriptive tags for clusters
18
- - TF-IDF method (default): Extract keywords with highest TF-IDF scores
19
- - Frequency method: Extract most common words per cluster
20
- - Custom callback: Implement custom tagging logic (e.g., LLM-based)
21
- - `VectorCollection.assign_cluster_metadata()` - Persist cluster IDs to document metadata
22
- - `VectorCollection.get_cluster_members()` - Retrieve all documents in a cluster
23
-
24
- - **Cluster Quality Metrics** - Evaluate clustering results
25
- - `ClusterResult.inertia` - Sum of squared distances to centroids (K-means only, lower is better)
26
- - `ClusterResult.silhouette_score` - Cluster separation metric (-1 to 1, higher is better)
27
- - `ClusterResult.metrics()` - Get all metrics as dictionary
28
-
29
- - **Cluster Persistence** - Save and reuse cluster configurations
30
- - `VectorCollection.save_cluster()` - Save cluster centroids and metadata to database
31
- - `VectorCollection.load_cluster()` - Load saved cluster configuration
32
- - `VectorCollection.list_clusters()` - List all saved cluster configurations
33
- - `VectorCollection.delete_cluster()` - Delete a saved cluster configuration
34
- - `VectorCollection.assign_to_cluster()` - Assign new documents to saved clusters without re-clustering
35
-
36
- - **Async Clustering Support** - Full async/await parity for all clustering operations
37
- - `AsyncVectorCollection.cluster()`, `auto_tag()`, `assign_cluster_metadata()`, `get_cluster_members()`
38
- - `AsyncVectorCollection.save_cluster()`, `load_cluster()`, `list_clusters()`, `delete_cluster()`, `assign_to_cluster()`
39
-
40
- - **New Dependencies** - Now included in standard installation
41
- - `scikit-learn>=1.3.0` - K-means, MiniBatch K-means, silhouette score
42
- - `hdbscan>=0.8.33` - Density-based clustering
43
- - `sqlcipher3-binary>=0.5.0` - Encryption support (previously optional)
44
- - `cryptography>=41.0` - Encryption utilities (previously optional)
45
-
46
- - **Documentation**
47
- - New comprehensive clustering guide: `docs/guides/clustering.md`
48
- - Algorithm comparison and selection guide
49
- - Quality metrics interpretation
50
- - Cluster persistence workflows
51
- - Use cases: product categorization, topic discovery, customer segmentation, duplicate detection
52
- - Best practices and troubleshooting
53
- - New types reference: `docs/api/types.md`
54
- - Complete `ClusterResult` API documentation
55
- - `Document`, `DistanceStrategy`, `Quantization`, `ClusterAlgorithm` reference
56
- - Updated README.md and docs/index.md with clustering sections
57
- - Enhanced `docs/api/core.md` with clustering examples
12
+ - **`delete_collection(name)`** drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
13
+ - **`store_embeddings` parameter** on `collection()` opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
14
+ - **`async_retry_on_lock` decorator** async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
15
+ - **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
16
+ - **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
17
+ - **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` was previously defined in the enum but raised `ValueError` at runtime.
18
+ - **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
19
+ - **Embeddings server enhancements:**
20
+ - Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
21
+ - CORS middleware with configurable origins for browser-based clients
22
+ - Model warm-up on startup (skip with `--no-warmup`)
23
+ - Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
24
+ - Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
25
+ - Startup banner logging config summary (host, port, model, auth, rate limits)
26
+ - Nested token array normalization (`list[list[int]]` input format)
27
+ - Async executor offload for `embed_texts` (non-blocking event loop)
28
+ - OpenAPI version synced from package metadata
29
+ - Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
30
+
31
+ ### Fixed
32
+
33
+ - **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
34
+ - **`_matches_filter` string semantics** now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
35
+ - **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
36
+ - **WAL mode for encrypted databases** `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
37
+ - **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
38
+ - **`_ensure_fts_table`** retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
39
+ - **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
40
+
41
+ ### Improved
42
+
43
+ - **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
44
+ - **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
45
+ - **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
46
+ - **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
47
+
48
+ ### Removed
49
+
50
+ - **Duplicate `_dim` property** — removed in favor of the public `dim` property.
51
+
52
+ ### Breaking Changes
53
+
54
+ - String metadata filters now use exact equality (was substring match).
55
+ - `store_embeddings` defaults to `False` `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
56
+
57
+ ## [2.4.0] - 2026-03-22
58
+
59
+ ### Added
60
+
61
+ - **Public catalog API on VectorCollection + AsyncVectorCollection:**
62
+ - `get_documents(filter_dict=)` — replaces private `_catalog` access
63
+ - `get_embeddings_by_ids(ids)` — fetch stored embeddings
64
+ - `update_metadata(updates)` — batch metadata merge
65
+ - `count()`, `save()`, `dim` property — async wrappers
66
+ - `add_texts(parent_ids=, threads=)` — full param support on async
67
+ - `rebuild_index`, `get_children/parent/descendants/ancestors`, `set_parent` — async hierarchy API
68
+ - **Executor injection on AsyncVectorDB** — accept optional `executor` keyword argument so consumers can share a single-threaded executor for ONNX/usearch thread safety; `close()` only shuts down executor when `_owns_executor` is True
69
+ - **Safety constants** in `constants.py`: `SEARCH_COLLECTION_TIMEOUT`, `EXECUTOR_SHUTDOWN_TIMEOUT`, `MAX_HIERARCHY_DEPTH`
70
+
71
+ ### Fixed
72
+
73
+ - **VectorDB.close()** now calls `conn.close()` — was leaking file descriptors when `save()` succeeded but connection was never closed
74
+ - **VectorDB.close()** wraps `save()` in `try/finally` so `conn.close()` always runs even if index serialization fails
75
+ - **add_documents ID recovery** uses `last_insert_rowid()` arithmetic instead of `ORDER BY id DESC LIMIT N`, which raced under concurrent inserts
76
+ - **String metadata filter** uses exact equality (`=`) instead of `LIKE` substring match — `{"type": "doc"}` no longer matches `"markdown_doc"`
77
+ - **update_metadata_batch** wrapped in single transaction (`with self.conn`) to prevent partial commits on crash
78
+ - **rebuild_index** uses `if x is not None` instead of `x or default` so passing `connectivity=0` no longer silently uses the default
79
+ - **search_collections** parallel futures now have a 30s timeout — one hung collection can no longer block the entire cross-collection search
80
+ - **AsyncVectorDB.close()** uses `shutdown(wait=False, cancel_futures=True)` instead of blocking `shutdown(wait=True)` which could hang forever on stuck tasks
81
+ - **Recursive CTE safety cap** — `get_descendants`/`get_ancestors` apply `MAX_HIERARCHY_DEPTH=100` when `max_depth=None` to prevent infinite recursion from parent_id cycles
82
+ - **RateLimiter cleanup** capped to 500 evictions per call to bound lock hold time under high bucket counts
83
+ - **HuggingFace download** now uses `etag_timeout=30` with local-cache fallback on network failure
84
+ - **embed_texts** rejects batches over 10,000 texts to prevent unbounded CPU time
85
+ - **retry_on_lock** adds `total_timeout=10s` budget — gives up early if cumulative sleep would exceed the budget
58
86
 
59
87
  ### Changed
60
88
 
61
- - **pyproject.toml**: Updated `scikit-learn` minimum version from `1.0` to `1.3.0` for improved clustering stability
89
+ - **`__version__`** now read from package metadata via `importlib.metadata` (single source of truth in `pyproject.toml`)
90
+ - **Upsert in usearch_index** separates conflict detection from removal for clearer flow
62
91
 
63
- ### Testing
92
+ ## [2.3.0] - 2026-03-08
64
93
 
65
- - Added 26 clustering tests in `tests/unit/test_clustering.py`:
66
- - 16 core clustering tests (algorithms, auto-tagging, metadata persistence, edge cases)
67
- - 4 cluster metrics tests (inertia, silhouette, metrics method)
68
- - 6 cluster persistence tests (save/load/list/delete/assign)
69
- - Added 3 async clustering tests in `tests/unit/test_async.py`
70
- - Total test count: 305 (up from 292)
94
+ ### Breaking Changes
71
95
 
72
- ### Installation
96
+ - **Integration dependencies are now optional.** LangChain and LlamaIndex packages are no longer installed by default. Install with `pip install simplevecdb[integrations]` to use them. Existing users upgrading from v2.2.x will see a clear ImportError with migration instructions.
73
97
 
74
- Clustering and encryption are now included by default:
98
+ ### Added
75
99
 
76
- ```bash
77
- pip install simplevecdb
78
- ```
100
+ - **`[integrations]` optional extra** — Install LangChain and LlamaIndex dependencies only when needed, reducing default install footprint
101
+ - **Runtime import guards** in integration modules with v2.3.0 migration messaging
102
+ - **Lazy `__getattr__` loading** in `integrations/__init__.py` — integration classes are only imported when accessed
103
+ - **Input validation guards** on search methods:
104
+ - `similarity_search`, `similarity_search_batch`, `keyword_search`, `hybrid_search` now reject `k <= 0`
105
+ - `add_texts` validates length consistency of `metadatas`, `embeddings`, `ids`, and `parent_ids` against `texts`
106
+ - **NaN/Inf validation** for float values in metadata filters (`utils.validate_filter`)
107
+ - **Empty list rejection** for list filter values
108
+ - **Double-close protection** on `VectorDB` with `_closed` flag
109
+ - **Context manager protocol** (`__enter__`/`__exit__`) on `VectorDB`
110
+ - **Table name validation** in `check_migration` (defense-in-depth against SQL injection)
111
+ - **Graceful per-future error handling** in `search_collections`
112
+ - **Adaptive batch search threshold** — queries below `USEARCH_BATCH_THRESHOLD` (10) use sequential search to avoid batch overhead
79
113
 
80
- No extra installation steps required!
114
+ ### Changed
81
115
 
82
- ### Example
116
+ - **Python dev target changed to 3.12** (`.python-version`), `requires-python` remains `>= "3.10"`
117
+ - **Version bumped to 2.3.0**
118
+ - **Performance: MMR search vectorized** — pre-normalize embeddings once, use `sel_matrix @ emb` matrix-vector multiply instead of Python inner loop, O(1) `list.pop` replaces O(n) `list.remove`, hoist `1 - lambda_mult` loop invariant
119
+ - **Performance: merged SQL round-trips in MMR** — new `get_documents_and_embeddings_by_ids` fetches text, metadata, and embeddings in a single query (previously two separate SELECTs)
120
+ - **Performance: `get_parent` collapsed** from 2 sequential SELECTs to 1 self-JOIN
121
+ - **Performance: `add_documents` ID recovery** — skip redundant `SELECT ORDER BY DESC` when explicit IDs are provided; removed unnecessary `list(texts)` copy
122
+ - **Performance: FLOAT serialization** — `np.asarray().tobytes()` replaces `struct.pack` with per-element Python loop (single C memcpy)
123
+ - **Performance: `np.array` → `np.asarray`** on every search and insert path to avoid unnecessary copies
124
+ - **Performance: SQL placeholder strings** — `",".join(["?"] * len(ids))` replaces generator expression across all 9 call sites
125
+ - **Performance: batched numpy conversion** in `add_texts` — single `np.asarray` call instead of per-item conversion
126
+ - **Performance: compact JSON separators** in catalog serialization
127
+ - **Performance: deduplicated `.tolist()` calls** in search engine
128
+ - **Performance: `np.unique(ravel())`** for batch key collection in `similarity_search_batch`
129
+ - **Performance: usearch upsert** — skip contains-check loop on empty index, cache `int(key)` once per iteration
130
+ - **Performance: cluster table DDL** — `_cluster_table_ready` flag skips `CREATE TABLE IF NOT EXISTS` on repeated calls; cached `_cluster_table_name`
131
+ - **`_normalize_key`** now delegates to `_derive_key` instead of duplicating PBKDF2 logic
132
+ - **HNSW defaults** in `usearch_index.py` now sourced from `constants.py` (removed local duplicates)
133
+ - **Collection name regex** uses `constants.COLLECTION_NAME_PATTERN` instead of hardcoded pattern
134
+ - **`VectorDB` defaults** for `distance_strategy` and `quantization` sourced from `constants.DEFAULT_DISTANCE_STRATEGY` / `constants.DEFAULT_QUANTIZATION`
135
+ - **`_batched` utility** moved from `core.py` to `utils.py` for reuse; now used in `catalog.py` batch updates
136
+ - **`auto_tag`** uses `defaultdict(list)` instead of manual if-not-in pattern
137
+ - **`import random`** hoisted to module level in `utils.py` (was inside retry loop)
138
+ - **Streaming placeholder bug fixed** — `_process_streaming_batch` now correctly detects `None` placeholders (previously used empty list `[]`, preventing auto-embedding replacement)
139
+ - **README updated** to document `pip install simplevecdb[integrations]` installation
140
+
141
+ ### Removed
142
+
143
+ - LangChain and LlamaIndex packages from core `[project.dependencies]` (moved to `[project.optional-dependencies] integrations`)
144
+ - Duplicated HNSW default constants from `usearch_index.py` (now single source in `constants.py`)
145
+ - Unused `struct` import from `quantization.py`
146
+ - Unused `itertools` import from `core.py`
147
+
148
+ ## [2.2.1] - 2026-01-27
83
149
 
84
- ```python
85
- from simplevecdb import VectorDB
150
+ ### Changed
86
151
 
87
- db = VectorDB("products.db")
88
- collection = db.collection("items")
152
+ - Moved integration dependencies (langchain-core, langchain-openai, llama-index) from dev to main dependencies for easier installation
153
+ - Added bandit to dev dependencies for security linting in pre-commit
154
+ - Cleaned up duplicate dev dependency definitions
89
155
 
90
- # Cluster documents
91
- result = collection.cluster(n_clusters=5, algorithm="minibatch_kmeans")
156
+ ## [2.2.0] - 2026-01-26
92
157
 
93
- # Generate tags and persist
94
- tags = collection.auto_tag(result, method="tfidf", n_keywords=3)
95
- collection.assign_cluster_metadata(result, tags)
158
+ ### Added
96
159
 
97
- # Save for fast assignment of new documents
98
- collection.save_cluster("categories", result, metadata={"tags": tags})
160
+ - Version 2.2.0 release
99
161
 
100
- # Later: assign new documents without re-clustering
101
- new_ids = collection.add_texts(new_texts, embeddings=new_embeddings)
102
- collection.assign_to_cluster("categories", new_ids)
162
+ ## [2.1.0] - 2026-01-01
103
163
 
104
- # Evaluate quality
105
- print(f"Silhouette Score: {result.silhouette_score:.2f}") # 0.62
106
- print(f"Inertia: {result.inertia:.2f}") # 1523.45
107
- ```
164
+ ### Added
165
+
166
+ - **SQLCipher Encryption Support** - Full at-rest encryption for sensitive data:
167
+ - `VectorDB(path, encryption_key="...")` enables AES-256 page-level database encryption
168
+ - Uses SQLCipher for transparent SQLite encryption (PRAGMA key)
169
+ - Usearch index files encrypted with AES-256-GCM (`.usearch.enc`)
170
+ - Zero performance overhead during search (decrypt on load, encrypt on save only)
171
+ - Key derivation: PBKDF2-SHA256 with 480,000 iterations for passphrases
172
+ - Install with `pip install simplevecdb[encryption]`
173
+
174
+ - **New encryption module** (`simplevecdb.encryption`):
175
+ - `create_encrypted_connection()` - SQLCipher connection factory
176
+ - `is_database_encrypted()` - Check if a database file is encrypted
177
+ - `encrypt_index_file()` / `decrypt_index_file()` - Index file encryption
178
+ - `EncryptionError` / `EncryptionUnavailableError` - New exception types
179
+
180
+ - **Streaming Insert API** - Memory-efficient large-scale ingestion:
181
+ - `collection.add_texts_streaming(iterable)` - Process from any iterator/generator
182
+ - Configurable `batch_size` parameter (default: config.EMBEDDING_BATCH_SIZE)
183
+ - Yields `StreamingProgress` after each batch for monitoring
184
+ - Optional `on_progress` callback for custom logging/UI updates
185
+ - New types: `StreamingProgress`, `ProgressCallback`
186
+
187
+ - **Hierarchical Document Relationships** - Parent/child document structure:
188
+ - `parent_ids` parameter in `add_texts()` to link documents
189
+ - `get_children(doc_id)` - Get direct child documents
190
+ - `get_parent(doc_id)` - Get parent document
191
+ - `get_descendants(doc_id, max_depth)` - Recursive children traversal
192
+ - `get_ancestors(doc_id, max_depth)` - Path to root
193
+ - `set_parent(doc_id, parent_id)` - Update relationships
194
+ - Uses SQLite recursive CTE for efficient traversal
195
+ - Auto-migrates existing databases (adds `parent_id` column)
196
+
197
+ ### Changed
198
+
199
+ - `check_migration()` now gracefully handles encrypted databases (returns `needs_migration=False`)
200
+
201
+ ### Dependencies
202
+
203
+ - New optional dependency group `[encryption]`: `sqlcipher3-binary>=0.5.0`, `cryptography>=41.0`
108
204
 
109
205
  ## [2.0.0] - 2025-12-23
110
206
 
@@ -473,6 +569,12 @@ Benchmarks on i9-13900K & RTX 4090 with 10k vectors (384-dim):
473
569
  - **Documentation**: https://coderdayton.github.io/simplevecdb/
474
570
  - **License**: MIT
475
571
 
572
+ [2.4.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.4.0
573
+ [2.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.3.0
574
+ [2.2.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.1
575
+ [2.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.0
576
+ [2.1.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.1.0
577
+ [2.0.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.0.0
476
578
  [1.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.3.0
477
579
  [1.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.2.0
478
580
  [1.1.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.1.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: simplevecdb
3
- Version: 2.3.0
3
+ Version: 2.5.0
4
4
  Summary: Dead-simple local vector database powered by usearch HNSW.
5
5
  Author-email: Dayton Dunbar <coderdayton14@gmail.com>
6
6
  License: MIT
@@ -43,7 +43,7 @@ SimpleVecDB brings **Chroma-like simplicity** to a single **SQLite file**. Built
43
43
  - **Zero Infrastructure** — Just a `.db` file. No Docker, no Redis, no cloud bills.
44
44
  - **Blazing Fast** — 10-100x faster search via usearch HNSW. Adaptive: brute-force for <10k vectors (perfect recall), HNSW for larger collections.
45
45
  - **Truly Portable** — Runs anywhere SQLite runs: Linux, macOS, Windows, even WASM.
46
- - **Async Ready** — Full async/await support for web servers and concurrent workloads.
46
+ - **Async Ready** — Full async/await support with optional executor injection for thread-safe ONNX/usearch sharing.
47
47
  - **Batteries Included** — Optional FastAPI embeddings server + LangChain/LlamaIndex integrations via `[integrations]` extra.
48
48
  - **Production Ready** — Hybrid search (BM25 + vector), metadata filtering, multi-collection support, and automatic hardware acceleration.
49
49
 
@@ -169,10 +169,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
169
169
  **Optional: Run embeddings server (OpenAI-compatible)**
170
170
 
171
171
  ```bash
172
- simplevecdb-server --port 8000
172
+ simplevecdb-server --port 8000 # Default model, auto warm-up
173
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
174
+ simplevecdb-server --no-warmup # Skip model preload on startup
175
+ simplevecdb-server --help # Show all options
173
176
  ```
174
177
 
175
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
178
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
176
179
 
177
180
  ### Option 3: With LangChain or LlamaIndex
178
181
 
@@ -321,6 +324,36 @@ parent = collection.get_parent(child_ids[0])
321
324
  descendants = collection.get_descendants(parent_ids[0])
322
325
  ```
323
326
 
327
+ ### Document Management (v2.4+)
328
+
329
+ Query and update documents without touching private internals:
330
+
331
+ ```python
332
+ # Get all documents (with optional metadata filter)
333
+ docs = collection.get_documents(filter_dict={"category": "tech"})
334
+ for doc_id, text, metadata in docs:
335
+ print(f"[{doc_id}] {text[:50]}...")
336
+
337
+ # Paginated access (v2.5+)
338
+ page1 = collection.get_documents(limit=100)
339
+ page2 = collection.get_documents(limit=100, offset=100)
340
+
341
+ # Fetch stored embeddings
342
+ embeddings = collection.get_embeddings_by_ids([1, 2, 3])
343
+
344
+ # Batch update metadata (shallow merge)
345
+ collection.update_metadata([
346
+ (1, {"reviewed": True}),
347
+ (2, {"reviewed": True, "score": 0.95}),
348
+ ])
349
+
350
+ # Quick stats
351
+ print(f"Collection has {collection.count()} documents, dim={collection.dim}")
352
+
353
+ # Delete an entire collection (v2.5+)
354
+ db.delete_collection("old_data")
355
+ ```
356
+
324
357
  ### Vector Clustering (v2.2+)
325
358
 
326
359
  Discover natural groupings in your embeddings:
@@ -359,6 +392,12 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
359
392
  | **Document Hierarchies** | ✅ | Parent/child relationships for chunked docs |
360
393
  | **Vector Clustering** | ✅ | K-means, MiniBatch K-means, HDBSCAN with auto-tagging (v2.2+) |
361
394
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
395
+ | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
396
+ | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
397
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
398
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
399
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
400
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
362
401
 
363
402
  ## Performance Benchmarks
364
403
 
@@ -429,6 +468,11 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
429
468
  - [x] Hierarchical document relationships (parent/child)
430
469
  - [x] Cross-collection search
431
470
  - [x] Vector clustering and auto-tagging (v2.2)
471
+ - [x] Public catalog API for document management (v2.4)
472
+ - [x] Async executor injection for thread-safe sharing (v2.4)
473
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
474
+ - [x] Cross-process file locking and connection health checks (v2.5)
475
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
432
476
  - [ ] Incremental clustering (online learning)
433
477
  - [ ] Cluster visualization exports
434
478
 
@@ -14,7 +14,7 @@ SimpleVecDB brings **Chroma-like simplicity** to a single **SQLite file**. Built
14
14
  - **Zero Infrastructure** — Just a `.db` file. No Docker, no Redis, no cloud bills.
15
15
  - **Blazing Fast** — 10-100x faster search via usearch HNSW. Adaptive: brute-force for <10k vectors (perfect recall), HNSW for larger collections.
16
16
  - **Truly Portable** — Runs anywhere SQLite runs: Linux, macOS, Windows, even WASM.
17
- - **Async Ready** — Full async/await support for web servers and concurrent workloads.
17
+ - **Async Ready** — Full async/await support with optional executor injection for thread-safe ONNX/usearch sharing.
18
18
  - **Batteries Included** — Optional FastAPI embeddings server + LangChain/LlamaIndex integrations via `[integrations]` extra.
19
19
  - **Production Ready** — Hybrid search (BM25 + vector), metadata filtering, multi-collection support, and automatic hardware acceleration.
20
20
 
@@ -140,10 +140,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
140
140
  **Optional: Run embeddings server (OpenAI-compatible)**
141
141
 
142
142
  ```bash
143
- simplevecdb-server --port 8000
143
+ simplevecdb-server --port 8000 # Default model, auto warm-up
144
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
145
+ simplevecdb-server --no-warmup # Skip model preload on startup
146
+ simplevecdb-server --help # Show all options
144
147
  ```
145
148
 
146
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
149
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
147
150
 
148
151
  ### Option 3: With LangChain or LlamaIndex
149
152
 
@@ -292,6 +295,36 @@ parent = collection.get_parent(child_ids[0])
292
295
  descendants = collection.get_descendants(parent_ids[0])
293
296
  ```
294
297
 
298
+ ### Document Management (v2.4+)
299
+
300
+ Query and update documents without touching private internals:
301
+
302
+ ```python
303
+ # Get all documents (with optional metadata filter)
304
+ docs = collection.get_documents(filter_dict={"category": "tech"})
305
+ for doc_id, text, metadata in docs:
306
+ print(f"[{doc_id}] {text[:50]}...")
307
+
308
+ # Paginated access (v2.5+)
309
+ page1 = collection.get_documents(limit=100)
310
+ page2 = collection.get_documents(limit=100, offset=100)
311
+
312
+ # Fetch stored embeddings
313
+ embeddings = collection.get_embeddings_by_ids([1, 2, 3])
314
+
315
+ # Batch update metadata (shallow merge)
316
+ collection.update_metadata([
317
+ (1, {"reviewed": True}),
318
+ (2, {"reviewed": True, "score": 0.95}),
319
+ ])
320
+
321
+ # Quick stats
322
+ print(f"Collection has {collection.count()} documents, dim={collection.dim}")
323
+
324
+ # Delete an entire collection (v2.5+)
325
+ db.delete_collection("old_data")
326
+ ```
327
+
295
328
  ### Vector Clustering (v2.2+)
296
329
 
297
330
  Discover natural groupings in your embeddings:
@@ -330,6 +363,12 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
330
363
  | **Document Hierarchies** | ✅ | Parent/child relationships for chunked docs |
331
364
  | **Vector Clustering** | ✅ | K-means, MiniBatch K-means, HDBSCAN with auto-tagging (v2.2+) |
332
365
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
366
+ | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
367
+ | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
368
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
369
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
370
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
371
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
333
372
 
334
373
  ## Performance Benchmarks
335
374
 
@@ -400,6 +439,11 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
400
439
  - [x] Hierarchical document relationships (parent/child)
401
440
  - [x] Cross-collection search
402
441
  - [x] Vector clustering and auto-tagging (v2.2)
442
+ - [x] Public catalog API for document management (v2.4)
443
+ - [x] Async executor injection for thread-safe sharing (v2.4)
444
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
445
+ - [x] Cross-process file locking and connection health checks (v2.5)
446
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
403
447
  - [ ] Incremental clustering (online learning)
404
448
  - [ ] Cluster visualization exports
405
449
 
@@ -5,6 +5,41 @@ All notable changes to SimpleVecDB will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.4.0] - 2026-03-22
9
+
10
+ ### Added
11
+
12
+ - **Public catalog API on VectorCollection + AsyncVectorCollection:**
13
+ - `get_documents(filter_dict=)` — replaces private `_catalog` access
14
+ - `get_embeddings_by_ids(ids)` — fetch stored embeddings
15
+ - `update_metadata(updates)` — batch metadata merge
16
+ - `count()`, `save()`, `dim` property — async wrappers
17
+ - `add_texts(parent_ids=, threads=)` — full param support on async
18
+ - `rebuild_index`, `get_children/parent/descendants/ancestors`, `set_parent` — async hierarchy API
19
+ - **Executor injection on AsyncVectorDB** — accept optional `executor` keyword argument so consumers can share a single-threaded executor for ONNX/usearch thread safety; `close()` only shuts down executor when `_owns_executor` is True
20
+ - **Safety constants** in `constants.py`: `SEARCH_COLLECTION_TIMEOUT`, `EXECUTOR_SHUTDOWN_TIMEOUT`, `MAX_HIERARCHY_DEPTH`
21
+
22
+ ### Fixed
23
+
24
+ - **VectorDB.close()** now calls `conn.close()` — was leaking file descriptors when `save()` succeeded but connection was never closed
25
+ - **VectorDB.close()** wraps `save()` in `try/finally` so `conn.close()` always runs even if index serialization fails
26
+ - **add_documents ID recovery** uses `last_insert_rowid()` arithmetic instead of `ORDER BY id DESC LIMIT N`, which raced under concurrent inserts
27
+ - **String metadata filter** uses exact equality (`=`) instead of `LIKE` substring match — `{"type": "doc"}` no longer matches `"markdown_doc"`
28
+ - **update_metadata_batch** wrapped in single transaction (`with self.conn`) to prevent partial commits on crash
29
+ - **rebuild_index** uses `if x is not None` instead of `x or default` so passing `connectivity=0` no longer silently uses the default
30
+ - **search_collections** parallel futures now have a 30s timeout — one hung collection can no longer block the entire cross-collection search
31
+ - **AsyncVectorDB.close()** uses `shutdown(wait=False, cancel_futures=True)` instead of blocking `shutdown(wait=True)` which could hang forever on stuck tasks
32
+ - **Recursive CTE safety cap** — `get_descendants`/`get_ancestors` apply `MAX_HIERARCHY_DEPTH=100` when `max_depth=None` to prevent infinite recursion from parent_id cycles
33
+ - **RateLimiter cleanup** capped to 500 evictions per call to bound lock hold time under high bucket counts
34
+ - **HuggingFace download** now uses `etag_timeout=30` with local-cache fallback on network failure
35
+ - **embed_texts** rejects batches over 10,000 texts to prevent unbounded CPU time
36
+ - **retry_on_lock** adds `total_timeout=10s` budget — gives up early if cumulative sleep would exceed the budget
37
+
38
+ ### Changed
39
+
40
+ - **`__version__`** now read from package metadata via `importlib.metadata` (single source of truth in `pyproject.toml`)
41
+ - **Upsert in usearch_index** separates conflict detection from removal for clearer flow
42
+
8
43
  ## [2.3.0] - 2026-03-08
9
44
 
10
45
  ### Breaking Changes
@@ -485,6 +520,7 @@ Benchmarks on i9-13900K & RTX 4090 with 10k vectors (384-dim):
485
520
  - **Documentation**: https://coderdayton.github.io/simplevecdb/
486
521
  - **License**: MIT
487
522
 
523
+ [2.4.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.4.0
488
524
  [2.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.3.0
489
525
  [2.2.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.1
490
526
  [2.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "simplevecdb"
3
- version = "2.3.0"
3
+ version = "2.5.0"
4
4
  description = "Dead-simple local vector database powered by usearch HNSW."
5
5
  authors = [{ name = "Dayton Dunbar", email = "coderdayton14@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -16,10 +16,18 @@ try:
16
16
  except ImportError:
17
17
  pass
18
18
  from .logging import get_logger, configure_logging, log_operation
19
- from .utils import DatabaseLockedError, retry_on_lock, validate_filter
19
+ from .utils import (
20
+ DatabaseLockedError,
21
+ async_retry_on_lock,
22
+ file_lock,
23
+ retry_on_lock,
24
+ validate_filter,
25
+ )
20
26
  from .encryption import EncryptionError, EncryptionUnavailableError
21
27
 
22
- __version__ = "2.3.0"
28
+ from importlib.metadata import version as _pkg_version
29
+
30
+ __version__ = _pkg_version("simplevecdb")
23
31
  __all__ = [
24
32
  # Core classes
25
33
  "VectorDB",
@@ -47,6 +55,8 @@ __all__ = [
47
55
  "MigrationRequiredError",
48
56
  "EncryptionError",
49
57
  "EncryptionUnavailableError",
58
+ "async_retry_on_lock",
59
+ "file_lock",
50
60
  "retry_on_lock",
51
61
  "validate_filter",
52
62
  ]