simplevecdb 2.3.0__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simplevecdb-2.3.0/docs → simplevecdb-2.5.0}/CHANGELOG.md +182 -80
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/PKG-INFO +48 -4
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/README.md +47 -3
- {simplevecdb-2.3.0 → simplevecdb-2.5.0/docs}/CHANGELOG.md +36 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/pyproject.toml +1 -1
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/__init__.py +12 -2
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/async_core.py +174 -14
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/constants.py +17 -1
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/core.py +218 -30
- simplevecdb-2.5.0/src/simplevecdb/embeddings/__init__.py +12 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/models.py +33 -5
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/server.py +206 -38
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/catalog.py +152 -80
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/quantization.py +6 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/search.py +68 -27
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/usearch_index.py +64 -49
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/utils.py +155 -1
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_langchain.py +1 -1
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_server.py +10 -8
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_core_additional_coverage.py +1 -1
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_filters.py +5 -5
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_initialization.py +2 -2
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_missing_coverage.py +1 -1
- simplevecdb-2.5.0/tests/unit/core/test_v25_correctness.py +265 -0
- simplevecdb-2.5.0/tests/unit/core/test_v25_features.py +344 -0
- simplevecdb-2.5.0/tests/unit/core/test_v25_robustness.py +312 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server.py +33 -25
- simplevecdb-2.5.0/tests/unit/embeddings/test_v25_enhancements.py +382 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_core.py +6 -6
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/uv.lock +1 -1
- simplevecdb-2.3.0/src/simplevecdb/embeddings/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.bandit +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.env.example +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/FUNDING.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/dependabot.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/ci.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/publish.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/security.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.github/workflows/update-sponsors.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.gitignore +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.pre-commit-config.yaml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/.python-version +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/CODE_OF_CONDUCT.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/CONTRIBUTING.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/LICENSE +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/SECURITY.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/CONTRIBUTING.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/ENV_SETUP.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/LICENSE +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/async.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/config.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/core.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/embeddings.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/encryption.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/catalog.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/quantization.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/engine/search.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/integrations.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/api/types.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/benchmarks.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/examples.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/guides/clustering.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/docs/index.md +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/auto_embed.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/backend_benchmark.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/embeddings/perf_benchmark.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/quant_benchmark.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/langchain_rag.ipynb +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/llama_rag.ipynb +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/rag/ollama_rag.ipynb +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/examples/smoke_test.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/mkdocs.yml +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/config.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/encryption.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/clustering.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/langchain.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/llamaindex.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/logging.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/src/simplevecdb/types.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/conftest.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_llamaindex.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_rag.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/integration/test_v21_features.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/perf/test_batch_detection.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/perf/test_performance.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_batch_detection.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_factory_methods.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_quantization.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/core/test_similarity_search.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_models.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/__init__.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_langchain_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_llamaindex_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_async.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_async_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_catalog_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_clustering.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_config.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_cross_collection_search.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_encryption.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_encryption_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_error_handling.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_hierarchy.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_multi_collection.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_search_missing_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_streaming.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_types.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_usearch_index_missing_coverage.py +0 -0
- {simplevecdb-2.3.0 → simplevecdb-2.5.0}/tests/unit/test_utils.py +0 -0
|
@@ -5,106 +5,202 @@ All notable changes to SimpleVecDB will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [2.
|
|
8
|
+
## [2.5.0] - 2026-04-07
|
|
9
9
|
|
|
10
10
|
### Added
|
|
11
11
|
|
|
12
|
-
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
|
|
24
|
-
-
|
|
25
|
-
-
|
|
26
|
-
-
|
|
27
|
-
- `
|
|
28
|
-
|
|
29
|
-
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
- **
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
- **
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
12
|
+
- **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
|
|
13
|
+
- **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
|
|
14
|
+
- **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
|
|
15
|
+
- **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
|
|
16
|
+
- **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
|
|
17
|
+
- **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
|
|
18
|
+
- **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
|
|
19
|
+
- **Embeddings server enhancements:**
|
|
20
|
+
- Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
|
|
21
|
+
- CORS middleware with configurable origins for browser-based clients
|
|
22
|
+
- Model warm-up on startup (skip with `--no-warmup`)
|
|
23
|
+
- Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
|
|
24
|
+
- Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
|
|
25
|
+
- Startup banner logging config summary (host, port, model, auth, rate limits)
|
|
26
|
+
- Nested token array normalization (`list[list[int]]` input format)
|
|
27
|
+
- Async executor offload for `embed_texts` (non-blocking event loop)
|
|
28
|
+
- OpenAPI version synced from package metadata
|
|
29
|
+
- Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
|
|
30
|
+
|
|
31
|
+
### Fixed
|
|
32
|
+
|
|
33
|
+
- **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
|
|
34
|
+
- **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
|
|
35
|
+
- **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
|
|
36
|
+
- **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
|
|
37
|
+
- **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
|
|
38
|
+
- **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
|
|
39
|
+
- **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
|
|
40
|
+
|
|
41
|
+
### Improved
|
|
42
|
+
|
|
43
|
+
- **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
|
|
44
|
+
- **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
|
|
45
|
+
- **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
|
|
46
|
+
- **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
|
|
47
|
+
|
|
48
|
+
### Removed
|
|
49
|
+
|
|
50
|
+
- **Duplicate `_dim` property** — removed in favor of the public `dim` property.
|
|
51
|
+
|
|
52
|
+
### Breaking Changes
|
|
53
|
+
|
|
54
|
+
- String metadata filters now use exact equality (was substring match).
|
|
55
|
+
- `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
|
|
56
|
+
|
|
57
|
+
## [2.4.0] - 2026-03-22
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
|
|
61
|
+
- **Public catalog API on VectorCollection + AsyncVectorCollection:**
|
|
62
|
+
- `get_documents(filter_dict=)` — replaces private `_catalog` access
|
|
63
|
+
- `get_embeddings_by_ids(ids)` — fetch stored embeddings
|
|
64
|
+
- `update_metadata(updates)` — batch metadata merge
|
|
65
|
+
- `count()`, `save()`, `dim` property — async wrappers
|
|
66
|
+
- `add_texts(parent_ids=, threads=)` — full param support on async
|
|
67
|
+
- `rebuild_index`, `get_children/parent/descendants/ancestors`, `set_parent` — async hierarchy API
|
|
68
|
+
- **Executor injection on AsyncVectorDB** — accept optional `executor` keyword argument so consumers can share a single-threaded executor for ONNX/usearch thread safety; `close()` only shuts down executor when `_owns_executor` is True
|
|
69
|
+
- **Safety constants** in `constants.py`: `SEARCH_COLLECTION_TIMEOUT`, `EXECUTOR_SHUTDOWN_TIMEOUT`, `MAX_HIERARCHY_DEPTH`
|
|
70
|
+
|
|
71
|
+
### Fixed
|
|
72
|
+
|
|
73
|
+
- **VectorDB.close()** now calls `conn.close()` — was leaking file descriptors when `save()` succeeded but connection was never closed
|
|
74
|
+
- **VectorDB.close()** wraps `save()` in `try/finally` so `conn.close()` always runs even if index serialization fails
|
|
75
|
+
- **add_documents ID recovery** uses `last_insert_rowid()` arithmetic instead of `ORDER BY id DESC LIMIT N`, which raced under concurrent inserts
|
|
76
|
+
- **String metadata filter** uses exact equality (`=`) instead of `LIKE` substring match — `{"type": "doc"}` no longer matches `"markdown_doc"`
|
|
77
|
+
- **update_metadata_batch** wrapped in single transaction (`with self.conn`) to prevent partial commits on crash
|
|
78
|
+
- **rebuild_index** uses `if x is not None` instead of `x or default` so passing `connectivity=0` no longer silently uses the default
|
|
79
|
+
- **search_collections** parallel futures now have a 30s timeout — one hung collection can no longer block the entire cross-collection search
|
|
80
|
+
- **AsyncVectorDB.close()** uses `shutdown(wait=False, cancel_futures=True)` instead of blocking `shutdown(wait=True)` which could hang forever on stuck tasks
|
|
81
|
+
- **Recursive CTE safety cap** — `get_descendants`/`get_ancestors` apply `MAX_HIERARCHY_DEPTH=100` when `max_depth=None` to prevent infinite recursion from parent_id cycles
|
|
82
|
+
- **RateLimiter cleanup** capped to 500 evictions per call to bound lock hold time under high bucket counts
|
|
83
|
+
- **HuggingFace download** now uses `etag_timeout=30` with local-cache fallback on network failure
|
|
84
|
+
- **embed_texts** rejects batches over 10,000 texts to prevent unbounded CPU time
|
|
85
|
+
- **retry_on_lock** adds `total_timeout=10s` budget — gives up early if cumulative sleep would exceed the budget
|
|
58
86
|
|
|
59
87
|
### Changed
|
|
60
88
|
|
|
61
|
-
-
|
|
89
|
+
- **`__version__`** now read from package metadata via `importlib.metadata` (single source of truth in `pyproject.toml`)
|
|
90
|
+
- **Upsert in usearch_index** separates conflict detection from removal for clearer flow
|
|
62
91
|
|
|
63
|
-
|
|
92
|
+
## [2.3.0] - 2026-03-08
|
|
64
93
|
|
|
65
|
-
|
|
66
|
-
- 16 core clustering tests (algorithms, auto-tagging, metadata persistence, edge cases)
|
|
67
|
-
- 4 cluster metrics tests (inertia, silhouette, metrics method)
|
|
68
|
-
- 6 cluster persistence tests (save/load/list/delete/assign)
|
|
69
|
-
- Added 3 async clustering tests in `tests/unit/test_async.py`
|
|
70
|
-
- Total test count: 305 (up from 292)
|
|
94
|
+
### Breaking Changes
|
|
71
95
|
|
|
72
|
-
|
|
96
|
+
- **Integration dependencies are now optional.** LangChain and LlamaIndex packages are no longer installed by default. Install with `pip install simplevecdb[integrations]` to use them. Existing users upgrading from v2.2.x will see a clear ImportError with migration instructions.
|
|
73
97
|
|
|
74
|
-
|
|
98
|
+
### Added
|
|
75
99
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
100
|
+
- **`[integrations]` optional extra** — Install LangChain and LlamaIndex dependencies only when needed, reducing default install footprint
|
|
101
|
+
- **Runtime import guards** in integration modules with v2.3.0 migration messaging
|
|
102
|
+
- **Lazy `__getattr__` loading** in `integrations/__init__.py` — integration classes are only imported when accessed
|
|
103
|
+
- **Input validation guards** on search methods:
|
|
104
|
+
- `similarity_search`, `similarity_search_batch`, `keyword_search`, `hybrid_search` now reject `k <= 0`
|
|
105
|
+
- `add_texts` validates length consistency of `metadatas`, `embeddings`, `ids`, and `parent_ids` against `texts`
|
|
106
|
+
- **NaN/Inf validation** for float values in metadata filters (`utils.validate_filter`)
|
|
107
|
+
- **Empty list rejection** for list filter values
|
|
108
|
+
- **Double-close protection** on `VectorDB` with `_closed` flag
|
|
109
|
+
- **Context manager protocol** (`__enter__`/`__exit__`) on `VectorDB`
|
|
110
|
+
- **Table name validation** in `check_migration` (defense-in-depth against SQL injection)
|
|
111
|
+
- **Graceful per-future error handling** in `search_collections`
|
|
112
|
+
- **Adaptive batch search threshold** — queries below `USEARCH_BATCH_THRESHOLD` (10) use sequential search to avoid batch overhead
|
|
79
113
|
|
|
80
|
-
|
|
114
|
+
### Changed
|
|
81
115
|
|
|
82
|
-
|
|
116
|
+
- **Python dev target changed to 3.12** (`.python-version`), `requires-python` remains `>= "3.10"`
|
|
117
|
+
- **Version bumped to 2.3.0**
|
|
118
|
+
- **Performance: MMR search vectorized** — pre-normalize embeddings once, use `sel_matrix @ emb` matrix-vector multiply instead of Python inner loop, O(1) `list.pop` replaces O(n) `list.remove`, hoist `1 - lambda_mult` loop invariant
|
|
119
|
+
- **Performance: merged SQL round-trips in MMR** — new `get_documents_and_embeddings_by_ids` fetches text, metadata, and embeddings in a single query (previously two separate SELECTs)
|
|
120
|
+
- **Performance: `get_parent` collapsed** from 2 sequential SELECTs to 1 self-JOIN
|
|
121
|
+
- **Performance: `add_documents` ID recovery** — skip redundant `SELECT ORDER BY DESC` when explicit IDs are provided; removed unnecessary `list(texts)` copy
|
|
122
|
+
- **Performance: FLOAT serialization** — `np.asarray().tobytes()` replaces `struct.pack` with per-element Python loop (single C memcpy)
|
|
123
|
+
- **Performance: `np.array` → `np.asarray`** on every search and insert path to avoid unnecessary copies
|
|
124
|
+
- **Performance: SQL placeholder strings** — `",".join(["?"] * len(ids))` replaces generator expression across all 9 call sites
|
|
125
|
+
- **Performance: batched numpy conversion** in `add_texts` — single `np.asarray` call instead of per-item conversion
|
|
126
|
+
- **Performance: compact JSON separators** in catalog serialization
|
|
127
|
+
- **Performance: deduplicated `.tolist()` calls** in search engine
|
|
128
|
+
- **Performance: `np.unique(ravel())`** for batch key collection in `similarity_search_batch`
|
|
129
|
+
- **Performance: usearch upsert** — skip contains-check loop on empty index, cache `int(key)` once per iteration
|
|
130
|
+
- **Performance: cluster table DDL** — `_cluster_table_ready` flag skips `CREATE TABLE IF NOT EXISTS` on repeated calls; cached `_cluster_table_name`
|
|
131
|
+
- **`_normalize_key`** now delegates to `_derive_key` instead of duplicating PBKDF2 logic
|
|
132
|
+
- **HNSW defaults** in `usearch_index.py` now sourced from `constants.py` (removed local duplicates)
|
|
133
|
+
- **Collection name regex** uses `constants.COLLECTION_NAME_PATTERN` instead of hardcoded pattern
|
|
134
|
+
- **`VectorDB` defaults** for `distance_strategy` and `quantization` sourced from `constants.DEFAULT_DISTANCE_STRATEGY` / `constants.DEFAULT_QUANTIZATION`
|
|
135
|
+
- **`_batched` utility** moved from `core.py` to `utils.py` for reuse; now used in `catalog.py` batch updates
|
|
136
|
+
- **`auto_tag`** uses `defaultdict(list)` instead of manual if-not-in pattern
|
|
137
|
+
- **`import random`** hoisted to module level in `utils.py` (was inside retry loop)
|
|
138
|
+
- **Streaming placeholder bug fixed** — `_process_streaming_batch` now correctly detects `None` placeholders (previously used empty list `[]`, preventing auto-embedding replacement)
|
|
139
|
+
- **README updated** to document `pip install simplevecdb[integrations]` installation
|
|
140
|
+
|
|
141
|
+
### Removed
|
|
142
|
+
|
|
143
|
+
- LangChain and LlamaIndex packages from core `[project.dependencies]` (moved to `[project.optional-dependencies] integrations`)
|
|
144
|
+
- Duplicated HNSW default constants from `usearch_index.py` (now single source in `constants.py`)
|
|
145
|
+
- Unused `struct` import from `quantization.py`
|
|
146
|
+
- Unused `itertools` import from `core.py`
|
|
147
|
+
|
|
148
|
+
## [2.2.1] - 2026-01-27
|
|
83
149
|
|
|
84
|
-
|
|
85
|
-
from simplevecdb import VectorDB
|
|
150
|
+
### Changed
|
|
86
151
|
|
|
87
|
-
|
|
88
|
-
|
|
152
|
+
- Moved integration dependencies (langchain-core, langchain-openai, llama-index) from dev to main dependencies for easier installation
|
|
153
|
+
- Added bandit to dev dependencies for security linting in pre-commit
|
|
154
|
+
- Cleaned up duplicate dev dependency definitions
|
|
89
155
|
|
|
90
|
-
|
|
91
|
-
result = collection.cluster(n_clusters=5, algorithm="minibatch_kmeans")
|
|
156
|
+
## [2.2.0] - 2026-01-26
|
|
92
157
|
|
|
93
|
-
|
|
94
|
-
tags = collection.auto_tag(result, method="tfidf", n_keywords=3)
|
|
95
|
-
collection.assign_cluster_metadata(result, tags)
|
|
158
|
+
### Added
|
|
96
159
|
|
|
97
|
-
|
|
98
|
-
collection.save_cluster("categories", result, metadata={"tags": tags})
|
|
160
|
+
- Version 2.2.0 release
|
|
99
161
|
|
|
100
|
-
|
|
101
|
-
new_ids = collection.add_texts(new_texts, embeddings=new_embeddings)
|
|
102
|
-
collection.assign_to_cluster("categories", new_ids)
|
|
162
|
+
## [2.1.0] - 2026-01-01
|
|
103
163
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
164
|
+
### Added
|
|
165
|
+
|
|
166
|
+
- **SQLCipher Encryption Support** - Full at-rest encryption for sensitive data:
|
|
167
|
+
- `VectorDB(path, encryption_key="...")` enables AES-256 page-level database encryption
|
|
168
|
+
- Uses SQLCipher for transparent SQLite encryption (PRAGMA key)
|
|
169
|
+
- Usearch index files encrypted with AES-256-GCM (`.usearch.enc`)
|
|
170
|
+
- Zero performance overhead during search (decrypt on load, encrypt on save only)
|
|
171
|
+
- Key derivation: PBKDF2-SHA256 with 480,000 iterations for passphrases
|
|
172
|
+
- Install with `pip install simplevecdb[encryption]`
|
|
173
|
+
|
|
174
|
+
- **New encryption module** (`simplevecdb.encryption`):
|
|
175
|
+
- `create_encrypted_connection()` - SQLCipher connection factory
|
|
176
|
+
- `is_database_encrypted()` - Check if a database file is encrypted
|
|
177
|
+
- `encrypt_index_file()` / `decrypt_index_file()` - Index file encryption
|
|
178
|
+
- `EncryptionError` / `EncryptionUnavailableError` - New exception types
|
|
179
|
+
|
|
180
|
+
- **Streaming Insert API** - Memory-efficient large-scale ingestion:
|
|
181
|
+
- `collection.add_texts_streaming(iterable)` - Process from any iterator/generator
|
|
182
|
+
- Configurable `batch_size` parameter (default: config.EMBEDDING_BATCH_SIZE)
|
|
183
|
+
- Yields `StreamingProgress` after each batch for monitoring
|
|
184
|
+
- Optional `on_progress` callback for custom logging/UI updates
|
|
185
|
+
- New types: `StreamingProgress`, `ProgressCallback`
|
|
186
|
+
|
|
187
|
+
- **Hierarchical Document Relationships** - Parent/child document structure:
|
|
188
|
+
- `parent_ids` parameter in `add_texts()` to link documents
|
|
189
|
+
- `get_children(doc_id)` - Get direct child documents
|
|
190
|
+
- `get_parent(doc_id)` - Get parent document
|
|
191
|
+
- `get_descendants(doc_id, max_depth)` - Recursive children traversal
|
|
192
|
+
- `get_ancestors(doc_id, max_depth)` - Path to root
|
|
193
|
+
- `set_parent(doc_id, parent_id)` - Update relationships
|
|
194
|
+
- Uses SQLite recursive CTE for efficient traversal
|
|
195
|
+
- Auto-migrates existing databases (adds `parent_id` column)
|
|
196
|
+
|
|
197
|
+
### Changed
|
|
198
|
+
|
|
199
|
+
- `check_migration()` now gracefully handles encrypted databases (returns `needs_migration=False`)
|
|
200
|
+
|
|
201
|
+
### Dependencies
|
|
202
|
+
|
|
203
|
+
- New optional dependency group `[encryption]`: `sqlcipher3-binary>=0.5.0`, `cryptography>=41.0`
|
|
108
204
|
|
|
109
205
|
## [2.0.0] - 2025-12-23
|
|
110
206
|
|
|
@@ -473,6 +569,12 @@ Benchmarks on i9-13900K & RTX 4090 with 10k vectors (384-dim):
|
|
|
473
569
|
- **Documentation**: https://coderdayton.github.io/simplevecdb/
|
|
474
570
|
- **License**: MIT
|
|
475
571
|
|
|
572
|
+
[2.4.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.4.0
|
|
573
|
+
[2.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.3.0
|
|
574
|
+
[2.2.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.1
|
|
575
|
+
[2.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.0
|
|
576
|
+
[2.1.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.1.0
|
|
577
|
+
[2.0.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.0.0
|
|
476
578
|
[1.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.3.0
|
|
477
579
|
[1.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.2.0
|
|
478
580
|
[1.1.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v1.1.1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: simplevecdb
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Summary: Dead-simple local vector database powered by usearch HNSW.
|
|
5
5
|
Author-email: Dayton Dunbar <coderdayton14@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -43,7 +43,7 @@ SimpleVecDB brings **Chroma-like simplicity** to a single **SQLite file**. Built
|
|
|
43
43
|
- **Zero Infrastructure** — Just a `.db` file. No Docker, no Redis, no cloud bills.
|
|
44
44
|
- **Blazing Fast** — 10-100x faster search via usearch HNSW. Adaptive: brute-force for <10k vectors (perfect recall), HNSW for larger collections.
|
|
45
45
|
- **Truly Portable** — Runs anywhere SQLite runs: Linux, macOS, Windows, even WASM.
|
|
46
|
-
- **Async Ready** — Full async/await support
|
|
46
|
+
- **Async Ready** — Full async/await support with optional executor injection for thread-safe ONNX/usearch sharing.
|
|
47
47
|
- **Batteries Included** — Optional FastAPI embeddings server + LangChain/LlamaIndex integrations via `[integrations]` extra.
|
|
48
48
|
- **Production Ready** — Hybrid search (BM25 + vector), metadata filtering, multi-collection support, and automatic hardware acceleration.
|
|
49
49
|
|
|
@@ -169,10 +169,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
|
|
|
169
169
|
**Optional: Run embeddings server (OpenAI-compatible)**
|
|
170
170
|
|
|
171
171
|
```bash
|
|
172
|
-
simplevecdb-server --port 8000
|
|
172
|
+
simplevecdb-server --port 8000 # Default model, auto warm-up
|
|
173
|
+
simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
|
|
174
|
+
simplevecdb-server --no-warmup # Skip model preload on startup
|
|
175
|
+
simplevecdb-server --help # Show all options
|
|
173
176
|
```
|
|
174
177
|
|
|
175
|
-
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
|
|
178
|
+
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
|
|
176
179
|
|
|
177
180
|
### Option 3: With LangChain or LlamaIndex
|
|
178
181
|
|
|
@@ -321,6 +324,36 @@ parent = collection.get_parent(child_ids[0])
|
|
|
321
324
|
descendants = collection.get_descendants(parent_ids[0])
|
|
322
325
|
```
|
|
323
326
|
|
|
327
|
+
### Document Management (v2.4+)
|
|
328
|
+
|
|
329
|
+
Query and update documents without touching private internals:
|
|
330
|
+
|
|
331
|
+
```python
|
|
332
|
+
# Get all documents (with optional metadata filter)
|
|
333
|
+
docs = collection.get_documents(filter_dict={"category": "tech"})
|
|
334
|
+
for doc_id, text, metadata in docs:
|
|
335
|
+
print(f"[{doc_id}] {text[:50]}...")
|
|
336
|
+
|
|
337
|
+
# Paginated access (v2.5+)
|
|
338
|
+
page1 = collection.get_documents(limit=100)
|
|
339
|
+
page2 = collection.get_documents(limit=100, offset=100)
|
|
340
|
+
|
|
341
|
+
# Fetch stored embeddings
|
|
342
|
+
embeddings = collection.get_embeddings_by_ids([1, 2, 3])
|
|
343
|
+
|
|
344
|
+
# Batch update metadata (shallow merge)
|
|
345
|
+
collection.update_metadata([
|
|
346
|
+
(1, {"reviewed": True}),
|
|
347
|
+
(2, {"reviewed": True, "score": 0.95}),
|
|
348
|
+
])
|
|
349
|
+
|
|
350
|
+
# Quick stats
|
|
351
|
+
print(f"Collection has {collection.count()} documents, dim={collection.dim}")
|
|
352
|
+
|
|
353
|
+
# Delete an entire collection (v2.5+)
|
|
354
|
+
db.delete_collection("old_data")
|
|
355
|
+
```
|
|
356
|
+
|
|
324
357
|
### Vector Clustering (v2.2+)
|
|
325
358
|
|
|
326
359
|
Discover natural groupings in your embeddings:
|
|
@@ -359,6 +392,12 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
|
|
|
359
392
|
| **Document Hierarchies** | ✅ | Parent/child relationships for chunked docs |
|
|
360
393
|
| **Vector Clustering** | ✅ | K-means, MiniBatch K-means, HDBSCAN with auto-tagging (v2.2+) |
|
|
361
394
|
| **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
|
|
395
|
+
| **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
|
|
396
|
+
| **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
|
|
397
|
+
| **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
|
|
398
|
+
| **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
|
|
399
|
+
| **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
|
|
400
|
+
| **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
|
|
362
401
|
|
|
363
402
|
## Performance Benchmarks
|
|
364
403
|
|
|
@@ -429,6 +468,11 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
|
|
|
429
468
|
- [x] Hierarchical document relationships (parent/child)
|
|
430
469
|
- [x] Cross-collection search
|
|
431
470
|
- [x] Vector clustering and auto-tagging (v2.2)
|
|
471
|
+
- [x] Public catalog API for document management (v2.4)
|
|
472
|
+
- [x] Async executor injection for thread-safe sharing (v2.4)
|
|
473
|
+
- [x] Collection management: `delete_collection()`, pagination (v2.5)
|
|
474
|
+
- [x] Cross-process file locking and connection health checks (v2.5)
|
|
475
|
+
- [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
|
|
432
476
|
- [ ] Incremental clustering (online learning)
|
|
433
477
|
- [ ] Cluster visualization exports
|
|
434
478
|
|
|
@@ -14,7 +14,7 @@ SimpleVecDB brings **Chroma-like simplicity** to a single **SQLite file**. Built
|
|
|
14
14
|
- **Zero Infrastructure** — Just a `.db` file. No Docker, no Redis, no cloud bills.
|
|
15
15
|
- **Blazing Fast** — 10-100x faster search via usearch HNSW. Adaptive: brute-force for <10k vectors (perfect recall), HNSW for larger collections.
|
|
16
16
|
- **Truly Portable** — Runs anywhere SQLite runs: Linux, macOS, Windows, even WASM.
|
|
17
|
-
- **Async Ready** — Full async/await support
|
|
17
|
+
- **Async Ready** — Full async/await support with optional executor injection for thread-safe ONNX/usearch sharing.
|
|
18
18
|
- **Batteries Included** — Optional FastAPI embeddings server + LangChain/LlamaIndex integrations via `[integrations]` extra.
|
|
19
19
|
- **Production Ready** — Hybrid search (BM25 + vector), metadata filtering, multi-collection support, and automatic hardware acceleration.
|
|
20
20
|
|
|
@@ -140,10 +140,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
|
|
|
140
140
|
**Optional: Run embeddings server (OpenAI-compatible)**
|
|
141
141
|
|
|
142
142
|
```bash
|
|
143
|
-
simplevecdb-server --port 8000
|
|
143
|
+
simplevecdb-server --port 8000 # Default model, auto warm-up
|
|
144
|
+
simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
|
|
145
|
+
simplevecdb-server --no-warmup # Skip model preload on startup
|
|
146
|
+
simplevecdb-server --help # Show all options
|
|
144
147
|
```
|
|
145
148
|
|
|
146
|
-
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
|
|
149
|
+
See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
|
|
147
150
|
|
|
148
151
|
### Option 3: With LangChain or LlamaIndex
|
|
149
152
|
|
|
@@ -292,6 +295,36 @@ parent = collection.get_parent(child_ids[0])
|
|
|
292
295
|
descendants = collection.get_descendants(parent_ids[0])
|
|
293
296
|
```
|
|
294
297
|
|
|
298
|
+
### Document Management (v2.4+)
|
|
299
|
+
|
|
300
|
+
Query and update documents without touching private internals:
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
# Get all documents (with optional metadata filter)
|
|
304
|
+
docs = collection.get_documents(filter_dict={"category": "tech"})
|
|
305
|
+
for doc_id, text, metadata in docs:
|
|
306
|
+
print(f"[{doc_id}] {text[:50]}...")
|
|
307
|
+
|
|
308
|
+
# Paginated access (v2.5+)
|
|
309
|
+
page1 = collection.get_documents(limit=100)
|
|
310
|
+
page2 = collection.get_documents(limit=100, offset=100)
|
|
311
|
+
|
|
312
|
+
# Fetch stored embeddings
|
|
313
|
+
embeddings = collection.get_embeddings_by_ids([1, 2, 3])
|
|
314
|
+
|
|
315
|
+
# Batch update metadata (shallow merge)
|
|
316
|
+
collection.update_metadata([
|
|
317
|
+
(1, {"reviewed": True}),
|
|
318
|
+
(2, {"reviewed": True, "score": 0.95}),
|
|
319
|
+
])
|
|
320
|
+
|
|
321
|
+
# Quick stats
|
|
322
|
+
print(f"Collection has {collection.count()} documents, dim={collection.dim}")
|
|
323
|
+
|
|
324
|
+
# Delete an entire collection (v2.5+)
|
|
325
|
+
db.delete_collection("old_data")
|
|
326
|
+
```
|
|
327
|
+
|
|
295
328
|
### Vector Clustering (v2.2+)
|
|
296
329
|
|
|
297
330
|
Discover natural groupings in your embeddings:
|
|
@@ -330,6 +363,12 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
|
|
|
330
363
|
| **Document Hierarchies** | ✅ | Parent/child relationships for chunked docs |
|
|
331
364
|
| **Vector Clustering** | ✅ | K-means, MiniBatch K-means, HDBSCAN with auto-tagging (v2.2+) |
|
|
332
365
|
| **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
|
|
366
|
+
| **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
|
|
367
|
+
| **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
|
|
368
|
+
| **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
|
|
369
|
+
| **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
|
|
370
|
+
| **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
|
|
371
|
+
| **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
|
|
333
372
|
|
|
334
373
|
## Performance Benchmarks
|
|
335
374
|
|
|
@@ -400,6 +439,11 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
|
|
|
400
439
|
- [x] Hierarchical document relationships (parent/child)
|
|
401
440
|
- [x] Cross-collection search
|
|
402
441
|
- [x] Vector clustering and auto-tagging (v2.2)
|
|
442
|
+
- [x] Public catalog API for document management (v2.4)
|
|
443
|
+
- [x] Async executor injection for thread-safe sharing (v2.4)
|
|
444
|
+
- [x] Collection management: `delete_collection()`, pagination (v2.5)
|
|
445
|
+
- [x] Cross-process file locking and connection health checks (v2.5)
|
|
446
|
+
- [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
|
|
403
447
|
- [ ] Incremental clustering (online learning)
|
|
404
448
|
- [ ] Cluster visualization exports
|
|
405
449
|
|
|
@@ -5,6 +5,41 @@ All notable changes to SimpleVecDB will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.4.0] - 2026-03-22
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **Public catalog API on VectorCollection + AsyncVectorCollection:**
|
|
13
|
+
- `get_documents(filter_dict=)` — replaces private `_catalog` access
|
|
14
|
+
- `get_embeddings_by_ids(ids)` — fetch stored embeddings
|
|
15
|
+
- `update_metadata(updates)` — batch metadata merge
|
|
16
|
+
- `count()`, `save()`, `dim` property — async wrappers
|
|
17
|
+
- `add_texts(parent_ids=, threads=)` — full param support on async
|
|
18
|
+
- `rebuild_index`, `get_children/parent/descendants/ancestors`, `set_parent` — async hierarchy API
|
|
19
|
+
- **Executor injection on AsyncVectorDB** — accept optional `executor` keyword argument so consumers can share a single-threaded executor for ONNX/usearch thread safety; `close()` only shuts down executor when `_owns_executor` is True
|
|
20
|
+
- **Safety constants** in `constants.py`: `SEARCH_COLLECTION_TIMEOUT`, `EXECUTOR_SHUTDOWN_TIMEOUT`, `MAX_HIERARCHY_DEPTH`
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
|
|
24
|
+
- **VectorDB.close()** now calls `conn.close()` — was leaking file descriptors when `save()` succeeded but connection was never closed
|
|
25
|
+
- **VectorDB.close()** wraps `save()` in `try/finally` so `conn.close()` always runs even if index serialization fails
|
|
26
|
+
- **add_documents ID recovery** uses `last_insert_rowid()` arithmetic instead of `ORDER BY id DESC LIMIT N`, which raced under concurrent inserts
|
|
27
|
+
- **String metadata filter** uses exact equality (`=`) instead of `LIKE` substring match — `{"type": "doc"}` no longer matches `"markdown_doc"`
|
|
28
|
+
- **update_metadata_batch** wrapped in single transaction (`with self.conn`) to prevent partial commits on crash
|
|
29
|
+
- **rebuild_index** uses `if x is not None` instead of `x or default` so passing `connectivity=0` no longer silently uses the default
|
|
30
|
+
- **search_collections** parallel futures now have a 30s timeout — one hung collection can no longer block the entire cross-collection search
|
|
31
|
+
- **AsyncVectorDB.close()** uses `shutdown(wait=False, cancel_futures=True)` instead of blocking `shutdown(wait=True)` which could hang forever on stuck tasks
|
|
32
|
+
- **Recursive CTE safety cap** — `get_descendants`/`get_ancestors` apply `MAX_HIERARCHY_DEPTH=100` when `max_depth=None` to prevent infinite recursion from parent_id cycles
|
|
33
|
+
- **RateLimiter cleanup** capped to 500 evictions per call to bound lock hold time under high bucket counts
|
|
34
|
+
- **HuggingFace download** now uses `etag_timeout=30` with local-cache fallback on network failure
|
|
35
|
+
- **embed_texts** rejects batches over 10,000 texts to prevent unbounded CPU time
|
|
36
|
+
- **retry_on_lock** adds `total_timeout=10s` budget — gives up early if cumulative sleep would exceed the budget
|
|
37
|
+
|
|
38
|
+
### Changed
|
|
39
|
+
|
|
40
|
+
- **`__version__`** now read from package metadata via `importlib.metadata` (single source of truth in `pyproject.toml`)
|
|
41
|
+
- **Upsert in usearch_index** separates conflict detection from removal for clearer flow
|
|
42
|
+
|
|
8
43
|
## [2.3.0] - 2026-03-08
|
|
9
44
|
|
|
10
45
|
### Breaking Changes
|
|
@@ -485,6 +520,7 @@ Benchmarks on i9-13900K & RTX 4090 with 10k vectors (384-dim):
|
|
|
485
520
|
- **Documentation**: https://coderdayton.github.io/simplevecdb/
|
|
486
521
|
- **License**: MIT
|
|
487
522
|
|
|
523
|
+
[2.4.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.4.0
|
|
488
524
|
[2.3.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.3.0
|
|
489
525
|
[2.2.1]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.1
|
|
490
526
|
[2.2.0]: https://github.com/coderdayton/simplevecdb/releases/tag/v2.2.0
|
|
@@ -16,10 +16,18 @@ try:
|
|
|
16
16
|
except ImportError:
|
|
17
17
|
pass
|
|
18
18
|
from .logging import get_logger, configure_logging, log_operation
|
|
19
|
-
from .utils import
|
|
19
|
+
from .utils import (
|
|
20
|
+
DatabaseLockedError,
|
|
21
|
+
async_retry_on_lock,
|
|
22
|
+
file_lock,
|
|
23
|
+
retry_on_lock,
|
|
24
|
+
validate_filter,
|
|
25
|
+
)
|
|
20
26
|
from .encryption import EncryptionError, EncryptionUnavailableError
|
|
21
27
|
|
|
22
|
-
|
|
28
|
+
from importlib.metadata import version as _pkg_version
|
|
29
|
+
|
|
30
|
+
__version__ = _pkg_version("simplevecdb")
|
|
23
31
|
__all__ = [
|
|
24
32
|
# Core classes
|
|
25
33
|
"VectorDB",
|
|
@@ -47,6 +55,8 @@ __all__ = [
|
|
|
47
55
|
"MigrationRequiredError",
|
|
48
56
|
"EncryptionError",
|
|
49
57
|
"EncryptionUnavailableError",
|
|
58
|
+
"async_retry_on_lock",
|
|
59
|
+
"file_lock",
|
|
50
60
|
"retry_on_lock",
|
|
51
61
|
"validate_filter",
|
|
52
62
|
]
|