simplevecdb 2.4.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {simplevecdb-2.4.0/docs → simplevecdb-2.5.0}/CHANGELOG.md +49 -0
  2. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/PKG-INFO +20 -3
  3. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/README.md +19 -2
  4. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/pyproject.toml +1 -1
  5. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/__init__.py +9 -1
  6. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/async_core.py +26 -3
  7. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/constants.py +2 -1
  8. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/core.py +159 -29
  9. simplevecdb-2.5.0/src/simplevecdb/embeddings/__init__.py +12 -0
  10. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/server.py +194 -33
  11. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/catalog.py +93 -25
  12. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/quantization.py +6 -0
  13. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/search.py +68 -27
  14. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/usearch_index.py +63 -51
  15. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/utils.py +140 -1
  16. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/integration/test_langchain.py +1 -1
  17. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/integration/test_server.py +10 -8
  18. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_core_additional_coverage.py +1 -1
  19. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_initialization.py +2 -2
  20. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_missing_coverage.py +1 -1
  21. simplevecdb-2.5.0/tests/unit/core/test_v25_correctness.py +265 -0
  22. simplevecdb-2.5.0/tests/unit/core/test_v25_features.py +344 -0
  23. simplevecdb-2.5.0/tests/unit/core/test_v25_robustness.py +312 -0
  24. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server.py +33 -25
  25. simplevecdb-2.5.0/tests/unit/embeddings/test_v25_enhancements.py +382 -0
  26. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_core.py +6 -6
  27. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/uv.lock +1 -1
  28. simplevecdb-2.4.0/.claude/settings.local.json +0 -8
  29. simplevecdb-2.4.0/src/simplevecdb/embeddings/__init__.py +0 -0
  30. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.bandit +0 -0
  31. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.env.example +0 -0
  32. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/FUNDING.yml +0 -0
  33. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  34. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  35. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  36. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/dependabot.yml +0 -0
  37. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/workflows/ci.yml +0 -0
  38. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/workflows/publish.yml +0 -0
  39. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/workflows/security.yml +0 -0
  40. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.github/workflows/update-sponsors.yml +0 -0
  41. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.gitignore +0 -0
  42. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.pre-commit-config.yaml +0 -0
  43. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/.python-version +0 -0
  44. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/CODE_OF_CONDUCT.md +0 -0
  45. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/CONTRIBUTING.md +0 -0
  46. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/LICENSE +0 -0
  47. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/SECURITY.md +0 -0
  48. {simplevecdb-2.4.0 → simplevecdb-2.5.0/docs}/CHANGELOG.md +0 -0
  49. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/CONTRIBUTING.md +0 -0
  50. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/ENV_SETUP.md +0 -0
  51. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/LICENSE +0 -0
  52. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/async.md +0 -0
  53. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/config.md +0 -0
  54. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/core.md +0 -0
  55. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/embeddings.md +0 -0
  56. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/encryption.md +0 -0
  57. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/engine/catalog.md +0 -0
  58. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/engine/quantization.md +0 -0
  59. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/engine/search.md +0 -0
  60. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/integrations.md +0 -0
  61. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/api/types.md +0 -0
  62. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/benchmarks.md +0 -0
  63. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/examples.md +0 -0
  64. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/guides/clustering.md +0 -0
  65. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/docs/index.md +0 -0
  66. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/auto_embed.py +0 -0
  67. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/backend_benchmark.py +0 -0
  68. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/embeddings/perf_benchmark.py +0 -0
  69. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/quant_benchmark.py +0 -0
  70. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/rag/langchain_rag.ipynb +0 -0
  71. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/rag/llama_rag.ipynb +0 -0
  72. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/rag/ollama_rag.ipynb +0 -0
  73. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/examples/smoke_test.py +0 -0
  74. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/mkdocs.yml +0 -0
  75. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/config.py +0 -0
  76. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/embeddings/models.py +0 -0
  77. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/encryption.py +0 -0
  78. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/__init__.py +0 -0
  79. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/engine/clustering.py +0 -0
  80. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/__init__.py +0 -0
  81. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/langchain.py +0 -0
  82. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/integrations/llamaindex.py +0 -0
  83. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/logging.py +0 -0
  84. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/src/simplevecdb/types.py +0 -0
  85. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/conftest.py +0 -0
  86. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/integration/test_llamaindex.py +0 -0
  87. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/integration/test_rag.py +0 -0
  88. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/integration/test_v21_features.py +0 -0
  89. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/perf/test_batch_detection.py +0 -0
  90. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/perf/test_performance.py +0 -0
  91. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/__init__.py +0 -0
  92. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_batch_detection.py +0 -0
  93. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_factory_methods.py +0 -0
  94. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_filters.py +0 -0
  95. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_quantization.py +0 -0
  96. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/core/test_similarity_search.py +0 -0
  97. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/embeddings/__init__.py +0 -0
  98. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_models.py +0 -0
  99. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/embeddings/test_server_coverage.py +0 -0
  100. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/integrations/__init__.py +0 -0
  101. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_langchain_coverage.py +0 -0
  102. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/integrations/test_llamaindex_coverage.py +0 -0
  103. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_async.py +0 -0
  104. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_async_coverage.py +0 -0
  105. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_catalog_coverage.py +0 -0
  106. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_clustering.py +0 -0
  107. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_config.py +0 -0
  108. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_cross_collection_search.py +0 -0
  109. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_encryption.py +0 -0
  110. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_encryption_coverage.py +0 -0
  111. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_error_handling.py +0 -0
  112. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_hierarchy.py +0 -0
  113. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_multi_collection.py +0 -0
  114. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_search.py +0 -0
  115. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_search_coverage.py +0 -0
  116. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_search_missing_coverage.py +0 -0
  117. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_streaming.py +0 -0
  118. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_types.py +0 -0
  119. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_usearch_index_missing_coverage.py +0 -0
  120. {simplevecdb-2.4.0 → simplevecdb-2.5.0}/tests/unit/test_utils.py +0 -0
@@ -5,6 +5,55 @@ All notable changes to SimpleVecDB will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.5.0] - 2026-04-07
9
+
10
+ ### Added
11
+
12
+ - **`delete_collection(name)`** — drop a collection's SQLite tables, FTS index, and usearch file in one call. Available on both `VectorDB` and `AsyncVectorDB`.
13
+ - **`store_embeddings` parameter** on `collection()` — opt into storing embedding BLOBs in SQLite (default `False`). Saves ~2x storage; MMR transparently fetches vectors from the usearch index when BLOBs are absent.
14
+ - **`async_retry_on_lock` decorator** — async variant of `retry_on_lock` using `asyncio.sleep` instead of `time.sleep`, avoiding executor thread blocking.
15
+ - **`file_lock` context manager** — advisory cross-process file locking (`fcntl`/`msvcrt`) for usearch index files. Prevents corruption from concurrent processes.
16
+ - **`__repr__`** on `VectorDB`, `VectorCollection`, `AsyncVectorDB`, `AsyncVectorCollection` for debuggable string representations.
17
+ - **FLOAT16 quantization** fully implemented in `serialize()`/`deserialize()` — was previously defined in the enum but raised `ValueError` at runtime.
18
+ - **Pagination** on `get_documents(limit=, offset=)` and catalog methods (`find_ids_by_filter`, `find_ids_by_texts`) — previously returned unbounded result sets.
19
+ - **Embeddings server enhancements:**
20
+ - Graceful shutdown with SIGTERM/SIGINT draining (10s timeout)
21
+ - CORS middleware with configurable origins for browser-based clients
22
+ - Model warm-up on startup (skip with `--no-warmup`)
23
+ - Input validation: rejects empty strings (422) and texts exceeding 100k chars (413)
24
+ - Proper `argparse` CLI with `--host`, `--port`, `--no-warmup`, `--help`
25
+ - Startup banner logging config summary (host, port, model, auth, rate limits)
26
+ - Nested token array normalization (`list[list[int]]` input format)
27
+ - Async executor offload for `embed_texts` (non-blocking event loop)
28
+ - OpenAPI version synced from package metadata
29
+ - Module `__init__.py` exports (`embed_texts`, `get_embedder`, `load_model`, `app`, `run_server`)
30
+
31
+ ### Fixed
32
+
33
+ - **`delete_by_ids` ordering** — SQLite deletion now happens first (transactional, can rollback), then usearch. Previously usearch removed first, leaving orphaned catalog entries on SQLite failure.
34
+ - **`_matches_filter` string semantics** — now uses exact equality, consistent with SQL `build_filter_clause`. Was using substring match (`value in str(meta_value)`).
35
+ - **`list_collections`** — scans `sqlite_master` for persisted collection tables instead of returning only session-cached names. Works across reopened databases.
36
+ - **WAL mode for encrypted databases** — `PRAGMA journal_mode=WAL` and `PRAGMA synchronous=NORMAL` now set for SQLCipher connections (was only set for unencrypted).
37
+ - **`collection()` cache key** — includes `distance_strategy` and `quantization` in cache key (sync version). Previously cached by name only, silently ignoring differing params on cache hit.
38
+ - **`_ensure_fts_table`** — retries up to 3 times on transient "database is locked" errors instead of permanently disabling FTS on first failure.
39
+ - **Connection health check** — `SELECT 1` probe after connection creation; raises `RuntimeError` immediately on corrupt databases.
40
+
41
+ ### Improved
42
+
43
+ - **Usearch batch operations** — `add()`, `remove()`, and `get()` now use batch usearch APIs instead of per-key loops. Significant speedup for large operations.
44
+ - **Filtered search iterative deepening** — replaces fixed `k*3` overfetch with adaptive doubling (up to `k*30`). Highly selective filters now reliably return `k` results.
45
+ - **Memory-map heuristic** — uses file size threshold (50MB) instead of inaccurate `file_size // 100` vector count estimate for mmap vs load decision.
46
+ - **Apple chip detection** — uses `platform.processor()` instead of spawning a `sysctl` subprocess.
47
+
48
+ ### Removed
49
+
50
+ - **Duplicate `_dim` property** — removed in favor of the public `dim` property.
51
+
52
+ ### Breaking Changes
53
+
54
+ - String metadata filters now use exact equality (was substring match).
55
+ - `store_embeddings` defaults to `False` — `rebuild_index()` requires `store_embeddings=True` or re-adding documents.
56
+
8
57
  ## [2.4.0] - 2026-03-22
9
58
 
10
59
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: simplevecdb
3
- Version: 2.4.0
3
+ Version: 2.5.0
4
4
  Summary: Dead-simple local vector database powered by usearch HNSW.
5
5
  Author-email: Dayton Dunbar <coderdayton14@gmail.com>
6
6
  License: MIT
@@ -169,10 +169,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
169
169
  **Optional: Run embeddings server (OpenAI-compatible)**
170
170
 
171
171
  ```bash
172
- simplevecdb-server --port 8000
172
+ simplevecdb-server --port 8000 # Default model, auto warm-up
173
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
174
+ simplevecdb-server --no-warmup # Skip model preload on startup
175
+ simplevecdb-server --help # Show all options
173
176
  ```
174
177
 
175
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
178
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
176
179
 
177
180
  ### Option 3: With LangChain or LlamaIndex
178
181
 
@@ -331,6 +334,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
331
334
  for doc_id, text, metadata in docs:
332
335
  print(f"[{doc_id}] {text[:50]}...")
333
336
 
337
+ # Paginated access (v2.5+)
338
+ page1 = collection.get_documents(limit=100)
339
+ page2 = collection.get_documents(limit=100, offset=100)
340
+
334
341
  # Fetch stored embeddings
335
342
  embeddings = collection.get_embeddings_by_ids([1, 2, 3])
336
343
 
@@ -342,6 +349,9 @@ collection.update_metadata([
342
349
 
343
350
  # Quick stats
344
351
  print(f"Collection has {collection.count()} documents, dim={collection.dim}")
352
+
353
+ # Delete an entire collection (v2.5+)
354
+ db.delete_collection("old_data")
345
355
  ```
346
356
 
347
357
  ### Vector Clustering (v2.2+)
@@ -384,6 +394,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
384
394
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
385
395
  | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
386
396
  | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
397
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
398
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
399
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
400
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
387
401
 
388
402
  ## Performance Benchmarks
389
403
 
@@ -456,6 +470,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
456
470
  - [x] Vector clustering and auto-tagging (v2.2)
457
471
  - [x] Public catalog API for document management (v2.4)
458
472
  - [x] Async executor injection for thread-safe sharing (v2.4)
473
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
474
+ - [x] Cross-process file locking and connection health checks (v2.5)
475
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
459
476
  - [ ] Incremental clustering (online learning)
460
477
  - [ ] Cluster visualization exports
461
478
 
@@ -140,10 +140,13 @@ hybrid = collection.hybrid_search("powerhouse cell", k=2)
140
140
  **Optional: Run embeddings server (OpenAI-compatible)**
141
141
 
142
142
  ```bash
143
- simplevecdb-server --port 8000
143
+ simplevecdb-server --port 8000 # Default model, auto warm-up
144
+ simplevecdb-server --host 0.0.0.0 --port 9000 # Bind to all interfaces
145
+ simplevecdb-server --no-warmup # Skip model preload on startup
146
+ simplevecdb-server --help # Show all options
144
147
  ```
145
148
 
146
- See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CUDA optimization.
149
+ See [Setup Guide](ENV_SETUP.md) for configuration: model registry, rate limits, API keys, CORS, CUDA optimization.
147
150
 
148
151
  ### Option 3: With LangChain or LlamaIndex
149
152
 
@@ -302,6 +305,10 @@ docs = collection.get_documents(filter_dict={"category": "tech"})
302
305
  for doc_id, text, metadata in docs:
303
306
  print(f"[{doc_id}] {text[:50]}...")
304
307
 
308
+ # Paginated access (v2.5+)
309
+ page1 = collection.get_documents(limit=100)
310
+ page2 = collection.get_documents(limit=100, offset=100)
311
+
305
312
  # Fetch stored embeddings
306
313
  embeddings = collection.get_embeddings_by_ids([1, 2, 3])
307
314
 
@@ -313,6 +320,9 @@ collection.update_metadata([
313
320
 
314
321
  # Quick stats
315
322
  print(f"Collection has {collection.count()} documents, dim={collection.dim}")
323
+
324
+ # Delete an entire collection (v2.5+)
325
+ db.delete_collection("old_data")
316
326
  ```
317
327
 
318
328
  ### Vector Clustering (v2.2+)
@@ -355,6 +365,10 @@ Supports K-means, MiniBatch K-means, and HDBSCAN. See [Clustering Guide](https:/
355
365
  | **Cluster Persistence** | ✅ | Save/load cluster centroids for fast assignment (v2.2+) |
356
366
  | **Public Catalog API** | ✅ | `get_documents`, `get_embeddings_by_ids`, `update_metadata` (v2.4+) |
357
367
  | **Executor Injection** | ✅ | Share thread pool across async instances for ONNX safety (v2.4+) |
368
+ | **Collection Management** | ✅ | `delete_collection()`, paginated `get_documents(limit=, offset=)` (v2.5+) |
369
+ | **Cross-Process Safety** | ✅ | Advisory file locking on usearch index files (v2.5+) |
370
+ | **FLOAT16 Quantization** | ✅ | Half-precision storage with 2x compression (v2.5+) |
371
+ | **Embeddings Server** | ✅ | CORS, graceful shutdown, input validation, model warm-up (v2.5+) |
358
372
 
359
373
  ## Performance Benchmarks
360
374
 
@@ -427,6 +441,9 @@ pip install torch --index-url https://download.pytorch.org/whl/cu118
427
441
  - [x] Vector clustering and auto-tagging (v2.2)
428
442
  - [x] Public catalog API for document management (v2.4)
429
443
  - [x] Async executor injection for thread-safe sharing (v2.4)
444
+ - [x] Collection management: `delete_collection()`, pagination (v2.5)
445
+ - [x] Cross-process file locking and connection health checks (v2.5)
446
+ - [x] Embeddings server hardening: CORS, graceful shutdown, input validation (v2.5)
430
447
  - [ ] Incremental clustering (online learning)
431
448
  - [ ] Cluster visualization exports
432
449
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "simplevecdb"
3
- version = "2.4.0"
3
+ version = "2.5.0"
4
4
  description = "Dead-simple local vector database powered by usearch HNSW."
5
5
  authors = [{ name = "Dayton Dunbar", email = "coderdayton14@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -16,7 +16,13 @@ try:
16
16
  except ImportError:
17
17
  pass
18
18
  from .logging import get_logger, configure_logging, log_operation
19
- from .utils import DatabaseLockedError, retry_on_lock, validate_filter
19
+ from .utils import (
20
+ DatabaseLockedError,
21
+ async_retry_on_lock,
22
+ file_lock,
23
+ retry_on_lock,
24
+ validate_filter,
25
+ )
20
26
  from .encryption import EncryptionError, EncryptionUnavailableError
21
27
 
22
28
  from importlib.metadata import version as _pkg_version
@@ -49,6 +55,8 @@ __all__ = [
49
55
  "MigrationRequiredError",
50
56
  "EncryptionError",
51
57
  "EncryptionUnavailableError",
58
+ "async_retry_on_lock",
59
+ "file_lock",
52
60
  "retry_on_lock",
53
61
  "validate_filter",
54
62
  ]
@@ -60,6 +60,9 @@ class AsyncVectorCollection:
60
60
  """Collection name."""
61
61
  return self._collection.name
62
62
 
63
+ def __repr__(self) -> str:
64
+ return f"AsyncVectorCollection(name={self._collection.name!r})"
65
+
63
66
  async def add_texts(
64
67
  self,
65
68
  texts: Sequence[str],
@@ -210,15 +213,20 @@ class AsyncVectorCollection:
210
213
  async def get_documents(
211
214
  self,
212
215
  filter_dict: dict[str, Any] | None = None,
216
+ *,
217
+ limit: int | None = None,
218
+ offset: int | None = None,
213
219
  ) -> list[tuple[int, str, dict[str, Any]]]:
214
- """Get all documents with text content and metadata.
220
+ """Get documents with text content and metadata.
215
221
 
216
222
  See VectorCollection.get_documents for full documentation.
217
223
  """
218
224
  loop = asyncio.get_running_loop()
219
225
  return await loop.run_in_executor(
220
226
  self._executor,
221
- lambda: self._collection.get_documents(filter_dict=filter_dict),
227
+ lambda: self._collection.get_documents(
228
+ filter_dict=filter_dict, limit=limit, offset=offset
229
+ ),
222
230
  )
223
231
 
224
232
  async def get_embeddings_by_ids(self, ids: Sequence[int]) -> dict[int, Any]:
@@ -599,9 +607,21 @@ class AsyncVectorDB:
599
607
  return self._collections[cache_key]
600
608
 
601
609
  def list_collections(self) -> list[str]:
602
- """Return names of all initialized collections."""
610
+ """Return names of all persisted collections in the database."""
603
611
  return self._db.list_collections()
604
612
 
613
+ async def delete_collection(self, name: str) -> None:
614
+ """Delete a collection and all its data."""
615
+ loop = asyncio.get_running_loop()
616
+ await loop.run_in_executor(
617
+ self._executor, lambda: self._db.delete_collection(name)
618
+ )
619
+ # Evict from async-level cache too
620
+ with self._collections_lock:
621
+ keys_to_remove = [k for k in self._collections if k[0] == name]
622
+ for k in keys_to_remove:
623
+ del self._collections[k]
624
+
605
625
  async def search_collections(
606
626
  self,
607
627
  query: Sequence[float],
@@ -644,6 +664,9 @@ class AsyncVectorDB:
644
664
  self._executor, lambda: self._db.vacuum(checkpoint_wal)
645
665
  )
646
666
 
667
+ def __repr__(self) -> str:
668
+ return f"AsyncVectorDB(path={self._db.path!r})"
669
+
647
670
  async def close(self) -> None:
648
671
  """Close the database connection and shutdown executor."""
649
672
  try:
@@ -74,7 +74,8 @@ USEARCH_BRUTEFORCE_THRESHOLD = 10000
74
74
  # - Instant startup (no full load into RAM)
75
75
  # - Lower memory footprint (OS manages page cache)
76
76
  # - Slight latency increase for cold pages (acceptable trade-off)
77
- USEARCH_MMAP_THRESHOLD = 100000
77
+ # Threshold in bytes — 50MB covers ~30k 384-dim f32 vectors.
78
+ USEARCH_MMAP_THRESHOLD = 50 * 1024 * 1024 # 50 MB
78
79
 
79
80
  # Batch search threshold: auto-batch queries when > this count
80
81
  # usearch batch search provides ~10x throughput for multi-query workloads
@@ -178,6 +178,7 @@ class VectorCollection:
178
178
  distance_strategy: DistanceStrategy,
179
179
  quantization: Quantization,
180
180
  encryption_key: str | bytes | None = None,
181
+ store_embeddings: bool = False,
181
182
  ):
182
183
  self.conn = conn
183
184
  self._db_path = db_path
@@ -186,6 +187,7 @@ class VectorCollection:
186
187
  self.quantization = quantization
187
188
  self._quantizer = QuantizationStrategy(quantization)
188
189
  self._encryption_key = encryption_key
190
+ self._store_embeddings = store_embeddings
189
191
 
190
192
  # Sanitize name to prevent issues
191
193
  if not re.match(constants.COLLECTION_NAME_PATTERN, name):
@@ -397,12 +399,12 @@ class VectorCollection:
397
399
  batch_ids = ids[batch_start:batch_end] if ids else None
398
400
  batch_parent_ids = parent_ids[batch_start:batch_end] if parent_ids else None
399
401
 
400
- # Add to SQLite metadata store (with embeddings for MMR support)
402
+ # Add to SQLite metadata store
401
403
  doc_ids = self._catalog.add_documents(
402
404
  batch_texts,
403
405
  list(batch_metas),
404
406
  batch_ids,
405
- embeddings=batch_embeds,
407
+ embeddings=batch_embeds if self._store_embeddings else None,
406
408
  parent_ids=batch_parent_ids,
407
409
  )
408
410
 
@@ -748,12 +750,13 @@ class VectorCollection:
748
750
  if not ids_list:
749
751
  return
750
752
 
751
- # Delete from usearch
752
- self._index.remove(ids_list)
753
-
754
- # Delete from SQLite
753
+ # Delete from SQLite first (transactional, can rollback on failure)
755
754
  self._catalog.delete_by_ids(ids_list)
756
755
 
756
+ # Then remove from usearch (if this fails, catalog is clean and
757
+ # rebuild_index() can recover the index from stored data)
758
+ self._index.remove(ids_list)
759
+
757
760
  def remove_texts(
758
761
  self,
759
762
  texts: Sequence[str] | None = None,
@@ -846,6 +849,13 @@ class VectorCollection:
846
849
  # Fetch embeddings from SQLite
847
850
  embeddings_map = self._catalog.get_embeddings_by_ids(all_ids)
848
851
 
852
+ if not embeddings_map and not self._store_embeddings:
853
+ raise RuntimeError(
854
+ "Cannot rebuild index: no embeddings stored in SQLite. "
855
+ "Create the collection with store_embeddings=True to enable "
856
+ "rebuild_index(), or re-add documents with store_embeddings=True."
857
+ )
858
+
849
859
  # Filter to only docs with embeddings
850
860
  valid_pairs = [
851
861
  (doc_id, emb)
@@ -1260,7 +1270,7 @@ class VectorCollection:
1260
1270
 
1261
1271
  centroids = None
1262
1272
  if centroids_bytes is not None:
1263
- dim = self._dim
1273
+ dim = self.dim
1264
1274
  if dim:
1265
1275
  centroids = np.frombuffer(centroids_bytes, dtype=np.float32).reshape(
1266
1276
  n_clusters, dim
@@ -1353,19 +1363,26 @@ class VectorCollection:
1353
1363
  def get_documents(
1354
1364
  self,
1355
1365
  filter_dict: dict[str, Any] | None = None,
1366
+ *,
1367
+ limit: int | None = None,
1368
+ offset: int | None = None,
1356
1369
  ) -> list[tuple[int, str, dict[str, Any]]]:
1357
- """Get all documents with text content and metadata.
1370
+ """Get documents with text content and metadata.
1358
1371
 
1359
1372
  Args:
1360
1373
  filter_dict: Optional metadata filter to narrow results.
1374
+ limit: Maximum number of documents to return (None = all).
1375
+ offset: Number of documents to skip (None = 0).
1361
1376
 
1362
1377
  Returns:
1363
- List of (doc_id, text, metadata) tuples.
1378
+ List of (doc_id, text, metadata) tuples, ordered by ID.
1364
1379
  """
1365
1380
  filter_builder = self._catalog.build_filter_clause if filter_dict else None
1366
1381
  return self._catalog.get_all_docs_with_text(
1367
1382
  filter_dict=filter_dict,
1368
1383
  filter_builder=filter_builder,
1384
+ limit=limit,
1385
+ offset=offset,
1369
1386
  )
1370
1387
 
1371
1388
  def get_embeddings_by_ids(self, ids: Sequence[int]) -> dict[int, Any]:
@@ -1395,10 +1412,11 @@ class VectorCollection:
1395
1412
  """Vector dimension (None if no vectors added yet)."""
1396
1413
  return self._index.ndim
1397
1414
 
1398
- @property
1399
- def _dim(self) -> int | None:
1400
- """Vector dimension (None if no vectors added yet)."""
1401
- return self._index.ndim
1415
+ def __repr__(self) -> str:
1416
+ return (
1417
+ f"VectorCollection(name={self.name!r}, dim={self.dim}, "
1418
+ f"size={self.count()}, distance={self.distance_strategy.value})"
1419
+ )
1402
1420
 
1403
1421
 
1404
1422
  class VectorDB:
@@ -1452,7 +1470,7 @@ class VectorDB:
1452
1470
  self.quantization = quantization
1453
1471
  self.auto_migrate = auto_migrate
1454
1472
  self._encryption_key = encryption_key
1455
- self._collections: dict[str, VectorCollection] = {}
1473
+ self._collections: dict[tuple, VectorCollection] = {}
1456
1474
 
1457
1475
  # Create connection (encrypted or plain)
1458
1476
  if encryption_key is not None:
@@ -1467,6 +1485,8 @@ class VectorDB:
1467
1485
  check_same_thread=False,
1468
1486
  timeout=30.0,
1469
1487
  )
1488
+ self.conn.execute("PRAGMA journal_mode=WAL")
1489
+ self.conn.execute("PRAGMA synchronous=NORMAL")
1470
1490
  self._encrypted = True
1471
1491
  _logger.info("Opened encrypted database: %s", self.path)
1472
1492
  else:
@@ -1477,6 +1497,13 @@ class VectorDB:
1477
1497
  self.conn.execute("PRAGMA synchronous=NORMAL")
1478
1498
  self._encrypted = False
1479
1499
 
1500
+ # Verify connection is healthy
1501
+ try:
1502
+ self.conn.execute("SELECT 1")
1503
+ except sqlite3.DatabaseError as e:
1504
+ self.conn.close()
1505
+ raise RuntimeError(f"Database health check failed: {e}") from e
1506
+
1480
1507
  # Check for required migration before allowing collection access
1481
1508
  if not auto_migrate and self.path != ":memory:":
1482
1509
  migration_info = VectorDB.check_migration(self.path)
@@ -1491,22 +1518,103 @@ class VectorDB:
1491
1518
 
1492
1519
  def list_collections(self) -> list[str]:
1493
1520
  """
1494
- Return names of all initialized collections.
1521
+ Return names of all persisted collections in the database.
1495
1522
 
1496
- Only returns collections that have been accessed via `collection()` in this
1497
- session. Does not scan the database for collections created in previous sessions.
1523
+ Scans the database schema for collection tables, returning both
1524
+ collections accessed this session and those created in previous sessions.
1498
1525
 
1499
1526
  Returns:
1500
- List of collection names currently cached in this VectorDB instance.
1527
+ Sorted list of collection names stored in this database.
1501
1528
 
1502
1529
  Example:
1503
1530
  >>> db = VectorDB("app.db")
1504
1531
  >>> db.collection("users")
1505
- >>> db.collection("products")
1506
- >>> db.list_collections()
1507
- ['users', 'products']
1532
+ >>> db.close()
1533
+ >>> db2 = VectorDB("app.db")
1534
+ >>> db2.list_collections()
1535
+ ['users']
1536
+ """
1537
+ rows = self.conn.execute(
1538
+ "SELECT name FROM sqlite_master WHERE type='table' "
1539
+ "AND (name = 'tinyvec_items' OR name LIKE 'items_%')"
1540
+ ).fetchall()
1541
+ # Collect all table names, then filter out FTS/cluster derivatives.
1542
+ # FTS5 creates shadow tables: items_<name>_fts, items_<name>_fts_data,
1543
+ # items_<name>_fts_idx, items_<name>_fts_content, items_<name>_fts_docsize,
1544
+ # items_<name>_fts_config. Cluster tables: items_<name>_clusters.
1545
+ # We identify derivatives by checking if a suffix is <coll>_fts*
1546
+ # or <coll>_clusters for some other known collection suffix.
1547
+ all_suffixes: set[str] = set()
1548
+ has_default = False
1549
+ for (table_name,) in rows:
1550
+ if table_name == "tinyvec_items":
1551
+ has_default = True
1552
+ elif table_name.startswith("items_"):
1553
+ all_suffixes.add(table_name[6:])
1554
+
1555
+ # A suffix is a real collection if no other suffix is a prefix of it
1556
+ # followed by _fts* or _clusters.
1557
+ _fts_suffixes = ("_fts", "_fts_data", "_fts_idx", "_fts_content",
1558
+ "_fts_docsize", "_fts_config")
1559
+ derivative_suffixes: set[str] = set()
1560
+ for s in all_suffixes:
1561
+ for fts in _fts_suffixes:
1562
+ derivative_suffixes.add(f"{s}{fts}")
1563
+ derivative_suffixes.add(f"{s}_clusters")
1564
+
1565
+ names: list[str] = []
1566
+ if has_default:
1567
+ names.append("default")
1568
+ for s in sorted(all_suffixes - derivative_suffixes):
1569
+ names.append(s)
1570
+ return names
1571
+
1572
+ def delete_collection(self, name: str) -> None:
1508
1573
  """
1509
- return list(self._collections.keys())
1574
+ Delete a collection and all its data.
1575
+
1576
+ Drops the SQLite tables (items, FTS, clusters) and deletes
1577
+ the usearch index file. Removes the collection from the cache.
1578
+
1579
+ Args:
1580
+ name: Collection name to delete.
1581
+
1582
+ Raises:
1583
+ ValueError: If the collection name is invalid.
1584
+ KeyError: If the collection does not exist.
1585
+ """
1586
+ if not re.match(constants.COLLECTION_NAME_PATTERN, name):
1587
+ raise ValueError(
1588
+ f"Invalid collection name '{name}'. Must be alphanumeric + underscores."
1589
+ )
1590
+ if name not in self.list_collections():
1591
+ raise KeyError(f"Collection '{name}' does not exist.")
1592
+
1593
+ table_name = "tinyvec_items" if name == "default" else f"items_{name}"
1594
+ fts_table = f"{table_name}_fts"
1595
+ cluster_table = f"{table_name}_clusters"
1596
+
1597
+ # Drop SQLite tables
1598
+ self.conn.execute(f"DROP TABLE IF EXISTS {fts_table}")
1599
+ self.conn.execute(f"DROP TABLE IF EXISTS {cluster_table}")
1600
+ self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
1601
+ self.conn.commit()
1602
+
1603
+ # Delete usearch index file (and encrypted variant if present)
1604
+ if self.path != ":memory:":
1605
+ index_path = Path(self.path + f".{name}.usearch")
1606
+ if index_path.exists():
1607
+ index_path.unlink()
1608
+ encrypted_path = Path(str(index_path) + ".enc")
1609
+ if encrypted_path.exists():
1610
+ encrypted_path.unlink()
1611
+
1612
+ # Remove from cache (match any tuple key with this name)
1613
+ keys_to_remove = [k for k in self._collections if k[0] == name]
1614
+ for k in keys_to_remove:
1615
+ del self._collections[k]
1616
+
1617
+ _logger.info("Deleted collection: %s", name)
1510
1618
 
1511
1619
  def search_collections(
1512
1620
  self,
@@ -1562,14 +1670,28 @@ class VectorDB:
1562
1670
  # Resolve and validate collections
1563
1671
  targets: list[VectorCollection] = []
1564
1672
  dims: set[int | None] = set()
1673
+ # Validate explicit collection names exist in DB
1674
+ if collections is not None:
1675
+ persisted = set(self.list_collections())
1676
+ for name in target_names:
1677
+ if name not in persisted:
1678
+ # Check cache too (collection may exist but not yet persisted)
1679
+ if not any(k[0] == name for k in self._collections):
1680
+ raise KeyError(
1681
+ f"Collection '{name}' not initialized. "
1682
+ f"Call db.collection('{name}') first."
1683
+ )
1684
+
1565
1685
  for name in target_names:
1566
- if name not in self._collections:
1567
- raise KeyError(
1568
- f"Collection '{name}' not initialized. Call db.collection('{name}') first."
1569
- )
1570
- coll = self._collections[name]
1686
+ # Find cached collection by name (may have any strategy/quantization)
1687
+ matched = [v for k, v in self._collections.items() if k[0] == name]
1688
+ if matched:
1689
+ coll = matched[0]
1690
+ else:
1691
+ # Auto-initialize with defaults for persisted but uncached collections
1692
+ coll = self.collection(name)
1571
1693
  targets.append(coll)
1572
- dims.add(coll._dim)
1694
+ dims.add(coll.dim)
1573
1695
 
1574
1696
  # Check dimension consistency (ignore None for empty collections)
1575
1697
  dims.discard(None)
@@ -1629,6 +1751,7 @@ class VectorDB:
1629
1751
  name: str = "default",
1630
1752
  distance_strategy: DistanceStrategy | None = None,
1631
1753
  quantization: Quantization | None = None,
1754
+ store_embeddings: bool = False,
1632
1755
  ) -> VectorCollection:
1633
1756
  """
1634
1757
  Get or create a named collection.
@@ -1640,6 +1763,9 @@ class VectorDB:
1640
1763
  name: Collection name (alphanumeric + underscore only).
1641
1764
  distance_strategy: Override database-level distance metric.
1642
1765
  quantization: Override database-level quantization.
1766
+ store_embeddings: If True, store embeddings as BLOBs in SQLite
1767
+ alongside the usearch index. Required for rebuild_index().
1768
+ Default False to save ~2x storage.
1643
1769
 
1644
1770
  Returns:
1645
1771
  VectorCollection instance.
@@ -1647,7 +1773,7 @@ class VectorDB:
1647
1773
  Raises:
1648
1774
  ValueError: If collection name contains invalid characters.
1649
1775
  """
1650
- cache_key = name
1776
+ cache_key = (name, distance_strategy, quantization, store_embeddings)
1651
1777
  if cache_key not in self._collections:
1652
1778
  self._collections[cache_key] = VectorCollection(
1653
1779
  conn=self.conn,
@@ -1656,6 +1782,7 @@ class VectorDB:
1656
1782
  distance_strategy=distance_strategy or self.distance_strategy,
1657
1783
  quantization=quantization or self.quantization,
1658
1784
  encryption_key=self._encryption_key,
1785
+ store_embeddings=store_embeddings,
1659
1786
  )
1660
1787
  return self._collections[cache_key]
1661
1788
 
@@ -1844,6 +1971,9 @@ MIGRATION ROLLBACK INSTRUCTIONS:
1844
1971
  for collection in self._collections.values():
1845
1972
  collection.save()
1846
1973
 
1974
+ def __repr__(self) -> str:
1975
+ return f"VectorDB(path={self.path!r}, collections={self.list_collections()})"
1976
+
1847
1977
  def close(self) -> None:
1848
1978
  """Close the database connection and save indexes."""
1849
1979
  if getattr(self, "_closed", False):
@@ -0,0 +1,12 @@
1
+ """Embeddings module — local embedding models and OpenAI-compatible server."""
2
+
3
+ from .models import embed_texts, get_embedder, load_model
4
+ from .server import app, run_server
5
+
6
+ __all__ = [
7
+ "app",
8
+ "embed_texts",
9
+ "get_embedder",
10
+ "load_model",
11
+ "run_server",
12
+ ]