betterdb-retrieval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- betterdb_retrieval-0.1.0/.gitignore +9 -0
- betterdb_retrieval-0.1.0/CHANGELOG.md +11 -0
- betterdb_retrieval-0.1.0/PKG-INFO +99 -0
- betterdb_retrieval-0.1.0/README.md +82 -0
- betterdb_retrieval-0.1.0/RELEASE_NOTES.md +52 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/__init__.py +96 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/discovery.py +34 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/fields.py +6 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/ft_create.py +187 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/ft_search.py +51 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/health.py +46 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/prometheus_metrics.py +91 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/retriever.py +422 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/schema.py +46 -0
- betterdb_retrieval-0.1.0/betterdb_retrieval/telemetry.py +24 -0
- betterdb_retrieval-0.1.0/eval/__init__.py +1 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/__init__.py +6 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/__main__.py +13 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/adapter.py +90 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/dataset.py +24 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/embed.py +123 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/fixture.json +130 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/judge.py +72 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/openai_http.py +84 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/reader.py +78 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/run.py +100 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/runner.py +230 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/store.py +319 -0
- betterdb_retrieval-0.1.0/eval/longmemeval/types.py +84 -0
- betterdb_retrieval-0.1.0/pyproject.toml +44 -0
- betterdb_retrieval-0.1.0/tests/__init__.py +0 -0
- betterdb_retrieval-0.1.0/tests/conftest.py +42 -0
- betterdb_retrieval-0.1.0/tests/test_discovery.py +89 -0
- betterdb_retrieval-0.1.0/tests/test_ft_create.py +341 -0
- betterdb_retrieval-0.1.0/tests/test_ft_search.py +61 -0
- betterdb_retrieval-0.1.0/tests/test_health.py +105 -0
- betterdb_retrieval-0.1.0/tests/test_index_lifecycle.py +162 -0
- betterdb_retrieval-0.1.0/tests/test_longmemeval.py +54 -0
- betterdb_retrieval-0.1.0/tests/test_prometheus.py +49 -0
- betterdb_retrieval-0.1.0/tests/test_query.py +236 -0
- betterdb_retrieval-0.1.0/tests/test_telemetry.py +132 -0
- betterdb_retrieval-0.1.0/tests/test_upsert_delete.py +259 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-06-23
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Initial release. Python equivalent of the TypeScript `@betterdb/retrieval`.
|
|
8
|
+
- `Retriever` — index lifecycle (`create_index` / `drop_index` / `describe_index` / `health`), `upsert` / `delete`, and `query` (vector + filtered + `hybrid="rerank"` KNN search).
|
|
9
|
+
- Typed `RetrievalSchema` (tag / numeric / text fields, HNSW & FLAT vector specs).
|
|
10
|
+
- Shared discovery registry `register` / `unregister`, ownership-checked against the `__betterdb:caches` hash.
|
|
11
|
+
- Observability seams: `RetrievalMetrics` / `RetrievalTracer` protocols and `create_prometheus_metrics` (optional `prometheus` extra).
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: betterdb-retrieval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Developer-facing retrieval SDK over Valkey Search: index lifecycle, upsert, and vector + filtered query.
|
|
5
|
+
Project-URL: Repository, https://github.com/BetterDB-inc/monitor
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: embeddings,knn,rag,redis,retrieval,valkey,valkey-search,vector-search
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: betterdb-valkey-search-kit>=0.1.0
|
|
10
|
+
Requires-Dist: prometheus-client>=0.19.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
14
|
+
Provides-Extra: eval
|
|
15
|
+
Requires-Dist: valkey>=6.0.0; extra == 'eval'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# @betterdb/retrieval (Python)
|
|
19
|
+
|
|
20
|
+
`betterdb-retrieval` — developer-facing retrieval SDK over [Valkey Search](https://valkey.io/topics/search/) (`FT.*`): typed index schema, idempotent index lifecycle, upsert/delete, and vector + filtered + hybrid query. This is the Python equivalent of the TypeScript `@betterdb/retrieval` package, built on [`betterdb-valkey-search-kit`](../valkey-search-kit-py/).
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install betterdb-retrieval valkey
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Requires a Valkey server with the [Valkey Search](https://valkey.io/topics/search/) module loaded.
|
|
29
|
+
|
|
30
|
+
## Quick start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from valkey.asyncio import Valkey
|
|
34
|
+
|
|
35
|
+
from betterdb_retrieval import Retriever, UpsertEntry
|
|
36
|
+
|
|
37
|
+
client = Valkey.from_url("redis://localhost:6379")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def embed(text: str) -> list[float]:
|
|
41
|
+
... # return an embedding
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
retriever = Retriever(
|
|
45
|
+
client=client,
|
|
46
|
+
name="docs",
|
|
47
|
+
schema={
|
|
48
|
+
"fields": {
|
|
49
|
+
"category": {"type": "tag"},
|
|
50
|
+
"year": {"type": "numeric", "sortable": True},
|
|
51
|
+
},
|
|
52
|
+
"vector": {"algorithm": "hnsw", "metric": "cosine"},
|
|
53
|
+
},
|
|
54
|
+
embed_fn=embed,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Create the index if it doesn't exist (idempotent; dims resolved from embed_fn).
|
|
58
|
+
await retriever.create_index()
|
|
59
|
+
|
|
60
|
+
await retriever.upsert([
|
|
61
|
+
UpsertEntry(
|
|
62
|
+
id="doc1",
|
|
63
|
+
text="Valkey is a high-performance key-value store",
|
|
64
|
+
fields={"category": "db", "year": 2024},
|
|
65
|
+
),
|
|
66
|
+
])
|
|
67
|
+
|
|
68
|
+
hits = await retriever.query(
|
|
69
|
+
text="fast in-memory database",
|
|
70
|
+
k=5,
|
|
71
|
+
filter={"category": "db"},
|
|
72
|
+
)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Retriever API
|
|
76
|
+
|
|
77
|
+
- `create_index()` — create the index if absent (idempotent). Vector dimension is taken from `schema["vector"]["dims"]` or resolved by probing `embed_fn`.
|
|
78
|
+
- `upsert(entries)` — embed each entry's `text` and write it as a hash with its `fields`.
|
|
79
|
+
- `delete(ids)` — delete documents by id.
|
|
80
|
+
- `query(*, k, text=None, vector=None, filter=None, hybrid=None)` — KNN search. Provide `text` (embedded for you) or a precomputed `vector`, a positive `k`, an optional `filter` (tag/numeric fields), and `hybrid="rerank"` to post-process hits through a `rerank_fn`. Returns `list[QueryHit]`.
|
|
81
|
+
- `describe_index()` / `health()` — index stats: doc count, indexing state, dimension, percent indexed, and an optional estimated recall.
|
|
82
|
+
- `drop_index()` — drop the index (no-op if it doesn't exist).
|
|
83
|
+
- `register()` / `unregister()` — publish/remove a discovery marker in the shared `__betterdb:caches` registry, ownership-checked so it never clobbers a foreign cache type.
|
|
84
|
+
|
|
85
|
+
> `QueryHit.score` is the raw KNN vector **distance** (lower is closer), not a similarity — rank ascending.
|
|
86
|
+
|
|
87
|
+
## Observability
|
|
88
|
+
|
|
89
|
+
Pass `metrics` (a `RetrievalMetrics`) and/or `tracer` (a `RetrievalTracer`) to instrument every operation. `create_prometheus_metrics()` provides a ready-made [prometheus-client](https://github.com/prometheus/client_python) implementation.
|
|
90
|
+
|
|
91
|
+
## Development
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
uv run --extra dev pytest tests -q
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
MIT
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# @betterdb/retrieval (Python)
|
|
2
|
+
|
|
3
|
+
`betterdb-retrieval` — developer-facing retrieval SDK over [Valkey Search](https://valkey.io/topics/search/) (`FT.*`): typed index schema, idempotent index lifecycle, upsert/delete, and vector + filtered + hybrid query. This is the Python equivalent of the TypeScript `@betterdb/retrieval` package, built on [`betterdb-valkey-search-kit`](../valkey-search-kit-py/).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install betterdb-retrieval valkey
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Requires a Valkey server with the [Valkey Search](https://valkey.io/topics/search/) module loaded.
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from valkey.asyncio import Valkey
|
|
17
|
+
|
|
18
|
+
from betterdb_retrieval import Retriever, UpsertEntry
|
|
19
|
+
|
|
20
|
+
client = Valkey.from_url("redis://localhost:6379")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
async def embed(text: str) -> list[float]:
|
|
24
|
+
... # return an embedding
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
retriever = Retriever(
|
|
28
|
+
client=client,
|
|
29
|
+
name="docs",
|
|
30
|
+
schema={
|
|
31
|
+
"fields": {
|
|
32
|
+
"category": {"type": "tag"},
|
|
33
|
+
"year": {"type": "numeric", "sortable": True},
|
|
34
|
+
},
|
|
35
|
+
"vector": {"algorithm": "hnsw", "metric": "cosine"},
|
|
36
|
+
},
|
|
37
|
+
embed_fn=embed,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Create the index if it doesn't exist (idempotent; dims resolved from embed_fn).
|
|
41
|
+
await retriever.create_index()
|
|
42
|
+
|
|
43
|
+
await retriever.upsert([
|
|
44
|
+
UpsertEntry(
|
|
45
|
+
id="doc1",
|
|
46
|
+
text="Valkey is a high-performance key-value store",
|
|
47
|
+
fields={"category": "db", "year": 2024},
|
|
48
|
+
),
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
hits = await retriever.query(
|
|
52
|
+
text="fast in-memory database",
|
|
53
|
+
k=5,
|
|
54
|
+
filter={"category": "db"},
|
|
55
|
+
)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Retriever API
|
|
59
|
+
|
|
60
|
+
- `create_index()` — create the index if absent (idempotent). Vector dimension is taken from `schema["vector"]["dims"]` or resolved by probing `embed_fn`.
|
|
61
|
+
- `upsert(entries)` — embed each entry's `text` and write it as a hash with its `fields`.
|
|
62
|
+
- `delete(ids)` — delete documents by id.
|
|
63
|
+
- `query(*, k, text=None, vector=None, filter=None, hybrid=None)` — KNN search. Provide `text` (embedded for you) or a precomputed `vector`, a positive `k`, an optional `filter` (tag/numeric fields), and `hybrid="rerank"` to post-process hits through a `rerank_fn`. Returns `list[QueryHit]`.
|
|
64
|
+
- `describe_index()` / `health()` — index stats: doc count, indexing state, dimension, percent indexed, and an optional estimated recall.
|
|
65
|
+
- `drop_index()` — drop the index (no-op if it doesn't exist).
|
|
66
|
+
- `register()` / `unregister()` — publish/remove a discovery marker in the shared `__betterdb:caches` registry, ownership-checked so it never clobbers a foreign cache type.
|
|
67
|
+
|
|
68
|
+
> `QueryHit.score` is the raw KNN vector **distance** (lower is closer), not a similarity — rank ascending.
|
|
69
|
+
|
|
70
|
+
## Observability
|
|
71
|
+
|
|
72
|
+
Pass `metrics` (a `RetrievalMetrics`) and/or `tracer` (a `RetrievalTracer`) to instrument every operation. `create_prometheus_metrics()` provides a ready-made [prometheus-client](https://github.com/prometheus/client_python) implementation.
|
|
73
|
+
|
|
74
|
+
## Development
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv run --extra dev pytest tests -q
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## License
|
|
81
|
+
|
|
82
|
+
MIT
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# betterdb-retrieval v0.1.0
|
|
2
|
+
|
|
3
|
+
Python port of `@betterdb/retrieval`. Developer-facing retrieval SDK over
|
|
4
|
+
Valkey Search — typed schema, idempotent index lifecycle, upsert/delete, and
|
|
5
|
+
vector + filtered + hybrid query, with built-in observability seams.
|
|
6
|
+
|
|
7
|
+
Requires Valkey 8+ with the **valkey-search** module (vector index support).
|
|
8
|
+
Works with ElastiCache for Valkey, Memorystore for Valkey, and MemoryDB.
|
|
9
|
+
|
|
10
|
+
Built on [`betterdb-valkey-search-kit`](https://pypi.org/project/betterdb-valkey-search-kit/).
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```sh
|
|
17
|
+
pip install betterdb-retrieval
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## What's included
|
|
23
|
+
|
|
24
|
+
### Retriever
|
|
25
|
+
|
|
26
|
+
| Method | Description |
|
|
27
|
+
|---|---|
|
|
28
|
+
| `create_index()` | Create or attach to the vector index (idempotent) |
|
|
29
|
+
| `upsert(...)` | Insert or update a document with its vector and fields |
|
|
30
|
+
| `delete(id)` | Delete a document by id |
|
|
31
|
+
| `query(...)` | Vector, filtered, and hybrid (vector + filter) search |
|
|
32
|
+
| `health()` | Index name, doc count, vector dimension |
|
|
33
|
+
|
|
34
|
+
### Schema & fields
|
|
35
|
+
|
|
36
|
+
Typed `RetrievalSchema` with TAG / NUMERIC / vector field builders, validated
|
|
37
|
+
against the live `FT.INFO` to tolerate version skew.
|
|
38
|
+
|
|
39
|
+
### Discovery
|
|
40
|
+
|
|
41
|
+
Shared discovery registry with atomic register/unregister (EVAL compare-and-set).
|
|
42
|
+
|
|
43
|
+
### Observability
|
|
44
|
+
|
|
45
|
+
- `RetrievalMetrics` / `RetrievalTracer` instrumentation seams
|
|
46
|
+
- Prometheus metrics for query latency and result counts
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Full changelog
|
|
51
|
+
|
|
52
|
+
See the repository history for detailed changes.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Developer-facing retrieval SDK over Valkey Search.
|
|
2
|
+
|
|
3
|
+
Async Python port of the TypeScript ``@betterdb/retrieval`` package: index
|
|
4
|
+
lifecycle, upsert, and vector + filtered query backed by Valkey Search (FT.*).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .discovery import (
|
|
10
|
+
REGISTRY_KEY,
|
|
11
|
+
RETRIEVAL_CACHE_TYPE,
|
|
12
|
+
RetrievalMarker,
|
|
13
|
+
build_retrieval_marker,
|
|
14
|
+
)
|
|
15
|
+
from .fields import SCORE_FIELD, TEXT_FIELD
|
|
16
|
+
from .ft_create import (
|
|
17
|
+
build_ft_create_args,
|
|
18
|
+
index_name,
|
|
19
|
+
key_prefix,
|
|
20
|
+
resolve_vector_field_name,
|
|
21
|
+
)
|
|
22
|
+
from .ft_search import QueryFilter, build_ft_search_query
|
|
23
|
+
from .health import IndexHealthSnapshot, RecallEstimator, parse_percent_indexed
|
|
24
|
+
from .prometheus_metrics import (
|
|
25
|
+
PrometheusRetrievalMetrics,
|
|
26
|
+
create_prometheus_metrics,
|
|
27
|
+
)
|
|
28
|
+
from .retriever import (
|
|
29
|
+
EmbedFn,
|
|
30
|
+
IndexDescription,
|
|
31
|
+
QueryHit,
|
|
32
|
+
RerankFn,
|
|
33
|
+
Retriever,
|
|
34
|
+
RetrieverClient,
|
|
35
|
+
UpsertEntry,
|
|
36
|
+
)
|
|
37
|
+
from .schema import (
|
|
38
|
+
FieldSpec,
|
|
39
|
+
FtCapabilities,
|
|
40
|
+
RetrievalSchema,
|
|
41
|
+
VectorAlgorithm,
|
|
42
|
+
VectorMetric,
|
|
43
|
+
VectorSpec,
|
|
44
|
+
)
|
|
45
|
+
from .telemetry import (
|
|
46
|
+
RetrievalMetrics,
|
|
47
|
+
RetrievalOperation,
|
|
48
|
+
RetrievalSpan,
|
|
49
|
+
RetrievalTracer,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
# schema
|
|
54
|
+
"FieldSpec",
|
|
55
|
+
"VectorMetric",
|
|
56
|
+
"VectorAlgorithm",
|
|
57
|
+
"VectorSpec",
|
|
58
|
+
"RetrievalSchema",
|
|
59
|
+
"FtCapabilities",
|
|
60
|
+
# ft-create
|
|
61
|
+
"build_ft_create_args",
|
|
62
|
+
"index_name",
|
|
63
|
+
"key_prefix",
|
|
64
|
+
"resolve_vector_field_name",
|
|
65
|
+
# fields
|
|
66
|
+
"TEXT_FIELD",
|
|
67
|
+
"SCORE_FIELD",
|
|
68
|
+
# ft-search
|
|
69
|
+
"build_ft_search_query",
|
|
70
|
+
"QueryFilter",
|
|
71
|
+
# retriever
|
|
72
|
+
"Retriever",
|
|
73
|
+
"RetrieverClient",
|
|
74
|
+
"IndexDescription",
|
|
75
|
+
"EmbedFn",
|
|
76
|
+
"UpsertEntry",
|
|
77
|
+
"RerankFn",
|
|
78
|
+
"QueryHit",
|
|
79
|
+
# discovery
|
|
80
|
+
"build_retrieval_marker",
|
|
81
|
+
"REGISTRY_KEY",
|
|
82
|
+
"RETRIEVAL_CACHE_TYPE",
|
|
83
|
+
"RetrievalMarker",
|
|
84
|
+
# health
|
|
85
|
+
"parse_percent_indexed",
|
|
86
|
+
"IndexHealthSnapshot",
|
|
87
|
+
"RecallEstimator",
|
|
88
|
+
# telemetry
|
|
89
|
+
"RetrievalMetrics",
|
|
90
|
+
"RetrievalTracer",
|
|
91
|
+
"RetrievalSpan",
|
|
92
|
+
"RetrievalOperation",
|
|
93
|
+
# prometheus
|
|
94
|
+
"create_prometheus_metrics",
|
|
95
|
+
"PrometheusRetrievalMetrics",
|
|
96
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
|
|
5
|
+
from .ft_create import index_name
|
|
6
|
+
|
|
7
|
+
REGISTRY_KEY = "__betterdb:caches"
|
|
8
|
+
RETRIEVAL_PROTOCOL_VERSION = 1
|
|
9
|
+
RETRIEVAL_CACHE_TYPE = "retrieval"
|
|
10
|
+
# TODO: sync with pyproject.toml rather than hardcoding — this drifts on a
|
|
11
|
+
# version bump.
|
|
12
|
+
RETRIEVAL_VERSION = "0.1.0"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RetrievalMarker(TypedDict):
|
|
16
|
+
type: str
|
|
17
|
+
prefix: str
|
|
18
|
+
version: str
|
|
19
|
+
protocol_version: int
|
|
20
|
+
capabilities: list[str]
|
|
21
|
+
index_name: str
|
|
22
|
+
started_at: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_retrieval_marker(name: str, version: str, started_at: str) -> RetrievalMarker:
|
|
26
|
+
return {
|
|
27
|
+
"type": RETRIEVAL_CACHE_TYPE,
|
|
28
|
+
"prefix": name,
|
|
29
|
+
"version": version,
|
|
30
|
+
"protocol_version": RETRIEVAL_PROTOCOL_VERSION,
|
|
31
|
+
"capabilities": ["upsert", "query", "delete"],
|
|
32
|
+
"index_name": index_name(name),
|
|
33
|
+
"started_at": started_at,
|
|
34
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .fields import RESERVED_FIELD_NAMES
|
|
4
|
+
from .schema import FieldSpec, FtCapabilities, RetrievalSchema, VectorSpec
|
|
5
|
+
|
|
6
|
+
_HNSW_DEFAULTS = {"m": 16, "efConstruction": 200, "efRuntime": 10}
|
|
7
|
+
|
|
8
|
+
_METRIC_MAP = {"cosine": "COSINE", "l2": "L2", "ip": "IP"}
|
|
9
|
+
|
|
10
|
+
_ALGORITHM_MAP = {"hnsw": "HNSW", "flat": "FLAT"}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _is_positive_int(value: object) -> bool:
|
|
14
|
+
"""Mirror ``Number.isInteger(x) && x > 0`` (accepts integral floats)."""
|
|
15
|
+
if isinstance(value, bool):
|
|
16
|
+
return False
|
|
17
|
+
if isinstance(value, int):
|
|
18
|
+
return value > 0
|
|
19
|
+
if isinstance(value, float):
|
|
20
|
+
return value.is_integer() and value > 0
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _require_dims(dims: object) -> int:
|
|
25
|
+
if dims is None or not _is_positive_int(dims):
|
|
26
|
+
raise ValueError(f"dims must be a positive integer to build FT.CREATE args, got: {dims}")
|
|
27
|
+
return int(dims) # type: ignore[arg-type]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _validate_field_names(fields: dict[str, FieldSpec], vector_field_name: str) -> None:
|
|
31
|
+
for name in fields:
|
|
32
|
+
if len(name) == 0:
|
|
33
|
+
raise ValueError("Invalid field name: empty field name is not allowed")
|
|
34
|
+
if name == vector_field_name:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Field name '{name}' collides with the vector field name '{vector_field_name}'"
|
|
37
|
+
)
|
|
38
|
+
if name in RESERVED_FIELD_NAMES:
|
|
39
|
+
raise ValueError(f"Field name '{name}' is reserved and cannot be used in the schema")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _validate_text_field_capabilities(
|
|
43
|
+
fields: dict[str, FieldSpec], capabilities: FtCapabilities | None
|
|
44
|
+
) -> None:
|
|
45
|
+
if not (capabilities is not None and capabilities.get("textFields") is False):
|
|
46
|
+
return
|
|
47
|
+
text_field_names = [name for name, spec in fields.items() if spec.get("type") == "text"]
|
|
48
|
+
if text_field_names:
|
|
49
|
+
raise ValueError(f"Text fields require valkey-search >= 1.2: {', '.join(text_field_names)}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _validate_flat_hnsw_params(vector: VectorSpec) -> None:
|
|
53
|
+
if vector.get("algorithm") != "flat":
|
|
54
|
+
return
|
|
55
|
+
if vector.get("m") is not None:
|
|
56
|
+
raise ValueError("FLAT algorithm does not support 'm' parameter")
|
|
57
|
+
if vector.get("efConstruction") is not None:
|
|
58
|
+
raise ValueError("FLAT algorithm does not support 'efConstruction' parameter")
|
|
59
|
+
if vector.get("efRuntime") is not None:
|
|
60
|
+
raise ValueError("FLAT algorithm does not support 'efRuntime' parameter")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _build_field_args(name: str, spec: FieldSpec) -> list[str]:
|
|
64
|
+
field_type = spec.get("type")
|
|
65
|
+
if field_type == "text":
|
|
66
|
+
return [name, "TEXT"]
|
|
67
|
+
if field_type == "tag":
|
|
68
|
+
args = [name, "TAG"]
|
|
69
|
+
separator = spec.get("separator")
|
|
70
|
+
if separator is not None:
|
|
71
|
+
args.extend(["SEPARATOR", separator])
|
|
72
|
+
return args
|
|
73
|
+
args = [name, "NUMERIC"]
|
|
74
|
+
if spec.get("sortable") is True:
|
|
75
|
+
args.append("SORTABLE")
|
|
76
|
+
return args
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def resolve_vector_field_name(vector: VectorSpec) -> str:
|
|
80
|
+
field_name = vector.get("fieldName")
|
|
81
|
+
if field_name is None:
|
|
82
|
+
return "embedding"
|
|
83
|
+
if field_name.strip() == "":
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"Vector field name must not be empty or whitespace-only, got: '{field_name}'"
|
|
86
|
+
)
|
|
87
|
+
return field_name
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _build_vector_args(vector: VectorSpec, dims: int) -> list[str]:
|
|
91
|
+
field_name = resolve_vector_field_name(vector)
|
|
92
|
+
algorithm = vector.get("algorithm")
|
|
93
|
+
if algorithm not in _ALGORITHM_MAP:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Vector algorithm must be one of {sorted(_ALGORITHM_MAP)}, got: {algorithm!r}"
|
|
96
|
+
)
|
|
97
|
+
algo = _ALGORITHM_MAP[algorithm]
|
|
98
|
+
metric = _METRIC_MAP[vector["metric"]]
|
|
99
|
+
|
|
100
|
+
if vector.get("algorithm") == "flat":
|
|
101
|
+
return [
|
|
102
|
+
field_name,
|
|
103
|
+
"VECTOR",
|
|
104
|
+
algo,
|
|
105
|
+
"6",
|
|
106
|
+
"TYPE",
|
|
107
|
+
"FLOAT32",
|
|
108
|
+
"DIM",
|
|
109
|
+
str(dims),
|
|
110
|
+
"DISTANCE_METRIC",
|
|
111
|
+
metric,
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
m = vector.get("m")
|
|
115
|
+
if m is None:
|
|
116
|
+
m = _HNSW_DEFAULTS["m"]
|
|
117
|
+
ef_construction = vector.get("efConstruction")
|
|
118
|
+
if ef_construction is None:
|
|
119
|
+
ef_construction = _HNSW_DEFAULTS["efConstruction"]
|
|
120
|
+
ef_runtime = vector.get("efRuntime")
|
|
121
|
+
if ef_runtime is None:
|
|
122
|
+
ef_runtime = _HNSW_DEFAULTS["efRuntime"]
|
|
123
|
+
|
|
124
|
+
return [
|
|
125
|
+
field_name,
|
|
126
|
+
"VECTOR",
|
|
127
|
+
algo,
|
|
128
|
+
"12",
|
|
129
|
+
"TYPE",
|
|
130
|
+
"FLOAT32",
|
|
131
|
+
"DIM",
|
|
132
|
+
str(dims),
|
|
133
|
+
"DISTANCE_METRIC",
|
|
134
|
+
metric,
|
|
135
|
+
"M",
|
|
136
|
+
str(m),
|
|
137
|
+
"EF_CONSTRUCTION",
|
|
138
|
+
str(ef_construction),
|
|
139
|
+
"EF_RUNTIME",
|
|
140
|
+
str(ef_runtime),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def index_name(name: str) -> str:
|
|
145
|
+
if name.strip() == "":
|
|
146
|
+
raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
|
|
147
|
+
return f"{name}:idx"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def key_prefix(name: str) -> str:
|
|
151
|
+
if name.strip() == "":
|
|
152
|
+
raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
|
|
153
|
+
return f"{name}:"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def build_ft_create_args(
|
|
157
|
+
name: str,
|
|
158
|
+
schema: RetrievalSchema,
|
|
159
|
+
capabilities: FtCapabilities | None = None,
|
|
160
|
+
) -> list[str]:
|
|
161
|
+
if name.strip() == "":
|
|
162
|
+
raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
|
|
163
|
+
|
|
164
|
+
dims = _require_dims(schema["vector"].get("dims"))
|
|
165
|
+
vector_field_name = resolve_vector_field_name(schema["vector"])
|
|
166
|
+
|
|
167
|
+
_validate_field_names(schema["fields"], vector_field_name)
|
|
168
|
+
_validate_text_field_capabilities(schema["fields"], capabilities)
|
|
169
|
+
_validate_flat_hnsw_params(schema["vector"])
|
|
170
|
+
|
|
171
|
+
field_args: list[str] = []
|
|
172
|
+
for field_name, spec in schema["fields"].items():
|
|
173
|
+
field_args.extend(_build_field_args(field_name, spec))
|
|
174
|
+
|
|
175
|
+
vector_args = _build_vector_args(schema["vector"], dims)
|
|
176
|
+
|
|
177
|
+
return [
|
|
178
|
+
index_name(name),
|
|
179
|
+
"ON",
|
|
180
|
+
"HASH",
|
|
181
|
+
"PREFIX",
|
|
182
|
+
"1",
|
|
183
|
+
key_prefix(name),
|
|
184
|
+
"SCHEMA",
|
|
185
|
+
*field_args,
|
|
186
|
+
*vector_args,
|
|
187
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from betterdb_valkey_search_kit import escape_tag
|
|
6
|
+
|
|
7
|
+
from .fields import SCORE_FIELD
|
|
8
|
+
from .ft_create import resolve_vector_field_name
|
|
9
|
+
from .schema import RetrievalSchema
|
|
10
|
+
|
|
11
|
+
QueryFilter = dict[str, "str | int | float"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_number(value: object) -> bool:
|
|
15
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _build_filter_clause(field: str, value: str | int | float, schema: RetrievalSchema) -> str:
|
|
19
|
+
spec = schema["fields"].get(field)
|
|
20
|
+
if spec is None:
|
|
21
|
+
raise ValueError(f"Cannot filter on unknown field '{field}'")
|
|
22
|
+
field_type = spec.get("type")
|
|
23
|
+
if field_type == "tag":
|
|
24
|
+
return f"@{field}:{{{escape_tag(str(value))}}}"
|
|
25
|
+
if field_type == "numeric":
|
|
26
|
+
if not _is_number(value):
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Numeric filter on field '{field}' requires a number, got: {type(value).__name__}"
|
|
29
|
+
)
|
|
30
|
+
if isinstance(value, float) and not math.isfinite(value):
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f"Numeric filter on field '{field}' requires a finite number, got: {value!r}"
|
|
33
|
+
)
|
|
34
|
+
return f"@{field}:[{value} {value}]"
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Cannot filter on TEXT field '{field}'; only tag and numeric fields are filterable"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def build_ft_search_query(
|
|
41
|
+
schema: RetrievalSchema,
|
|
42
|
+
k: int,
|
|
43
|
+
filter: QueryFilter | None = None,
|
|
44
|
+
) -> str:
|
|
45
|
+
vector_field = resolve_vector_field_name(schema["vector"])
|
|
46
|
+
clauses: list[str] = []
|
|
47
|
+
if filter is not None:
|
|
48
|
+
for field, value in filter.items():
|
|
49
|
+
clauses.append(_build_filter_clause(field, value, schema))
|
|
50
|
+
filter_expr = f"({' '.join(clauses)})" if clauses else "*"
|
|
51
|
+
return f"{filter_expr}=>[KNN {k} @{vector_field} $vec AS {SCORE_FIELD}]"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class IndexHealthSnapshot:
|
|
9
|
+
name: str
|
|
10
|
+
num_docs: int
|
|
11
|
+
indexing_state: str
|
|
12
|
+
dims: int
|
|
13
|
+
percent_indexed: float
|
|
14
|
+
estimated_recall: Optional[float] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
RecallEstimator = Callable[[IndexHealthSnapshot], float]
|
|
18
|
+
|
|
19
|
+
_PERCENT_INDEXED_KEYS = ("percent_indexed", "backfill_complete_percent")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _s(x: Any) -> str:
|
|
23
|
+
if isinstance(x, bytes):
|
|
24
|
+
try:
|
|
25
|
+
return x.decode()
|
|
26
|
+
except UnicodeDecodeError:
|
|
27
|
+
return ""
|
|
28
|
+
return str(x)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_percent_indexed(info: list[Any]) -> float:
|
|
32
|
+
"""Extract the percent-indexed value from a raw FT.INFO reply.
|
|
33
|
+
|
|
34
|
+
valkey-search/RediSearch report either a 0-1 fraction or a 0-100
|
|
35
|
+
percentage depending on the version; both are normalized to 0-100. Returns
|
|
36
|
+
0 if the field is absent or unparseable.
|
|
37
|
+
"""
|
|
38
|
+
for i in range(0, len(info) - 1, 2):
|
|
39
|
+
if _s(info[i]) not in _PERCENT_INDEXED_KEYS:
|
|
40
|
+
continue
|
|
41
|
+
try:
|
|
42
|
+
value = float(_s(info[i + 1]))
|
|
43
|
+
except ValueError:
|
|
44
|
+
return 0.0
|
|
45
|
+
return value * 100 if value <= 1 else value
|
|
46
|
+
return 0.0
|