betterdb-retrieval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. betterdb_retrieval-0.1.0/.gitignore +9 -0
  2. betterdb_retrieval-0.1.0/CHANGELOG.md +11 -0
  3. betterdb_retrieval-0.1.0/PKG-INFO +99 -0
  4. betterdb_retrieval-0.1.0/README.md +82 -0
  5. betterdb_retrieval-0.1.0/RELEASE_NOTES.md +52 -0
  6. betterdb_retrieval-0.1.0/betterdb_retrieval/__init__.py +96 -0
  7. betterdb_retrieval-0.1.0/betterdb_retrieval/discovery.py +34 -0
  8. betterdb_retrieval-0.1.0/betterdb_retrieval/fields.py +6 -0
  9. betterdb_retrieval-0.1.0/betterdb_retrieval/ft_create.py +187 -0
  10. betterdb_retrieval-0.1.0/betterdb_retrieval/ft_search.py +51 -0
  11. betterdb_retrieval-0.1.0/betterdb_retrieval/health.py +46 -0
  12. betterdb_retrieval-0.1.0/betterdb_retrieval/prometheus_metrics.py +91 -0
  13. betterdb_retrieval-0.1.0/betterdb_retrieval/retriever.py +422 -0
  14. betterdb_retrieval-0.1.0/betterdb_retrieval/schema.py +46 -0
  15. betterdb_retrieval-0.1.0/betterdb_retrieval/telemetry.py +24 -0
  16. betterdb_retrieval-0.1.0/eval/__init__.py +1 -0
  17. betterdb_retrieval-0.1.0/eval/longmemeval/__init__.py +6 -0
  18. betterdb_retrieval-0.1.0/eval/longmemeval/__main__.py +13 -0
  19. betterdb_retrieval-0.1.0/eval/longmemeval/adapter.py +90 -0
  20. betterdb_retrieval-0.1.0/eval/longmemeval/dataset.py +24 -0
  21. betterdb_retrieval-0.1.0/eval/longmemeval/embed.py +123 -0
  22. betterdb_retrieval-0.1.0/eval/longmemeval/fixture.json +130 -0
  23. betterdb_retrieval-0.1.0/eval/longmemeval/judge.py +72 -0
  24. betterdb_retrieval-0.1.0/eval/longmemeval/openai_http.py +84 -0
  25. betterdb_retrieval-0.1.0/eval/longmemeval/reader.py +78 -0
  26. betterdb_retrieval-0.1.0/eval/longmemeval/run.py +100 -0
  27. betterdb_retrieval-0.1.0/eval/longmemeval/runner.py +230 -0
  28. betterdb_retrieval-0.1.0/eval/longmemeval/store.py +319 -0
  29. betterdb_retrieval-0.1.0/eval/longmemeval/types.py +84 -0
  30. betterdb_retrieval-0.1.0/pyproject.toml +44 -0
  31. betterdb_retrieval-0.1.0/tests/__init__.py +0 -0
  32. betterdb_retrieval-0.1.0/tests/conftest.py +42 -0
  33. betterdb_retrieval-0.1.0/tests/test_discovery.py +89 -0
  34. betterdb_retrieval-0.1.0/tests/test_ft_create.py +341 -0
  35. betterdb_retrieval-0.1.0/tests/test_ft_search.py +61 -0
  36. betterdb_retrieval-0.1.0/tests/test_health.py +105 -0
  37. betterdb_retrieval-0.1.0/tests/test_index_lifecycle.py +162 -0
  38. betterdb_retrieval-0.1.0/tests/test_longmemeval.py +54 -0
  39. betterdb_retrieval-0.1.0/tests/test_prometheus.py +49 -0
  40. betterdb_retrieval-0.1.0/tests/test_query.py +236 -0
  41. betterdb_retrieval-0.1.0/tests/test_telemetry.py +132 -0
  42. betterdb_retrieval-0.1.0/tests/test_upsert_delete.py +259 -0
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .venv/
5
+ dist/
6
+ *.egg-info/
7
+ .cache/
8
+ .ruff_cache/
9
+ .pytest_cache/
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2026-06-23
4
+
5
+ ### Added
6
+
7
+ - Initial release. Python equivalent of the TypeScript `@betterdb/retrieval`.
8
+ - `Retriever` — index lifecycle (`create_index` / `drop_index` / `describe_index` / `health`), `upsert` / `delete`, and `query` (vector + filtered + `hybrid="rerank"` KNN search).
9
+ - Typed `RetrievalSchema` (tag / numeric / text fields, HNSW & FLAT vector specs).
10
+ - Shared discovery registry `register` / `unregister`, ownership-checked against the `__betterdb:caches` hash.
11
+ - Observability seams: `RetrievalMetrics` / `RetrievalTracer` protocols and `create_prometheus_metrics` (optional `prometheus` extra).
@@ -0,0 +1,99 @@
1
+ Metadata-Version: 2.4
2
+ Name: betterdb-retrieval
3
+ Version: 0.1.0
4
+ Summary: Developer-facing retrieval SDK over Valkey Search: index lifecycle, upsert, and vector + filtered query.
5
+ Project-URL: Repository, https://github.com/BetterDB-inc/monitor
6
+ License: MIT
7
+ Keywords: embeddings,knn,rag,redis,retrieval,valkey,valkey-search,vector-search
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: betterdb-valkey-search-kit>=0.1.0
10
+ Requires-Dist: prometheus-client>=0.19.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
13
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
14
+ Provides-Extra: eval
15
+ Requires-Dist: valkey>=6.0.0; extra == 'eval'
16
+ Description-Content-Type: text/markdown
17
+
18
+ # @betterdb/retrieval (Python)
19
+
20
+ `betterdb-retrieval` — developer-facing retrieval SDK over [Valkey Search](https://valkey.io/topics/search/) (`FT.*`): typed index schema, idempotent index lifecycle, upsert/delete, and vector + filtered + hybrid query. This is the Python equivalent of the TypeScript `@betterdb/retrieval` package, built on [`betterdb-valkey-search-kit`](../valkey-search-kit-py/).
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install betterdb-retrieval valkey
26
+ ```
27
+
28
+ Requires a Valkey server with the [Valkey Search](https://valkey.io/topics/search/) module loaded.
29
+
30
+ ## Quick start
31
+
32
+ ```python
33
+ from valkey.asyncio import Valkey
34
+
35
+ from betterdb_retrieval import Retriever, UpsertEntry
36
+
37
+ client = Valkey.from_url("redis://localhost:6379")
38
+
39
+
40
+ async def embed(text: str) -> list[float]:
41
+ ... # return an embedding
42
+
43
+
44
+ retriever = Retriever(
45
+ client=client,
46
+ name="docs",
47
+ schema={
48
+ "fields": {
49
+ "category": {"type": "tag"},
50
+ "year": {"type": "numeric", "sortable": True},
51
+ },
52
+ "vector": {"algorithm": "hnsw", "metric": "cosine"},
53
+ },
54
+ embed_fn=embed,
55
+ )
56
+
57
+ # Create the index if it doesn't exist (idempotent; dims resolved from embed_fn).
58
+ await retriever.create_index()
59
+
60
+ await retriever.upsert([
61
+ UpsertEntry(
62
+ id="doc1",
63
+ text="Valkey is a high-performance key-value store",
64
+ fields={"category": "db", "year": 2024},
65
+ ),
66
+ ])
67
+
68
+ hits = await retriever.query(
69
+ text="fast in-memory database",
70
+ k=5,
71
+ filter={"category": "db"},
72
+ )
73
+ ```
74
+
75
+ ## Retriever API
76
+
77
+ - `create_index()` — create the index if absent (idempotent). Vector dimension is taken from `schema["vector"]["dims"]` or resolved by probing `embed_fn`.
78
+ - `upsert(entries)` — embed each entry's `text` and write it as a hash with its `fields`.
79
+ - `delete(ids)` — delete documents by id.
80
+ - `query(*, k, text=None, vector=None, filter=None, hybrid=None)` — KNN search. Provide `text` (embedded for you) or a precomputed `vector`, a positive `k`, an optional `filter` (tag/numeric fields), and `hybrid="rerank"` to post-process hits through a `rerank_fn`. Returns `list[QueryHit]`.
81
+ - `describe_index()` / `health()` — index stats: doc count, indexing state, dimension, percent indexed, and an optional estimated recall.
82
+ - `drop_index()` — drop the index (no-op if it doesn't exist).
83
+ - `register()` / `unregister()` — publish/remove a discovery marker in the shared `__betterdb:caches` registry, ownership-checked so it never clobbers a foreign cache type.
84
+
85
+ > `QueryHit.score` is the raw KNN vector **distance** (lower is closer), not a similarity — rank ascending.
86
+
87
+ ## Observability
88
+
89
+ Pass `metrics` (a `RetrievalMetrics`) and/or `tracer` (a `RetrievalTracer`) to instrument every operation. `create_prometheus_metrics()` provides a ready-made [prometheus-client](https://github.com/prometheus/client_python) implementation.
90
+
91
+ ## Development
92
+
93
+ ```bash
94
+ uv run --extra dev pytest tests -q
95
+ ```
96
+
97
+ ## License
98
+
99
+ MIT
@@ -0,0 +1,82 @@
1
+ # @betterdb/retrieval (Python)
2
+
3
+ `betterdb-retrieval` — developer-facing retrieval SDK over [Valkey Search](https://valkey.io/topics/search/) (`FT.*`): typed index schema, idempotent index lifecycle, upsert/delete, and vector + filtered + hybrid query. This is the Python equivalent of the TypeScript `@betterdb/retrieval` package, built on [`betterdb-valkey-search-kit`](../valkey-search-kit-py/).
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install betterdb-retrieval valkey
9
+ ```
10
+
11
+ Requires a Valkey server with the [Valkey Search](https://valkey.io/topics/search/) module loaded.
12
+
13
+ ## Quick start
14
+
15
+ ```python
16
+ from valkey.asyncio import Valkey
17
+
18
+ from betterdb_retrieval import Retriever, UpsertEntry
19
+
20
+ client = Valkey.from_url("redis://localhost:6379")
21
+
22
+
23
+ async def embed(text: str) -> list[float]:
24
+ ... # return an embedding
25
+
26
+
27
+ retriever = Retriever(
28
+ client=client,
29
+ name="docs",
30
+ schema={
31
+ "fields": {
32
+ "category": {"type": "tag"},
33
+ "year": {"type": "numeric", "sortable": True},
34
+ },
35
+ "vector": {"algorithm": "hnsw", "metric": "cosine"},
36
+ },
37
+ embed_fn=embed,
38
+ )
39
+
40
+ # Create the index if it doesn't exist (idempotent; dims resolved from embed_fn).
41
+ await retriever.create_index()
42
+
43
+ await retriever.upsert([
44
+ UpsertEntry(
45
+ id="doc1",
46
+ text="Valkey is a high-performance key-value store",
47
+ fields={"category": "db", "year": 2024},
48
+ ),
49
+ ])
50
+
51
+ hits = await retriever.query(
52
+ text="fast in-memory database",
53
+ k=5,
54
+ filter={"category": "db"},
55
+ )
56
+ ```
57
+
58
+ ## Retriever API
59
+
60
+ - `create_index()` — create the index if absent (idempotent). Vector dimension is taken from `schema["vector"]["dims"]` or resolved by probing `embed_fn`.
61
+ - `upsert(entries)` — embed each entry's `text` and write it as a hash with its `fields`.
62
+ - `delete(ids)` — delete documents by id.
63
+ - `query(*, k, text=None, vector=None, filter=None, hybrid=None)` — KNN search. Provide `text` (embedded for you) or a precomputed `vector`, a positive `k`, an optional `filter` (tag/numeric fields), and `hybrid="rerank"` to post-process hits through a `rerank_fn`. Returns `list[QueryHit]`.
64
+ - `describe_index()` / `health()` — index stats: doc count, indexing state, dimension, percent indexed, and an optional estimated recall.
65
+ - `drop_index()` — drop the index (no-op if it doesn't exist).
66
+ - `register()` / `unregister()` — publish/remove a discovery marker in the shared `__betterdb:caches` registry, ownership-checked so it never clobbers a foreign cache type.
67
+
68
+ > `QueryHit.score` is the raw KNN vector **distance** (lower is closer), not a similarity — rank ascending.
69
+
70
+ ## Observability
71
+
72
+ Pass `metrics` (a `RetrievalMetrics`) and/or `tracer` (a `RetrievalTracer`) to instrument every operation. `create_prometheus_metrics()` provides a ready-made [prometheus-client](https://github.com/prometheus/client_python) implementation.
73
+
74
+ ## Development
75
+
76
+ ```bash
77
+ uv run --extra dev pytest tests -q
78
+ ```
79
+
80
+ ## License
81
+
82
+ MIT
@@ -0,0 +1,52 @@
1
+ # betterdb-retrieval v0.1.0
2
+
3
+ Python port of `@betterdb/retrieval`. Developer-facing retrieval SDK over
4
+ Valkey Search — typed schema, idempotent index lifecycle, upsert/delete, and
5
+ vector + filtered + hybrid query, with built-in observability seams.
6
+
7
+ Requires Valkey 8+ with the **valkey-search** module (vector index support).
8
+ Works with ElastiCache for Valkey, Memorystore for Valkey, and MemoryDB.
9
+
10
+ Built on [`betterdb-valkey-search-kit`](https://pypi.org/project/betterdb-valkey-search-kit/).
11
+
12
+ ---
13
+
14
+ ## Installation
15
+
16
+ ```sh
17
+ pip install betterdb-retrieval
18
+ ```
19
+
20
+ ---
21
+
22
+ ## What's included
23
+
24
+ ### Retriever
25
+
26
+ | Method | Description |
27
+ |---|---|
28
+ | `create_index()` | Create or attach to the vector index (idempotent) |
29
+ | `upsert(...)` | Insert or update a document with its vector and fields |
30
+ | `delete(id)` | Delete a document by id |
31
+ | `query(...)` | Vector, filtered, and hybrid (vector + filter) search |
32
+ | `health()` | Index name, doc count, vector dimension |
33
+
34
+ ### Schema & fields
35
+
36
+ Typed `RetrievalSchema` with TAG / NUMERIC / vector field builders, validated
37
+ against the live `FT.INFO` to tolerate version skew.
38
+
39
+ ### Discovery
40
+
41
+ Shared discovery registry with atomic register/unregister (EVAL compare-and-set).
42
+
43
+ ### Observability
44
+
45
+ - `RetrievalMetrics` / `RetrievalTracer` instrumentation seams
46
+ - Prometheus metrics for query latency and result counts
47
+
48
+ ---
49
+
50
+ ## Full changelog
51
+
52
+ See the repository history for detailed changes.
@@ -0,0 +1,96 @@
1
+ """Developer-facing retrieval SDK over Valkey Search.
2
+
3
+ Async Python port of the TypeScript ``@betterdb/retrieval`` package: index
4
+ lifecycle, upsert, and vector + filtered query backed by Valkey Search (FT.*).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .discovery import (
10
+ REGISTRY_KEY,
11
+ RETRIEVAL_CACHE_TYPE,
12
+ RetrievalMarker,
13
+ build_retrieval_marker,
14
+ )
15
+ from .fields import SCORE_FIELD, TEXT_FIELD
16
+ from .ft_create import (
17
+ build_ft_create_args,
18
+ index_name,
19
+ key_prefix,
20
+ resolve_vector_field_name,
21
+ )
22
+ from .ft_search import QueryFilter, build_ft_search_query
23
+ from .health import IndexHealthSnapshot, RecallEstimator, parse_percent_indexed
24
+ from .prometheus_metrics import (
25
+ PrometheusRetrievalMetrics,
26
+ create_prometheus_metrics,
27
+ )
28
+ from .retriever import (
29
+ EmbedFn,
30
+ IndexDescription,
31
+ QueryHit,
32
+ RerankFn,
33
+ Retriever,
34
+ RetrieverClient,
35
+ UpsertEntry,
36
+ )
37
+ from .schema import (
38
+ FieldSpec,
39
+ FtCapabilities,
40
+ RetrievalSchema,
41
+ VectorAlgorithm,
42
+ VectorMetric,
43
+ VectorSpec,
44
+ )
45
+ from .telemetry import (
46
+ RetrievalMetrics,
47
+ RetrievalOperation,
48
+ RetrievalSpan,
49
+ RetrievalTracer,
50
+ )
51
+
52
+ __all__ = [
53
+ # schema
54
+ "FieldSpec",
55
+ "VectorMetric",
56
+ "VectorAlgorithm",
57
+ "VectorSpec",
58
+ "RetrievalSchema",
59
+ "FtCapabilities",
60
+ # ft-create
61
+ "build_ft_create_args",
62
+ "index_name",
63
+ "key_prefix",
64
+ "resolve_vector_field_name",
65
+ # fields
66
+ "TEXT_FIELD",
67
+ "SCORE_FIELD",
68
+ # ft-search
69
+ "build_ft_search_query",
70
+ "QueryFilter",
71
+ # retriever
72
+ "Retriever",
73
+ "RetrieverClient",
74
+ "IndexDescription",
75
+ "EmbedFn",
76
+ "UpsertEntry",
77
+ "RerankFn",
78
+ "QueryHit",
79
+ # discovery
80
+ "build_retrieval_marker",
81
+ "REGISTRY_KEY",
82
+ "RETRIEVAL_CACHE_TYPE",
83
+ "RetrievalMarker",
84
+ # health
85
+ "parse_percent_indexed",
86
+ "IndexHealthSnapshot",
87
+ "RecallEstimator",
88
+ # telemetry
89
+ "RetrievalMetrics",
90
+ "RetrievalTracer",
91
+ "RetrievalSpan",
92
+ "RetrievalOperation",
93
+ # prometheus
94
+ "create_prometheus_metrics",
95
+ "PrometheusRetrievalMetrics",
96
+ ]
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TypedDict
4
+
5
+ from .ft_create import index_name
6
+
7
+ REGISTRY_KEY = "__betterdb:caches"
8
+ RETRIEVAL_PROTOCOL_VERSION = 1
9
+ RETRIEVAL_CACHE_TYPE = "retrieval"
10
+ # TODO: sync with pyproject.toml rather than hardcoding — this drifts on a
11
+ # version bump.
12
+ RETRIEVAL_VERSION = "0.1.0"
13
+
14
+
15
+ class RetrievalMarker(TypedDict):
16
+ type: str
17
+ prefix: str
18
+ version: str
19
+ protocol_version: int
20
+ capabilities: list[str]
21
+ index_name: str
22
+ started_at: str
23
+
24
+
25
+ def build_retrieval_marker(name: str, version: str, started_at: str) -> RetrievalMarker:
26
+ return {
27
+ "type": RETRIEVAL_CACHE_TYPE,
28
+ "prefix": name,
29
+ "version": version,
30
+ "protocol_version": RETRIEVAL_PROTOCOL_VERSION,
31
+ "capabilities": ["upsert", "query", "delete"],
32
+ "index_name": index_name(name),
33
+ "started_at": started_at,
34
+ }
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ TEXT_FIELD = "__text"
4
+ SCORE_FIELD = "__score"
5
+
6
+ RESERVED_FIELD_NAMES: tuple[str, ...] = (TEXT_FIELD, SCORE_FIELD)
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ from .fields import RESERVED_FIELD_NAMES
4
+ from .schema import FieldSpec, FtCapabilities, RetrievalSchema, VectorSpec
5
+
6
+ _HNSW_DEFAULTS = {"m": 16, "efConstruction": 200, "efRuntime": 10}
7
+
8
+ _METRIC_MAP = {"cosine": "COSINE", "l2": "L2", "ip": "IP"}
9
+
10
+ _ALGORITHM_MAP = {"hnsw": "HNSW", "flat": "FLAT"}
11
+
12
+
13
+ def _is_positive_int(value: object) -> bool:
14
+ """Mirror ``Number.isInteger(x) && x > 0`` (accepts integral floats)."""
15
+ if isinstance(value, bool):
16
+ return False
17
+ if isinstance(value, int):
18
+ return value > 0
19
+ if isinstance(value, float):
20
+ return value.is_integer() and value > 0
21
+ return False
22
+
23
+
24
+ def _require_dims(dims: object) -> int:
25
+ if dims is None or not _is_positive_int(dims):
26
+ raise ValueError(f"dims must be a positive integer to build FT.CREATE args, got: {dims}")
27
+ return int(dims) # type: ignore[arg-type]
28
+
29
+
30
+ def _validate_field_names(fields: dict[str, FieldSpec], vector_field_name: str) -> None:
31
+ for name in fields:
32
+ if len(name) == 0:
33
+ raise ValueError("Invalid field name: empty field name is not allowed")
34
+ if name == vector_field_name:
35
+ raise ValueError(
36
+ f"Field name '{name}' collides with the vector field name '{vector_field_name}'"
37
+ )
38
+ if name in RESERVED_FIELD_NAMES:
39
+ raise ValueError(f"Field name '{name}' is reserved and cannot be used in the schema")
40
+
41
+
42
+ def _validate_text_field_capabilities(
43
+ fields: dict[str, FieldSpec], capabilities: FtCapabilities | None
44
+ ) -> None:
45
+ if not (capabilities is not None and capabilities.get("textFields") is False):
46
+ return
47
+ text_field_names = [name for name, spec in fields.items() if spec.get("type") == "text"]
48
+ if text_field_names:
49
+ raise ValueError(f"Text fields require valkey-search >= 1.2: {', '.join(text_field_names)}")
50
+
51
+
52
+ def _validate_flat_hnsw_params(vector: VectorSpec) -> None:
53
+ if vector.get("algorithm") != "flat":
54
+ return
55
+ if vector.get("m") is not None:
56
+ raise ValueError("FLAT algorithm does not support 'm' parameter")
57
+ if vector.get("efConstruction") is not None:
58
+ raise ValueError("FLAT algorithm does not support 'efConstruction' parameter")
59
+ if vector.get("efRuntime") is not None:
60
+ raise ValueError("FLAT algorithm does not support 'efRuntime' parameter")
61
+
62
+
63
+ def _build_field_args(name: str, spec: FieldSpec) -> list[str]:
64
+ field_type = spec.get("type")
65
+ if field_type == "text":
66
+ return [name, "TEXT"]
67
+ if field_type == "tag":
68
+ args = [name, "TAG"]
69
+ separator = spec.get("separator")
70
+ if separator is not None:
71
+ args.extend(["SEPARATOR", separator])
72
+ return args
73
+ args = [name, "NUMERIC"]
74
+ if spec.get("sortable") is True:
75
+ args.append("SORTABLE")
76
+ return args
77
+
78
+
79
+ def resolve_vector_field_name(vector: VectorSpec) -> str:
80
+ field_name = vector.get("fieldName")
81
+ if field_name is None:
82
+ return "embedding"
83
+ if field_name.strip() == "":
84
+ raise ValueError(
85
+ f"Vector field name must not be empty or whitespace-only, got: '{field_name}'"
86
+ )
87
+ return field_name
88
+
89
+
90
+ def _build_vector_args(vector: VectorSpec, dims: int) -> list[str]:
91
+ field_name = resolve_vector_field_name(vector)
92
+ algorithm = vector.get("algorithm")
93
+ if algorithm not in _ALGORITHM_MAP:
94
+ raise ValueError(
95
+ f"Vector algorithm must be one of {sorted(_ALGORITHM_MAP)}, got: {algorithm!r}"
96
+ )
97
+ algo = _ALGORITHM_MAP[algorithm]
98
+ metric = _METRIC_MAP[vector["metric"]]
99
+
100
+ if vector.get("algorithm") == "flat":
101
+ return [
102
+ field_name,
103
+ "VECTOR",
104
+ algo,
105
+ "6",
106
+ "TYPE",
107
+ "FLOAT32",
108
+ "DIM",
109
+ str(dims),
110
+ "DISTANCE_METRIC",
111
+ metric,
112
+ ]
113
+
114
+ m = vector.get("m")
115
+ if m is None:
116
+ m = _HNSW_DEFAULTS["m"]
117
+ ef_construction = vector.get("efConstruction")
118
+ if ef_construction is None:
119
+ ef_construction = _HNSW_DEFAULTS["efConstruction"]
120
+ ef_runtime = vector.get("efRuntime")
121
+ if ef_runtime is None:
122
+ ef_runtime = _HNSW_DEFAULTS["efRuntime"]
123
+
124
+ return [
125
+ field_name,
126
+ "VECTOR",
127
+ algo,
128
+ "12",
129
+ "TYPE",
130
+ "FLOAT32",
131
+ "DIM",
132
+ str(dims),
133
+ "DISTANCE_METRIC",
134
+ metric,
135
+ "M",
136
+ str(m),
137
+ "EF_CONSTRUCTION",
138
+ str(ef_construction),
139
+ "EF_RUNTIME",
140
+ str(ef_runtime),
141
+ ]
142
+
143
+
144
+ def index_name(name: str) -> str:
145
+ if name.strip() == "":
146
+ raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
147
+ return f"{name}:idx"
148
+
149
+
150
+ def key_prefix(name: str) -> str:
151
+ if name.strip() == "":
152
+ raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
153
+ return f"{name}:"
154
+
155
+
156
+ def build_ft_create_args(
157
+ name: str,
158
+ schema: RetrievalSchema,
159
+ capabilities: FtCapabilities | None = None,
160
+ ) -> list[str]:
161
+ if name.strip() == "":
162
+ raise ValueError(f"Index name must not be empty or whitespace-only, got: '{name}'")
163
+
164
+ dims = _require_dims(schema["vector"].get("dims"))
165
+ vector_field_name = resolve_vector_field_name(schema["vector"])
166
+
167
+ _validate_field_names(schema["fields"], vector_field_name)
168
+ _validate_text_field_capabilities(schema["fields"], capabilities)
169
+ _validate_flat_hnsw_params(schema["vector"])
170
+
171
+ field_args: list[str] = []
172
+ for field_name, spec in schema["fields"].items():
173
+ field_args.extend(_build_field_args(field_name, spec))
174
+
175
+ vector_args = _build_vector_args(schema["vector"], dims)
176
+
177
+ return [
178
+ index_name(name),
179
+ "ON",
180
+ "HASH",
181
+ "PREFIX",
182
+ "1",
183
+ key_prefix(name),
184
+ "SCHEMA",
185
+ *field_args,
186
+ *vector_args,
187
+ ]
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+
5
+ from betterdb_valkey_search_kit import escape_tag
6
+
7
+ from .fields import SCORE_FIELD
8
+ from .ft_create import resolve_vector_field_name
9
+ from .schema import RetrievalSchema
10
+
11
+ QueryFilter = dict[str, "str | int | float"]
12
+
13
+
14
+ def _is_number(value: object) -> bool:
15
+ return isinstance(value, (int, float)) and not isinstance(value, bool)
16
+
17
+
18
+ def _build_filter_clause(field: str, value: str | int | float, schema: RetrievalSchema) -> str:
19
+ spec = schema["fields"].get(field)
20
+ if spec is None:
21
+ raise ValueError(f"Cannot filter on unknown field '{field}'")
22
+ field_type = spec.get("type")
23
+ if field_type == "tag":
24
+ return f"@{field}:{{{escape_tag(str(value))}}}"
25
+ if field_type == "numeric":
26
+ if not _is_number(value):
27
+ raise ValueError(
28
+ f"Numeric filter on field '{field}' requires a number, got: {type(value).__name__}"
29
+ )
30
+ if isinstance(value, float) and not math.isfinite(value):
31
+ raise ValueError(
32
+ f"Numeric filter on field '{field}' requires a finite number, got: {value!r}"
33
+ )
34
+ return f"@{field}:[{value} {value}]"
35
+ raise ValueError(
36
+ f"Cannot filter on TEXT field '{field}'; only tag and numeric fields are filterable"
37
+ )
38
+
39
+
40
+ def build_ft_search_query(
41
+ schema: RetrievalSchema,
42
+ k: int,
43
+ filter: QueryFilter | None = None,
44
+ ) -> str:
45
+ vector_field = resolve_vector_field_name(schema["vector"])
46
+ clauses: list[str] = []
47
+ if filter is not None:
48
+ for field, value in filter.items():
49
+ clauses.append(_build_filter_clause(field, value, schema))
50
+ filter_expr = f"({' '.join(clauses)})" if clauses else "*"
51
+ return f"{filter_expr}=>[KNN {k} @{vector_field} $vec AS {SCORE_FIELD}]"
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Callable, Optional
5
+
6
+
7
+ @dataclass
8
+ class IndexHealthSnapshot:
9
+ name: str
10
+ num_docs: int
11
+ indexing_state: str
12
+ dims: int
13
+ percent_indexed: float
14
+ estimated_recall: Optional[float] = None
15
+
16
+
17
+ RecallEstimator = Callable[[IndexHealthSnapshot], float]
18
+
19
+ _PERCENT_INDEXED_KEYS = ("percent_indexed", "backfill_complete_percent")
20
+
21
+
22
+ def _s(x: Any) -> str:
23
+ if isinstance(x, bytes):
24
+ try:
25
+ return x.decode()
26
+ except UnicodeDecodeError:
27
+ return ""
28
+ return str(x)
29
+
30
+
31
+ def parse_percent_indexed(info: list[Any]) -> float:
32
+ """Extract the percent-indexed value from a raw FT.INFO reply.
33
+
34
+ valkey-search/RediSearch report either a 0-1 fraction or a 0-100
35
+ percentage depending on the version; both are normalized to 0-100. Returns
36
+ 0 if the field is absent or unparseable.
37
+ """
38
+ for i in range(0, len(info) - 1, 2):
39
+ if _s(info[i]) not in _PERCENT_INDEXED_KEYS:
40
+ continue
41
+ try:
42
+ value = float(_s(info[i + 1]))
43
+ except ValueError:
44
+ return 0.0
45
+ return value * 100 if value <= 1 else value
46
+ return 0.0