patchvec 0.5.8__tar.gz → 0.5.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {patchvec-0.5.8/patchvec.egg-info → patchvec-0.5.8.1}/PKG-INFO +1 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/README.md +78 -52
- {patchvec-0.5.8 → patchvec-0.5.8.1/patchvec.egg-info}/PKG-INFO +1 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/main.py +1 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/preprocess.py +2 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/service.py +9 -7
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/base.py +3 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/qdrant_store.py +2 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/txtai_store.py +3 -8
- {patchvec-0.5.8 → patchvec-0.5.8.1}/setup.py +1 -1
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_cli.py +31 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store.py +30 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/ABOUT.md +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/LICENSE +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/MANIFEST.in +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/config.yml.example +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/SOURCES.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/dependency_links.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/entry_points.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/requires.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/top_level.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/__init__.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/__init__.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/patchvec_icon_192.png +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/ui.html +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/auth.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/cli.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/config.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/__init__.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/base.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/factory.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/openai_emb.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/sbert_emb.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/txtai_emb.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/log.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/meta_store.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/metrics.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/schemas.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/__init__.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/factory.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/ui.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/requirements-cpu.txt +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/setup.cfg +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_admin_tenants.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_auth.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_auth_api.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_collection_rename.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_collections.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_config_runtime.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_csv_ingest.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_csv_ingest_api.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_data_export.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_delete_document.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_docid_default.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_health.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ingest_errors.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ingest_size_limit.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_lazy_app.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_list_collections.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_log.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_meta_store.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_metrics.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_request_id.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_search_errors.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_tenant_rate_limit.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_concurrent_upsert.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_cache_race.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_catalog_metrics.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_close_race.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_filters.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_meta_fetch_scope.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_sql_safety.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ui.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_csv.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_pdf.py +0 -0
- {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_txt.py +0 -0
|
@@ -1,31 +1,39 @@
|
|
|
1
1
|
<!-- (C) 2025, 2026 Rodrigo Rodrigues da Silva <rodrigo@flowlexi.com> -->
|
|
2
2
|
<!-- SPDX-License-Identifier: AGPL-3.0-or-later -->
|
|
3
3
|
|
|
4
|
-
# 🍰 PatchVec —
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
document id, page,
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
- **
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
- **
|
|
20
|
-
|
|
4
|
+
# 🍰 PatchVec — Vector Search You Can Understand
|
|
5
|
+
|
|
6
|
+
PatchVec is a single-process vector search engine that ingests your
|
|
7
|
+
documents, chunks and embeds them, and gives you semantic search with
|
|
8
|
+
full provenance — document id, page, character offset, and the exact
|
|
9
|
+
snippet that matched. No cluster, no managed service, no
|
|
10
|
+
opaque pipelines.
|
|
11
|
+
|
|
12
|
+
Drop a file in, search it, see exactly what came back and why.
|
|
13
|
+
|
|
14
|
+
## ⚙️ Why PatchVec
|
|
15
|
+
|
|
16
|
+
- **Ingest files, not embeddings** — hand it a PDF, CSV, or TXT and
|
|
17
|
+
PatchVec chunks, embeds, and indexes it. No preprocessing pipeline
|
|
18
|
+
to build.
|
|
19
|
+
- **Full provenance on every hit** — every search result traces back
|
|
20
|
+
to a document, page, and character offset. Latency and request
|
|
21
|
+
traceability are built into every response.
|
|
22
|
+
- **Multi-tenant by default** — tenant/collection namespacing is
|
|
23
|
+
built in, not bolted on.
|
|
24
|
+
- **REST, CLI, or embed it** — run as an HTTP service, script via
|
|
25
|
+
the CLI, or import the library directly in your Python app.
|
|
26
|
+
- **Pluggable embeddings** — swap models per collection; wire in
|
|
27
|
+
local or hosted embedding backends.
|
|
21
28
|
|
|
22
29
|
## 🧭 Workflows
|
|
23
30
|
|
|
24
31
|
### 🐳 Docker workflow (prebuilt images)
|
|
25
32
|
|
|
26
|
-
Pull the image that fits your hardware from the
|
|
27
|
-
Container Registry
|
|
28
|
-
|
|
33
|
+
Pull the image that fits your hardware from the
|
|
34
|
+
[Flowlexi Container Registry](https://gitlab.com/flowlexi/patchvec/container_registry)
|
|
35
|
+
on GitLab (CUDA builds publish as `latest-gpu`, CPU-only as
|
|
36
|
+
`latest-cpu`).
|
|
29
37
|
|
|
30
38
|
```bash
|
|
31
39
|
docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest-gpu
|
|
@@ -66,20 +74,20 @@ local configuration directory.
|
|
|
66
74
|
**Requires Python 3.10–3.14.**
|
|
67
75
|
|
|
68
76
|
```bash
|
|
69
|
-
mkdir -p ~/pv && cd ~/pv
|
|
77
|
+
mkdir -p ~/pv && cd ~/pv # or wherever
|
|
70
78
|
python -m venv .venv-pv
|
|
71
79
|
source .venv-pv/bin/activate
|
|
72
80
|
python -m pip install --upgrade pip
|
|
73
81
|
pip install "patchvec[cpu]"
|
|
74
82
|
|
|
75
83
|
# grab the default configs
|
|
76
|
-
curl -LO https://raw.githubusercontent.com/
|
|
77
|
-
curl -LO https://raw.githubusercontent.com/
|
|
84
|
+
curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/config.yml.example
|
|
85
|
+
curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/tenants.yml.example
|
|
78
86
|
cp config.yml.example config.yml
|
|
79
87
|
cp tenants.yml.example tenants.yml
|
|
80
88
|
|
|
81
89
|
# sample demo corpus
|
|
82
|
-
curl -LO https://raw.githubusercontent.com/
|
|
90
|
+
curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/demo/20k_leagues.txt
|
|
83
91
|
|
|
84
92
|
# point Patchvec at the config directory and set a local admin key
|
|
85
93
|
export PATCHVEC_CONFIG="$HOME/pv/config.yml"
|
|
@@ -129,8 +137,34 @@ curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
|
|
|
129
137
|
"http://localhost:8086/collections/demo/books/search?q=captain+nemo&k=3"
|
|
130
138
|
```
|
|
131
139
|
|
|
132
|
-
|
|
133
|
-
|
|
140
|
+
Every hit comes back with provenance you can trace, plus latency
|
|
141
|
+
and request id for observability:
|
|
142
|
+
|
|
143
|
+
```json
|
|
144
|
+
{
|
|
145
|
+
"matches": [
|
|
146
|
+
{
|
|
147
|
+
"id": "verne-20k::chunk_42",
|
|
148
|
+
"score": 0.82,
|
|
149
|
+
"text": "Captain Nemo conducted me to the central staircase ...",
|
|
150
|
+
"tenant": "demo",
|
|
151
|
+
"collection": "books",
|
|
152
|
+
"match_reason": "semantic",
|
|
153
|
+
"meta": {
|
|
154
|
+
"docid": "verne-20k",
|
|
155
|
+
"filename": "20k_leagues.txt",
|
|
156
|
+
"offset": 21000,
|
|
157
|
+
"lang": "en",
|
|
158
|
+
"ingested_at": "2026-03-07T12:00:00Z"
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
],
|
|
162
|
+
"latency_ms": 12.4,
|
|
163
|
+
"request_id": "req-5f3a-b812"
|
|
164
|
+
}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
The Swagger UI is available at `http://localhost:8086/`.
|
|
134
168
|
|
|
135
169
|
Health and metrics endpoints are available at `/health` and `/metrics`.
|
|
136
170
|
|
|
@@ -147,10 +181,15 @@ though), or explicitly delete the document and then ingest it again.
|
|
|
147
181
|
CLI (re-ingest to replace):
|
|
148
182
|
|
|
149
183
|
```bash
|
|
150
|
-
|
|
151
|
-
cp demo/20k_leagues.txt demo/20k_leagues_mod.txt
|
|
152
|
-
echo "THE END" >> demo/20k_leagues_mod.txt
|
|
184
|
+
# initial ingest
|
|
153
185
|
pavecli ingest demo books 20k_leagues.txt --docid=verne-20k
|
|
186
|
+
|
|
187
|
+
# modify the content (filename can change — docid is what matters)
|
|
188
|
+
cp 20k_leagues.txt 20k_leagues_v2.txt
|
|
189
|
+
echo "THE END" >> 20k_leagues_v2.txt
|
|
190
|
+
|
|
191
|
+
# re-ingest with the same docid to replace the indexed content
|
|
192
|
+
pavecli ingest demo books 20k_leagues_v2.txt --docid=verne-20k
|
|
154
193
|
```
|
|
155
194
|
|
|
156
195
|
REST (delete then ingest):
|
|
@@ -169,37 +208,24 @@ curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
|
|
|
169
208
|
|
|
170
209
|
### 🛠️ Developer workflow
|
|
171
210
|
|
|
172
|
-
Building from source relies on
|
|
173
|
-
make serve`, `make test`, etc.).
|
|
174
|
-
task claiming rules live in
|
|
175
|
-
|
|
211
|
+
Building from source relies on `Makefile` shortcuts (`make install-dev`,
|
|
212
|
+
`USE_CPU=1 make serve`, `make test`, `make check`, etc.).
|
|
213
|
+
The full contributor workflow, target reference, and task claiming rules live in
|
|
214
|
+
[CONTRIBUTING.md](CONTRIBUTING.md). Performance benchmarks are documented in
|
|
215
|
+
[README-benchmarks.md](README-benchmarks.md).
|
|
176
216
|
|
|
177
217
|
## Logging
|
|
178
218
|
|
|
179
|
-
PatchVec
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
Controlled by `log.level` in `config.yml` (`DEBUG`, `INFO`, `WARNING`; default
|
|
183
|
-
`INFO`). Namespace-level overrides:
|
|
219
|
+
PatchVec writes human-readable logs to stderr and optionally emits
|
|
220
|
+
structured JSON lines (one per search/ingest/delete) for production
|
|
221
|
+
observability. Enable the ops stream in `config.yml`:
|
|
184
222
|
|
|
185
223
|
```yaml
|
|
186
224
|
log:
|
|
187
|
-
|
|
188
|
-
debug: [pave.stores] # force DEBUG for specific namespaces
|
|
189
|
-
watch: [txtai] # one level more verbose than base
|
|
190
|
-
quiet: [uvicorn] # one level quieter (uvicorn is quieted by default)
|
|
225
|
+
ops_log: stdout # null (off) | stdout | /path/to/ops.jsonl
|
|
191
226
|
```
|
|
192
227
|
|
|
193
|
-
|
|
194
|
-
search, ingest, delete, rename — written to a configurable destination. Off by
|
|
195
|
-
default; `stdout` is recommended for Docker/12-factor deployments (PatchVec
|
|
196
|
-
already uses `stderr` for the dev stream):
|
|
197
|
-
|
|
198
|
-
```yaml
|
|
199
|
-
log:
|
|
200
|
-
ops_log: null # null (off) | stdout | /path/to/ops.jsonl
|
|
201
|
-
access_log: null # uvicorn access log: null (off) | stdout | /path
|
|
202
|
-
```
|
|
228
|
+
See `config.yml.example` for the full logging configuration.
|
|
203
229
|
|
|
204
230
|
## 🗺️ Roadmap
|
|
205
231
|
|
|
@@ -143,8 +143,9 @@ def preprocess(filename: str, content: bytes, csv_options: dict[str, Any] \
|
|
|
143
143
|
yield f"page_{i}", text, {"page": i}
|
|
144
144
|
elif ext == "txt":
|
|
145
145
|
text = content.decode("utf-8", errors="ignore")
|
|
146
|
+
step = max(TXT_CHUNK_SIZE - TXT_CHUNK_OVERLAP, 1)
|
|
146
147
|
for i, chunk in enumerate(_chunks(text)):
|
|
147
|
-
yield f"chunk_{i}", chunk, {"
|
|
148
|
+
yield f"chunk_{i}", chunk, {"offset": i * step}
|
|
148
149
|
elif ext == "csv" or mt == "text/csv":
|
|
149
150
|
yield from _preprocess_csv(filename, content, csv_options or {})
|
|
150
151
|
return
|
|
@@ -160,17 +160,19 @@ def ingest_document(store, tenant: str, collection: str, filename: str, content:
|
|
|
160
160
|
if baseid and store.has_doc(tenant, collection, baseid):
|
|
161
161
|
purged = store.purge_doc(tenant, collection, baseid)
|
|
162
162
|
m_inc("purge_total", purged)
|
|
163
|
-
|
|
163
|
+
meta_from_call = metadata or {}
|
|
164
|
+
now = datetime.now(tz.utc).isoformat(timespec="seconds")
|
|
165
|
+
now = now.replace("+00:00", "Z")
|
|
166
|
+
doc_meta = {
|
|
167
|
+
"docid": baseid, "filename": filename,
|
|
168
|
+
"ingested_at": now, **meta_from_call,
|
|
169
|
+
}
|
|
164
170
|
records = []
|
|
165
171
|
for local_id, text, extra in preprocess(
|
|
166
172
|
filename, content, csv_options=csv_options
|
|
167
173
|
):
|
|
168
174
|
rid = f"{baseid}::{local_id}"
|
|
169
|
-
|
|
170
|
-
now = now.replace("+00:00", "Z")
|
|
171
|
-
meta = {"docid": baseid, "filename": filename, "ingested_at": now}
|
|
172
|
-
meta.update(meta_doc)
|
|
173
|
-
meta.update(extra)
|
|
175
|
+
meta = {**doc_meta, **extra}
|
|
174
176
|
records.append((rid, text, meta))
|
|
175
177
|
if not records:
|
|
176
178
|
return {
|
|
@@ -178,7 +180,7 @@ def ingest_document(store, tenant: str, collection: str, filename: str, content:
|
|
|
178
180
|
"code": "no_text_extracted",
|
|
179
181
|
"error": "no text extracted",
|
|
180
182
|
}
|
|
181
|
-
count = store.index_records(tenant, collection, baseid, records)
|
|
183
|
+
count = store.index_records(tenant, collection, baseid, records, doc_meta)
|
|
182
184
|
m_inc("documents_indexed_total", 1.0)
|
|
183
185
|
m_inc("chunks_indexed_total", float(count or 0))
|
|
184
186
|
latency_ms = round((_time.perf_counter() - _t0) * 1000, 2)
|
|
@@ -58,7 +58,9 @@ class BaseStore(ABC):
|
|
|
58
58
|
|
|
59
59
|
@abstractmethod
|
|
60
60
|
def index_records(self, tenant: str, collection: str, docid: str,
|
|
61
|
-
records: Iterable[Record]
|
|
61
|
+
records: Iterable[Record],
|
|
62
|
+
doc_meta: dict[str, Any] | None = None
|
|
63
|
+
) -> int: ...
|
|
62
64
|
|
|
63
65
|
@abstractmethod
|
|
64
66
|
def search(self, tenant: str, collection: str, query: str, k: int = 5,
|
|
@@ -29,7 +29,8 @@ class QdrantStore(BaseStore):
|
|
|
29
29
|
def purge_doc(self, tenant: str, collection: str, docid: str) -> int:
|
|
30
30
|
raise NotImplementedError("to be implemented")
|
|
31
31
|
|
|
32
|
-
def index_records(self, tenant: str, collection: str, docid: str,
|
|
32
|
+
def index_records(self, tenant: str, collection: str, docid: str,
|
|
33
|
+
records: Iterable[Record], doc_meta: dict | None = None) -> int:
|
|
33
34
|
raise NotImplementedError("to be implemented")
|
|
34
35
|
|
|
35
36
|
def search(self, tenant: str, collection: str, text: str, k: int = 5,
|
|
@@ -525,7 +525,9 @@ class TxtaiStore(BaseStore):
|
|
|
525
525
|
return None
|
|
526
526
|
|
|
527
527
|
def index_records(self, tenant: str, collection: str, docid: str,
|
|
528
|
-
records: Iterable[Record]
|
|
528
|
+
records: Iterable[Record],
|
|
529
|
+
doc_meta: dict[str, Any] | None = None
|
|
530
|
+
) -> int:
|
|
529
531
|
"""
|
|
530
532
|
Ingests records as (rid, text, meta). Guarantees non-null text, coerces
|
|
531
533
|
dict-records, updates SQLite metadata, saves index. Thread critical.
|
|
@@ -537,7 +539,6 @@ class TxtaiStore(BaseStore):
|
|
|
537
539
|
em = self._emb[key]
|
|
538
540
|
prepared: list[tuple[str, Any, str]] = []
|
|
539
541
|
chunk_rows: list[tuple[str, str | None, dict[str, Any]]] = []
|
|
540
|
-
doc_meta: dict[str, Any] = {}
|
|
541
542
|
|
|
542
543
|
for r in records:
|
|
543
544
|
if isinstance(r, dict):
|
|
@@ -569,12 +570,6 @@ class TxtaiStore(BaseStore):
|
|
|
569
570
|
md = {}
|
|
570
571
|
|
|
571
572
|
md["docid"] = docid
|
|
572
|
-
# Capture first occurrence of doc-level meta fields
|
|
573
|
-
if not doc_meta:
|
|
574
|
-
doc_meta = {
|
|
575
|
-
k: v for k, v in md.items()
|
|
576
|
-
if k not in ("chunk", "page", "position", "section")
|
|
577
|
-
}
|
|
578
573
|
|
|
579
574
|
try:
|
|
580
575
|
safe_meta = self._sanit_meta_dict(md)
|
|
@@ -17,7 +17,7 @@ long_description, long_type = read_long_description()
|
|
|
17
17
|
|
|
18
18
|
setup(
|
|
19
19
|
name="patchvec", # external name
|
|
20
|
-
version="0.5.8",
|
|
20
|
+
version="0.5.8.1",
|
|
21
21
|
description="Patchvec — A lightweight, pluggable vector search microservice.",
|
|
22
22
|
long_description=long_description,
|
|
23
23
|
long_description_content_type="text/markdown",
|
|
@@ -32,6 +32,37 @@ def test_cli_ingest_on_fresh_collection_with_empty_index_dir(cli_env, tmp_path):
|
|
|
32
32
|
and c[3] == "DOC1" for c in store.calls)
|
|
33
33
|
assert ("save", tenant, coll) in store.calls
|
|
34
34
|
|
|
35
|
+
|
|
36
|
+
def test_cli_ingest_passes_doc_meta_through_wrapper(cli_env, tmp_path):
|
|
37
|
+
pvcli, store, _ = cli_env
|
|
38
|
+
tenant, coll = "acme", "metawrap"
|
|
39
|
+
sample = tmp_path / "meta.txt"
|
|
40
|
+
sample.write_text("conteúdo de teste", encoding="utf-8")
|
|
41
|
+
|
|
42
|
+
pvcli.main_cli(["create-collection", tenant, coll])
|
|
43
|
+
pvcli.main_cli(
|
|
44
|
+
[
|
|
45
|
+
"ingest", tenant, coll, str(sample),
|
|
46
|
+
"--docid", "DOCMETA",
|
|
47
|
+
"--metadata", '{"lang":"pt","source":"cli"}',
|
|
48
|
+
]
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
calls = [
|
|
52
|
+
c for c in store.calls
|
|
53
|
+
if c[0] == "index_records" and c[1] == tenant
|
|
54
|
+
and c[2] == coll and c[3] == "DOCMETA"
|
|
55
|
+
]
|
|
56
|
+
assert calls
|
|
57
|
+
doc_meta = calls[-1][5]
|
|
58
|
+
assert isinstance(doc_meta, dict)
|
|
59
|
+
assert doc_meta["docid"] == "DOCMETA"
|
|
60
|
+
assert doc_meta["lang"] == "pt"
|
|
61
|
+
assert doc_meta["source"] == "cli"
|
|
62
|
+
assert doc_meta["filename"].endswith("meta.txt")
|
|
63
|
+
assert doc_meta["ingested_at"].endswith("Z")
|
|
64
|
+
|
|
65
|
+
|
|
35
66
|
def test_cli_reingest_same_docid_triggers_purge(cli_env, tmp_path):
|
|
36
67
|
pvcli, store, _ = cli_env
|
|
37
68
|
tenant, coll = "acme", "reupcli"
|
|
@@ -99,6 +99,36 @@ def test_meta_json_and_filters(store):
|
|
|
99
99
|
|
|
100
100
|
# Ensure meta was JSON-encoded internally (FakeEmbeddings asserts this)
|
|
101
101
|
|
|
102
|
+
|
|
103
|
+
def test_doc_level_meta_persists_in_documents_table(store):
|
|
104
|
+
tenant, coll, docid = "acme", "docmeta", "DOCMETA"
|
|
105
|
+
recs = [
|
|
106
|
+
{
|
|
107
|
+
"id": "0",
|
|
108
|
+
"content": "Documento com metadados.",
|
|
109
|
+
"metadata": {"lang": "pt", "chunk": 0},
|
|
110
|
+
},
|
|
111
|
+
]
|
|
112
|
+
doc_meta = {
|
|
113
|
+
"docid": docid,
|
|
114
|
+
"filename": "meta.txt",
|
|
115
|
+
"lang": "pt",
|
|
116
|
+
"source": "api",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
n = store.index_records(tenant, coll, docid, recs, doc_meta=doc_meta)
|
|
120
|
+
assert n == 1
|
|
121
|
+
|
|
122
|
+
col_db = store.impl._dbs[(tenant, coll)]
|
|
123
|
+
conn = col_db._conn
|
|
124
|
+
assert conn is not None
|
|
125
|
+
row = conn.execute(
|
|
126
|
+
"SELECT meta_json FROM documents WHERE docid=?",
|
|
127
|
+
(docid,),
|
|
128
|
+
).fetchone()
|
|
129
|
+
assert row is not None and row[0]
|
|
130
|
+
assert json.loads(row[0]) == doc_meta
|
|
131
|
+
|
|
102
132
|
def test_purge_doc_removes_ids(store):
|
|
103
133
|
recs = [
|
|
104
134
|
{"id": "y::0", "content": "primeiro", "metadata": {}},
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|