patchvec 0.5.8__tar.gz → 0.5.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {patchvec-0.5.8/patchvec.egg-info → patchvec-0.5.8.1}/PKG-INFO +1 -1
  2. {patchvec-0.5.8 → patchvec-0.5.8.1}/README.md +78 -52
  3. {patchvec-0.5.8 → patchvec-0.5.8.1/patchvec.egg-info}/PKG-INFO +1 -1
  4. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/main.py +1 -1
  5. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/preprocess.py +2 -1
  6. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/service.py +9 -7
  7. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/base.py +3 -1
  8. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/qdrant_store.py +2 -1
  9. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/txtai_store.py +3 -8
  10. {patchvec-0.5.8 → patchvec-0.5.8.1}/setup.py +1 -1
  11. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_cli.py +31 -0
  12. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store.py +30 -0
  13. {patchvec-0.5.8 → patchvec-0.5.8.1}/ABOUT.md +0 -0
  14. {patchvec-0.5.8 → patchvec-0.5.8.1}/LICENSE +0 -0
  15. {patchvec-0.5.8 → patchvec-0.5.8.1}/MANIFEST.in +0 -0
  16. {patchvec-0.5.8 → patchvec-0.5.8.1}/config.yml.example +0 -0
  17. {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/SOURCES.txt +0 -0
  18. {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/dependency_links.txt +0 -0
  19. {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/entry_points.txt +0 -0
  20. {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/requires.txt +0 -0
  21. {patchvec-0.5.8 → patchvec-0.5.8.1}/patchvec.egg-info/top_level.txt +0 -0
  22. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/__init__.py +0 -0
  23. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/__init__.py +0 -0
  24. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/patchvec_icon_192.png +0 -0
  25. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/assets/ui.html +0 -0
  26. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/auth.py +0 -0
  27. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/cli.py +0 -0
  28. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/config.py +0 -0
  29. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/__init__.py +0 -0
  30. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/base.py +0 -0
  31. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/factory.py +0 -0
  32. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/openai_emb.py +0 -0
  33. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/sbert_emb.py +0 -0
  34. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/embedders/txtai_emb.py +0 -0
  35. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/log.py +0 -0
  36. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/meta_store.py +0 -0
  37. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/metrics.py +0 -0
  38. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/schemas.py +0 -0
  39. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/__init__.py +0 -0
  40. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/stores/factory.py +0 -0
  41. {patchvec-0.5.8 → patchvec-0.5.8.1}/pave/ui.py +0 -0
  42. {patchvec-0.5.8 → patchvec-0.5.8.1}/requirements-cpu.txt +0 -0
  43. {patchvec-0.5.8 → patchvec-0.5.8.1}/setup.cfg +0 -0
  44. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_admin_tenants.py +0 -0
  45. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_auth.py +0 -0
  46. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_auth_api.py +0 -0
  47. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_collection_rename.py +0 -0
  48. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_collections.py +0 -0
  49. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_config_runtime.py +0 -0
  50. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_csv_ingest.py +0 -0
  51. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_csv_ingest_api.py +0 -0
  52. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_data_export.py +0 -0
  53. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_delete_document.py +0 -0
  54. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_docid_default.py +0 -0
  55. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_health.py +0 -0
  56. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ingest_errors.py +0 -0
  57. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ingest_size_limit.py +0 -0
  58. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_lazy_app.py +0 -0
  59. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_list_collections.py +0 -0
  60. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_log.py +0 -0
  61. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_meta_store.py +0 -0
  62. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_metrics.py +0 -0
  63. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_request_id.py +0 -0
  64. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_search_errors.py +0 -0
  65. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_tenant_rate_limit.py +0 -0
  66. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_concurrent_upsert.py +0 -0
  67. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_cache_race.py +0 -0
  68. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_catalog_metrics.py +0 -0
  69. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_close_race.py +0 -0
  70. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_filters.py +0 -0
  71. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_meta_fetch_scope.py +0 -0
  72. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_txtai_store_sql_safety.py +0 -0
  73. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_ui.py +0 -0
  74. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_csv.py +0 -0
  75. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_pdf.py +0 -0
  76. {patchvec-0.5.8 → patchvec-0.5.8.1}/tests/test_upload_search_txt.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: patchvec
3
- Version: 0.5.8
3
+ Version: 0.5.8.1
4
4
  Summary: Patchvec — A lightweight, pluggable vector search microservice.
5
5
  Author: Rodrigo Rodrigues da Silva
6
6
  Author-email: rodrigo@flowlexi.com
@@ -1,31 +1,39 @@
1
1
  <!-- (C) 2025, 2026 Rodrigo Rodrigues da Silva <rodrigo@flowlexi.com> -->
2
2
  <!-- SPDX-License-Identifier: AGPL-3.0-or-later -->
3
3
 
4
- # 🍰 PatchVec — Lightweight, Pluggable Vector Search Microservice
5
-
6
- Patchvec is a compact vector store built for people who want provenance and fast
7
- iteration on RAG plumbing. No black boxes, no hidden pipelines: every chunk records
8
- document id, page, and byte offsets, and you can swap embeddings or storage backends per
9
- collection.
10
-
11
- ## ⚙️ Core capabilities
12
-
13
- - **Docker images** — prebuilt CPU/GPU images published to the GitLab Container
14
- Registry.
15
- - **Tenants and collections** — isolation by tenant with per-collection configuration.
16
- - **Pluggable embeddings** — choose the embedding adapter per collection; wire in local
17
- or hosted models.
18
- - **REST and CLI** — production use over HTTP, quick experiments with the bundled CLI.
19
- - **Deterministic provenance** every hit returns doc id, page, offset, and snippet for
20
- traceability.
4
+ # 🍰 PatchVec — Vector Search You Can Understand
5
+
6
+ PatchVec is a single-process vector search engine that ingests your
7
+ documents, chunks and embeds them, and gives you semantic search with
8
+ full provenance — document id, page, character offset, and the exact
9
+ snippet that matched. No cluster, no managed service, no
10
+ opaque pipelines.
11
+
12
+ Drop a file in, search it, see exactly what came back and why.
13
+
14
+ ## ⚙️ Why PatchVec
15
+
16
+ - **Ingest files, not embeddings** — hand it a PDF, CSV, or TXT and
17
+ PatchVec chunks, embeds, and indexes it. No preprocessing pipeline
18
+ to build.
19
+ - **Full provenance on every hit** every search result traces back
20
+ to a document, page, and character offset. Latency and request
21
+ traceability are built into every response.
22
+ - **Multi-tenant by default** — tenant/collection namespacing is
23
+ built in, not bolted on.
24
+ - **REST, CLI, or embed it** — run as an HTTP service, script via
25
+ the CLI, or import the library directly in your Python app.
26
+ - **Pluggable embeddings** — swap models per collection; wire in
27
+ local or hosted embedding backends.
21
28
 
22
29
  ## 🧭 Workflows
23
30
 
24
31
  ### 🐳 Docker workflow (prebuilt images)
25
32
 
26
- Pull the image that fits your hardware from the [https://gitlab.com/flowlexi](Flowlexi)
27
- Container Registry on Gitlab (CUDA builds publish as `latest-gpu`, CPU-only as `latest-
28
- cpu`).
33
+ Pull the image that fits your hardware from the
34
+ [Flowlexi Container Registry](https://gitlab.com/flowlexi/patchvec/container_registry)
35
+ on GitLab (CUDA builds publish as `latest-gpu`, CPU-only as
36
+ `latest-cpu`).
29
37
 
30
38
  ```bash
31
39
  docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest-gpu
@@ -66,20 +74,20 @@ local configuration directory.
66
74
  **Requires Python 3.10–3.14.**
67
75
 
68
76
  ```bash
69
- mkdir -p ~/pv && cd ~/pv #or wherever
77
+ mkdir -p ~/pv && cd ~/pv # or wherever
70
78
  python -m venv .venv-pv
71
79
  source .venv-pv/bin/activate
72
80
  python -m pip install --upgrade pip
73
81
  pip install "patchvec[cpu]"
74
82
 
75
83
  # grab the default configs
76
- curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/config.yml.example
77
- curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/tenants.yml.example
84
+ curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/config.yml.example
85
+ curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/tenants.yml.example
78
86
  cp config.yml.example config.yml
79
87
  cp tenants.yml.example tenants.yml
80
88
 
81
89
  # sample demo corpus
82
- curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/demo/20k_leagues.txt
90
+ curl -LO https://raw.githubusercontent.com/rodrigopitanga/patchvec/main/demo/20k_leagues.txt
83
91
 
84
92
  # point Patchvec at the config directory and set a local admin key
85
93
  export PATCHVEC_CONFIG="$HOME/pv/config.yml"
@@ -129,8 +137,34 @@ curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
129
137
  "http://localhost:8086/collections/demo/books/search?q=captain+nemo&k=3"
130
138
  ```
131
139
 
132
- There is a simple Swagger UI available at the root of the server. Just point your
133
- browser to `http://localhost:8086/`
140
+ Every hit comes back with provenance you can trace, plus latency
141
+ and request id for observability:
142
+
143
+ ```json
144
+ {
145
+ "matches": [
146
+ {
147
+ "id": "verne-20k::chunk_42",
148
+ "score": 0.82,
149
+ "text": "Captain Nemo conducted me to the central staircase ...",
150
+ "tenant": "demo",
151
+ "collection": "books",
152
+ "match_reason": "semantic",
153
+ "meta": {
154
+ "docid": "verne-20k",
155
+ "filename": "20k_leagues.txt",
156
+ "offset": 21000,
157
+ "lang": "en",
158
+ "ingested_at": "2026-03-07T12:00:00Z"
159
+ }
160
+ }
161
+ ],
162
+ "latency_ms": 12.4,
163
+ "request_id": "req-5f3a-b812"
164
+ }
165
+ ```
166
+
167
+ The Swagger UI is available at `http://localhost:8086/`.
134
168
 
135
169
  Health and metrics endpoints are available at `/health` and `/metrics`.
136
170
 
@@ -147,10 +181,15 @@ though), or explicitly delete the document and then ingest it again.
147
181
  CLI (re-ingest to replace):
148
182
 
149
183
  ```bash
150
- pavecli ingest demo books demo/20k_leagues.txt --docid=verne-20k
151
- cp demo/20k_leagues.txt demo/20k_leagues_mod.txt
152
- echo "THE END" >> demo/20k_leagues_mod.txt
184
+ # initial ingest
153
185
  pavecli ingest demo books 20k_leagues.txt --docid=verne-20k
186
+
187
+ # modify the content (filename can change — docid is what matters)
188
+ cp 20k_leagues.txt 20k_leagues_v2.txt
189
+ echo "THE END" >> 20k_leagues_v2.txt
190
+
191
+ # re-ingest with the same docid to replace the indexed content
192
+ pavecli ingest demo books 20k_leagues_v2.txt --docid=verne-20k
154
193
  ```
155
194
 
156
195
  REST (delete then ingest):
@@ -169,37 +208,24 @@ curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
169
208
 
170
209
  ### 🛠️ Developer workflow
171
210
 
172
- Building from source relies on the `Makefile` shortcuts (`make install-dev`, `USE_CPU=1
173
- make serve`, `make test`, etc.). The full contributor workflow, target reference, and
174
- task claiming rules live in [CONTRIBUTING.md](CONTRIBUTING.md). Performance benchmarks
175
- are documented in [README-benchmarks.md](README-benchmarks.md).
211
+ Building from source relies on `Makefile` shortcuts (`make install-dev`,
212
+ `USE_CPU=1 make serve`, `make test`, `make check`, etc.).
213
+ The full contributor workflow, target reference, and task claiming rules live in
214
+ [CONTRIBUTING.md](CONTRIBUTING.md). Performance benchmarks are documented in
215
+ [README-benchmarks.md](README-benchmarks.md).
176
216
 
177
217
  ## Logging
178
218
 
179
- PatchVec emits two independent log streams.
180
-
181
- **Dev stream** (stderr, always on): human-readable text, colored in TTY.
182
- Controlled by `log.level` in `config.yml` (`DEBUG`, `INFO`, `WARNING`; default
183
- `INFO`). Namespace-level overrides:
219
+ PatchVec writes human-readable logs to stderr and optionally emits
220
+ structured JSON lines (one per search/ingest/delete) for production
221
+ observability. Enable the ops stream in `config.yml`:
184
222
 
185
223
  ```yaml
186
224
  log:
187
- level: INFO
188
- debug: [pave.stores] # force DEBUG for specific namespaces
189
- watch: [txtai] # one level more verbose than base
190
- quiet: [uvicorn] # one level quieter (uvicorn is quieted by default)
225
+ ops_log: stdout # null (off) | stdout | /path/to/ops.jsonl
191
226
  ```
192
227
 
193
- **Ops stream**: one JSON line per operation —
194
- search, ingest, delete, rename — written to a configurable destination. Off by
195
- default; `stdout` is recommended for Docker/12-factor deployments (PatchVec
196
- already uses `stderr` for the dev stream):
197
-
198
- ```yaml
199
- log:
200
- ops_log: null # null (off) | stdout | /path/to/ops.jsonl
201
- access_log: null # uvicorn access log: null (off) | stdout | /path
202
- ```
228
+ See `config.yml.example` for the full logging configuration.
203
229
 
204
230
  ## 🗺️ Roadmap
205
231
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: patchvec
3
- Version: 0.5.8
3
+ Version: 0.5.8.1
4
4
  Summary: Patchvec — A lightweight, pluggable vector search microservice.
5
5
  Author: Rodrigo Rodrigues da Silva
6
6
  Author-email: rodrigo@flowlexi.com
@@ -40,7 +40,7 @@ from pave.ui import attach_ui
40
40
  import pave.log as ops_log
41
41
  from pave.log import ops_event
42
42
 
43
- VERSION = "0.5.8"
43
+ VERSION = "0.5.8.1"
44
44
 
45
45
 
46
46
  def _hw_info() -> dict:
@@ -143,8 +143,9 @@ def preprocess(filename: str, content: bytes, csv_options: dict[str, Any] \
143
143
  yield f"page_{i}", text, {"page": i}
144
144
  elif ext == "txt":
145
145
  text = content.decode("utf-8", errors="ignore")
146
+ step = max(TXT_CHUNK_SIZE - TXT_CHUNK_OVERLAP, 1)
146
147
  for i, chunk in enumerate(_chunks(text)):
147
- yield f"chunk_{i}", chunk, {"chunk": i}
148
+ yield f"chunk_{i}", chunk, {"offset": i * step}
148
149
  elif ext == "csv" or mt == "text/csv":
149
150
  yield from _preprocess_csv(filename, content, csv_options or {})
150
151
  return
@@ -160,17 +160,19 @@ def ingest_document(store, tenant: str, collection: str, filename: str, content:
160
160
  if baseid and store.has_doc(tenant, collection, baseid):
161
161
  purged = store.purge_doc(tenant, collection, baseid)
162
162
  m_inc("purge_total", purged)
163
- meta_doc = metadata or {}
163
+ meta_from_call = metadata or {}
164
+ now = datetime.now(tz.utc).isoformat(timespec="seconds")
165
+ now = now.replace("+00:00", "Z")
166
+ doc_meta = {
167
+ "docid": baseid, "filename": filename,
168
+ "ingested_at": now, **meta_from_call,
169
+ }
164
170
  records = []
165
171
  for local_id, text, extra in preprocess(
166
172
  filename, content, csv_options=csv_options
167
173
  ):
168
174
  rid = f"{baseid}::{local_id}"
169
- now = datetime.now(tz.utc).isoformat(timespec="seconds")
170
- now = now.replace("+00:00", "Z")
171
- meta = {"docid": baseid, "filename": filename, "ingested_at": now}
172
- meta.update(meta_doc)
173
- meta.update(extra)
175
+ meta = {**doc_meta, **extra}
174
176
  records.append((rid, text, meta))
175
177
  if not records:
176
178
  return {
@@ -178,7 +180,7 @@ def ingest_document(store, tenant: str, collection: str, filename: str, content:
178
180
  "code": "no_text_extracted",
179
181
  "error": "no text extracted",
180
182
  }
181
- count = store.index_records(tenant, collection, baseid, records)
183
+ count = store.index_records(tenant, collection, baseid, records, doc_meta)
182
184
  m_inc("documents_indexed_total", 1.0)
183
185
  m_inc("chunks_indexed_total", float(count or 0))
184
186
  latency_ms = round((_time.perf_counter() - _t0) * 1000, 2)
@@ -58,7 +58,9 @@ class BaseStore(ABC):
58
58
 
59
59
  @abstractmethod
60
60
  def index_records(self, tenant: str, collection: str, docid: str,
61
- records: Iterable[Record]) -> int: ...
61
+ records: Iterable[Record],
62
+ doc_meta: dict[str, Any] | None = None
63
+ ) -> int: ...
62
64
 
63
65
  @abstractmethod
64
66
  def search(self, tenant: str, collection: str, query: str, k: int = 5,
@@ -29,7 +29,8 @@ class QdrantStore(BaseStore):
29
29
  def purge_doc(self, tenant: str, collection: str, docid: str) -> int:
30
30
  raise NotImplementedError("to be implemented")
31
31
 
32
- def index_records(self, tenant: str, collection: str, docid: str, records: Iterable[Record]) -> int:
32
+ def index_records(self, tenant: str, collection: str, docid: str,
33
+ records: Iterable[Record], doc_meta: dict | None = None) -> int:
33
34
  raise NotImplementedError("to be implemented")
34
35
 
35
36
  def search(self, tenant: str, collection: str, text: str, k: int = 5,
@@ -525,7 +525,9 @@ class TxtaiStore(BaseStore):
525
525
  return None
526
526
 
527
527
  def index_records(self, tenant: str, collection: str, docid: str,
528
- records: Iterable[Record]) -> int:
528
+ records: Iterable[Record],
529
+ doc_meta: dict[str, Any] | None = None
530
+ ) -> int:
529
531
  """
530
532
  Ingests records as (rid, text, meta). Guarantees non-null text, coerces
531
533
  dict-records, updates SQLite metadata, saves index. Thread critical.
@@ -537,7 +539,6 @@ class TxtaiStore(BaseStore):
537
539
  em = self._emb[key]
538
540
  prepared: list[tuple[str, Any, str]] = []
539
541
  chunk_rows: list[tuple[str, str | None, dict[str, Any]]] = []
540
- doc_meta: dict[str, Any] = {}
541
542
 
542
543
  for r in records:
543
544
  if isinstance(r, dict):
@@ -569,12 +570,6 @@ class TxtaiStore(BaseStore):
569
570
  md = {}
570
571
 
571
572
  md["docid"] = docid
572
- # Capture first occurrence of doc-level meta fields
573
- if not doc_meta:
574
- doc_meta = {
575
- k: v for k, v in md.items()
576
- if k not in ("chunk", "page", "position", "section")
577
- }
578
573
 
579
574
  try:
580
575
  safe_meta = self._sanit_meta_dict(md)
@@ -17,7 +17,7 @@ long_description, long_type = read_long_description()
17
17
 
18
18
  setup(
19
19
  name="patchvec", # external name
20
- version="0.5.8",
20
+ version="0.5.8.1",
21
21
  description="Patchvec — A lightweight, pluggable vector search microservice.",
22
22
  long_description=long_description,
23
23
  long_description_content_type="text/markdown",
@@ -32,6 +32,37 @@ def test_cli_ingest_on_fresh_collection_with_empty_index_dir(cli_env, tmp_path):
32
32
  and c[3] == "DOC1" for c in store.calls)
33
33
  assert ("save", tenant, coll) in store.calls
34
34
 
35
+
36
+ def test_cli_ingest_passes_doc_meta_through_wrapper(cli_env, tmp_path):
37
+ pvcli, store, _ = cli_env
38
+ tenant, coll = "acme", "metawrap"
39
+ sample = tmp_path / "meta.txt"
40
+ sample.write_text("conteúdo de teste", encoding="utf-8")
41
+
42
+ pvcli.main_cli(["create-collection", tenant, coll])
43
+ pvcli.main_cli(
44
+ [
45
+ "ingest", tenant, coll, str(sample),
46
+ "--docid", "DOCMETA",
47
+ "--metadata", '{"lang":"pt","source":"cli"}',
48
+ ]
49
+ )
50
+
51
+ calls = [
52
+ c for c in store.calls
53
+ if c[0] == "index_records" and c[1] == tenant
54
+ and c[2] == coll and c[3] == "DOCMETA"
55
+ ]
56
+ assert calls
57
+ doc_meta = calls[-1][5]
58
+ assert isinstance(doc_meta, dict)
59
+ assert doc_meta["docid"] == "DOCMETA"
60
+ assert doc_meta["lang"] == "pt"
61
+ assert doc_meta["source"] == "cli"
62
+ assert doc_meta["filename"].endswith("meta.txt")
63
+ assert doc_meta["ingested_at"].endswith("Z")
64
+
65
+
35
66
  def test_cli_reingest_same_docid_triggers_purge(cli_env, tmp_path):
36
67
  pvcli, store, _ = cli_env
37
68
  tenant, coll = "acme", "reupcli"
@@ -99,6 +99,36 @@ def test_meta_json_and_filters(store):
99
99
 
100
100
  # Ensure meta was JSON-encoded internally (FakeEmbeddings asserts this)
101
101
 
102
+
103
+ def test_doc_level_meta_persists_in_documents_table(store):
104
+ tenant, coll, docid = "acme", "docmeta", "DOCMETA"
105
+ recs = [
106
+ {
107
+ "id": "0",
108
+ "content": "Documento com metadados.",
109
+ "metadata": {"lang": "pt", "chunk": 0},
110
+ },
111
+ ]
112
+ doc_meta = {
113
+ "docid": docid,
114
+ "filename": "meta.txt",
115
+ "lang": "pt",
116
+ "source": "api",
117
+ }
118
+
119
+ n = store.index_records(tenant, coll, docid, recs, doc_meta=doc_meta)
120
+ assert n == 1
121
+
122
+ col_db = store.impl._dbs[(tenant, coll)]
123
+ conn = col_db._conn
124
+ assert conn is not None
125
+ row = conn.execute(
126
+ "SELECT meta_json FROM documents WHERE docid=?",
127
+ (docid,),
128
+ ).fetchone()
129
+ assert row is not None and row[0]
130
+ assert json.loads(row[0]) == doc_meta
131
+
102
132
  def test_purge_doc_removes_ids(store):
103
133
  recs = [
104
134
  {"id": "y::0", "content": "primeiro", "metadata": {}},
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes