patchvec 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pave/metrics.py ADDED
@@ -0,0 +1,52 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+ from __future__ import annotations
4
+ import time, threading
5
+ from typing import Dict, Any
6
+
7
+ _started = time.time()
8
+ _lock = threading.Lock()
9
+ _counters: Dict[str, float] = {
10
+ "requests_total": 0.0,
11
+ "collections_created_total": 0.0,
12
+ "collections_deleted_total": 0.0,
13
+ "documents_indexed_total": 0.0,
14
+ "chunks_indexed_total": 0.0,
15
+ "purge_total": 0.0,
16
+ "search_total": 0.0,
17
+ "errors_total": 0.0,
18
+ }
19
+
20
+ _last_error: str | None = None
21
+
22
+ def inc(name: str, value: float = 1.0):
23
+ with _lock:
24
+ _counters[name] = _counters.get(name, 0.0) + value
25
+
26
+ def set_error(msg: str):
27
+ global _last_error
28
+ with _lock:
29
+ _last_error = msg
30
+ _counters["errors_total"] = _counters.get("errors_total", 0.0) + 1.0
31
+
32
+ def snapshot(extra: Dict[str, Any] | None = None) -> Dict[str, Any]:
33
+ with _lock:
34
+ data = dict(_counters)
35
+ data.update({
36
+ "uptime_seconds": time.time() - _started,
37
+ "last_error": _last_error,
38
+ })
39
+ if extra:
40
+ data.update(extra)
41
+ return data
42
+
43
+ def to_prometheus(extra: Dict[str, Any] | None = None, build: Dict[str, str] | None = None) -> str:
44
+ s = []
45
+ snap = snapshot(extra)
46
+ for k, v in snap.items():
47
+ if isinstance(v, (int, float)):
48
+ s.append(f"patchvec_{k} {float(v)}")
49
+ if build:
50
+ labels = ",".join([f'{key}="{val}"' for key, val in build.items()])
51
+ s.append(f"patchvec_build_info{{{labels}}} 1")
52
+ return "\n".join(s) + "\n"
pave/preprocess.py ADDED
@@ -0,0 +1,151 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ from __future__ import annotations
5
+ import io, csv, mimetypes
6
+ from .config import CFG
7
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
8
+ from pypdf import PdfReader
9
+
10
+ TXT_CHUNK_SIZE = int(CFG.get("preprocess.txt_chunk_size", 1000))
11
+ TXT_CHUNK_OVERLAP = int(CFG.get("preprocess.txt_chunk_overlap", 200))
12
+
13
+ def _chunks(text: str, size: int = TXT_CHUNK_SIZE, overlap: int = TXT_CHUNK_OVERLAP):
14
+ text = text or ""
15
+ step = max(size - overlap, 1)
16
+ i = 0
17
+ while i < len(text):
18
+ yield text[i : i + size]
19
+ i += step
20
+
21
+ def _csv_parse_col_spec(spec: str) -> tuple[list[str], list[int]]:
22
+ names: list[str] = []
23
+ idxs: list[int] = []
24
+ if not spec:
25
+ return names, idxs
26
+ for tok in (t.strip() for t in spec.split(",") if t.strip()):
27
+ if tok.isdigit():
28
+ i = int(tok)
29
+ if i <= 0:
30
+ raise ValueError("CSV column indices are 1-based")
31
+ idxs.append(i - 1)
32
+ else:
33
+ names.append(tok)
34
+ return names, idxs
35
+
36
+ def _csv_stringify_row(row: Dict[str, Any], keys: List[str]) -> str:
37
+ return "\n".join(f"{k}: {'' if row.get(k) is None else row.get(k)}" for k in keys)
38
+
39
+ def _preprocess_csv(filename: str, content: bytes, csv_options: Dict[str, Any]) -> Iterator[Tuple[str, str, Dict[str, Any]]]:
40
+ has_header = (csv_options.get("has_header") or "auto").lower() # auto|yes|no
41
+ meta_spec = csv_options.get("meta_cols") or ""
42
+ inc_spec = csv_options.get("include_cols") or ""
43
+
44
+ meta_names, meta_idxs = _csv_parse_col_spec(meta_spec)
45
+ inc_names , inc_idxs = _csv_parse_col_spec(inc_spec)
46
+
47
+ # decode
48
+ try:
49
+ text = content.decode("utf-8")
50
+ except UnicodeDecodeError:
51
+ text = content.decode("latin-1")
52
+
53
+ sio = io.StringIO(text)
54
+ sniffer = csv.Sniffer()
55
+ try:
56
+ dialect = sniffer.sniff(text[:4096])
57
+ except Exception:
58
+ dialect = csv.excel
59
+
60
+ reader = csv.reader(sio, dialect)
61
+ first = next(reader, None)
62
+ if first is None:
63
+ return
64
+
65
+ header_row: Optional[List[str]] = None
66
+ if has_header == "yes":
67
+ header_row = [str(h).strip() for h in first]
68
+ elif has_header == "no":
69
+ header_row = None
70
+ else:
71
+ try:
72
+ header_row = [str(h).strip() for h in first] if sniffer.has_header(text[:4096]) else None
73
+ except Exception:
74
+ header_row = None
75
+
76
+ if header_row is not None:
77
+ cols = header_row
78
+ data_rows = reader
79
+ else:
80
+ cols = [f"col_{i}" for i in range(len(first))]
81
+ data_rows = [first, *list(reader)]
82
+
83
+ ncols = len(cols)
84
+ name_to_idx = {c: i for i, c in enumerate(cols)}
85
+
86
+ # refuse if names referenced but no header
87
+ if (meta_names or inc_names) and header_row is None:
88
+ raise ValueError("CSV has no header but column names were provided. Use 1-based indices or supply a header.")
89
+
90
+ def resolve(names: list[str], idxs: list[int]) -> List[str]:
91
+ out: list[str] = []
92
+ for nm in names:
93
+ if nm not in name_to_idx:
94
+ raise ValueError(f"CSV column '{nm}' not found in header")
95
+ out.append(nm)
96
+ for i in idxs:
97
+ if i < 0 or i >= ncols:
98
+ raise ValueError(f"CSV column index {i+1} out of range (1..{ncols})")
99
+ out.append(cols[i])
100
+ seen = set(); out2=[]
101
+ for k in out:
102
+ if k not in seen:
103
+ seen.add(k); out2.append(k)
104
+ return out2
105
+
106
+ meta_keys = resolve(meta_names, meta_idxs)
107
+ if inc_names or inc_idxs:
108
+ include_keys = resolve(inc_names, inc_idxs)
109
+ else:
110
+ # DEFAULT: include all columns EXCEPT meta
111
+ meta_set = set(meta_keys)
112
+ include_keys = [c for c in cols if c not in meta_set]
113
+
114
+ rowno = 0
115
+ for row in data_rows:
116
+ rowno += 1
117
+ if len(row) < ncols:
118
+ row = row + [""] * (ncols - len(row))
119
+ elif len(row) > ncols:
120
+ row = row[:ncols]
121
+ asdict = {cols[i]: row[i] for i in range(ncols)}
122
+ text_part = _csv_stringify_row(asdict, include_keys)
123
+ extra = {k: asdict.get(k, "") for k in meta_keys}
124
+ extra["row"] = rowno
125
+ extra["has_header"] = bool(header_row is not None)
126
+ yield (f"row_{rowno-1}", text_part, extra)
127
+
128
+ def preprocess(filename: str, content: bytes, csv_options: Dict[str, Any] \
129
+ | None = None) -> Iterator[Tuple[str, str, Dict[str, Any]]]:
130
+ """
131
+ Yields (local_id, text, extra_meta):
132
+ - PDF: one chunk per page
133
+ - TXT: charcount-based chunks
134
+ - CSV: one chunk per row ("; " join)
135
+ """
136
+ mt, _ = mimetypes.guess_type(filename)
137
+ ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
138
+ if ext == "pdf":
139
+ reader = PdfReader(io.BytesIO(content))
140
+ for i, page in enumerate(reader.pages):
141
+ text = page.extract_text() or ""
142
+ yield f"page_{i}", text, {"page": i}
143
+ elif ext == "txt":
144
+ text = content.decode("utf-8", errors="ignore")
145
+ for i, chunk in enumerate(_chunks(text)):
146
+ yield f"chunk_{i}", chunk, {"chunk": i}
147
+ elif ext == "csv" or mt == "text/csv":
148
+ yield from _preprocess_csv(filename, content, csv_options or {})
149
+ return
150
+ else:
151
+ raise ValueError(f"unsupported file type: {ext or 'unknown'}")
pave/service.py ADDED
@@ -0,0 +1,92 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ from __future__ import annotations
5
+ import uuid, json, re
6
+ from typing import Dict, Any, Iterable, Tuple, List
7
+ from datetime import datetime, timezone as tz
8
+ from pave.preprocess import preprocess
9
+ from pave.metrics import inc as m_inc
10
+
11
+ # Pure-ish service functions operating on a store adapter
12
+
13
+ def create_collection(store, tenant: str, name: str) -> Dict[str, Any]:
14
+ store.load_or_init(tenant, name)
15
+ store.save(tenant, name)
16
+ m_inc("collections_created_total", 1.0)
17
+ return {
18
+ "ok": True,
19
+ "tenant": tenant,
20
+ "collection": name
21
+ }
22
+
23
+ def delete_collection(store, tenant: str, name: str) -> Dict[str, Any]:
24
+ store.delete_collection(tenant, name)
25
+ m_inc("collections_deleted_total", 1.0)
26
+ return {
27
+ "ok": True,
28
+ "tenant": tenant,
29
+ "deleted": name
30
+ }
31
+
32
+ def _default_docid(filename: str) -> str:
33
+ # Uppercase
34
+ base = filename.upper()
35
+ # replace space and dot with underscore
36
+ base = base.replace(" ", "_").replace(".", "_")
37
+ # replace all non A-Z0-9_ with underscore
38
+ base = re.sub(r"[^A-Z0-9_]", "_", base)
39
+ # collapse multiple underscores
40
+ base = re.sub(r"_+", "_", base).strip("_")
41
+ if base != '': return base
42
+ return "PVDOC_"+str(uuid.uuid4())
43
+
44
+ def ingest_document(store, tenant: str, collection: str, filename: str, content: bytes,
45
+ docid: str | None, metadata: Dict[str, Any] | None,
46
+ csv_options: Dict[str, Any] | None = None) -> Dict[str, Any]:
47
+ baseid = docid or _default_docid(filename)
48
+ if baseid and store.has_doc(tenant, collection, baseid):
49
+ purged = store.purge_doc(tenant, collection, baseid)
50
+ m_inc("purge_total", purged)
51
+ meta_doc = metadata or {}
52
+ records = []
53
+ for local_id, text, extra in preprocess(filename, content, csv_options=csv_options):
54
+ rid = f"{baseid}::{local_id}"
55
+ now = datetime.now(tz.utc).isoformat(timespec="seconds").replace("+00:00", "Z")
56
+ meta = {"docid": baseid, "filename": filename, "ingested_at": now}
57
+ meta.update(meta_doc)
58
+ meta.update(extra)
59
+ records.append((rid, text, meta))
60
+ if not records:
61
+ return {"ok": False, "error": "no text extracted"}
62
+ count = store.index_records(tenant, collection, baseid, records)
63
+ m_inc("documents_indexed_total", 1.0)
64
+ m_inc("chunks_indexed_total", float(count or 0))
65
+ return {
66
+ "ok": True,
67
+ "tenant": tenant,
68
+ "collection": collection,
69
+ "docid": baseid,
70
+ "chunks": count
71
+ }
72
+
73
+ def do_search(store, tenant: str, collection: str, q: str, k: int = 5,
74
+ filters: Dict[str, Any] | None = None, include_common: bool = False,
75
+ common_tenant: str | None = None, common_collection: str | None = None
76
+ ) -> Dict[str, Any]:
77
+ m_inc("search_total", 1.0)
78
+ if include_common and common_tenant and common_collection:
79
+ matches: List[Dict[str, Any]] = []
80
+ matches.extend(store.search(
81
+ tenant, collection, q, max(10, k * 2), filters=filters))
82
+ matches.extend(store.search(
83
+ common_tenant, common_collection, q, max(10, k * 2), filters=filters))
84
+ from heapq import nlargest
85
+ top = nlargest(k, matches, key=lambda x: x["score"])
86
+ m_inc("matches_total", float(len(top) or 0))
87
+ return {"matches": top}
88
+ top = store.search(tenant, collection, q, k, filters=filters)
89
+ m_inc("matches_total", float(len(top) or 0))
90
+ return {
91
+ "matches": top
92
+ }
@@ -0,0 +1 @@
1
+ # pkg
pave/stores/base.py ADDED
@@ -0,0 +1,33 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ from __future__ import annotations
5
+ from abc import ABC, abstractmethod
6
+ from typing import Iterable, Dict, Any, List
7
+
8
+
9
+ Record = tuple[str, str, Dict[str, Any]] # (rid, text, meta)
10
+
11
+ class BaseStore(ABC):
12
+ @abstractmethod
13
+ def load_or_init(self, tenant: str, collection: str) -> None: ...
14
+
15
+ @abstractmethod
16
+ def save(self, tenant: str, collection: str) -> None: ...
17
+
18
+ @abstractmethod
19
+ def delete_collection(self, tenant: str, collection: str) -> None: ...
20
+
21
+ @abstractmethod
22
+ def has_doc(self, tenant: str, collection: str, docid: str) -> bool: ...
23
+
24
+ @abstractmethod
25
+ def purge_doc(self, tenant: str, collection: str, docid: str) -> int: ...
26
+
27
+ @abstractmethod
28
+ def index_records(self, tenant: str, collection: str, docid: str,
29
+ records: Iterable[Record]) -> int: ...
30
+
31
+ @abstractmethod
32
+ def search(self, tenant: str, collection: str, query: str, k: int = 5,
33
+ filters: Dict[str, Any] | None = None) -> List[Dict[str, Any]]: ...
pave/stores/factory.py ADDED
@@ -0,0 +1,18 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ from __future__ import annotations
5
+ from .base import BaseStore
6
+ from ..config import CFG
7
+
8
+ def get_store(cfg: CFG = CFG) -> BaseStore:
9
+ stype = cfg.get("vector_store.type")
10
+ match stype:
11
+ case "default" | "txtai": # vendor-neutral default
12
+ from .txtai_store import TxtaiStore
13
+ return TxtaiStore()
14
+ case "qdrant":
15
+ from .qdrant_store import QdrantStore
16
+ return QdrantStore()
17
+ case _:
18
+ raise RuntimeError(f"Unknown vector_store.type: {stype}")
@@ -0,0 +1,26 @@
1
+ # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+ from __future__ import annotations
4
+ from typing import Iterable, Dict, Any, List
5
+ from .base import BaseStore, Record
6
+
7
+ class QdrantStore(BaseStore):
8
+ """Stub adapter for Qdrant. To be implemented."""
9
+
10
+ def load_or_init(self, tenant: str, collection: str) -> None:
11
+ raise NotImplementedError("to be implemented")
12
+
13
+ def save(self, tenant: str, collection: str) -> None:
14
+ raise NotImplementedError("to be implemented")
15
+
16
+ def delete_collection(self, tenant: str, collection: str) -> None:
17
+ raise NotImplementedError("to be implemented")
18
+
19
+ def purge_doc(self, tenant: str, collection: str, docid: str) -> int:
20
+ raise NotImplementedError("to be implemented")
21
+
22
+ def index_records(self, tenant: str, collection: str, docid: str, records: Iterable[Record]) -> int:
23
+ raise NotImplementedError("to be implemented")
24
+
25
+ def search(self, tenant: str, collection: str, text: str, k: int = 5, filters: Dict[str, Any] | None = None) -> List[Dict[str, Any]]:
26
+ raise NotImplementedError("to be implemented")