patchvec 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patchvec-0.5.6.dist-info/METADATA +115 -0
- patchvec-0.5.6.dist-info/RECORD +28 -0
- patchvec-0.5.6.dist-info/WHEEL +5 -0
- patchvec-0.5.6.dist-info/entry_points.txt +3 -0
- patchvec-0.5.6.dist-info/licenses/LICENSE +9 -0
- patchvec-0.5.6.dist-info/top_level.txt +1 -0
- pave/__init__.py +6 -0
- pave/assets/patchvec_icon_192.png +0 -0
- pave/assets/ui.html +125 -0
- pave/auth.py +108 -0
- pave/cli.py +97 -0
- pave/config.py +240 -0
- pave/embedders/__init__.py +1 -0
- pave/embedders/base.py +12 -0
- pave/embedders/factory.py +21 -0
- pave/embedders/openai_emb.py +30 -0
- pave/embedders/sbert_emb.py +24 -0
- pave/embedders/txtai_emb.py +58 -0
- pave/main.py +303 -0
- pave/metrics.py +52 -0
- pave/preprocess.py +151 -0
- pave/service.py +92 -0
- pave/stores/__init__.py +1 -0
- pave/stores/base.py +33 -0
- pave/stores/factory.py +18 -0
- pave/stores/qdrant_store.py +26 -0
- pave/stores/txtai_store.py +445 -0
- pave/ui.py +175 -0
pave/metrics.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import time, threading
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
|
|
7
|
+
_started = time.time()
|
|
8
|
+
_lock = threading.Lock()
|
|
9
|
+
_counters: Dict[str, float] = {
|
|
10
|
+
"requests_total": 0.0,
|
|
11
|
+
"collections_created_total": 0.0,
|
|
12
|
+
"collections_deleted_total": 0.0,
|
|
13
|
+
"documents_indexed_total": 0.0,
|
|
14
|
+
"chunks_indexed_total": 0.0,
|
|
15
|
+
"purge_total": 0.0,
|
|
16
|
+
"search_total": 0.0,
|
|
17
|
+
"errors_total": 0.0,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
_last_error: str | None = None
|
|
21
|
+
|
|
22
|
+
def inc(name: str, value: float = 1.0):
|
|
23
|
+
with _lock:
|
|
24
|
+
_counters[name] = _counters.get(name, 0.0) + value
|
|
25
|
+
|
|
26
|
+
def set_error(msg: str):
|
|
27
|
+
global _last_error
|
|
28
|
+
with _lock:
|
|
29
|
+
_last_error = msg
|
|
30
|
+
_counters["errors_total"] = _counters.get("errors_total", 0.0) + 1.0
|
|
31
|
+
|
|
32
|
+
def snapshot(extra: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
33
|
+
with _lock:
|
|
34
|
+
data = dict(_counters)
|
|
35
|
+
data.update({
|
|
36
|
+
"uptime_seconds": time.time() - _started,
|
|
37
|
+
"last_error": _last_error,
|
|
38
|
+
})
|
|
39
|
+
if extra:
|
|
40
|
+
data.update(extra)
|
|
41
|
+
return data
|
|
42
|
+
|
|
43
|
+
def to_prometheus(extra: Dict[str, Any] | None = None, build: Dict[str, str] | None = None) -> str:
|
|
44
|
+
s = []
|
|
45
|
+
snap = snapshot(extra)
|
|
46
|
+
for k, v in snap.items():
|
|
47
|
+
if isinstance(v, (int, float)):
|
|
48
|
+
s.append(f"patchvec_{k} {float(v)}")
|
|
49
|
+
if build:
|
|
50
|
+
labels = ",".join([f'{key}="{val}"' for key, val in build.items()])
|
|
51
|
+
s.append(f"patchvec_build_info{{{labels}}} 1")
|
|
52
|
+
return "\n".join(s) + "\n"
|
pave/preprocess.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import io, csv, mimetypes
|
|
6
|
+
from .config import CFG
|
|
7
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
|
|
8
|
+
from pypdf import PdfReader
|
|
9
|
+
|
|
10
|
+
TXT_CHUNK_SIZE = int(CFG.get("preprocess.txt_chunk_size", 1000))
|
|
11
|
+
TXT_CHUNK_OVERLAP = int(CFG.get("preprocess.txt_chunk_overlap", 200))
|
|
12
|
+
|
|
13
|
+
def _chunks(text: str, size: int = TXT_CHUNK_SIZE, overlap: int = TXT_CHUNK_OVERLAP):
|
|
14
|
+
text = text or ""
|
|
15
|
+
step = max(size - overlap, 1)
|
|
16
|
+
i = 0
|
|
17
|
+
while i < len(text):
|
|
18
|
+
yield text[i : i + size]
|
|
19
|
+
i += step
|
|
20
|
+
|
|
21
|
+
def _csv_parse_col_spec(spec: str) -> tuple[list[str], list[int]]:
|
|
22
|
+
names: list[str] = []
|
|
23
|
+
idxs: list[int] = []
|
|
24
|
+
if not spec:
|
|
25
|
+
return names, idxs
|
|
26
|
+
for tok in (t.strip() for t in spec.split(",") if t.strip()):
|
|
27
|
+
if tok.isdigit():
|
|
28
|
+
i = int(tok)
|
|
29
|
+
if i <= 0:
|
|
30
|
+
raise ValueError("CSV column indices are 1-based")
|
|
31
|
+
idxs.append(i - 1)
|
|
32
|
+
else:
|
|
33
|
+
names.append(tok)
|
|
34
|
+
return names, idxs
|
|
35
|
+
|
|
36
|
+
def _csv_stringify_row(row: Dict[str, Any], keys: List[str]) -> str:
|
|
37
|
+
return "\n".join(f"{k}: {'' if row.get(k) is None else row.get(k)}" for k in keys)
|
|
38
|
+
|
|
39
|
+
def _preprocess_csv(filename: str, content: bytes, csv_options: Dict[str, Any]) -> Iterator[Tuple[str, str, Dict[str, Any]]]:
|
|
40
|
+
has_header = (csv_options.get("has_header") or "auto").lower() # auto|yes|no
|
|
41
|
+
meta_spec = csv_options.get("meta_cols") or ""
|
|
42
|
+
inc_spec = csv_options.get("include_cols") or ""
|
|
43
|
+
|
|
44
|
+
meta_names, meta_idxs = _csv_parse_col_spec(meta_spec)
|
|
45
|
+
inc_names , inc_idxs = _csv_parse_col_spec(inc_spec)
|
|
46
|
+
|
|
47
|
+
# decode
|
|
48
|
+
try:
|
|
49
|
+
text = content.decode("utf-8")
|
|
50
|
+
except UnicodeDecodeError:
|
|
51
|
+
text = content.decode("latin-1")
|
|
52
|
+
|
|
53
|
+
sio = io.StringIO(text)
|
|
54
|
+
sniffer = csv.Sniffer()
|
|
55
|
+
try:
|
|
56
|
+
dialect = sniffer.sniff(text[:4096])
|
|
57
|
+
except Exception:
|
|
58
|
+
dialect = csv.excel
|
|
59
|
+
|
|
60
|
+
reader = csv.reader(sio, dialect)
|
|
61
|
+
first = next(reader, None)
|
|
62
|
+
if first is None:
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
header_row: Optional[List[str]] = None
|
|
66
|
+
if has_header == "yes":
|
|
67
|
+
header_row = [str(h).strip() for h in first]
|
|
68
|
+
elif has_header == "no":
|
|
69
|
+
header_row = None
|
|
70
|
+
else:
|
|
71
|
+
try:
|
|
72
|
+
header_row = [str(h).strip() for h in first] if sniffer.has_header(text[:4096]) else None
|
|
73
|
+
except Exception:
|
|
74
|
+
header_row = None
|
|
75
|
+
|
|
76
|
+
if header_row is not None:
|
|
77
|
+
cols = header_row
|
|
78
|
+
data_rows = reader
|
|
79
|
+
else:
|
|
80
|
+
cols = [f"col_{i}" for i in range(len(first))]
|
|
81
|
+
data_rows = [first, *list(reader)]
|
|
82
|
+
|
|
83
|
+
ncols = len(cols)
|
|
84
|
+
name_to_idx = {c: i for i, c in enumerate(cols)}
|
|
85
|
+
|
|
86
|
+
# refuse if names referenced but no header
|
|
87
|
+
if (meta_names or inc_names) and header_row is None:
|
|
88
|
+
raise ValueError("CSV has no header but column names were provided. Use 1-based indices or supply a header.")
|
|
89
|
+
|
|
90
|
+
def resolve(names: list[str], idxs: list[int]) -> List[str]:
|
|
91
|
+
out: list[str] = []
|
|
92
|
+
for nm in names:
|
|
93
|
+
if nm not in name_to_idx:
|
|
94
|
+
raise ValueError(f"CSV column '{nm}' not found in header")
|
|
95
|
+
out.append(nm)
|
|
96
|
+
for i in idxs:
|
|
97
|
+
if i < 0 or i >= ncols:
|
|
98
|
+
raise ValueError(f"CSV column index {i+1} out of range (1..{ncols})")
|
|
99
|
+
out.append(cols[i])
|
|
100
|
+
seen = set(); out2=[]
|
|
101
|
+
for k in out:
|
|
102
|
+
if k not in seen:
|
|
103
|
+
seen.add(k); out2.append(k)
|
|
104
|
+
return out2
|
|
105
|
+
|
|
106
|
+
meta_keys = resolve(meta_names, meta_idxs)
|
|
107
|
+
if inc_names or inc_idxs:
|
|
108
|
+
include_keys = resolve(inc_names, inc_idxs)
|
|
109
|
+
else:
|
|
110
|
+
# DEFAULT: include all columns EXCEPT meta
|
|
111
|
+
meta_set = set(meta_keys)
|
|
112
|
+
include_keys = [c for c in cols if c not in meta_set]
|
|
113
|
+
|
|
114
|
+
rowno = 0
|
|
115
|
+
for row in data_rows:
|
|
116
|
+
rowno += 1
|
|
117
|
+
if len(row) < ncols:
|
|
118
|
+
row = row + [""] * (ncols - len(row))
|
|
119
|
+
elif len(row) > ncols:
|
|
120
|
+
row = row[:ncols]
|
|
121
|
+
asdict = {cols[i]: row[i] for i in range(ncols)}
|
|
122
|
+
text_part = _csv_stringify_row(asdict, include_keys)
|
|
123
|
+
extra = {k: asdict.get(k, "") for k in meta_keys}
|
|
124
|
+
extra["row"] = rowno
|
|
125
|
+
extra["has_header"] = bool(header_row is not None)
|
|
126
|
+
yield (f"row_{rowno-1}", text_part, extra)
|
|
127
|
+
|
|
128
|
+
def preprocess(filename: str, content: bytes, csv_options: Dict[str, Any] \
|
|
129
|
+
| None = None) -> Iterator[Tuple[str, str, Dict[str, Any]]]:
|
|
130
|
+
"""
|
|
131
|
+
Yields (local_id, text, extra_meta):
|
|
132
|
+
- PDF: one chunk per page
|
|
133
|
+
- TXT: charcount-based chunks
|
|
134
|
+
- CSV: one chunk per row ("; " join)
|
|
135
|
+
"""
|
|
136
|
+
mt, _ = mimetypes.guess_type(filename)
|
|
137
|
+
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
|
|
138
|
+
if ext == "pdf":
|
|
139
|
+
reader = PdfReader(io.BytesIO(content))
|
|
140
|
+
for i, page in enumerate(reader.pages):
|
|
141
|
+
text = page.extract_text() or ""
|
|
142
|
+
yield f"page_{i}", text, {"page": i}
|
|
143
|
+
elif ext == "txt":
|
|
144
|
+
text = content.decode("utf-8", errors="ignore")
|
|
145
|
+
for i, chunk in enumerate(_chunks(text)):
|
|
146
|
+
yield f"chunk_{i}", chunk, {"chunk": i}
|
|
147
|
+
elif ext == "csv" or mt == "text/csv":
|
|
148
|
+
yield from _preprocess_csv(filename, content, csv_options or {})
|
|
149
|
+
return
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"unsupported file type: {ext or 'unknown'}")
|
pave/service.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import uuid, json, re
|
|
6
|
+
from typing import Dict, Any, Iterable, Tuple, List
|
|
7
|
+
from datetime import datetime, timezone as tz
|
|
8
|
+
from pave.preprocess import preprocess
|
|
9
|
+
from pave.metrics import inc as m_inc
|
|
10
|
+
|
|
11
|
+
# Pure-ish service functions operating on a store adapter
|
|
12
|
+
|
|
13
|
+
def create_collection(store, tenant: str, name: str) -> Dict[str, Any]:
|
|
14
|
+
store.load_or_init(tenant, name)
|
|
15
|
+
store.save(tenant, name)
|
|
16
|
+
m_inc("collections_created_total", 1.0)
|
|
17
|
+
return {
|
|
18
|
+
"ok": True,
|
|
19
|
+
"tenant": tenant,
|
|
20
|
+
"collection": name
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def delete_collection(store, tenant: str, name: str) -> Dict[str, Any]:
|
|
24
|
+
store.delete_collection(tenant, name)
|
|
25
|
+
m_inc("collections_deleted_total", 1.0)
|
|
26
|
+
return {
|
|
27
|
+
"ok": True,
|
|
28
|
+
"tenant": tenant,
|
|
29
|
+
"deleted": name
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def _default_docid(filename: str) -> str:
|
|
33
|
+
# Uppercase
|
|
34
|
+
base = filename.upper()
|
|
35
|
+
# replace space and dot with underscore
|
|
36
|
+
base = base.replace(" ", "_").replace(".", "_")
|
|
37
|
+
# replace all non A-Z0-9_ with underscore
|
|
38
|
+
base = re.sub(r"[^A-Z0-9_]", "_", base)
|
|
39
|
+
# collapse multiple underscores
|
|
40
|
+
base = re.sub(r"_+", "_", base).strip("_")
|
|
41
|
+
if base != '': return base
|
|
42
|
+
return "PVDOC_"+str(uuid.uuid4())
|
|
43
|
+
|
|
44
|
+
def ingest_document(store, tenant: str, collection: str, filename: str, content: bytes,
|
|
45
|
+
docid: str | None, metadata: Dict[str, Any] | None,
|
|
46
|
+
csv_options: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
47
|
+
baseid = docid or _default_docid(filename)
|
|
48
|
+
if baseid and store.has_doc(tenant, collection, baseid):
|
|
49
|
+
purged = store.purge_doc(tenant, collection, baseid)
|
|
50
|
+
m_inc("purge_total", purged)
|
|
51
|
+
meta_doc = metadata or {}
|
|
52
|
+
records = []
|
|
53
|
+
for local_id, text, extra in preprocess(filename, content, csv_options=csv_options):
|
|
54
|
+
rid = f"{baseid}::{local_id}"
|
|
55
|
+
now = datetime.now(tz.utc).isoformat(timespec="seconds").replace("+00:00", "Z")
|
|
56
|
+
meta = {"docid": baseid, "filename": filename, "ingested_at": now}
|
|
57
|
+
meta.update(meta_doc)
|
|
58
|
+
meta.update(extra)
|
|
59
|
+
records.append((rid, text, meta))
|
|
60
|
+
if not records:
|
|
61
|
+
return {"ok": False, "error": "no text extracted"}
|
|
62
|
+
count = store.index_records(tenant, collection, baseid, records)
|
|
63
|
+
m_inc("documents_indexed_total", 1.0)
|
|
64
|
+
m_inc("chunks_indexed_total", float(count or 0))
|
|
65
|
+
return {
|
|
66
|
+
"ok": True,
|
|
67
|
+
"tenant": tenant,
|
|
68
|
+
"collection": collection,
|
|
69
|
+
"docid": baseid,
|
|
70
|
+
"chunks": count
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def do_search(store, tenant: str, collection: str, q: str, k: int = 5,
|
|
74
|
+
filters: Dict[str, Any] | None = None, include_common: bool = False,
|
|
75
|
+
common_tenant: str | None = None, common_collection: str | None = None
|
|
76
|
+
) -> Dict[str, Any]:
|
|
77
|
+
m_inc("search_total", 1.0)
|
|
78
|
+
if include_common and common_tenant and common_collection:
|
|
79
|
+
matches: List[Dict[str, Any]] = []
|
|
80
|
+
matches.extend(store.search(
|
|
81
|
+
tenant, collection, q, max(10, k * 2), filters=filters))
|
|
82
|
+
matches.extend(store.search(
|
|
83
|
+
common_tenant, common_collection, q, max(10, k * 2), filters=filters))
|
|
84
|
+
from heapq import nlargest
|
|
85
|
+
top = nlargest(k, matches, key=lambda x: x["score"])
|
|
86
|
+
m_inc("matches_total", float(len(top) or 0))
|
|
87
|
+
return {"matches": top}
|
|
88
|
+
top = store.search(tenant, collection, q, k, filters=filters)
|
|
89
|
+
m_inc("matches_total", float(len(top) or 0))
|
|
90
|
+
return {
|
|
91
|
+
"matches": top
|
|
92
|
+
}
|
pave/stores/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# pkg
|
pave/stores/base.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Iterable, Dict, Any, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
Record = tuple[str, str, Dict[str, Any]] # (rid, text, meta)
|
|
10
|
+
|
|
11
|
+
class BaseStore(ABC):
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def load_or_init(self, tenant: str, collection: str) -> None: ...
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def save(self, tenant: str, collection: str) -> None: ...
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def delete_collection(self, tenant: str, collection: str) -> None: ...
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def has_doc(self, tenant: str, collection: str, docid: str) -> bool: ...
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def purge_doc(self, tenant: str, collection: str, docid: str) -> int: ...
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def index_records(self, tenant: str, collection: str, docid: str,
|
|
29
|
+
records: Iterable[Record]) -> int: ...
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def search(self, tenant: str, collection: str, query: str, k: int = 5,
|
|
33
|
+
filters: Dict[str, Any] | None = None) -> List[Dict[str, Any]]: ...
|
pave/stores/factory.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
from .base import BaseStore
|
|
6
|
+
from ..config import CFG
|
|
7
|
+
|
|
8
|
+
def get_store(cfg: CFG = CFG) -> BaseStore:
|
|
9
|
+
stype = cfg.get("vector_store.type")
|
|
10
|
+
match stype:
|
|
11
|
+
case "default" | "txtai": # vendor-neutral default
|
|
12
|
+
from .txtai_store import TxtaiStore
|
|
13
|
+
return TxtaiStore()
|
|
14
|
+
case "qdrant":
|
|
15
|
+
from .qdrant_store import QdrantStore
|
|
16
|
+
return QdrantStore()
|
|
17
|
+
case _:
|
|
18
|
+
raise RuntimeError(f"Unknown vector_store.type: {stype}")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
|
|
2
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import Iterable, Dict, Any, List
|
|
5
|
+
from .base import BaseStore, Record
|
|
6
|
+
|
|
7
|
+
class QdrantStore(BaseStore):
|
|
8
|
+
"""Stub adapter for Qdrant. To be implemented."""
|
|
9
|
+
|
|
10
|
+
def load_or_init(self, tenant: str, collection: str) -> None:
|
|
11
|
+
raise NotImplementedError("to be implemented")
|
|
12
|
+
|
|
13
|
+
def save(self, tenant: str, collection: str) -> None:
|
|
14
|
+
raise NotImplementedError("to be implemented")
|
|
15
|
+
|
|
16
|
+
def delete_collection(self, tenant: str, collection: str) -> None:
|
|
17
|
+
raise NotImplementedError("to be implemented")
|
|
18
|
+
|
|
19
|
+
def purge_doc(self, tenant: str, collection: str, docid: str) -> int:
|
|
20
|
+
raise NotImplementedError("to be implemented")
|
|
21
|
+
|
|
22
|
+
def index_records(self, tenant: str, collection: str, docid: str, records: Iterable[Record]) -> int:
|
|
23
|
+
raise NotImplementedError("to be implemented")
|
|
24
|
+
|
|
25
|
+
def search(self, tenant: str, collection: str, text: str, k: int = 5, filters: Dict[str, Any] | None = None) -> List[Dict[str, Any]]:
|
|
26
|
+
raise NotImplementedError("to be implemented")
|