pg-raggraph 0.3.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg_raggraph/__init__.py +1432 -0
- pg_raggraph/answer.py +140 -0
- pg_raggraph/chunking.py +494 -0
- pg_raggraph/cli.py +496 -0
- pg_raggraph/config.py +237 -0
- pg_raggraph/db.py +346 -0
- pg_raggraph/embedding.py +123 -0
- pg_raggraph/evolution.py +256 -0
- pg_raggraph/extraction.py +328 -0
- pg_raggraph/mcp_server.py +204 -0
- pg_raggraph/models.py +221 -0
- pg_raggraph/reranker.py +117 -0
- pg_raggraph/resolution.py +83 -0
- pg_raggraph/retrieval.py +691 -0
- pg_raggraph/server.py +449 -0
- pg_raggraph/sql/__init__.py +0 -0
- pg_raggraph/sql/migrations/001_embedded_content.sql +33 -0
- pg_raggraph/sql/migrations/002_evolution_tracking.sql +99 -0
- pg_raggraph/sql/migrations/README.md +10 -0
- pg_raggraph/sql/migrations/__init__.py +0 -0
- pg_raggraph/sql/schema.sql +218 -0
- pg_raggraph/static/index.html +170 -0
- pg_raggraph-0.3.0a2.dist-info/METADATA +390 -0
- pg_raggraph-0.3.0a2.dist-info/RECORD +27 -0
- pg_raggraph-0.3.0a2.dist-info/WHEEL +4 -0
- pg_raggraph-0.3.0a2.dist-info/entry_points.txt +2 -0
- pg_raggraph-0.3.0a2.dist-info/licenses/LICENSE +21 -0
pg_raggraph/__init__.py
ADDED
|
@@ -0,0 +1,1432 @@
|
|
|
1
|
+
"""pg-raggraph — PostgreSQL-native GraphRAG."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from importlib.metadata import PackageNotFoundError
|
|
11
|
+
from importlib.metadata import version as _pkg_version
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
__version__ = _pkg_version("pg-raggraph")
|
|
15
|
+
except PackageNotFoundError:
|
|
16
|
+
# Editable install without installed metadata (rare). Mirror pyproject.
|
|
17
|
+
__version__ = "0.3.0a2"
|
|
18
|
+
|
|
19
|
+
from pg_raggraph.config import PGRGConfig
|
|
20
|
+
from pg_raggraph.models import QueryResult
|
|
21
|
+
|
|
22
|
+
# Canonical extension allowlist for ingestion. Mirrored by the FastAPI server
|
|
23
|
+
# and the MCP server so all surfaces accept the same set. Stored as a tuple
|
|
24
|
+
# so it's compatible with str.endswith() in the directory walker.
|
|
25
|
+
INGEST_ALLOWED_EXTS: tuple[str, ...] = (
|
|
26
|
+
".md",
|
|
27
|
+
".txt",
|
|
28
|
+
".py",
|
|
29
|
+
".ts",
|
|
30
|
+
".js",
|
|
31
|
+
".tsx",
|
|
32
|
+
".jsx",
|
|
33
|
+
".go",
|
|
34
|
+
".rs",
|
|
35
|
+
".java",
|
|
36
|
+
".rst",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"GraphRAG",
|
|
41
|
+
"INGEST_ALLOWED_EXTS",
|
|
42
|
+
"PGRGConfig",
|
|
43
|
+
"QueryResult",
|
|
44
|
+
"__version__",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger("pg_raggraph")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _json_default(obj):
|
|
51
|
+
"""JSON encoder fallback for types stdlib json can't handle natively.
|
|
52
|
+
|
|
53
|
+
datetime → ISO 8601 string (queryable from JSONB via
|
|
54
|
+
``metadata->>'effective_from'``). Falls back to ``str(obj)`` for
|
|
55
|
+
anything else so a user's exotic metadata value never crashes ingest.
|
|
56
|
+
"""
|
|
57
|
+
if isinstance(obj, datetime):
|
|
58
|
+
return obj.isoformat()
|
|
59
|
+
return str(obj)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _JSONLogFormatter(logging.Formatter):
|
|
63
|
+
"""Minimal stdlib-only JSON formatter for log aggregator pipelines.
|
|
64
|
+
|
|
65
|
+
No extra dep. Output shape matches the common Datadog / ELK / Loki
|
|
66
|
+
expectation: `ts`, `level`, `logger`, `msg`, plus `exc_info` when present.
|
|
67
|
+
Honors `extra={...}` on log calls — anything extra is merged at the top
|
|
68
|
+
level (keeping `ts`, `level`, `logger`, `msg` reserved).
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
_RESERVED = frozenset({"ts", "level", "logger", "msg", "exc_info"})
|
|
72
|
+
|
|
73
|
+
def format(self, record: logging.LogRecord) -> str: # noqa: D401
|
|
74
|
+
payload: dict = {
|
|
75
|
+
"ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
|
|
76
|
+
"level": record.levelname,
|
|
77
|
+
"logger": record.name,
|
|
78
|
+
"msg": record.getMessage(),
|
|
79
|
+
}
|
|
80
|
+
if record.exc_info:
|
|
81
|
+
payload["exc_info"] = self.formatException(record.exc_info)
|
|
82
|
+
# Merge extras (logger.info("...", extra={"request_id": x}) patterns).
|
|
83
|
+
for k, v in record.__dict__.items():
|
|
84
|
+
if k in self._RESERVED:
|
|
85
|
+
continue
|
|
86
|
+
if k.startswith("_"):
|
|
87
|
+
continue
|
|
88
|
+
if k in (
|
|
89
|
+
"args",
|
|
90
|
+
"msg",
|
|
91
|
+
"name",
|
|
92
|
+
"exc_info",
|
|
93
|
+
"exc_text",
|
|
94
|
+
"stack_info",
|
|
95
|
+
"lineno",
|
|
96
|
+
"module",
|
|
97
|
+
"filename",
|
|
98
|
+
"pathname",
|
|
99
|
+
"funcName",
|
|
100
|
+
"process",
|
|
101
|
+
"processName",
|
|
102
|
+
"thread",
|
|
103
|
+
"threadName",
|
|
104
|
+
"created",
|
|
105
|
+
"msecs",
|
|
106
|
+
"relativeCreated",
|
|
107
|
+
"levelname",
|
|
108
|
+
"levelno",
|
|
109
|
+
"asctime",
|
|
110
|
+
"message",
|
|
111
|
+
"taskName",
|
|
112
|
+
):
|
|
113
|
+
continue
|
|
114
|
+
try:
|
|
115
|
+
json.dumps(v)
|
|
116
|
+
payload[k] = v
|
|
117
|
+
except (TypeError, ValueError):
|
|
118
|
+
payload[k] = repr(v)
|
|
119
|
+
return json.dumps(payload, default=str)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
_logging_configured = False
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _configure_logging() -> None:
|
|
126
|
+
"""Idempotent root-logger configuration honoring PGRG_LOG_FORMAT.
|
|
127
|
+
|
|
128
|
+
Default (env unset or anything other than "json"): leave existing handlers
|
|
129
|
+
alone — caller's logging setup wins. When PGRG_LOG_FORMAT=json AND no
|
|
130
|
+
handlers are attached to the pg_raggraph logger yet, install a single
|
|
131
|
+
StreamHandler with the JSON formatter at PGRG_LOG_LEVEL (default INFO).
|
|
132
|
+
"""
|
|
133
|
+
global _logging_configured
|
|
134
|
+
if _logging_configured:
|
|
135
|
+
return
|
|
136
|
+
fmt = os.environ.get("PGRG_LOG_FORMAT", "").strip().lower()
|
|
137
|
+
if fmt != "json":
|
|
138
|
+
_logging_configured = True
|
|
139
|
+
return
|
|
140
|
+
if logger.handlers:
|
|
141
|
+
# Caller already wired their own handler; respect it.
|
|
142
|
+
_logging_configured = True
|
|
143
|
+
return
|
|
144
|
+
handler = logging.StreamHandler()
|
|
145
|
+
handler.setFormatter(_JSONLogFormatter())
|
|
146
|
+
level_name = os.environ.get("PGRG_LOG_LEVEL", "INFO").upper()
|
|
147
|
+
handler.setLevel(getattr(logging, level_name, logging.INFO))
|
|
148
|
+
logger.addHandler(handler)
|
|
149
|
+
logger.setLevel(getattr(logging, level_name, logging.INFO))
|
|
150
|
+
_logging_configured = True
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_configure_logging()
|
|
154
|
+
|
|
155
|
+
_NAMESPACE_RE = re.compile(r"^[a-zA-Z0-9_\-\.]{1,64}$")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _validate_namespace(ns: str) -> None:
|
|
159
|
+
if not _NAMESPACE_RE.match(ns):
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"Invalid namespace '{ns}'. Must be 1-64 chars, "
|
|
162
|
+
"alphanumeric/hyphens/underscores/dots only."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class GraphRAG:
|
|
167
|
+
"""Main entry point for pg-raggraph.
|
|
168
|
+
|
|
169
|
+
Usage:
|
|
170
|
+
async with GraphRAG("postgresql://localhost/mydb") as rag:
|
|
171
|
+
await rag.ingest(["./docs/"])
|
|
172
|
+
result = await rag.query("How does auth work?")
|
|
173
|
+
for chunk in result.chunks:
|
|
174
|
+
print(chunk.content)
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
def __init__(self, dsn: str | None = None, *, reranker=None, **kwargs):
|
|
178
|
+
"""Construct a GraphRAG instance.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
dsn: PostgreSQL connection string. Optional — can also be set
|
|
182
|
+
via PGRG_DSN env var or kwargs["dsn"].
|
|
183
|
+
reranker: Optional Reranker (see pg_raggraph.reranker.Reranker
|
|
184
|
+
protocol) to inject for power users. If None, a
|
|
185
|
+
FastEmbedReranker is lazy-loaded from config.rerank_model
|
|
186
|
+
on first use of rerank=True.
|
|
187
|
+
**kwargs: Any PGRGConfig field. See docs/Config-Reference.md
|
|
188
|
+
for the full list.
|
|
189
|
+
"""
|
|
190
|
+
if dsn:
|
|
191
|
+
kwargs["dsn"] = dsn
|
|
192
|
+
self.config = PGRGConfig(**kwargs)
|
|
193
|
+
self._db = None
|
|
194
|
+
self._embedder = None
|
|
195
|
+
self._llm = None # Shared LLM provider; closed with the instance
|
|
196
|
+
# If user injects a reranker, use it; otherwise lazy-load from
|
|
197
|
+
# config.rerank_model on first rerank=True call.
|
|
198
|
+
self._reranker = reranker
|
|
199
|
+
# PR-209: cooperative shutdown signal for long-running ingest loops.
|
|
200
|
+
# Lazily initialized inside ingest() because it must be created on the
|
|
201
|
+
# running asyncio loop, not at __init__ time.
|
|
202
|
+
self._shutdown_event = None
|
|
203
|
+
|
|
204
|
+
def request_shutdown(self) -> None:
|
|
205
|
+
"""Signal in-progress ingest loops to drain gracefully.
|
|
206
|
+
|
|
207
|
+
Already-running per-file transactions finish; queued files become
|
|
208
|
+
no-ops counted as skipped. Safe to call from a SIGTERM/SIGINT handler::
|
|
209
|
+
|
|
210
|
+
import asyncio, signal
|
|
211
|
+
from pg_raggraph import GraphRAG
|
|
212
|
+
|
|
213
|
+
rag = GraphRAG(...)
|
|
214
|
+
loop = asyncio.get_running_loop()
|
|
215
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
|
216
|
+
loop.add_signal_handler(sig, rag.request_shutdown)
|
|
217
|
+
|
|
218
|
+
Idempotent. Safe to call before ingest() starts (no-op).
|
|
219
|
+
"""
|
|
220
|
+
if self._shutdown_event is not None:
|
|
221
|
+
self._shutdown_event.set()
|
|
222
|
+
|
|
223
|
+
async def connect(self):
|
|
224
|
+
from pg_raggraph.db import Database
|
|
225
|
+
|
|
226
|
+
self._db = Database(self.config)
|
|
227
|
+
try:
|
|
228
|
+
await self._db.connect()
|
|
229
|
+
except Exception as e:
|
|
230
|
+
raise ConnectionError(
|
|
231
|
+
f"Cannot connect to PostgreSQL at {self.config.dsn}. "
|
|
232
|
+
f"Is the database running? Error: {e}"
|
|
233
|
+
) from e
|
|
234
|
+
|
|
235
|
+
async def close(self):
|
|
236
|
+
if self._db:
|
|
237
|
+
await self._db.close()
|
|
238
|
+
self._db = None
|
|
239
|
+
if self._llm is not None and hasattr(self._llm, "aclose"):
|
|
240
|
+
await self._llm.aclose()
|
|
241
|
+
self._llm = None
|
|
242
|
+
self._embedder = None
|
|
243
|
+
|
|
244
|
+
async def __aenter__(self):
|
|
245
|
+
await self.connect()
|
|
246
|
+
return self
|
|
247
|
+
|
|
248
|
+
async def __aexit__(self, *exc):
|
|
249
|
+
await self.close()
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def db(self):
|
|
253
|
+
if self._db is None:
|
|
254
|
+
raise RuntimeError("Not connected. Call connect() or use async with.")
|
|
255
|
+
return self._db
|
|
256
|
+
|
|
257
|
+
def _get_embedder(self):
|
|
258
|
+
if self._embedder is None:
|
|
259
|
+
from pg_raggraph.embedding import get_embedding_provider
|
|
260
|
+
|
|
261
|
+
self._embedder = get_embedding_provider(self.config)
|
|
262
|
+
return self._embedder
|
|
263
|
+
|
|
264
|
+
async def ingest(
|
|
265
|
+
self,
|
|
266
|
+
paths: list[str],
|
|
267
|
+
namespace: str | None = None,
|
|
268
|
+
on_progress=None,
|
|
269
|
+
*,
|
|
270
|
+
metadata: dict | None = None,
|
|
271
|
+
):
|
|
272
|
+
"""Ingest documents from file paths with parallel processing.
|
|
273
|
+
|
|
274
|
+
Optimizations:
|
|
275
|
+
- Parallel LLM extraction (extract_concurrency, default 8)
|
|
276
|
+
- Batched entity embeddings (1 call instead of N)
|
|
277
|
+
- Parallel document processing (doc_concurrency, default 4)
|
|
278
|
+
- Content hash dedup
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
paths: File or directory paths to ingest.
|
|
282
|
+
namespace: Namespace for data isolation.
|
|
283
|
+
on_progress: Optional callback(message: str) for progress updates.
|
|
284
|
+
metadata: Per-ingest evolution hints applied to every file in this
|
|
285
|
+
call. Optional keys: ``effective_from``, ``effective_to``,
|
|
286
|
+
``retracted``, ``retracted_at``, ``retraction_reason``,
|
|
287
|
+
``version_label``, ``supersedes_document_id``. When
|
|
288
|
+
``version_label``, ``supersedes_document_id``, or
|
|
289
|
+
``retraction_reason`` is present, a ``document_versions`` row
|
|
290
|
+
is also created mirroring the document's evolution metadata.
|
|
291
|
+
"""
|
|
292
|
+
import asyncio
|
|
293
|
+
|
|
294
|
+
from pg_raggraph.chunking import chunk_document, content_hash
|
|
295
|
+
from pg_raggraph.extraction import extract_from_chunks, get_llm_provider
|
|
296
|
+
|
|
297
|
+
ns = namespace or self.config.namespace
|
|
298
|
+
_validate_namespace(ns)
|
|
299
|
+
# PR-215: apply nice_level here (was previously in config init,
|
|
300
|
+
# which surprised callers by mutating process priority on import).
|
|
301
|
+
self.config.apply_nice_level()
|
|
302
|
+
embedder = self._get_embedder()
|
|
303
|
+
|
|
304
|
+
def _progress(msg: str):
|
|
305
|
+
logger.info(msg)
|
|
306
|
+
if on_progress:
|
|
307
|
+
on_progress(msg)
|
|
308
|
+
|
|
309
|
+
# Directories to skip when walking — avoid vendored code, build artifacts,
|
|
310
|
+
# model checkpoints, etc.
|
|
311
|
+
SKIP_DIRS = {
|
|
312
|
+
".git",
|
|
313
|
+
".venv",
|
|
314
|
+
"venv",
|
|
315
|
+
"node_modules",
|
|
316
|
+
"target", # Rust build
|
|
317
|
+
"dist",
|
|
318
|
+
"build",
|
|
319
|
+
"__pycache__",
|
|
320
|
+
".pytest_cache",
|
|
321
|
+
".ruff_cache",
|
|
322
|
+
".mypy_cache",
|
|
323
|
+
".tox",
|
|
324
|
+
"checkpoints",
|
|
325
|
+
"models",
|
|
326
|
+
".cargo",
|
|
327
|
+
".idea",
|
|
328
|
+
".vscode",
|
|
329
|
+
"site-packages",
|
|
330
|
+
".autonomy",
|
|
331
|
+
"skill-output",
|
|
332
|
+
}
|
|
333
|
+
SUPPORTED_EXTS = INGEST_ALLOWED_EXTS
|
|
334
|
+
|
|
335
|
+
# Collect and validate file paths
|
|
336
|
+
file_paths = []
|
|
337
|
+
for p in paths:
|
|
338
|
+
if os.path.isdir(p):
|
|
339
|
+
for root, dirs, files in os.walk(p):
|
|
340
|
+
# Prune skipped dirs in-place so we don't descend into them
|
|
341
|
+
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
|
342
|
+
for f in files:
|
|
343
|
+
if f.endswith(SUPPORTED_EXTS):
|
|
344
|
+
file_paths.append(os.path.join(root, f))
|
|
345
|
+
elif os.path.isfile(p):
|
|
346
|
+
file_paths.append(p)
|
|
347
|
+
else:
|
|
348
|
+
raise FileNotFoundError(f"Path not found: {p}")
|
|
349
|
+
|
|
350
|
+
if not file_paths:
|
|
351
|
+
logger.warning("No supported files found in provided paths.")
|
|
352
|
+
return
|
|
353
|
+
|
|
354
|
+
_progress(f"Found {len(file_paths)} files to process.")
|
|
355
|
+
|
|
356
|
+
# PR-209: lazily create the shutdown event on the running loop.
|
|
357
|
+
# request_shutdown() can be called before this without error (no-op);
|
|
358
|
+
# once an ingest is in flight, it observes the event and drains.
|
|
359
|
+
if self._shutdown_event is None:
|
|
360
|
+
self._shutdown_event = asyncio.Event()
|
|
361
|
+
|
|
362
|
+
# Process documents in parallel batches
|
|
363
|
+
doc_sem = asyncio.Semaphore(self.config.doc_concurrency)
|
|
364
|
+
# LLM is optional — without it, ingest stores chunks+embeddings only
|
|
365
|
+
# (pure vector RAG mode). Reuse the shared provider if already created
|
|
366
|
+
# so the connection pool is shared across ingest() calls.
|
|
367
|
+
llm = None
|
|
368
|
+
if not self.config.skip_extraction and self.config.llm_base_url:
|
|
369
|
+
if self._llm is None:
|
|
370
|
+
try:
|
|
371
|
+
self._llm = get_llm_provider(self.config)
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.warning(f"LLM provider unavailable, skipping extraction: {e}")
|
|
374
|
+
llm = self._llm
|
|
375
|
+
if llm is None:
|
|
376
|
+
_progress("Extraction disabled — ingesting as pure vector RAG.")
|
|
377
|
+
|
|
378
|
+
stats = {
|
|
379
|
+
"ingested": 0,
|
|
380
|
+
"skipped": 0,
|
|
381
|
+
"failed": 0,
|
|
382
|
+
"degraded": 0,
|
|
383
|
+
"entities": 0,
|
|
384
|
+
"rels": 0,
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
async def _process_file(idx: int, file_path: str):
|
|
388
|
+
# Retry on transient serialization / deadlock errors from
|
|
389
|
+
# concurrent ingestion. Exponential backoff, max 3 attempts.
|
|
390
|
+
async with doc_sem:
|
|
391
|
+
# PR-209: drain gracefully on shutdown. Files queued behind
|
|
392
|
+
# the semaphore become no-ops once request_shutdown() is
|
|
393
|
+
# observed; in-flight files (already past this check) finish
|
|
394
|
+
# their transaction normally.
|
|
395
|
+
if self._shutdown_event is not None and self._shutdown_event.is_set():
|
|
396
|
+
stats["skipped"] += 1
|
|
397
|
+
return
|
|
398
|
+
attempt = 0
|
|
399
|
+
while True:
|
|
400
|
+
attempt += 1
|
|
401
|
+
try:
|
|
402
|
+
r = await self._ingest_one_file(
|
|
403
|
+
file_path,
|
|
404
|
+
ns,
|
|
405
|
+
embedder,
|
|
406
|
+
llm,
|
|
407
|
+
content_hash,
|
|
408
|
+
chunk_document,
|
|
409
|
+
extract_from_chunks,
|
|
410
|
+
metadata=metadata,
|
|
411
|
+
)
|
|
412
|
+
if r:
|
|
413
|
+
stats["ingested"] += 1
|
|
414
|
+
stats["entities"] += r["entities"]
|
|
415
|
+
stats["rels"] += r["rels"]
|
|
416
|
+
if r.get("degraded"):
|
|
417
|
+
stats["degraded"] += 1
|
|
418
|
+
deg_note = (
|
|
419
|
+
" (extraction failed, vector-only)" if r.get("degraded") else ""
|
|
420
|
+
)
|
|
421
|
+
_progress(
|
|
422
|
+
f"[{idx}/{len(file_paths)}] "
|
|
423
|
+
f"{os.path.basename(file_path)}: "
|
|
424
|
+
f"{r['entities']} entities, {r['rels']} rels{deg_note}"
|
|
425
|
+
)
|
|
426
|
+
else:
|
|
427
|
+
stats["skipped"] += 1
|
|
428
|
+
return
|
|
429
|
+
except Exception as e:
|
|
430
|
+
# Postgres deadlock = SQLSTATE 40P01, serialization = 40001.
|
|
431
|
+
# Prefer the structured sqlstate attribute (psycopg3) over
|
|
432
|
+
# string matching, which breaks on non-English PG builds.
|
|
433
|
+
sqlstate = getattr(e, "sqlstate", None)
|
|
434
|
+
msg = str(e)
|
|
435
|
+
transient = sqlstate in ("40P01", "40001") or (
|
|
436
|
+
sqlstate is None
|
|
437
|
+
and (
|
|
438
|
+
"40P01" in msg
|
|
439
|
+
or "40001" in msg
|
|
440
|
+
or "deadlock detected" in msg
|
|
441
|
+
or "could not serialize" in msg
|
|
442
|
+
)
|
|
443
|
+
)
|
|
444
|
+
if transient and attempt < 3:
|
|
445
|
+
backoff = 0.2 * (2 ** (attempt - 1))
|
|
446
|
+
logger.info(
|
|
447
|
+
f"Retry {attempt}/3 after {backoff:.1f}s for "
|
|
448
|
+
f"{file_path}: {msg[:80]}"
|
|
449
|
+
)
|
|
450
|
+
await asyncio.sleep(backoff)
|
|
451
|
+
continue
|
|
452
|
+
logger.warning(f"Failed {file_path}: {e}")
|
|
453
|
+
stats["failed"] += 1
|
|
454
|
+
return
|
|
455
|
+
|
|
456
|
+
await asyncio.gather(*[_process_file(i + 1, fp) for i, fp in enumerate(file_paths)])
|
|
457
|
+
|
|
458
|
+
notes = []
|
|
459
|
+
if stats["failed"]:
|
|
460
|
+
notes.append(f"{stats['failed']} failed")
|
|
461
|
+
if stats["degraded"]:
|
|
462
|
+
notes.append(f"{stats['degraded']} degraded (vector-only, extraction error)")
|
|
463
|
+
suffix = f", {', '.join(notes)}" if notes else ""
|
|
464
|
+
_progress(
|
|
465
|
+
f"Done: {stats['ingested']} ingested, {stats['skipped']} skipped"
|
|
466
|
+
f"{suffix}. "
|
|
467
|
+
f"{stats['entities']} entities, {stats['rels']} relationships."
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
async def ingest_records(
|
|
471
|
+
self,
|
|
472
|
+
records,
|
|
473
|
+
namespace: str | None = None,
|
|
474
|
+
on_progress=None,
|
|
475
|
+
):
|
|
476
|
+
"""Ingest documents from in-memory records — no disk roundtrip.
|
|
477
|
+
|
|
478
|
+
Use this when your source data lives in another database, an API,
|
|
479
|
+
a queue, or anywhere that's not the filesystem. The classic
|
|
480
|
+
pattern for same-database CRM/ERP pipelines:
|
|
481
|
+
|
|
482
|
+
with psycopg.connect(crm_dsn) as conn:
|
|
483
|
+
rows = conn.execute("SELECT note_id, note_text, ... FROM ...").fetchall()
|
|
484
|
+
records = [
|
|
485
|
+
{
|
|
486
|
+
"text": format_doc(row),
|
|
487
|
+
"source_id": f"sales_note:{row['note_id']}",
|
|
488
|
+
"metadata": {"order_id": row["order_id"], "status": row["status"]},
|
|
489
|
+
}
|
|
490
|
+
for row in rows
|
|
491
|
+
]
|
|
492
|
+
await rag.ingest_records(records, namespace="sales_calls")
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
records: Iterable of dicts. Each dict must have:
|
|
496
|
+
- ``text`` (str, required): document content
|
|
497
|
+
- ``source_id`` (str, required): stable logical identifier
|
|
498
|
+
used for content-hash dedup AND stale-doc cleanup. Use a
|
|
499
|
+
scheme like ``"sales_note:42"`` or ``"jira:PROJ-1234"``.
|
|
500
|
+
Re-ingesting the same source_id with new text replaces
|
|
501
|
+
the prior version atomically.
|
|
502
|
+
- ``metadata`` (dict, optional): per-record metadata.
|
|
503
|
+
Persisted as JSONB on ``documents.metadata`` (queryable
|
|
504
|
+
via ``metadata->>'foo'``). Evolution-tracking keys
|
|
505
|
+
(``effective_from``, ``effective_to``, ``retracted``,
|
|
506
|
+
``version_label``, ``supersedes_document_id``) are ALSO
|
|
507
|
+
written to dedicated columns. Other keys are stored only
|
|
508
|
+
in the JSONB.
|
|
509
|
+
- ``entities`` (list of dict, optional): caller-known
|
|
510
|
+
entities to seed the graph. Each: ``{"name": "...",
|
|
511
|
+
"entity_type": "...", "description": "...", "properties":
|
|
512
|
+
{...}}``. ``name`` is required; the rest are optional.
|
|
513
|
+
Entity resolution merges these with LLM-extracted
|
|
514
|
+
entities of the same name. Linked to every chunk.
|
|
515
|
+
- ``relationships`` (list of dict, optional): caller-known
|
|
516
|
+
graph edges. Each: ``{"src": "EntityName1",
|
|
517
|
+
"dst": "EntityName2", "rel_type": "...", "weight": 1.0,
|
|
518
|
+
"description": "..."}``. ``src`` and ``dst`` are
|
|
519
|
+
required and must match either a caller-supplied or
|
|
520
|
+
LLM-extracted entity name.
|
|
521
|
+
- ``skip_llm`` (bool, optional, default False): skip LLM
|
|
522
|
+
extraction for this document. Useful when the caller's
|
|
523
|
+
known_entities/known_relationships already cover what
|
|
524
|
+
they care about and the LLM would just add noise / cost.
|
|
525
|
+
- ``pre_chunked`` (list of dict, optional): bypass
|
|
526
|
+
pg-raggraph's chunker AND embedder. Each entry:
|
|
527
|
+
``{"content": str, "embedded_content": str (optional),
|
|
528
|
+
"embedding": list[float] (must match config.embedding_dim),
|
|
529
|
+
"metadata": dict (optional), "token_count": int (optional)}``.
|
|
530
|
+
Use when an upstream tool (e.g. chunkshop's full pipeline)
|
|
531
|
+
already chunked + embedded the document. The ``text``
|
|
532
|
+
field still drives LLM entity/relationship extraction;
|
|
533
|
+
set it to a sensible reconstruction of the document.
|
|
534
|
+
See docs/cookbook/chunkshop-integration.md Pattern C.
|
|
535
|
+
namespace: Namespace for data isolation.
|
|
536
|
+
on_progress: Optional callback(message: str) for progress.
|
|
537
|
+
|
|
538
|
+
Returns: same stats shape as ``ingest()``.
|
|
539
|
+
|
|
540
|
+
Example (CRM with known FK relationships):
|
|
541
|
+
|
|
542
|
+
records = [{
|
|
543
|
+
"text": format_doc(row),
|
|
544
|
+
"source_id": f"sales_note:{row['note_id']}",
|
|
545
|
+
"metadata": {"order_id": row["order_id"], "status": row["status"]},
|
|
546
|
+
"entities": [
|
|
547
|
+
{"name": row["company_name"], "entity_type": "Customer"},
|
|
548
|
+
{"name": row["product_name"], "entity_type": "Product"},
|
|
549
|
+
{"name": row["salesperson_name"], "entity_type": "Salesperson"},
|
|
550
|
+
],
|
|
551
|
+
"relationships": [
|
|
552
|
+
{"src": row["company_name"], "dst": row["product_name"],
|
|
553
|
+
"rel_type": "BOUGHT"},
|
|
554
|
+
{"src": row["salesperson_name"], "dst": row["company_name"],
|
|
555
|
+
"rel_type": "SOLD_TO"},
|
|
556
|
+
],
|
|
557
|
+
} for row in crm_rows]
|
|
558
|
+
await rag.ingest_records(records, namespace="sales_calls")
|
|
559
|
+
"""
|
|
560
|
+
import asyncio
|
|
561
|
+
|
|
562
|
+
from pg_raggraph.chunking import chunk_document, content_hash
|
|
563
|
+
from pg_raggraph.extraction import extract_from_chunks, get_llm_provider
|
|
564
|
+
|
|
565
|
+
records = list(records)
|
|
566
|
+
ns = namespace or self.config.namespace
|
|
567
|
+
_validate_namespace(ns)
|
|
568
|
+
self.config.apply_nice_level()
|
|
569
|
+
embedder = self._get_embedder()
|
|
570
|
+
|
|
571
|
+
def _progress(msg: str):
|
|
572
|
+
logger.info(msg)
|
|
573
|
+
if on_progress:
|
|
574
|
+
on_progress(msg)
|
|
575
|
+
|
|
576
|
+
# Validate input shape (per-record, fail fast on the first bad row).
|
|
577
|
+
for i, rec in enumerate(records):
|
|
578
|
+
if not isinstance(rec, dict):
|
|
579
|
+
raise TypeError(f"records[{i}] must be a dict, got {type(rec).__name__}")
|
|
580
|
+
if not rec.get("text"):
|
|
581
|
+
raise ValueError(f"records[{i}] missing required 'text' field")
|
|
582
|
+
if not rec.get("source_id"):
|
|
583
|
+
raise ValueError(f"records[{i}] missing required 'source_id' field")
|
|
584
|
+
|
|
585
|
+
if not records:
|
|
586
|
+
_progress("No records to process.")
|
|
587
|
+
return
|
|
588
|
+
|
|
589
|
+
_progress(f"Processing {len(records)} records (in-memory ingest).")
|
|
590
|
+
|
|
591
|
+
if self._shutdown_event is None:
|
|
592
|
+
self._shutdown_event = asyncio.Event()
|
|
593
|
+
|
|
594
|
+
doc_sem = asyncio.Semaphore(self.config.doc_concurrency)
|
|
595
|
+
llm = None
|
|
596
|
+
if not self.config.skip_extraction and self.config.llm_base_url:
|
|
597
|
+
if self._llm is None:
|
|
598
|
+
try:
|
|
599
|
+
self._llm = get_llm_provider(self.config)
|
|
600
|
+
except Exception as e:
|
|
601
|
+
logger.warning(f"LLM provider unavailable, skipping extraction: {e}")
|
|
602
|
+
llm = self._llm
|
|
603
|
+
if llm is None:
|
|
604
|
+
_progress("Extraction disabled — ingesting as pure vector RAG.")
|
|
605
|
+
|
|
606
|
+
stats = {
|
|
607
|
+
"ingested": 0,
|
|
608
|
+
"skipped": 0,
|
|
609
|
+
"failed": 0,
|
|
610
|
+
"degraded": 0,
|
|
611
|
+
"entities": 0,
|
|
612
|
+
"rels": 0,
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
async def _process_record(idx: int, rec: dict):
|
|
616
|
+
async with doc_sem:
|
|
617
|
+
if self._shutdown_event is not None and self._shutdown_event.is_set():
|
|
618
|
+
stats["skipped"] += 1
|
|
619
|
+
return
|
|
620
|
+
attempt = 0
|
|
621
|
+
while True:
|
|
622
|
+
attempt += 1
|
|
623
|
+
try:
|
|
624
|
+
rec_meta = rec.get("metadata")
|
|
625
|
+
rec_entities = rec.get("entities")
|
|
626
|
+
rec_rels = rec.get("relationships")
|
|
627
|
+
rec_skip_llm = bool(rec.get("skip_llm", False))
|
|
628
|
+
rec_pre_chunked = rec.get("pre_chunked")
|
|
629
|
+
r = await self._ingest_one_content(
|
|
630
|
+
rec["text"],
|
|
631
|
+
source_id=rec["source_id"],
|
|
632
|
+
ns=ns,
|
|
633
|
+
embedder=embedder,
|
|
634
|
+
llm=llm,
|
|
635
|
+
content_hash_fn=content_hash,
|
|
636
|
+
chunk_document_fn=chunk_document,
|
|
637
|
+
extract_from_chunks_fn=extract_from_chunks,
|
|
638
|
+
metadata=rec_meta,
|
|
639
|
+
known_entities=rec_entities,
|
|
640
|
+
known_relationships=rec_rels,
|
|
641
|
+
skip_llm_for_this_doc=rec_skip_llm,
|
|
642
|
+
pre_chunked=rec_pre_chunked,
|
|
643
|
+
)
|
|
644
|
+
if r:
|
|
645
|
+
stats["ingested"] += 1
|
|
646
|
+
stats["entities"] += r["entities"]
|
|
647
|
+
stats["rels"] += r["rels"]
|
|
648
|
+
if r.get("degraded"):
|
|
649
|
+
stats["degraded"] += 1
|
|
650
|
+
deg_note = (
|
|
651
|
+
" (extraction failed, vector-only)" if r.get("degraded") else ""
|
|
652
|
+
)
|
|
653
|
+
_progress(
|
|
654
|
+
f"[{idx}/{len(records)}] {rec['source_id']}: "
|
|
655
|
+
f"{r['entities']} entities, {r['rels']} rels{deg_note}"
|
|
656
|
+
)
|
|
657
|
+
else:
|
|
658
|
+
stats["skipped"] += 1
|
|
659
|
+
return
|
|
660
|
+
except Exception as e:
|
|
661
|
+
sqlstate = getattr(e, "sqlstate", None)
|
|
662
|
+
msg = str(e)
|
|
663
|
+
transient = sqlstate in ("40P01", "40001") or (
|
|
664
|
+
sqlstate is None
|
|
665
|
+
and (
|
|
666
|
+
"40P01" in msg
|
|
667
|
+
or "40001" in msg
|
|
668
|
+
or "deadlock detected" in msg
|
|
669
|
+
or "could not serialize" in msg
|
|
670
|
+
)
|
|
671
|
+
)
|
|
672
|
+
if transient and attempt < 3:
|
|
673
|
+
backoff = 0.2 * (2 ** (attempt - 1))
|
|
674
|
+
await asyncio.sleep(backoff)
|
|
675
|
+
continue
|
|
676
|
+
logger.warning(f"Failed {rec['source_id']}: {e}")
|
|
677
|
+
stats["failed"] += 1
|
|
678
|
+
return
|
|
679
|
+
|
|
680
|
+
await asyncio.gather(*[_process_record(i + 1, rec) for i, rec in enumerate(records)])
|
|
681
|
+
|
|
682
|
+
notes_msg = []
|
|
683
|
+
if stats["failed"]:
|
|
684
|
+
notes_msg.append(f"{stats['failed']} failed")
|
|
685
|
+
if stats["degraded"]:
|
|
686
|
+
notes_msg.append(f"{stats['degraded']} degraded")
|
|
687
|
+
suffix = f", {', '.join(notes_msg)}" if notes_msg else ""
|
|
688
|
+
_progress(
|
|
689
|
+
f"Done: {stats['ingested']} ingested, {stats['skipped']} skipped"
|
|
690
|
+
f"{suffix}. {stats['entities']} entities, {stats['rels']} relationships."
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
async def _ingest_one_file(
|
|
694
|
+
self,
|
|
695
|
+
file_path,
|
|
696
|
+
ns,
|
|
697
|
+
embedder,
|
|
698
|
+
llm,
|
|
699
|
+
content_hash_fn,
|
|
700
|
+
chunk_document_fn,
|
|
701
|
+
extract_from_chunks_fn,
|
|
702
|
+
*,
|
|
703
|
+
metadata: dict | None = None,
|
|
704
|
+
):
|
|
705
|
+
"""Read a file from disk and ingest it.
|
|
706
|
+
|
|
707
|
+
Thin wrapper over `_ingest_one_content` — for in-memory ingest
|
|
708
|
+
(SQL → pgrg in the same database, no disk roundtrip) call
|
|
709
|
+
`ingest_records` instead, which routes directly to
|
|
710
|
+
`_ingest_one_content`.
|
|
711
|
+
"""
|
|
712
|
+
try:
|
|
713
|
+
with open(file_path, encoding="utf-8") as f:
|
|
714
|
+
content = f.read()
|
|
715
|
+
except (UnicodeDecodeError, ValueError):
|
|
716
|
+
logger.warning(f"Skipping non-UTF-8 file: {file_path}")
|
|
717
|
+
return None
|
|
718
|
+
return await self._ingest_one_content(
|
|
719
|
+
content,
|
|
720
|
+
source_id=file_path,
|
|
721
|
+
ns=ns,
|
|
722
|
+
embedder=embedder,
|
|
723
|
+
llm=llm,
|
|
724
|
+
content_hash_fn=content_hash_fn,
|
|
725
|
+
chunk_document_fn=chunk_document_fn,
|
|
726
|
+
extract_from_chunks_fn=extract_from_chunks_fn,
|
|
727
|
+
metadata=metadata,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
async def _ingest_one_content(
|
|
731
|
+
self,
|
|
732
|
+
content: str,
|
|
733
|
+
*,
|
|
734
|
+
source_id: str,
|
|
735
|
+
ns,
|
|
736
|
+
embedder,
|
|
737
|
+
llm,
|
|
738
|
+
content_hash_fn,
|
|
739
|
+
chunk_document_fn,
|
|
740
|
+
extract_from_chunks_fn,
|
|
741
|
+
metadata: dict | None = None,
|
|
742
|
+
known_entities: list[dict] | None = None,
|
|
743
|
+
known_relationships: list[dict] | None = None,
|
|
744
|
+
skip_llm_for_this_doc: bool = False,
|
|
745
|
+
pre_chunked: list[dict] | None = None,
|
|
746
|
+
):
|
|
747
|
+
"""Ingest a single document from in-memory content with all DB
|
|
748
|
+
writes in a single transaction.
|
|
749
|
+
|
|
750
|
+
Using db.transaction() ensures all chunks/entities/relationships for
|
|
751
|
+
one doc commit atomically, and chunk_id from INSERT is immediately
|
|
752
|
+
visible to entity_chunks INSERT on the same connection (no pool
|
|
753
|
+
commit propagation race).
|
|
754
|
+
|
|
755
|
+
``source_id`` serves the same role as ``source_path`` in file-based
|
|
756
|
+
ingest: it's the logical identifier for dedup (combined with
|
|
757
|
+
content_hash) and stale-doc cleanup. Use a stable string —
|
|
758
|
+
e.g. ``"sales_note:42"`` — so re-ingests of the same record
|
|
759
|
+
replace the prior version atomically.
|
|
760
|
+
|
|
761
|
+
``known_entities`` and ``known_relationships`` let callers seed
|
|
762
|
+
the graph with structured edges they already have (e.g. FK-derived
|
|
763
|
+
relationships from a CRM). Each known entity is linked to every
|
|
764
|
+
chunk of the document; each known relationship is linked to the
|
|
765
|
+
first chunk. They merge with LLM-extracted entities/relationships
|
|
766
|
+
via the entity-resolution path — same name across both sources
|
|
767
|
+
resolves to the same row.
|
|
768
|
+
|
|
769
|
+
``metadata`` is now persisted to ``documents.metadata`` JSONB as a
|
|
770
|
+
whole (in addition to the evolution-tracking columns). Query via
|
|
771
|
+
``metadata->>'foo'`` after ingest.
|
|
772
|
+
|
|
773
|
+
``skip_llm_for_this_doc`` skips LLM extraction for this document
|
|
774
|
+
only — useful when the caller's known_entities/known_relationships
|
|
775
|
+
already cover everything they care about and the LLM would just
|
|
776
|
+
add noise (or cost).
|
|
777
|
+
|
|
778
|
+
``pre_chunked`` lets callers bypass pg-raggraph's chunker AND
|
|
779
|
+
embedder. Use when the chunks + embeddings already exist
|
|
780
|
+
upstream (e.g. chunkshop end-to-end pipeline → pg-raggraph
|
|
781
|
+
graph layer; see docs/cookbook/chunkshop-integration.md
|
|
782
|
+
Pattern C). Each list entry is a dict::
|
|
783
|
+
|
|
784
|
+
{
|
|
785
|
+
"content": "<original chunk text>", # required
|
|
786
|
+
"embedded_content": "<text given to the embedder>", # optional
|
|
787
|
+
"embedding": [float, ...], # required (dim)
|
|
788
|
+
"metadata": {...}, # optional (merged)
|
|
789
|
+
"token_count": int, # optional
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
When ``pre_chunked`` is set, ``content`` is still used as the
|
|
793
|
+
full-document text input for LLM entity/relationship extraction
|
|
794
|
+
— set it to a sensible reconstruction (e.g. join all chunks
|
|
795
|
+
with newlines) so the LLM sees the document.
|
|
796
|
+
"""
|
|
797
|
+
# Use the source_id as the chunker's path hint so .md/.py-style
|
|
798
|
+
# extension detection still works for callers that pass
|
|
799
|
+
# filename-shaped IDs. For non-filename IDs the chunker falls back
|
|
800
|
+
# to content-based detection (e.g. markdown headings).
|
|
801
|
+
file_path = source_id
|
|
802
|
+
|
|
803
|
+
# Delta check (read-only)
|
|
804
|
+
c_hash = content_hash_fn(content)
|
|
805
|
+
existing = await self.db.fetch_one(
|
|
806
|
+
"SELECT id FROM documents WHERE namespace = %s AND content_hash = %s",
|
|
807
|
+
(ns, c_hash),
|
|
808
|
+
)
|
|
809
|
+
if existing:
|
|
810
|
+
logger.debug(f"Skipped (unchanged): {file_path}")
|
|
811
|
+
return None
|
|
812
|
+
|
|
813
|
+
# Chunk (no DB) — caller can pre-chunk to bypass pg-raggraph's
|
|
814
|
+
# chunker AND embedder (e.g. chunkshop Pattern C, where the upstream
|
|
815
|
+
# pipeline already chunked + embedded + extracted metadata).
|
|
816
|
+
from pg_raggraph.chunking import token_count as _token_count
|
|
817
|
+
|
|
818
|
+
if pre_chunked is not None:
|
|
819
|
+
chunks = []
|
|
820
|
+
chunk_embeddings = []
|
|
821
|
+
for i, pc in enumerate(pre_chunked):
|
|
822
|
+
if "content" not in pc or "embedding" not in pc:
|
|
823
|
+
raise ValueError(f"pre_chunked[{i}] must include 'content' and 'embedding'")
|
|
824
|
+
emb = pc["embedding"]
|
|
825
|
+
if len(emb) != self.config.embedding_dim:
|
|
826
|
+
raise ValueError(
|
|
827
|
+
f"pre_chunked[{i}].embedding has dim {len(emb)} but "
|
|
828
|
+
f"config.embedding_dim={self.config.embedding_dim}. "
|
|
829
|
+
"Configure GraphRAG with embedding_dim matching the "
|
|
830
|
+
"upstream embedder, or re-embed at the upstream layer."
|
|
831
|
+
)
|
|
832
|
+
body = pc["content"]
|
|
833
|
+
emb_content = pc.get("embedded_content") or body
|
|
834
|
+
meta = dict(pc.get("metadata") or {})
|
|
835
|
+
meta.setdefault("source_path", file_path)
|
|
836
|
+
meta.setdefault("chunk_index", i)
|
|
837
|
+
chunks.append(
|
|
838
|
+
{
|
|
839
|
+
"content": body,
|
|
840
|
+
"embedded_content": emb_content,
|
|
841
|
+
"token_count": pc.get("token_count") or _token_count(emb_content),
|
|
842
|
+
"content_hash": pc.get("content_hash") or content_hash_fn(body),
|
|
843
|
+
"metadata": meta,
|
|
844
|
+
}
|
|
845
|
+
)
|
|
846
|
+
chunk_embeddings.append(emb)
|
|
847
|
+
if not chunks:
|
|
848
|
+
return {"entities": 0, "rels": 0}
|
|
849
|
+
else:
|
|
850
|
+
chunks = chunk_document_fn(content, source_path=file_path, config=self.config)
|
|
851
|
+
if not chunks:
|
|
852
|
+
return {"entities": 0, "rels": 0}
|
|
853
|
+
|
|
854
|
+
# Batch embed all chunks. Use embedded_content so the embedder sees
|
|
855
|
+
# heading prefix (hierarchy strategy) or any future neighbor/summary
|
|
856
|
+
# decoration; for auto strategy this equals content.
|
|
857
|
+
texts = [c["embedded_content"] for c in chunks]
|
|
858
|
+
chunk_embeddings = await embedder.embed(texts)
|
|
859
|
+
|
|
860
|
+
# Extract entities/relationships via LLM (cache reads OK outside txn).
|
|
861
|
+
# If llm is None or skip_llm_for_this_doc is set, skip extraction
|
|
862
|
+
# entirely — pure vector RAG mode (with whatever known_entities /
|
|
863
|
+
# known_relationships the caller provides as the only graph signal).
|
|
864
|
+
extraction_degraded = False
|
|
865
|
+
if llm is None or skip_llm_for_this_doc:
|
|
866
|
+
from pg_raggraph.models import ExtractionResult
|
|
867
|
+
|
|
868
|
+
extraction_results = [ExtractionResult() for _ in chunks]
|
|
869
|
+
else:
|
|
870
|
+
try:
|
|
871
|
+
extraction_results = await extract_from_chunks_fn(
|
|
872
|
+
chunks, llm, self.db, self.config
|
|
873
|
+
)
|
|
874
|
+
except Exception as e:
|
|
875
|
+
logger.warning(f"Extraction failed for {file_path}, ingesting as pure vector: {e}")
|
|
876
|
+
from pg_raggraph.models import ExtractionResult
|
|
877
|
+
|
|
878
|
+
extraction_results = [ExtractionResult() for _ in chunks]
|
|
879
|
+
extraction_degraded = True
|
|
880
|
+
|
|
881
|
+
# Dedupe entities by name, build per-chunk entity/rel lists
|
|
882
|
+
unique_entities = {}
|
|
883
|
+
chunk_to_entities = []
|
|
884
|
+
chunk_to_rels = []
|
|
885
|
+
|
|
886
|
+
for i, extraction in enumerate(extraction_results):
|
|
887
|
+
entity_names = []
|
|
888
|
+
for ent in extraction.entities:
|
|
889
|
+
if ent.name not in unique_entities:
|
|
890
|
+
unique_entities[ent.name] = {
|
|
891
|
+
"entity_type": ent.entity_type,
|
|
892
|
+
"description": ent.description,
|
|
893
|
+
"chunks": [i],
|
|
894
|
+
}
|
|
895
|
+
else:
|
|
896
|
+
unique_entities[ent.name]["chunks"].append(i)
|
|
897
|
+
existing_desc = unique_entities[ent.name]["description"]
|
|
898
|
+
if ent.description and ent.description not in existing_desc:
|
|
899
|
+
unique_entities[ent.name]["description"] += " " + ent.description
|
|
900
|
+
entity_names.append(ent.name)
|
|
901
|
+
chunk_to_entities.append(entity_names)
|
|
902
|
+
chunk_to_rels.append(
|
|
903
|
+
[
|
|
904
|
+
(r.source, r.target, r.rel_type, r.description, r.weight)
|
|
905
|
+
for r in extraction.relationships
|
|
906
|
+
]
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
# Merge caller-supplied known entities and relationships.
|
|
910
|
+
# Known entities are document-level: linked to every chunk.
|
|
911
|
+
# Known relationships attach to chunk[0] (only one anchor point
|
|
912
|
+
# is needed; graph traversal queries entities, not chunks).
|
|
913
|
+
if known_entities:
|
|
914
|
+
all_chunk_idxs = list(range(len(chunks)))
|
|
915
|
+
for ke in known_entities:
|
|
916
|
+
if not ke.get("name"):
|
|
917
|
+
raise ValueError("known_entities entries must include a non-empty 'name'")
|
|
918
|
+
name = ke["name"]
|
|
919
|
+
ke_desc = ke.get("description", "") or ""
|
|
920
|
+
ke_type = ke.get("entity_type", "ENTITY")
|
|
921
|
+
if name not in unique_entities:
|
|
922
|
+
unique_entities[name] = {
|
|
923
|
+
"entity_type": ke_type,
|
|
924
|
+
"description": ke_desc,
|
|
925
|
+
"chunks": list(all_chunk_idxs),
|
|
926
|
+
}
|
|
927
|
+
else:
|
|
928
|
+
# LLM also found this entity. Caller's domain knowledge
|
|
929
|
+
# WINS on entity_type and (if non-empty) description —
|
|
930
|
+
# the user explicitly tagged this as a Customer/Product/
|
|
931
|
+
# whatever, so don't let the LLM's generic "company"
|
|
932
|
+
# classification overwrite the caller's intent.
|
|
933
|
+
if ke_type and ke_type != "ENTITY":
|
|
934
|
+
unique_entities[name]["entity_type"] = ke_type
|
|
935
|
+
if ke_desc:
|
|
936
|
+
unique_entities[name]["description"] = ke_desc
|
|
937
|
+
existing = set(unique_entities[name]["chunks"])
|
|
938
|
+
existing.update(all_chunk_idxs)
|
|
939
|
+
unique_entities[name]["chunks"] = sorted(existing)
|
|
940
|
+
# Reflect in chunk_to_entities so entity_chunks links are written.
|
|
941
|
+
for ci in all_chunk_idxs:
|
|
942
|
+
if name not in chunk_to_entities[ci]:
|
|
943
|
+
chunk_to_entities[ci].append(name)
|
|
944
|
+
|
|
945
|
+
if known_relationships:
|
|
946
|
+
for kr in known_relationships:
|
|
947
|
+
if not (kr.get("src") and kr.get("dst")):
|
|
948
|
+
raise ValueError("known_relationships entries must include 'src' and 'dst'")
|
|
949
|
+
rel_tuple = (
|
|
950
|
+
kr["src"],
|
|
951
|
+
kr["dst"],
|
|
952
|
+
kr.get("rel_type", "RELATED_TO"),
|
|
953
|
+
kr.get("description", "") or "",
|
|
954
|
+
float(kr.get("weight", 1.0)),
|
|
955
|
+
)
|
|
956
|
+
# Anchor on chunk[0] — relationships are document-level.
|
|
957
|
+
chunk_to_rels[0].append(rel_tuple)
|
|
958
|
+
|
|
959
|
+
# Batch embed entities (no DB)
|
|
960
|
+
if unique_entities:
|
|
961
|
+
entity_names_list = list(unique_entities.keys())
|
|
962
|
+
entity_texts = [
|
|
963
|
+
f"{name} {unique_entities[name]['description']}" for name in entity_names_list
|
|
964
|
+
]
|
|
965
|
+
entity_embeddings = await embedder.embed(entity_texts)
|
|
966
|
+
else:
|
|
967
|
+
entity_names_list = []
|
|
968
|
+
entity_embeddings = []
|
|
969
|
+
|
|
970
|
+
# All DB writes in a single transaction
|
|
971
|
+
from pg_raggraph.resolution import resolve_entity
|
|
972
|
+
|
|
973
|
+
async with self.db.transaction() as tx:
|
|
974
|
+
# Incremental update: if source_path exists with a DIFFERENT hash,
|
|
975
|
+
# the file has changed. Delete the stale document inside the same
|
|
976
|
+
# transaction as the new insert so any failure mid-ingest rolls
|
|
977
|
+
# back both the delete and the insert — the old version stays
|
|
978
|
+
# visible until the new one commits. FK cascades take care of
|
|
979
|
+
# chunks and the entity/relationship provenance joins. Call
|
|
980
|
+
# prune_orphans() afterwards to clean up unreferenced entities.
|
|
981
|
+
stale = await tx.fetch_one(
|
|
982
|
+
"SELECT id FROM documents "
|
|
983
|
+
"WHERE namespace = %s AND source_path = %s AND content_hash != %s",
|
|
984
|
+
(ns, file_path, c_hash),
|
|
985
|
+
)
|
|
986
|
+
if stale:
|
|
987
|
+
await tx.execute("DELETE FROM documents WHERE id = %s", (stale["id"],))
|
|
988
|
+
logger.info(f"Replaced stale version of {file_path}")
|
|
989
|
+
|
|
990
|
+
# Insert document with any caller-supplied evolution metadata.
|
|
991
|
+
# ON CONFLICT uses COALESCE so a re-ingest without metadata doesn't
|
|
992
|
+
# clobber previously-stored evolution fields. For `retracted` we
|
|
993
|
+
# distinguish "absent from meta" (preserve prior value) from
|
|
994
|
+
# "explicitly True/False" (apply the caller's value, including
|
|
995
|
+
# un-retracting). COALESCE can't express this for booleans, so we
|
|
996
|
+
# pass a separate `retracted_explicit` flag and gate the SET on it
|
|
997
|
+
# via CASE WHEN.
|
|
998
|
+
meta = metadata or {}
|
|
999
|
+
eff_from = meta.get("effective_from")
|
|
1000
|
+
eff_to = meta.get("effective_to")
|
|
1001
|
+
retracted_explicit = "retracted" in meta and meta["retracted"] is not None
|
|
1002
|
+
# Value for fresh INSERT: the caller's value if explicit, else
|
|
1003
|
+
# False (matches the column DEFAULT). On UPDATE the CASE WHEN
|
|
1004
|
+
# below decides whether to apply it at all.
|
|
1005
|
+
retracted_value = bool(meta["retracted"]) if retracted_explicit else False
|
|
1006
|
+
version_label = meta.get("version_label")
|
|
1007
|
+
supersedes_doc = meta.get("supersedes_document_id")
|
|
1008
|
+
|
|
1009
|
+
# Persist arbitrary caller metadata to documents.metadata JSONB.
|
|
1010
|
+
# The dedicated evolution columns (effective_from etc.) ALSO get
|
|
1011
|
+
# the same fields, so callers can query either way.
|
|
1012
|
+
# Re-ingest merges (caller intent: add new keys, update changed
|
|
1013
|
+
# keys, leave untouched keys alone) — implemented via JSONB
|
|
1014
|
+
# concat in the ON CONFLICT branch.
|
|
1015
|
+
# Use _json_default so datetime values in metadata (e.g.
|
|
1016
|
+
# effective_from / effective_to from evolution-tracking ingests)
|
|
1017
|
+
# serialize to ISO strings instead of crashing the ingest.
|
|
1018
|
+
doc_metadata_json = json.dumps(meta, default=_json_default) if meta else "{}"
|
|
1019
|
+
|
|
1020
|
+
doc_id = await tx.insert_returning_id(
|
|
1021
|
+
"INSERT INTO documents "
|
|
1022
|
+
"(namespace, content_hash, source_path, metadata, "
|
|
1023
|
+
" effective_from, effective_to, retracted, version_label) "
|
|
1024
|
+
"VALUES (%s, %s, %s, %s::jsonb, %s, %s, %s, %s) "
|
|
1025
|
+
"ON CONFLICT (namespace, content_hash) DO UPDATE "
|
|
1026
|
+
"SET source_path = EXCLUDED.source_path, "
|
|
1027
|
+
" metadata = documents.metadata || EXCLUDED.metadata, "
|
|
1028
|
+
" effective_from = COALESCE("
|
|
1029
|
+
"EXCLUDED.effective_from, documents.effective_from), "
|
|
1030
|
+
" effective_to = COALESCE("
|
|
1031
|
+
"EXCLUDED.effective_to, documents.effective_to), "
|
|
1032
|
+
" retracted = CASE WHEN %s "
|
|
1033
|
+
"THEN EXCLUDED.retracted ELSE documents.retracted END, "
|
|
1034
|
+
" version_label = COALESCE("
|
|
1035
|
+
"EXCLUDED.version_label, documents.version_label) "
|
|
1036
|
+
"RETURNING id",
|
|
1037
|
+
(
|
|
1038
|
+
ns,
|
|
1039
|
+
c_hash,
|
|
1040
|
+
file_path,
|
|
1041
|
+
doc_metadata_json,
|
|
1042
|
+
eff_from,
|
|
1043
|
+
eff_to,
|
|
1044
|
+
retracted_value,
|
|
1045
|
+
version_label,
|
|
1046
|
+
retracted_explicit,
|
|
1047
|
+
),
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
# If caller supplied version info or a supersession edge, create a
|
|
1051
|
+
# document_versions row for authoritative multi-version tracking.
|
|
1052
|
+
if version_label or supersedes_doc or meta.get("retraction_reason"):
|
|
1053
|
+
await tx.execute(
|
|
1054
|
+
"INSERT INTO document_versions "
|
|
1055
|
+
"(namespace, document_id, version_label, effective_from, effective_to, "
|
|
1056
|
+
" supersedes_document_id, retracted, retracted_at, retraction_reason) "
|
|
1057
|
+
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
|
|
1058
|
+
(
|
|
1059
|
+
ns,
|
|
1060
|
+
doc_id,
|
|
1061
|
+
version_label,
|
|
1062
|
+
eff_from,
|
|
1063
|
+
eff_to,
|
|
1064
|
+
supersedes_doc,
|
|
1065
|
+
retracted_value,
|
|
1066
|
+
meta.get("retracted_at"),
|
|
1067
|
+
meta.get("retraction_reason"),
|
|
1068
|
+
),
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
# Insert all chunks
|
|
1072
|
+
chunk_ids = []
|
|
1073
|
+
for i, chunk in enumerate(chunks):
|
|
1074
|
+
chunk_id = await tx.insert_returning_id(
|
|
1075
|
+
"INSERT INTO chunks "
|
|
1076
|
+
"(document_id, content, embedded_content, embedding, token_count, metadata) "
|
|
1077
|
+
"VALUES (%s, %s, %s, %s, %s, %s) RETURNING id",
|
|
1078
|
+
(
|
|
1079
|
+
doc_id,
|
|
1080
|
+
chunk["content"],
|
|
1081
|
+
chunk["embedded_content"],
|
|
1082
|
+
chunk_embeddings[i],
|
|
1083
|
+
chunk["token_count"],
|
|
1084
|
+
json.dumps(chunk["metadata"], default=_json_default),
|
|
1085
|
+
),
|
|
1086
|
+
)
|
|
1087
|
+
chunk_ids.append(chunk_id)
|
|
1088
|
+
|
|
1089
|
+
if not unique_entities:
|
|
1090
|
+
return {"entities": 0, "rels": 0}
|
|
1091
|
+
|
|
1092
|
+
# Resolve and insert entities (tx duck-types the db interface)
|
|
1093
|
+
entity_name_to_id = {}
|
|
1094
|
+
for name, emb in zip(entity_names_list, entity_embeddings):
|
|
1095
|
+
info = unique_entities[name]
|
|
1096
|
+
eid = await resolve_entity(
|
|
1097
|
+
name=name,
|
|
1098
|
+
entity_type=info["entity_type"],
|
|
1099
|
+
description=info["description"],
|
|
1100
|
+
embedding=emb,
|
|
1101
|
+
namespace=ns,
|
|
1102
|
+
db=tx,
|
|
1103
|
+
config=self.config,
|
|
1104
|
+
)
|
|
1105
|
+
entity_name_to_id[name] = eid
|
|
1106
|
+
|
|
1107
|
+
# Insert entity_chunks links
|
|
1108
|
+
for i, chunk_id in enumerate(chunk_ids):
|
|
1109
|
+
if i >= len(chunk_to_entities):
|
|
1110
|
+
break
|
|
1111
|
+
seen = set()
|
|
1112
|
+
for ent_name in chunk_to_entities[i]:
|
|
1113
|
+
if ent_name in seen or ent_name not in entity_name_to_id:
|
|
1114
|
+
continue
|
|
1115
|
+
seen.add(ent_name)
|
|
1116
|
+
await tx.execute(
|
|
1117
|
+
"INSERT INTO entity_chunks (entity_id, chunk_id, confidence) "
|
|
1118
|
+
"VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
|
|
1119
|
+
(entity_name_to_id[ent_name], chunk_id, 1.0),
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
# Insert relationships and their chunk links
|
|
1123
|
+
rel_count = 0
|
|
1124
|
+
for i, chunk_id in enumerate(chunk_ids):
|
|
1125
|
+
if i >= len(chunk_to_rels):
|
|
1126
|
+
break
|
|
1127
|
+
for rel in chunk_to_rels[i]:
|
|
1128
|
+
src_id = entity_name_to_id.get(rel[0])
|
|
1129
|
+
dst_id = entity_name_to_id.get(rel[1])
|
|
1130
|
+
if not (src_id and dst_id):
|
|
1131
|
+
continue
|
|
1132
|
+
rel_id = await tx.insert_returning_id(
|
|
1133
|
+
"INSERT INTO relationships "
|
|
1134
|
+
"(namespace, src_id, dst_id, rel_type, weight, description) "
|
|
1135
|
+
"VALUES (%s, %s, %s, %s, %s, %s) RETURNING id",
|
|
1136
|
+
(ns, src_id, dst_id, rel[2], rel[4], rel[3]),
|
|
1137
|
+
)
|
|
1138
|
+
await tx.execute(
|
|
1139
|
+
"INSERT INTO relationship_chunks "
|
|
1140
|
+
"(relationship_id, chunk_id, confidence) "
|
|
1141
|
+
"VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
|
|
1142
|
+
(rel_id, chunk_id, 1.0),
|
|
1143
|
+
)
|
|
1144
|
+
rel_count += 1
|
|
1145
|
+
|
|
1146
|
+
return {
|
|
1147
|
+
"entities": len(unique_entities),
|
|
1148
|
+
"rels": rel_count,
|
|
1149
|
+
"degraded": extraction_degraded,
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
async def query(
|
|
1153
|
+
self,
|
|
1154
|
+
question: str,
|
|
1155
|
+
mode: str = "smart",
|
|
1156
|
+
namespace: str | None = None,
|
|
1157
|
+
*,
|
|
1158
|
+
as_of: datetime | None = None,
|
|
1159
|
+
version_filter: str | None = None,
|
|
1160
|
+
evolution_aware: bool | None = None,
|
|
1161
|
+
rerank: bool = False,
|
|
1162
|
+
) -> QueryResult:
|
|
1163
|
+
"""Query the knowledge graph.
|
|
1164
|
+
|
|
1165
|
+
Modes:
|
|
1166
|
+
smart (default) - confidence-triggered routing (naive → boost → expand)
|
|
1167
|
+
naive - vector + BM25 only (fastest)
|
|
1168
|
+
naive_boost - naive + 1-hop graph boost re-ranking
|
|
1169
|
+
local - vector seed → graph expansion via entity neighbors
|
|
1170
|
+
global - relationship-centric retrieval
|
|
1171
|
+
hybrid - local + global combined
|
|
1172
|
+
|
|
1173
|
+
Evolution-aware kwargs (keyword-only):
|
|
1174
|
+
as_of: time-travel filter — restrict to documents whose effective
|
|
1175
|
+
window contains this timestamp.
|
|
1176
|
+
version_filter: restrict to documents with matching version_label.
|
|
1177
|
+
evolution_aware: when False, ignore evolution_tier for this query
|
|
1178
|
+
(forces classic retrieval). When None, honors config.
|
|
1179
|
+
rerank: when True, fetch top_k * rerank_factor candidates and
|
|
1180
|
+
re-rank with a cross-encoder before trimming to top_k.
|
|
1181
|
+
Adds ~30-80 ms p50 latency, zero per-query LLM cost.
|
|
1182
|
+
Model and factor configured via PGRGConfig.rerank_model
|
|
1183
|
+
and rerank_factor.
|
|
1184
|
+
"""
|
|
1185
|
+
from pg_raggraph.retrieval import query as retrieval_query
|
|
1186
|
+
|
|
1187
|
+
ns = namespace or self.config.namespace
|
|
1188
|
+
_validate_namespace(ns)
|
|
1189
|
+
embedder = self._get_embedder()
|
|
1190
|
+
top_k_override = self.config.top_k * self.config.rerank_factor if rerank else None
|
|
1191
|
+
result = await retrieval_query(
|
|
1192
|
+
question=question,
|
|
1193
|
+
db=self.db,
|
|
1194
|
+
embedder=embedder,
|
|
1195
|
+
config=self.config,
|
|
1196
|
+
mode=mode,
|
|
1197
|
+
namespace=ns,
|
|
1198
|
+
as_of=as_of,
|
|
1199
|
+
version_filter=version_filter,
|
|
1200
|
+
evolution_aware=evolution_aware,
|
|
1201
|
+
top_k_override=top_k_override,
|
|
1202
|
+
)
|
|
1203
|
+
if rerank:
|
|
1204
|
+
from pg_raggraph.reranker import FastEmbedReranker, apply_reranker
|
|
1205
|
+
|
|
1206
|
+
if self._reranker is None:
|
|
1207
|
+
self._reranker = FastEmbedReranker(self.config.rerank_model)
|
|
1208
|
+
result = await apply_reranker(self._reranker, question, result, self.config.top_k)
|
|
1209
|
+
return result
|
|
1210
|
+
|
|
1211
|
+
async def ask(
|
|
1212
|
+
self,
|
|
1213
|
+
question: str,
|
|
1214
|
+
mode: str = "smart",
|
|
1215
|
+
namespace: str | None = None,
|
|
1216
|
+
*,
|
|
1217
|
+
as_of: datetime | None = None,
|
|
1218
|
+
version_filter: str | None = None,
|
|
1219
|
+
evolution_aware: bool | None = None,
|
|
1220
|
+
short_answer: bool = False,
|
|
1221
|
+
rerank: bool = False,
|
|
1222
|
+
) -> QueryResult:
|
|
1223
|
+
"""Query + LLM answer synthesis.
|
|
1224
|
+
|
|
1225
|
+
Runs retrieval then generates a grounded natural-language answer
|
|
1226
|
+
using the configured LLM. Falls back to a top-chunk summary if no
|
|
1227
|
+
LLM is configured — library stays useful as pure vector RAG.
|
|
1228
|
+
|
|
1229
|
+
When ``short_answer=True``, the LLM is asked for a short factoid
|
|
1230
|
+
answer (≤10 tokens, single phrase) instead of a paragraph. Useful
|
|
1231
|
+
for SQuAD-style benchmarks where gold answers are short strings.
|
|
1232
|
+
|
|
1233
|
+
When ``rerank=True``, the retrieved chunks are re-ranked with a
|
|
1234
|
+
cross-encoder before answer generation. Adds ~30-80 ms p50 latency,
|
|
1235
|
+
zero per-query LLM cost.
|
|
1236
|
+
"""
|
|
1237
|
+
from pg_raggraph.answer import generate_answer
|
|
1238
|
+
|
|
1239
|
+
result = await self.query(
|
|
1240
|
+
question,
|
|
1241
|
+
mode=mode,
|
|
1242
|
+
namespace=namespace,
|
|
1243
|
+
as_of=as_of,
|
|
1244
|
+
version_filter=version_filter,
|
|
1245
|
+
evolution_aware=evolution_aware,
|
|
1246
|
+
rerank=rerank,
|
|
1247
|
+
)
|
|
1248
|
+
# Reuse the shared LLM client (same pool as ingestion).
|
|
1249
|
+
llm = None
|
|
1250
|
+
if self.config.llm_base_url:
|
|
1251
|
+
if self._llm is None:
|
|
1252
|
+
try:
|
|
1253
|
+
from pg_raggraph.extraction import get_llm_provider
|
|
1254
|
+
|
|
1255
|
+
self._llm = get_llm_provider(self.config)
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
logger.warning(f"LLM provider unavailable: {e}")
|
|
1258
|
+
llm = self._llm
|
|
1259
|
+
result.answer = await generate_answer(
|
|
1260
|
+
question, result, llm, self.config, short_answer=short_answer
|
|
1261
|
+
)
|
|
1262
|
+
return result
|
|
1263
|
+
|
|
1264
|
+
async def status(self, namespace: str | None = None) -> dict:
|
|
1265
|
+
"""Get graph statistics."""
|
|
1266
|
+
ns = namespace or self.config.namespace
|
|
1267
|
+
return {
|
|
1268
|
+
"schema_version": int(await self.db.get_meta("schema_version") or 0),
|
|
1269
|
+
"embedding_dim": int(await self.db.get_meta("embedding_dim") or 0),
|
|
1270
|
+
"namespace": ns,
|
|
1271
|
+
"documents": await self.db.count("documents", ns),
|
|
1272
|
+
# Chunks table has no namespace column — scope via documents join.
|
|
1273
|
+
"chunks": (
|
|
1274
|
+
await self.db.fetch_one(
|
|
1275
|
+
"SELECT count(*) AS cnt FROM chunks c "
|
|
1276
|
+
"JOIN documents d ON d.id = c.document_id "
|
|
1277
|
+
"WHERE d.namespace = %s",
|
|
1278
|
+
(ns,),
|
|
1279
|
+
)
|
|
1280
|
+
)["cnt"],
|
|
1281
|
+
"entities": await self.db.count("entities", ns),
|
|
1282
|
+
"relationships": await self.db.count("relationships", ns),
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
async def delete(self, namespace: str):
|
|
1286
|
+
"""Delete all data in a namespace."""
|
|
1287
|
+
_validate_namespace(namespace)
|
|
1288
|
+
await self.db.execute("DELETE FROM documents WHERE namespace = %s", (namespace,))
|
|
1289
|
+
await self.db.execute("DELETE FROM entities WHERE namespace = %s", (namespace,))
|
|
1290
|
+
await self.db.execute("DELETE FROM relationships WHERE namespace = %s", (namespace,))
|
|
1291
|
+
|
|
1292
|
+
async def delete_document(self, source_path: str, namespace: str | None = None) -> int:
|
|
1293
|
+
"""Delete a document and all its chunks by source path.
|
|
1294
|
+
|
|
1295
|
+
Entities and relationships are left in place — they may be referenced
|
|
1296
|
+
by other documents. Use `prune_orphans()` to clean up any entities
|
|
1297
|
+
that become unreferenced.
|
|
1298
|
+
|
|
1299
|
+
Returns number of documents deleted.
|
|
1300
|
+
"""
|
|
1301
|
+
ns = namespace or self.config.namespace
|
|
1302
|
+
_validate_namespace(ns)
|
|
1303
|
+
result = await self.db.fetch_one(
|
|
1304
|
+
"DELETE FROM documents WHERE namespace = %s AND source_path = %s RETURNING id",
|
|
1305
|
+
(ns, source_path),
|
|
1306
|
+
)
|
|
1307
|
+
return 1 if result else 0
|
|
1308
|
+
|
|
1309
|
+
async def delete_entity(self, entity_id: int) -> bool:
|
|
1310
|
+
"""Delete an entity and its relationships by id."""
|
|
1311
|
+
result = await self.db.fetch_one(
|
|
1312
|
+
"DELETE FROM entities WHERE id = %s RETURNING id", (entity_id,)
|
|
1313
|
+
)
|
|
1314
|
+
return result is not None
|
|
1315
|
+
|
|
1316
|
+
async def merge_entities(self, keep_id: int, merge_ids: list[int]) -> dict:
|
|
1317
|
+
"""Merge one or more entities into a canonical one.
|
|
1318
|
+
|
|
1319
|
+
Rewrites relationships and entity_chunks to point at `keep_id`,
|
|
1320
|
+
deduplicates any resulting duplicate edges, drops self-loops that
|
|
1321
|
+
the merge creates, then deletes the merged entities. All atomic.
|
|
1322
|
+
|
|
1323
|
+
Raises ValueError if keep_id appears in merge_ids (would delete the
|
|
1324
|
+
canonical entity) or if merge_ids is empty.
|
|
1325
|
+
"""
|
|
1326
|
+
if not merge_ids:
|
|
1327
|
+
raise ValueError("merge_ids must not be empty")
|
|
1328
|
+
if keep_id in merge_ids:
|
|
1329
|
+
raise ValueError(
|
|
1330
|
+
f"keep_id {keep_id} must not appear in merge_ids — "
|
|
1331
|
+
"that would delete the canonical entity"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
async with self.db.transaction() as tx:
|
|
1335
|
+
# Verify all entities exist and share a namespace. Cross-namespace
|
|
1336
|
+
# merges are almost always a bug.
|
|
1337
|
+
rows = await tx.fetch_all(
|
|
1338
|
+
"SELECT id, namespace FROM entities WHERE id = ANY(%s)",
|
|
1339
|
+
([keep_id, *merge_ids],),
|
|
1340
|
+
)
|
|
1341
|
+
found_ids = {r["id"] for r in rows}
|
|
1342
|
+
missing = set([keep_id, *merge_ids]) - found_ids
|
|
1343
|
+
if missing:
|
|
1344
|
+
raise ValueError(f"entities not found: {sorted(missing)}")
|
|
1345
|
+
namespaces = {r["namespace"] for r in rows}
|
|
1346
|
+
if len(namespaces) > 1:
|
|
1347
|
+
raise ValueError(f"cross-namespace merge refused: {sorted(namespaces)}")
|
|
1348
|
+
|
|
1349
|
+
# Repoint relationships. After rewriting src_id and dst_id, any
|
|
1350
|
+
# edge whose src and dst both collapse to keep_id becomes a
|
|
1351
|
+
# self-loop — delete those. Remaining duplicates (same src, dst,
|
|
1352
|
+
# rel_type after the rewrite) collapse to one row each.
|
|
1353
|
+
await tx.execute(
|
|
1354
|
+
"UPDATE relationships SET src_id = %s WHERE src_id = ANY(%s)",
|
|
1355
|
+
(keep_id, merge_ids),
|
|
1356
|
+
)
|
|
1357
|
+
await tx.execute(
|
|
1358
|
+
"UPDATE relationships SET dst_id = %s WHERE dst_id = ANY(%s)",
|
|
1359
|
+
(keep_id, merge_ids),
|
|
1360
|
+
)
|
|
1361
|
+
# Drop self-loops created by the merge.
|
|
1362
|
+
await tx.execute(
|
|
1363
|
+
"DELETE FROM relationships WHERE src_id = dst_id AND (src_id = %s OR dst_id = %s)",
|
|
1364
|
+
(keep_id, keep_id),
|
|
1365
|
+
)
|
|
1366
|
+
# Collapse duplicate edges (keep the lowest id per group).
|
|
1367
|
+
await tx.execute(
|
|
1368
|
+
"DELETE FROM relationships a USING relationships b "
|
|
1369
|
+
"WHERE a.id > b.id AND a.src_id = b.src_id AND "
|
|
1370
|
+
"a.dst_id = b.dst_id AND a.rel_type = b.rel_type AND "
|
|
1371
|
+
"a.namespace = b.namespace AND (a.src_id = %s OR a.dst_id = %s)",
|
|
1372
|
+
(keep_id, keep_id),
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
# Copy entity_chunks rows from merged entities to keep_id,
|
|
1376
|
+
# deduping via ON CONFLICT, then delete the old rows.
|
|
1377
|
+
await tx.execute(
|
|
1378
|
+
"INSERT INTO entity_chunks (entity_id, chunk_id, confidence, provenance) "
|
|
1379
|
+
"SELECT %s, chunk_id, confidence, provenance FROM entity_chunks "
|
|
1380
|
+
"WHERE entity_id = ANY(%s) "
|
|
1381
|
+
"ON CONFLICT DO NOTHING",
|
|
1382
|
+
(keep_id, merge_ids),
|
|
1383
|
+
)
|
|
1384
|
+
await tx.execute(
|
|
1385
|
+
"DELETE FROM entity_chunks WHERE entity_id = ANY(%s)",
|
|
1386
|
+
(merge_ids,),
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
# Delete merged entities.
|
|
1390
|
+
await tx.execute("DELETE FROM entities WHERE id = ANY(%s)", (merge_ids,))
|
|
1391
|
+
|
|
1392
|
+
return {"kept": keep_id, "merged_count": len(merge_ids)}
|
|
1393
|
+
|
|
1394
|
+
async def prune_orphans(self, namespace: str | None = None) -> dict:
|
|
1395
|
+
"""Delete entities and relationships with no chunk links."""
|
|
1396
|
+
ns = namespace or self.config.namespace
|
|
1397
|
+
_validate_namespace(ns)
|
|
1398
|
+
# Count first, then delete — gives a clean int return value that's
|
|
1399
|
+
# easy to assert on in tests and log in production.
|
|
1400
|
+
ent_row = await self.db.fetch_one(
|
|
1401
|
+
"SELECT count(*) AS cnt FROM entities WHERE namespace = %s "
|
|
1402
|
+
"AND id NOT IN (SELECT DISTINCT entity_id FROM entity_chunks)",
|
|
1403
|
+
(ns,),
|
|
1404
|
+
)
|
|
1405
|
+
rel_row = await self.db.fetch_one(
|
|
1406
|
+
"SELECT count(*) AS cnt FROM relationships WHERE namespace = %s "
|
|
1407
|
+
"AND id NOT IN (SELECT DISTINCT relationship_id FROM relationship_chunks)",
|
|
1408
|
+
(ns,),
|
|
1409
|
+
)
|
|
1410
|
+
entities_pruned = ent_row["cnt"] if ent_row else 0
|
|
1411
|
+
relationships_pruned = rel_row["cnt"] if rel_row else 0
|
|
1412
|
+
await self.db.execute(
|
|
1413
|
+
"DELETE FROM entities WHERE namespace = %s AND id NOT IN "
|
|
1414
|
+
"(SELECT DISTINCT entity_id FROM entity_chunks)",
|
|
1415
|
+
(ns,),
|
|
1416
|
+
)
|
|
1417
|
+
await self.db.execute(
|
|
1418
|
+
"DELETE FROM relationships WHERE namespace = %s AND id NOT IN "
|
|
1419
|
+
"(SELECT DISTINCT relationship_id FROM relationship_chunks)",
|
|
1420
|
+
(ns,),
|
|
1421
|
+
)
|
|
1422
|
+
return {
|
|
1423
|
+
"entities_pruned": entities_pruned,
|
|
1424
|
+
"relationships_pruned": relationships_pruned,
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
async def tune_scoring_weights(self, **kwargs):
|
|
1428
|
+
"""Grid-search scoring weights against a gold QA set.
|
|
1429
|
+
See src/pg_raggraph/evolution.py:tune_scoring_weights for args."""
|
|
1430
|
+
from pg_raggraph.evolution import tune_scoring_weights as _tune
|
|
1431
|
+
|
|
1432
|
+
return await _tune(self, **kwargs)
|