pg-raggraph 0.3.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1432 @@
1
+ """pg-raggraph — PostgreSQL-native GraphRAG."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from datetime import datetime
10
+ from importlib.metadata import PackageNotFoundError
11
+ from importlib.metadata import version as _pkg_version
12
+
13
+ try:
14
+ __version__ = _pkg_version("pg-raggraph")
15
+ except PackageNotFoundError:
16
+ # Editable install without installed metadata (rare). Mirror pyproject.
17
+ __version__ = "0.3.0a2"
18
+
19
+ from pg_raggraph.config import PGRGConfig
20
+ from pg_raggraph.models import QueryResult
21
+
22
+ # Canonical extension allowlist for ingestion. Mirrored by the FastAPI server
23
+ # and the MCP server so all surfaces accept the same set. Stored as a tuple
24
+ # so it's compatible with str.endswith() in the directory walker.
25
+ INGEST_ALLOWED_EXTS: tuple[str, ...] = (
26
+ ".md",
27
+ ".txt",
28
+ ".py",
29
+ ".ts",
30
+ ".js",
31
+ ".tsx",
32
+ ".jsx",
33
+ ".go",
34
+ ".rs",
35
+ ".java",
36
+ ".rst",
37
+ )
38
+
39
+ __all__ = [
40
+ "GraphRAG",
41
+ "INGEST_ALLOWED_EXTS",
42
+ "PGRGConfig",
43
+ "QueryResult",
44
+ "__version__",
45
+ ]
46
+
47
+ logger = logging.getLogger("pg_raggraph")
48
+
49
+
50
+ def _json_default(obj):
51
+ """JSON encoder fallback for types stdlib json can't handle natively.
52
+
53
+ datetime → ISO 8601 string (queryable from JSONB via
54
+ ``metadata->>'effective_from'``). Falls back to ``str(obj)`` for
55
+ anything else so a user's exotic metadata value never crashes ingest.
56
+ """
57
+ if isinstance(obj, datetime):
58
+ return obj.isoformat()
59
+ return str(obj)
60
+
61
+
62
+ class _JSONLogFormatter(logging.Formatter):
63
+ """Minimal stdlib-only JSON formatter for log aggregator pipelines.
64
+
65
+ No extra dep. Output shape matches the common Datadog / ELK / Loki
66
+ expectation: `ts`, `level`, `logger`, `msg`, plus `exc_info` when present.
67
+ Honors `extra={...}` on log calls — anything extra is merged at the top
68
+ level (keeping `ts`, `level`, `logger`, `msg` reserved).
69
+ """
70
+
71
+ _RESERVED = frozenset({"ts", "level", "logger", "msg", "exc_info"})
72
+
73
+ def format(self, record: logging.LogRecord) -> str: # noqa: D401
74
+ payload: dict = {
75
+ "ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
76
+ "level": record.levelname,
77
+ "logger": record.name,
78
+ "msg": record.getMessage(),
79
+ }
80
+ if record.exc_info:
81
+ payload["exc_info"] = self.formatException(record.exc_info)
82
+ # Merge extras (logger.info("...", extra={"request_id": x}) patterns).
83
+ for k, v in record.__dict__.items():
84
+ if k in self._RESERVED:
85
+ continue
86
+ if k.startswith("_"):
87
+ continue
88
+ if k in (
89
+ "args",
90
+ "msg",
91
+ "name",
92
+ "exc_info",
93
+ "exc_text",
94
+ "stack_info",
95
+ "lineno",
96
+ "module",
97
+ "filename",
98
+ "pathname",
99
+ "funcName",
100
+ "process",
101
+ "processName",
102
+ "thread",
103
+ "threadName",
104
+ "created",
105
+ "msecs",
106
+ "relativeCreated",
107
+ "levelname",
108
+ "levelno",
109
+ "asctime",
110
+ "message",
111
+ "taskName",
112
+ ):
113
+ continue
114
+ try:
115
+ json.dumps(v)
116
+ payload[k] = v
117
+ except (TypeError, ValueError):
118
+ payload[k] = repr(v)
119
+ return json.dumps(payload, default=str)
120
+
121
+
122
+ _logging_configured = False
123
+
124
+
125
+ def _configure_logging() -> None:
126
+ """Idempotent root-logger configuration honoring PGRG_LOG_FORMAT.
127
+
128
+ Default (env unset or anything other than "json"): leave existing handlers
129
+ alone — caller's logging setup wins. When PGRG_LOG_FORMAT=json AND no
130
+ handlers are attached to the pg_raggraph logger yet, install a single
131
+ StreamHandler with the JSON formatter at PGRG_LOG_LEVEL (default INFO).
132
+ """
133
+ global _logging_configured
134
+ if _logging_configured:
135
+ return
136
+ fmt = os.environ.get("PGRG_LOG_FORMAT", "").strip().lower()
137
+ if fmt != "json":
138
+ _logging_configured = True
139
+ return
140
+ if logger.handlers:
141
+ # Caller already wired their own handler; respect it.
142
+ _logging_configured = True
143
+ return
144
+ handler = logging.StreamHandler()
145
+ handler.setFormatter(_JSONLogFormatter())
146
+ level_name = os.environ.get("PGRG_LOG_LEVEL", "INFO").upper()
147
+ handler.setLevel(getattr(logging, level_name, logging.INFO))
148
+ logger.addHandler(handler)
149
+ logger.setLevel(getattr(logging, level_name, logging.INFO))
150
+ _logging_configured = True
151
+
152
+
153
+ _configure_logging()
154
+
155
+ _NAMESPACE_RE = re.compile(r"^[a-zA-Z0-9_\-\.]{1,64}$")
156
+
157
+
158
+ def _validate_namespace(ns: str) -> None:
159
+ if not _NAMESPACE_RE.match(ns):
160
+ raise ValueError(
161
+ f"Invalid namespace '{ns}'. Must be 1-64 chars, "
162
+ "alphanumeric/hyphens/underscores/dots only."
163
+ )
164
+
165
+
166
+ class GraphRAG:
167
+ """Main entry point for pg-raggraph.
168
+
169
+ Usage:
170
+ async with GraphRAG("postgresql://localhost/mydb") as rag:
171
+ await rag.ingest(["./docs/"])
172
+ result = await rag.query("How does auth work?")
173
+ for chunk in result.chunks:
174
+ print(chunk.content)
175
+ """
176
+
177
+ def __init__(self, dsn: str | None = None, *, reranker=None, **kwargs):
178
+ """Construct a GraphRAG instance.
179
+
180
+ Args:
181
+ dsn: PostgreSQL connection string. Optional — can also be set
182
+ via PGRG_DSN env var or kwargs["dsn"].
183
+ reranker: Optional Reranker (see pg_raggraph.reranker.Reranker
184
+ protocol) to inject for power users. If None, a
185
+ FastEmbedReranker is lazy-loaded from config.rerank_model
186
+ on first use of rerank=True.
187
+ **kwargs: Any PGRGConfig field. See docs/Config-Reference.md
188
+ for the full list.
189
+ """
190
+ if dsn:
191
+ kwargs["dsn"] = dsn
192
+ self.config = PGRGConfig(**kwargs)
193
+ self._db = None
194
+ self._embedder = None
195
+ self._llm = None # Shared LLM provider; closed with the instance
196
+ # If user injects a reranker, use it; otherwise lazy-load from
197
+ # config.rerank_model on first rerank=True call.
198
+ self._reranker = reranker
199
+ # PR-209: cooperative shutdown signal for long-running ingest loops.
200
+ # Lazily initialized inside ingest() because it must be created on the
201
+ # running asyncio loop, not at __init__ time.
202
+ self._shutdown_event = None
203
+
204
+ def request_shutdown(self) -> None:
205
+ """Signal in-progress ingest loops to drain gracefully.
206
+
207
+ Already-running per-file transactions finish; queued files become
208
+ no-ops counted as skipped. Safe to call from a SIGTERM/SIGINT handler::
209
+
210
+ import asyncio, signal
211
+ from pg_raggraph import GraphRAG
212
+
213
+ rag = GraphRAG(...)
214
+ loop = asyncio.get_running_loop()
215
+ for sig in (signal.SIGTERM, signal.SIGINT):
216
+ loop.add_signal_handler(sig, rag.request_shutdown)
217
+
218
+ Idempotent. Safe to call before ingest() starts (no-op).
219
+ """
220
+ if self._shutdown_event is not None:
221
+ self._shutdown_event.set()
222
+
223
+ async def connect(self):
224
+ from pg_raggraph.db import Database
225
+
226
+ self._db = Database(self.config)
227
+ try:
228
+ await self._db.connect()
229
+ except Exception as e:
230
+ raise ConnectionError(
231
+ f"Cannot connect to PostgreSQL at {self.config.dsn}. "
232
+ f"Is the database running? Error: {e}"
233
+ ) from e
234
+
235
+ async def close(self):
236
+ if self._db:
237
+ await self._db.close()
238
+ self._db = None
239
+ if self._llm is not None and hasattr(self._llm, "aclose"):
240
+ await self._llm.aclose()
241
+ self._llm = None
242
+ self._embedder = None
243
+
244
+ async def __aenter__(self):
245
+ await self.connect()
246
+ return self
247
+
248
+ async def __aexit__(self, *exc):
249
+ await self.close()
250
+
251
+ @property
252
+ def db(self):
253
+ if self._db is None:
254
+ raise RuntimeError("Not connected. Call connect() or use async with.")
255
+ return self._db
256
+
257
+ def _get_embedder(self):
258
+ if self._embedder is None:
259
+ from pg_raggraph.embedding import get_embedding_provider
260
+
261
+ self._embedder = get_embedding_provider(self.config)
262
+ return self._embedder
263
+
264
+ async def ingest(
265
+ self,
266
+ paths: list[str],
267
+ namespace: str | None = None,
268
+ on_progress=None,
269
+ *,
270
+ metadata: dict | None = None,
271
+ ):
272
+ """Ingest documents from file paths with parallel processing.
273
+
274
+ Optimizations:
275
+ - Parallel LLM extraction (extract_concurrency, default 8)
276
+ - Batched entity embeddings (1 call instead of N)
277
+ - Parallel document processing (doc_concurrency, default 4)
278
+ - Content hash dedup
279
+
280
+ Args:
281
+ paths: File or directory paths to ingest.
282
+ namespace: Namespace for data isolation.
283
+ on_progress: Optional callback(message: str) for progress updates.
284
+ metadata: Per-ingest evolution hints applied to every file in this
285
+ call. Optional keys: ``effective_from``, ``effective_to``,
286
+ ``retracted``, ``retracted_at``, ``retraction_reason``,
287
+ ``version_label``, ``supersedes_document_id``. When
288
+ ``version_label``, ``supersedes_document_id``, or
289
+ ``retraction_reason`` is present, a ``document_versions`` row
290
+ is also created mirroring the document's evolution metadata.
291
+ """
292
+ import asyncio
293
+
294
+ from pg_raggraph.chunking import chunk_document, content_hash
295
+ from pg_raggraph.extraction import extract_from_chunks, get_llm_provider
296
+
297
+ ns = namespace or self.config.namespace
298
+ _validate_namespace(ns)
299
+ # PR-215: apply nice_level here (was previously in config init,
300
+ # which surprised callers by mutating process priority on import).
301
+ self.config.apply_nice_level()
302
+ embedder = self._get_embedder()
303
+
304
+ def _progress(msg: str):
305
+ logger.info(msg)
306
+ if on_progress:
307
+ on_progress(msg)
308
+
309
+ # Directories to skip when walking — avoid vendored code, build artifacts,
310
+ # model checkpoints, etc.
311
+ SKIP_DIRS = {
312
+ ".git",
313
+ ".venv",
314
+ "venv",
315
+ "node_modules",
316
+ "target", # Rust build
317
+ "dist",
318
+ "build",
319
+ "__pycache__",
320
+ ".pytest_cache",
321
+ ".ruff_cache",
322
+ ".mypy_cache",
323
+ ".tox",
324
+ "checkpoints",
325
+ "models",
326
+ ".cargo",
327
+ ".idea",
328
+ ".vscode",
329
+ "site-packages",
330
+ ".autonomy",
331
+ "skill-output",
332
+ }
333
+ SUPPORTED_EXTS = INGEST_ALLOWED_EXTS
334
+
335
+ # Collect and validate file paths
336
+ file_paths = []
337
+ for p in paths:
338
+ if os.path.isdir(p):
339
+ for root, dirs, files in os.walk(p):
340
+ # Prune skipped dirs in-place so we don't descend into them
341
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
342
+ for f in files:
343
+ if f.endswith(SUPPORTED_EXTS):
344
+ file_paths.append(os.path.join(root, f))
345
+ elif os.path.isfile(p):
346
+ file_paths.append(p)
347
+ else:
348
+ raise FileNotFoundError(f"Path not found: {p}")
349
+
350
+ if not file_paths:
351
+ logger.warning("No supported files found in provided paths.")
352
+ return
353
+
354
+ _progress(f"Found {len(file_paths)} files to process.")
355
+
356
+ # PR-209: lazily create the shutdown event on the running loop.
357
+ # request_shutdown() can be called before this without error (no-op);
358
+ # once an ingest is in flight, it observes the event and drains.
359
+ if self._shutdown_event is None:
360
+ self._shutdown_event = asyncio.Event()
361
+
362
+ # Process documents in parallel batches
363
+ doc_sem = asyncio.Semaphore(self.config.doc_concurrency)
364
+ # LLM is optional — without it, ingest stores chunks+embeddings only
365
+ # (pure vector RAG mode). Reuse the shared provider if already created
366
+ # so the connection pool is shared across ingest() calls.
367
+ llm = None
368
+ if not self.config.skip_extraction and self.config.llm_base_url:
369
+ if self._llm is None:
370
+ try:
371
+ self._llm = get_llm_provider(self.config)
372
+ except Exception as e:
373
+ logger.warning(f"LLM provider unavailable, skipping extraction: {e}")
374
+ llm = self._llm
375
+ if llm is None:
376
+ _progress("Extraction disabled — ingesting as pure vector RAG.")
377
+
378
+ stats = {
379
+ "ingested": 0,
380
+ "skipped": 0,
381
+ "failed": 0,
382
+ "degraded": 0,
383
+ "entities": 0,
384
+ "rels": 0,
385
+ }
386
+
387
+ async def _process_file(idx: int, file_path: str):
388
+ # Retry on transient serialization / deadlock errors from
389
+ # concurrent ingestion. Exponential backoff, max 3 attempts.
390
+ async with doc_sem:
391
+ # PR-209: drain gracefully on shutdown. Files queued behind
392
+ # the semaphore become no-ops once request_shutdown() is
393
+ # observed; in-flight files (already past this check) finish
394
+ # their transaction normally.
395
+ if self._shutdown_event is not None and self._shutdown_event.is_set():
396
+ stats["skipped"] += 1
397
+ return
398
+ attempt = 0
399
+ while True:
400
+ attempt += 1
401
+ try:
402
+ r = await self._ingest_one_file(
403
+ file_path,
404
+ ns,
405
+ embedder,
406
+ llm,
407
+ content_hash,
408
+ chunk_document,
409
+ extract_from_chunks,
410
+ metadata=metadata,
411
+ )
412
+ if r:
413
+ stats["ingested"] += 1
414
+ stats["entities"] += r["entities"]
415
+ stats["rels"] += r["rels"]
416
+ if r.get("degraded"):
417
+ stats["degraded"] += 1
418
+ deg_note = (
419
+ " (extraction failed, vector-only)" if r.get("degraded") else ""
420
+ )
421
+ _progress(
422
+ f"[{idx}/{len(file_paths)}] "
423
+ f"{os.path.basename(file_path)}: "
424
+ f"{r['entities']} entities, {r['rels']} rels{deg_note}"
425
+ )
426
+ else:
427
+ stats["skipped"] += 1
428
+ return
429
+ except Exception as e:
430
+ # Postgres deadlock = SQLSTATE 40P01, serialization = 40001.
431
+ # Prefer the structured sqlstate attribute (psycopg3) over
432
+ # string matching, which breaks on non-English PG builds.
433
+ sqlstate = getattr(e, "sqlstate", None)
434
+ msg = str(e)
435
+ transient = sqlstate in ("40P01", "40001") or (
436
+ sqlstate is None
437
+ and (
438
+ "40P01" in msg
439
+ or "40001" in msg
440
+ or "deadlock detected" in msg
441
+ or "could not serialize" in msg
442
+ )
443
+ )
444
+ if transient and attempt < 3:
445
+ backoff = 0.2 * (2 ** (attempt - 1))
446
+ logger.info(
447
+ f"Retry {attempt}/3 after {backoff:.1f}s for "
448
+ f"{file_path}: {msg[:80]}"
449
+ )
450
+ await asyncio.sleep(backoff)
451
+ continue
452
+ logger.warning(f"Failed {file_path}: {e}")
453
+ stats["failed"] += 1
454
+ return
455
+
456
+ await asyncio.gather(*[_process_file(i + 1, fp) for i, fp in enumerate(file_paths)])
457
+
458
+ notes = []
459
+ if stats["failed"]:
460
+ notes.append(f"{stats['failed']} failed")
461
+ if stats["degraded"]:
462
+ notes.append(f"{stats['degraded']} degraded (vector-only, extraction error)")
463
+ suffix = f", {', '.join(notes)}" if notes else ""
464
+ _progress(
465
+ f"Done: {stats['ingested']} ingested, {stats['skipped']} skipped"
466
+ f"{suffix}. "
467
+ f"{stats['entities']} entities, {stats['rels']} relationships."
468
+ )
469
+
470
+ async def ingest_records(
471
+ self,
472
+ records,
473
+ namespace: str | None = None,
474
+ on_progress=None,
475
+ ):
476
+ """Ingest documents from in-memory records — no disk roundtrip.
477
+
478
+ Use this when your source data lives in another database, an API,
479
+ a queue, or anywhere that's not the filesystem. The classic
480
+ pattern for same-database CRM/ERP pipelines:
481
+
482
+ with psycopg.connect(crm_dsn) as conn:
483
+ rows = conn.execute("SELECT note_id, note_text, ... FROM ...").fetchall()
484
+ records = [
485
+ {
486
+ "text": format_doc(row),
487
+ "source_id": f"sales_note:{row['note_id']}",
488
+ "metadata": {"order_id": row["order_id"], "status": row["status"]},
489
+ }
490
+ for row in rows
491
+ ]
492
+ await rag.ingest_records(records, namespace="sales_calls")
493
+
494
+ Args:
495
+ records: Iterable of dicts. Each dict must have:
496
+ - ``text`` (str, required): document content
497
+ - ``source_id`` (str, required): stable logical identifier
498
+ used for content-hash dedup AND stale-doc cleanup. Use a
499
+ scheme like ``"sales_note:42"`` or ``"jira:PROJ-1234"``.
500
+ Re-ingesting the same source_id with new text replaces
501
+ the prior version atomically.
502
+ - ``metadata`` (dict, optional): per-record metadata.
503
+ Persisted as JSONB on ``documents.metadata`` (queryable
504
+ via ``metadata->>'foo'``). Evolution-tracking keys
505
+ (``effective_from``, ``effective_to``, ``retracted``,
506
+ ``version_label``, ``supersedes_document_id``) are ALSO
507
+ written to dedicated columns. Other keys are stored only
508
+ in the JSONB.
509
+ - ``entities`` (list of dict, optional): caller-known
510
+ entities to seed the graph. Each: ``{"name": "...",
511
+ "entity_type": "...", "description": "...", "properties":
512
+ {...}}``. ``name`` is required; the rest are optional.
513
+ Entity resolution merges these with LLM-extracted
514
+ entities of the same name. Linked to every chunk.
515
+ - ``relationships`` (list of dict, optional): caller-known
516
+ graph edges. Each: ``{"src": "EntityName1",
517
+ "dst": "EntityName2", "rel_type": "...", "weight": 1.0,
518
+ "description": "..."}``. ``src`` and ``dst`` are
519
+ required and must match either a caller-supplied or
520
+ LLM-extracted entity name.
521
+ - ``skip_llm`` (bool, optional, default False): skip LLM
522
+ extraction for this document. Useful when the caller's
523
+ known_entities/known_relationships already cover what
524
+ they care about and the LLM would just add noise / cost.
525
+ - ``pre_chunked`` (list of dict, optional): bypass
526
+ pg-raggraph's chunker AND embedder. Each entry:
527
+ ``{"content": str, "embedded_content": str (optional),
528
+ "embedding": list[float] (must match config.embedding_dim),
529
+ "metadata": dict (optional), "token_count": int (optional)}``.
530
+ Use when an upstream tool (e.g. chunkshop's full pipeline)
531
+ already chunked + embedded the document. The ``text``
532
+ field still drives LLM entity/relationship extraction;
533
+ set it to a sensible reconstruction of the document.
534
+ See docs/cookbook/chunkshop-integration.md Pattern C.
535
+ namespace: Namespace for data isolation.
536
+ on_progress: Optional callback(message: str) for progress.
537
+
538
+ Returns: same stats shape as ``ingest()``.
539
+
540
+ Example (CRM with known FK relationships):
541
+
542
+ records = [{
543
+ "text": format_doc(row),
544
+ "source_id": f"sales_note:{row['note_id']}",
545
+ "metadata": {"order_id": row["order_id"], "status": row["status"]},
546
+ "entities": [
547
+ {"name": row["company_name"], "entity_type": "Customer"},
548
+ {"name": row["product_name"], "entity_type": "Product"},
549
+ {"name": row["salesperson_name"], "entity_type": "Salesperson"},
550
+ ],
551
+ "relationships": [
552
+ {"src": row["company_name"], "dst": row["product_name"],
553
+ "rel_type": "BOUGHT"},
554
+ {"src": row["salesperson_name"], "dst": row["company_name"],
555
+ "rel_type": "SOLD_TO"},
556
+ ],
557
+ } for row in crm_rows]
558
+ await rag.ingest_records(records, namespace="sales_calls")
559
+ """
560
+ import asyncio
561
+
562
+ from pg_raggraph.chunking import chunk_document, content_hash
563
+ from pg_raggraph.extraction import extract_from_chunks, get_llm_provider
564
+
565
+ records = list(records)
566
+ ns = namespace or self.config.namespace
567
+ _validate_namespace(ns)
568
+ self.config.apply_nice_level()
569
+ embedder = self._get_embedder()
570
+
571
+ def _progress(msg: str):
572
+ logger.info(msg)
573
+ if on_progress:
574
+ on_progress(msg)
575
+
576
+ # Validate input shape (per-record, fail fast on the first bad row).
577
+ for i, rec in enumerate(records):
578
+ if not isinstance(rec, dict):
579
+ raise TypeError(f"records[{i}] must be a dict, got {type(rec).__name__}")
580
+ if not rec.get("text"):
581
+ raise ValueError(f"records[{i}] missing required 'text' field")
582
+ if not rec.get("source_id"):
583
+ raise ValueError(f"records[{i}] missing required 'source_id' field")
584
+
585
+ if not records:
586
+ _progress("No records to process.")
587
+ return
588
+
589
+ _progress(f"Processing {len(records)} records (in-memory ingest).")
590
+
591
+ if self._shutdown_event is None:
592
+ self._shutdown_event = asyncio.Event()
593
+
594
+ doc_sem = asyncio.Semaphore(self.config.doc_concurrency)
595
+ llm = None
596
+ if not self.config.skip_extraction and self.config.llm_base_url:
597
+ if self._llm is None:
598
+ try:
599
+ self._llm = get_llm_provider(self.config)
600
+ except Exception as e:
601
+ logger.warning(f"LLM provider unavailable, skipping extraction: {e}")
602
+ llm = self._llm
603
+ if llm is None:
604
+ _progress("Extraction disabled — ingesting as pure vector RAG.")
605
+
606
+ stats = {
607
+ "ingested": 0,
608
+ "skipped": 0,
609
+ "failed": 0,
610
+ "degraded": 0,
611
+ "entities": 0,
612
+ "rels": 0,
613
+ }
614
+
615
+ async def _process_record(idx: int, rec: dict):
616
+ async with doc_sem:
617
+ if self._shutdown_event is not None and self._shutdown_event.is_set():
618
+ stats["skipped"] += 1
619
+ return
620
+ attempt = 0
621
+ while True:
622
+ attempt += 1
623
+ try:
624
+ rec_meta = rec.get("metadata")
625
+ rec_entities = rec.get("entities")
626
+ rec_rels = rec.get("relationships")
627
+ rec_skip_llm = bool(rec.get("skip_llm", False))
628
+ rec_pre_chunked = rec.get("pre_chunked")
629
+ r = await self._ingest_one_content(
630
+ rec["text"],
631
+ source_id=rec["source_id"],
632
+ ns=ns,
633
+ embedder=embedder,
634
+ llm=llm,
635
+ content_hash_fn=content_hash,
636
+ chunk_document_fn=chunk_document,
637
+ extract_from_chunks_fn=extract_from_chunks,
638
+ metadata=rec_meta,
639
+ known_entities=rec_entities,
640
+ known_relationships=rec_rels,
641
+ skip_llm_for_this_doc=rec_skip_llm,
642
+ pre_chunked=rec_pre_chunked,
643
+ )
644
+ if r:
645
+ stats["ingested"] += 1
646
+ stats["entities"] += r["entities"]
647
+ stats["rels"] += r["rels"]
648
+ if r.get("degraded"):
649
+ stats["degraded"] += 1
650
+ deg_note = (
651
+ " (extraction failed, vector-only)" if r.get("degraded") else ""
652
+ )
653
+ _progress(
654
+ f"[{idx}/{len(records)}] {rec['source_id']}: "
655
+ f"{r['entities']} entities, {r['rels']} rels{deg_note}"
656
+ )
657
+ else:
658
+ stats["skipped"] += 1
659
+ return
660
+ except Exception as e:
661
+ sqlstate = getattr(e, "sqlstate", None)
662
+ msg = str(e)
663
+ transient = sqlstate in ("40P01", "40001") or (
664
+ sqlstate is None
665
+ and (
666
+ "40P01" in msg
667
+ or "40001" in msg
668
+ or "deadlock detected" in msg
669
+ or "could not serialize" in msg
670
+ )
671
+ )
672
+ if transient and attempt < 3:
673
+ backoff = 0.2 * (2 ** (attempt - 1))
674
+ await asyncio.sleep(backoff)
675
+ continue
676
+ logger.warning(f"Failed {rec['source_id']}: {e}")
677
+ stats["failed"] += 1
678
+ return
679
+
680
+ await asyncio.gather(*[_process_record(i + 1, rec) for i, rec in enumerate(records)])
681
+
682
+ notes_msg = []
683
+ if stats["failed"]:
684
+ notes_msg.append(f"{stats['failed']} failed")
685
+ if stats["degraded"]:
686
+ notes_msg.append(f"{stats['degraded']} degraded")
687
+ suffix = f", {', '.join(notes_msg)}" if notes_msg else ""
688
+ _progress(
689
+ f"Done: {stats['ingested']} ingested, {stats['skipped']} skipped"
690
+ f"{suffix}. {stats['entities']} entities, {stats['rels']} relationships."
691
+ )
692
+
693
+ async def _ingest_one_file(
694
+ self,
695
+ file_path,
696
+ ns,
697
+ embedder,
698
+ llm,
699
+ content_hash_fn,
700
+ chunk_document_fn,
701
+ extract_from_chunks_fn,
702
+ *,
703
+ metadata: dict | None = None,
704
+ ):
705
+ """Read a file from disk and ingest it.
706
+
707
+ Thin wrapper over `_ingest_one_content` — for in-memory ingest
708
+ (SQL → pgrg in the same database, no disk roundtrip) call
709
+ `ingest_records` instead, which routes directly to
710
+ `_ingest_one_content`.
711
+ """
712
+ try:
713
+ with open(file_path, encoding="utf-8") as f:
714
+ content = f.read()
715
+ except (UnicodeDecodeError, ValueError):
716
+ logger.warning(f"Skipping non-UTF-8 file: {file_path}")
717
+ return None
718
+ return await self._ingest_one_content(
719
+ content,
720
+ source_id=file_path,
721
+ ns=ns,
722
+ embedder=embedder,
723
+ llm=llm,
724
+ content_hash_fn=content_hash_fn,
725
+ chunk_document_fn=chunk_document_fn,
726
+ extract_from_chunks_fn=extract_from_chunks_fn,
727
+ metadata=metadata,
728
+ )
729
+
730
+ async def _ingest_one_content(
731
+ self,
732
+ content: str,
733
+ *,
734
+ source_id: str,
735
+ ns,
736
+ embedder,
737
+ llm,
738
+ content_hash_fn,
739
+ chunk_document_fn,
740
+ extract_from_chunks_fn,
741
+ metadata: dict | None = None,
742
+ known_entities: list[dict] | None = None,
743
+ known_relationships: list[dict] | None = None,
744
+ skip_llm_for_this_doc: bool = False,
745
+ pre_chunked: list[dict] | None = None,
746
+ ):
747
+ """Ingest a single document from in-memory content with all DB
748
+ writes in a single transaction.
749
+
750
+ Using db.transaction() ensures all chunks/entities/relationships for
751
+ one doc commit atomically, and chunk_id from INSERT is immediately
752
+ visible to entity_chunks INSERT on the same connection (no pool
753
+ commit propagation race).
754
+
755
+ ``source_id`` serves the same role as ``source_path`` in file-based
756
+ ingest: it's the logical identifier for dedup (combined with
757
+ content_hash) and stale-doc cleanup. Use a stable string —
758
+ e.g. ``"sales_note:42"`` — so re-ingests of the same record
759
+ replace the prior version atomically.
760
+
761
+ ``known_entities`` and ``known_relationships`` let callers seed
762
+ the graph with structured edges they already have (e.g. FK-derived
763
+ relationships from a CRM). Each known entity is linked to every
764
+ chunk of the document; each known relationship is linked to the
765
+ first chunk. They merge with LLM-extracted entities/relationships
766
+ via the entity-resolution path — same name across both sources
767
+ resolves to the same row.
768
+
769
+ ``metadata`` is now persisted to ``documents.metadata`` JSONB as a
770
+ whole (in addition to the evolution-tracking columns). Query via
771
+ ``metadata->>'foo'`` after ingest.
772
+
773
+ ``skip_llm_for_this_doc`` skips LLM extraction for this document
774
+ only — useful when the caller's known_entities/known_relationships
775
+ already cover everything they care about and the LLM would just
776
+ add noise (or cost).
777
+
778
+ ``pre_chunked`` lets callers bypass pg-raggraph's chunker AND
779
+ embedder. Use when the chunks + embeddings already exist
780
+ upstream (e.g. chunkshop end-to-end pipeline → pg-raggraph
781
+ graph layer; see docs/cookbook/chunkshop-integration.md
782
+ Pattern C). Each list entry is a dict::
783
+
784
+ {
785
+ "content": "<original chunk text>", # required
786
+ "embedded_content": "<text given to the embedder>", # optional
787
+ "embedding": [float, ...], # required (dim)
788
+ "metadata": {...}, # optional (merged)
789
+ "token_count": int, # optional
790
+ }
791
+
792
+ When ``pre_chunked`` is set, ``content`` is still used as the
793
+ full-document text input for LLM entity/relationship extraction
794
+ — set it to a sensible reconstruction (e.g. join all chunks
795
+ with newlines) so the LLM sees the document.
796
+ """
797
+ # Use the source_id as the chunker's path hint so .md/.py-style
798
+ # extension detection still works for callers that pass
799
+ # filename-shaped IDs. For non-filename IDs the chunker falls back
800
+ # to content-based detection (e.g. markdown headings).
801
+ file_path = source_id
802
+
803
+ # Delta check (read-only)
804
+ c_hash = content_hash_fn(content)
805
+ existing = await self.db.fetch_one(
806
+ "SELECT id FROM documents WHERE namespace = %s AND content_hash = %s",
807
+ (ns, c_hash),
808
+ )
809
+ if existing:
810
+ logger.debug(f"Skipped (unchanged): {file_path}")
811
+ return None
812
+
813
+ # Chunk (no DB) — caller can pre-chunk to bypass pg-raggraph's
814
+ # chunker AND embedder (e.g. chunkshop Pattern C, where the upstream
815
+ # pipeline already chunked + embedded + extracted metadata).
816
+ from pg_raggraph.chunking import token_count as _token_count
817
+
818
+ if pre_chunked is not None:
819
+ chunks = []
820
+ chunk_embeddings = []
821
+ for i, pc in enumerate(pre_chunked):
822
+ if "content" not in pc or "embedding" not in pc:
823
+ raise ValueError(f"pre_chunked[{i}] must include 'content' and 'embedding'")
824
+ emb = pc["embedding"]
825
+ if len(emb) != self.config.embedding_dim:
826
+ raise ValueError(
827
+ f"pre_chunked[{i}].embedding has dim {len(emb)} but "
828
+ f"config.embedding_dim={self.config.embedding_dim}. "
829
+ "Configure GraphRAG with embedding_dim matching the "
830
+ "upstream embedder, or re-embed at the upstream layer."
831
+ )
832
+ body = pc["content"]
833
+ emb_content = pc.get("embedded_content") or body
834
+ meta = dict(pc.get("metadata") or {})
835
+ meta.setdefault("source_path", file_path)
836
+ meta.setdefault("chunk_index", i)
837
+ chunks.append(
838
+ {
839
+ "content": body,
840
+ "embedded_content": emb_content,
841
+ "token_count": pc.get("token_count") or _token_count(emb_content),
842
+ "content_hash": pc.get("content_hash") or content_hash_fn(body),
843
+ "metadata": meta,
844
+ }
845
+ )
846
+ chunk_embeddings.append(emb)
847
+ if not chunks:
848
+ return {"entities": 0, "rels": 0}
849
+ else:
850
+ chunks = chunk_document_fn(content, source_path=file_path, config=self.config)
851
+ if not chunks:
852
+ return {"entities": 0, "rels": 0}
853
+
854
+ # Batch embed all chunks. Use embedded_content so the embedder sees
855
+ # heading prefix (hierarchy strategy) or any future neighbor/summary
856
+ # decoration; for auto strategy this equals content.
857
+ texts = [c["embedded_content"] for c in chunks]
858
+ chunk_embeddings = await embedder.embed(texts)
859
+
860
+ # Extract entities/relationships via LLM (cache reads OK outside txn).
861
+ # If llm is None or skip_llm_for_this_doc is set, skip extraction
862
+ # entirely — pure vector RAG mode (with whatever known_entities /
863
+ # known_relationships the caller provides as the only graph signal).
864
+ extraction_degraded = False
865
+ if llm is None or skip_llm_for_this_doc:
866
+ from pg_raggraph.models import ExtractionResult
867
+
868
+ extraction_results = [ExtractionResult() for _ in chunks]
869
+ else:
870
+ try:
871
+ extraction_results = await extract_from_chunks_fn(
872
+ chunks, llm, self.db, self.config
873
+ )
874
+ except Exception as e:
875
+ logger.warning(f"Extraction failed for {file_path}, ingesting as pure vector: {e}")
876
+ from pg_raggraph.models import ExtractionResult
877
+
878
+ extraction_results = [ExtractionResult() for _ in chunks]
879
+ extraction_degraded = True
880
+
881
+ # Dedupe entities by name, build per-chunk entity/rel lists
882
+ unique_entities = {}
883
+ chunk_to_entities = []
884
+ chunk_to_rels = []
885
+
886
+ for i, extraction in enumerate(extraction_results):
887
+ entity_names = []
888
+ for ent in extraction.entities:
889
+ if ent.name not in unique_entities:
890
+ unique_entities[ent.name] = {
891
+ "entity_type": ent.entity_type,
892
+ "description": ent.description,
893
+ "chunks": [i],
894
+ }
895
+ else:
896
+ unique_entities[ent.name]["chunks"].append(i)
897
+ existing_desc = unique_entities[ent.name]["description"]
898
+ if ent.description and ent.description not in existing_desc:
899
+ unique_entities[ent.name]["description"] += " " + ent.description
900
+ entity_names.append(ent.name)
901
+ chunk_to_entities.append(entity_names)
902
+ chunk_to_rels.append(
903
+ [
904
+ (r.source, r.target, r.rel_type, r.description, r.weight)
905
+ for r in extraction.relationships
906
+ ]
907
+ )
908
+
909
+ # Merge caller-supplied known entities and relationships.
910
+ # Known entities are document-level: linked to every chunk.
911
+ # Known relationships attach to chunk[0] (only one anchor point
912
+ # is needed; graph traversal queries entities, not chunks).
913
+ if known_entities:
914
+ all_chunk_idxs = list(range(len(chunks)))
915
+ for ke in known_entities:
916
+ if not ke.get("name"):
917
+ raise ValueError("known_entities entries must include a non-empty 'name'")
918
+ name = ke["name"]
919
+ ke_desc = ke.get("description", "") or ""
920
+ ke_type = ke.get("entity_type", "ENTITY")
921
+ if name not in unique_entities:
922
+ unique_entities[name] = {
923
+ "entity_type": ke_type,
924
+ "description": ke_desc,
925
+ "chunks": list(all_chunk_idxs),
926
+ }
927
+ else:
928
+ # LLM also found this entity. Caller's domain knowledge
929
+ # WINS on entity_type and (if non-empty) description —
930
+ # the user explicitly tagged this as a Customer/Product/
931
+ # whatever, so don't let the LLM's generic "company"
932
+ # classification overwrite the caller's intent.
933
+ if ke_type and ke_type != "ENTITY":
934
+ unique_entities[name]["entity_type"] = ke_type
935
+ if ke_desc:
936
+ unique_entities[name]["description"] = ke_desc
937
+ existing = set(unique_entities[name]["chunks"])
938
+ existing.update(all_chunk_idxs)
939
+ unique_entities[name]["chunks"] = sorted(existing)
940
+ # Reflect in chunk_to_entities so entity_chunks links are written.
941
+ for ci in all_chunk_idxs:
942
+ if name not in chunk_to_entities[ci]:
943
+ chunk_to_entities[ci].append(name)
944
+
945
+ if known_relationships:
946
+ for kr in known_relationships:
947
+ if not (kr.get("src") and kr.get("dst")):
948
+ raise ValueError("known_relationships entries must include 'src' and 'dst'")
949
+ rel_tuple = (
950
+ kr["src"],
951
+ kr["dst"],
952
+ kr.get("rel_type", "RELATED_TO"),
953
+ kr.get("description", "") or "",
954
+ float(kr.get("weight", 1.0)),
955
+ )
956
+ # Anchor on chunk[0] — relationships are document-level.
957
+ chunk_to_rels[0].append(rel_tuple)
958
+
959
+ # Batch embed entities (no DB)
960
+ if unique_entities:
961
+ entity_names_list = list(unique_entities.keys())
962
+ entity_texts = [
963
+ f"{name} {unique_entities[name]['description']}" for name in entity_names_list
964
+ ]
965
+ entity_embeddings = await embedder.embed(entity_texts)
966
+ else:
967
+ entity_names_list = []
968
+ entity_embeddings = []
969
+
970
+ # All DB writes in a single transaction
971
+ from pg_raggraph.resolution import resolve_entity
972
+
973
+ async with self.db.transaction() as tx:
974
+ # Incremental update: if source_path exists with a DIFFERENT hash,
975
+ # the file has changed. Delete the stale document inside the same
976
+ # transaction as the new insert so any failure mid-ingest rolls
977
+ # back both the delete and the insert — the old version stays
978
+ # visible until the new one commits. FK cascades take care of
979
+ # chunks and the entity/relationship provenance joins. Call
980
+ # prune_orphans() afterwards to clean up unreferenced entities.
981
+ stale = await tx.fetch_one(
982
+ "SELECT id FROM documents "
983
+ "WHERE namespace = %s AND source_path = %s AND content_hash != %s",
984
+ (ns, file_path, c_hash),
985
+ )
986
+ if stale:
987
+ await tx.execute("DELETE FROM documents WHERE id = %s", (stale["id"],))
988
+ logger.info(f"Replaced stale version of {file_path}")
989
+
990
+ # Insert document with any caller-supplied evolution metadata.
991
+ # ON CONFLICT uses COALESCE so a re-ingest without metadata doesn't
992
+ # clobber previously-stored evolution fields. For `retracted` we
993
+ # distinguish "absent from meta" (preserve prior value) from
994
+ # "explicitly True/False" (apply the caller's value, including
995
+ # un-retracting). COALESCE can't express this for booleans, so we
996
+ # pass a separate `retracted_explicit` flag and gate the SET on it
997
+ # via CASE WHEN.
998
+ meta = metadata or {}
999
+ eff_from = meta.get("effective_from")
1000
+ eff_to = meta.get("effective_to")
1001
+ retracted_explicit = "retracted" in meta and meta["retracted"] is not None
1002
+ # Value for fresh INSERT: the caller's value if explicit, else
1003
+ # False (matches the column DEFAULT). On UPDATE the CASE WHEN
1004
+ # below decides whether to apply it at all.
1005
+ retracted_value = bool(meta["retracted"]) if retracted_explicit else False
1006
+ version_label = meta.get("version_label")
1007
+ supersedes_doc = meta.get("supersedes_document_id")
1008
+
1009
+ # Persist arbitrary caller metadata to documents.metadata JSONB.
1010
+ # The dedicated evolution columns (effective_from etc.) ALSO get
1011
+ # the same fields, so callers can query either way.
1012
+ # Re-ingest merges (caller intent: add new keys, update changed
1013
+ # keys, leave untouched keys alone) — implemented via JSONB
1014
+ # concat in the ON CONFLICT branch.
1015
+ # Use _json_default so datetime values in metadata (e.g.
1016
+ # effective_from / effective_to from evolution-tracking ingests)
1017
+ # serialize to ISO strings instead of crashing the ingest.
1018
+ doc_metadata_json = json.dumps(meta, default=_json_default) if meta else "{}"
1019
+
1020
+ doc_id = await tx.insert_returning_id(
1021
+ "INSERT INTO documents "
1022
+ "(namespace, content_hash, source_path, metadata, "
1023
+ " effective_from, effective_to, retracted, version_label) "
1024
+ "VALUES (%s, %s, %s, %s::jsonb, %s, %s, %s, %s) "
1025
+ "ON CONFLICT (namespace, content_hash) DO UPDATE "
1026
+ "SET source_path = EXCLUDED.source_path, "
1027
+ " metadata = documents.metadata || EXCLUDED.metadata, "
1028
+ " effective_from = COALESCE("
1029
+ "EXCLUDED.effective_from, documents.effective_from), "
1030
+ " effective_to = COALESCE("
1031
+ "EXCLUDED.effective_to, documents.effective_to), "
1032
+ " retracted = CASE WHEN %s "
1033
+ "THEN EXCLUDED.retracted ELSE documents.retracted END, "
1034
+ " version_label = COALESCE("
1035
+ "EXCLUDED.version_label, documents.version_label) "
1036
+ "RETURNING id",
1037
+ (
1038
+ ns,
1039
+ c_hash,
1040
+ file_path,
1041
+ doc_metadata_json,
1042
+ eff_from,
1043
+ eff_to,
1044
+ retracted_value,
1045
+ version_label,
1046
+ retracted_explicit,
1047
+ ),
1048
+ )
1049
+
1050
+ # If caller supplied version info or a supersession edge, create a
1051
+ # document_versions row for authoritative multi-version tracking.
1052
+ if version_label or supersedes_doc or meta.get("retraction_reason"):
1053
+ await tx.execute(
1054
+ "INSERT INTO document_versions "
1055
+ "(namespace, document_id, version_label, effective_from, effective_to, "
1056
+ " supersedes_document_id, retracted, retracted_at, retraction_reason) "
1057
+ "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
1058
+ (
1059
+ ns,
1060
+ doc_id,
1061
+ version_label,
1062
+ eff_from,
1063
+ eff_to,
1064
+ supersedes_doc,
1065
+ retracted_value,
1066
+ meta.get("retracted_at"),
1067
+ meta.get("retraction_reason"),
1068
+ ),
1069
+ )
1070
+
1071
+ # Insert all chunks
1072
+ chunk_ids = []
1073
+ for i, chunk in enumerate(chunks):
1074
+ chunk_id = await tx.insert_returning_id(
1075
+ "INSERT INTO chunks "
1076
+ "(document_id, content, embedded_content, embedding, token_count, metadata) "
1077
+ "VALUES (%s, %s, %s, %s, %s, %s) RETURNING id",
1078
+ (
1079
+ doc_id,
1080
+ chunk["content"],
1081
+ chunk["embedded_content"],
1082
+ chunk_embeddings[i],
1083
+ chunk["token_count"],
1084
+ json.dumps(chunk["metadata"], default=_json_default),
1085
+ ),
1086
+ )
1087
+ chunk_ids.append(chunk_id)
1088
+
1089
+ if not unique_entities:
1090
+ return {"entities": 0, "rels": 0}
1091
+
1092
+ # Resolve and insert entities (tx duck-types the db interface)
1093
+ entity_name_to_id = {}
1094
+ for name, emb in zip(entity_names_list, entity_embeddings):
1095
+ info = unique_entities[name]
1096
+ eid = await resolve_entity(
1097
+ name=name,
1098
+ entity_type=info["entity_type"],
1099
+ description=info["description"],
1100
+ embedding=emb,
1101
+ namespace=ns,
1102
+ db=tx,
1103
+ config=self.config,
1104
+ )
1105
+ entity_name_to_id[name] = eid
1106
+
1107
+ # Insert entity_chunks links
1108
+ for i, chunk_id in enumerate(chunk_ids):
1109
+ if i >= len(chunk_to_entities):
1110
+ break
1111
+ seen = set()
1112
+ for ent_name in chunk_to_entities[i]:
1113
+ if ent_name in seen or ent_name not in entity_name_to_id:
1114
+ continue
1115
+ seen.add(ent_name)
1116
+ await tx.execute(
1117
+ "INSERT INTO entity_chunks (entity_id, chunk_id, confidence) "
1118
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
1119
+ (entity_name_to_id[ent_name], chunk_id, 1.0),
1120
+ )
1121
+
1122
+ # Insert relationships and their chunk links
1123
+ rel_count = 0
1124
+ for i, chunk_id in enumerate(chunk_ids):
1125
+ if i >= len(chunk_to_rels):
1126
+ break
1127
+ for rel in chunk_to_rels[i]:
1128
+ src_id = entity_name_to_id.get(rel[0])
1129
+ dst_id = entity_name_to_id.get(rel[1])
1130
+ if not (src_id and dst_id):
1131
+ continue
1132
+ rel_id = await tx.insert_returning_id(
1133
+ "INSERT INTO relationships "
1134
+ "(namespace, src_id, dst_id, rel_type, weight, description) "
1135
+ "VALUES (%s, %s, %s, %s, %s, %s) RETURNING id",
1136
+ (ns, src_id, dst_id, rel[2], rel[4], rel[3]),
1137
+ )
1138
+ await tx.execute(
1139
+ "INSERT INTO relationship_chunks "
1140
+ "(relationship_id, chunk_id, confidence) "
1141
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
1142
+ (rel_id, chunk_id, 1.0),
1143
+ )
1144
+ rel_count += 1
1145
+
1146
+ return {
1147
+ "entities": len(unique_entities),
1148
+ "rels": rel_count,
1149
+ "degraded": extraction_degraded,
1150
+ }
1151
+
1152
+ async def query(
1153
+ self,
1154
+ question: str,
1155
+ mode: str = "smart",
1156
+ namespace: str | None = None,
1157
+ *,
1158
+ as_of: datetime | None = None,
1159
+ version_filter: str | None = None,
1160
+ evolution_aware: bool | None = None,
1161
+ rerank: bool = False,
1162
+ ) -> QueryResult:
1163
+ """Query the knowledge graph.
1164
+
1165
+ Modes:
1166
+ smart (default) - confidence-triggered routing (naive → boost → expand)
1167
+ naive - vector + BM25 only (fastest)
1168
+ naive_boost - naive + 1-hop graph boost re-ranking
1169
+ local - vector seed → graph expansion via entity neighbors
1170
+ global - relationship-centric retrieval
1171
+ hybrid - local + global combined
1172
+
1173
+ Evolution-aware kwargs (keyword-only):
1174
+ as_of: time-travel filter — restrict to documents whose effective
1175
+ window contains this timestamp.
1176
+ version_filter: restrict to documents with matching version_label.
1177
+ evolution_aware: when False, ignore evolution_tier for this query
1178
+ (forces classic retrieval). When None, honors config.
1179
+ rerank: when True, fetch top_k * rerank_factor candidates and
1180
+ re-rank with a cross-encoder before trimming to top_k.
1181
+ Adds ~30-80 ms p50 latency, zero per-query LLM cost.
1182
+ Model and factor configured via PGRGConfig.rerank_model
1183
+ and rerank_factor.
1184
+ """
1185
+ from pg_raggraph.retrieval import query as retrieval_query
1186
+
1187
+ ns = namespace or self.config.namespace
1188
+ _validate_namespace(ns)
1189
+ embedder = self._get_embedder()
1190
+ top_k_override = self.config.top_k * self.config.rerank_factor if rerank else None
1191
+ result = await retrieval_query(
1192
+ question=question,
1193
+ db=self.db,
1194
+ embedder=embedder,
1195
+ config=self.config,
1196
+ mode=mode,
1197
+ namespace=ns,
1198
+ as_of=as_of,
1199
+ version_filter=version_filter,
1200
+ evolution_aware=evolution_aware,
1201
+ top_k_override=top_k_override,
1202
+ )
1203
+ if rerank:
1204
+ from pg_raggraph.reranker import FastEmbedReranker, apply_reranker
1205
+
1206
+ if self._reranker is None:
1207
+ self._reranker = FastEmbedReranker(self.config.rerank_model)
1208
+ result = await apply_reranker(self._reranker, question, result, self.config.top_k)
1209
+ return result
1210
+
1211
+ async def ask(
1212
+ self,
1213
+ question: str,
1214
+ mode: str = "smart",
1215
+ namespace: str | None = None,
1216
+ *,
1217
+ as_of: datetime | None = None,
1218
+ version_filter: str | None = None,
1219
+ evolution_aware: bool | None = None,
1220
+ short_answer: bool = False,
1221
+ rerank: bool = False,
1222
+ ) -> QueryResult:
1223
+ """Query + LLM answer synthesis.
1224
+
1225
+ Runs retrieval then generates a grounded natural-language answer
1226
+ using the configured LLM. Falls back to a top-chunk summary if no
1227
+ LLM is configured — library stays useful as pure vector RAG.
1228
+
1229
+ When ``short_answer=True``, the LLM is asked for a short factoid
1230
+ answer (≤10 tokens, single phrase) instead of a paragraph. Useful
1231
+ for SQuAD-style benchmarks where gold answers are short strings.
1232
+
1233
+ When ``rerank=True``, the retrieved chunks are re-ranked with a
1234
+ cross-encoder before answer generation. Adds ~30-80 ms p50 latency,
1235
+ zero per-query LLM cost.
1236
+ """
1237
+ from pg_raggraph.answer import generate_answer
1238
+
1239
+ result = await self.query(
1240
+ question,
1241
+ mode=mode,
1242
+ namespace=namespace,
1243
+ as_of=as_of,
1244
+ version_filter=version_filter,
1245
+ evolution_aware=evolution_aware,
1246
+ rerank=rerank,
1247
+ )
1248
+ # Reuse the shared LLM client (same pool as ingestion).
1249
+ llm = None
1250
+ if self.config.llm_base_url:
1251
+ if self._llm is None:
1252
+ try:
1253
+ from pg_raggraph.extraction import get_llm_provider
1254
+
1255
+ self._llm = get_llm_provider(self.config)
1256
+ except Exception as e:
1257
+ logger.warning(f"LLM provider unavailable: {e}")
1258
+ llm = self._llm
1259
+ result.answer = await generate_answer(
1260
+ question, result, llm, self.config, short_answer=short_answer
1261
+ )
1262
+ return result
1263
+
1264
+ async def status(self, namespace: str | None = None) -> dict:
1265
+ """Get graph statistics."""
1266
+ ns = namespace or self.config.namespace
1267
+ return {
1268
+ "schema_version": int(await self.db.get_meta("schema_version") or 0),
1269
+ "embedding_dim": int(await self.db.get_meta("embedding_dim") or 0),
1270
+ "namespace": ns,
1271
+ "documents": await self.db.count("documents", ns),
1272
+ # Chunks table has no namespace column — scope via documents join.
1273
+ "chunks": (
1274
+ await self.db.fetch_one(
1275
+ "SELECT count(*) AS cnt FROM chunks c "
1276
+ "JOIN documents d ON d.id = c.document_id "
1277
+ "WHERE d.namespace = %s",
1278
+ (ns,),
1279
+ )
1280
+ )["cnt"],
1281
+ "entities": await self.db.count("entities", ns),
1282
+ "relationships": await self.db.count("relationships", ns),
1283
+ }
1284
+
1285
+ async def delete(self, namespace: str):
1286
+ """Delete all data in a namespace."""
1287
+ _validate_namespace(namespace)
1288
+ await self.db.execute("DELETE FROM documents WHERE namespace = %s", (namespace,))
1289
+ await self.db.execute("DELETE FROM entities WHERE namespace = %s", (namespace,))
1290
+ await self.db.execute("DELETE FROM relationships WHERE namespace = %s", (namespace,))
1291
+
1292
+ async def delete_document(self, source_path: str, namespace: str | None = None) -> int:
1293
+ """Delete a document and all its chunks by source path.
1294
+
1295
+ Entities and relationships are left in place — they may be referenced
1296
+ by other documents. Use `prune_orphans()` to clean up any entities
1297
+ that become unreferenced.
1298
+
1299
+ Returns number of documents deleted.
1300
+ """
1301
+ ns = namespace or self.config.namespace
1302
+ _validate_namespace(ns)
1303
+ result = await self.db.fetch_one(
1304
+ "DELETE FROM documents WHERE namespace = %s AND source_path = %s RETURNING id",
1305
+ (ns, source_path),
1306
+ )
1307
+ return 1 if result else 0
1308
+
1309
+ async def delete_entity(self, entity_id: int) -> bool:
1310
+ """Delete an entity and its relationships by id."""
1311
+ result = await self.db.fetch_one(
1312
+ "DELETE FROM entities WHERE id = %s RETURNING id", (entity_id,)
1313
+ )
1314
+ return result is not None
1315
+
1316
+ async def merge_entities(self, keep_id: int, merge_ids: list[int]) -> dict:
1317
+ """Merge one or more entities into a canonical one.
1318
+
1319
+ Rewrites relationships and entity_chunks to point at `keep_id`,
1320
+ deduplicates any resulting duplicate edges, drops self-loops that
1321
+ the merge creates, then deletes the merged entities. All atomic.
1322
+
1323
+ Raises ValueError if keep_id appears in merge_ids (would delete the
1324
+ canonical entity) or if merge_ids is empty.
1325
+ """
1326
+ if not merge_ids:
1327
+ raise ValueError("merge_ids must not be empty")
1328
+ if keep_id in merge_ids:
1329
+ raise ValueError(
1330
+ f"keep_id {keep_id} must not appear in merge_ids — "
1331
+ "that would delete the canonical entity"
1332
+ )
1333
+
1334
+ async with self.db.transaction() as tx:
1335
+ # Verify all entities exist and share a namespace. Cross-namespace
1336
+ # merges are almost always a bug.
1337
+ rows = await tx.fetch_all(
1338
+ "SELECT id, namespace FROM entities WHERE id = ANY(%s)",
1339
+ ([keep_id, *merge_ids],),
1340
+ )
1341
+ found_ids = {r["id"] for r in rows}
1342
+ missing = set([keep_id, *merge_ids]) - found_ids
1343
+ if missing:
1344
+ raise ValueError(f"entities not found: {sorted(missing)}")
1345
+ namespaces = {r["namespace"] for r in rows}
1346
+ if len(namespaces) > 1:
1347
+ raise ValueError(f"cross-namespace merge refused: {sorted(namespaces)}")
1348
+
1349
+ # Repoint relationships. After rewriting src_id and dst_id, any
1350
+ # edge whose src and dst both collapse to keep_id becomes a
1351
+ # self-loop — delete those. Remaining duplicates (same src, dst,
1352
+ # rel_type after the rewrite) collapse to one row each.
1353
+ await tx.execute(
1354
+ "UPDATE relationships SET src_id = %s WHERE src_id = ANY(%s)",
1355
+ (keep_id, merge_ids),
1356
+ )
1357
+ await tx.execute(
1358
+ "UPDATE relationships SET dst_id = %s WHERE dst_id = ANY(%s)",
1359
+ (keep_id, merge_ids),
1360
+ )
1361
+ # Drop self-loops created by the merge.
1362
+ await tx.execute(
1363
+ "DELETE FROM relationships WHERE src_id = dst_id AND (src_id = %s OR dst_id = %s)",
1364
+ (keep_id, keep_id),
1365
+ )
1366
+ # Collapse duplicate edges (keep the lowest id per group).
1367
+ await tx.execute(
1368
+ "DELETE FROM relationships a USING relationships b "
1369
+ "WHERE a.id > b.id AND a.src_id = b.src_id AND "
1370
+ "a.dst_id = b.dst_id AND a.rel_type = b.rel_type AND "
1371
+ "a.namespace = b.namespace AND (a.src_id = %s OR a.dst_id = %s)",
1372
+ (keep_id, keep_id),
1373
+ )
1374
+
1375
+ # Copy entity_chunks rows from merged entities to keep_id,
1376
+ # deduping via ON CONFLICT, then delete the old rows.
1377
+ await tx.execute(
1378
+ "INSERT INTO entity_chunks (entity_id, chunk_id, confidence, provenance) "
1379
+ "SELECT %s, chunk_id, confidence, provenance FROM entity_chunks "
1380
+ "WHERE entity_id = ANY(%s) "
1381
+ "ON CONFLICT DO NOTHING",
1382
+ (keep_id, merge_ids),
1383
+ )
1384
+ await tx.execute(
1385
+ "DELETE FROM entity_chunks WHERE entity_id = ANY(%s)",
1386
+ (merge_ids,),
1387
+ )
1388
+
1389
+ # Delete merged entities.
1390
+ await tx.execute("DELETE FROM entities WHERE id = ANY(%s)", (merge_ids,))
1391
+
1392
+ return {"kept": keep_id, "merged_count": len(merge_ids)}
1393
+
1394
+ async def prune_orphans(self, namespace: str | None = None) -> dict:
1395
+ """Delete entities and relationships with no chunk links."""
1396
+ ns = namespace or self.config.namespace
1397
+ _validate_namespace(ns)
1398
+ # Count first, then delete — gives a clean int return value that's
1399
+ # easy to assert on in tests and log in production.
1400
+ ent_row = await self.db.fetch_one(
1401
+ "SELECT count(*) AS cnt FROM entities WHERE namespace = %s "
1402
+ "AND id NOT IN (SELECT DISTINCT entity_id FROM entity_chunks)",
1403
+ (ns,),
1404
+ )
1405
+ rel_row = await self.db.fetch_one(
1406
+ "SELECT count(*) AS cnt FROM relationships WHERE namespace = %s "
1407
+ "AND id NOT IN (SELECT DISTINCT relationship_id FROM relationship_chunks)",
1408
+ (ns,),
1409
+ )
1410
+ entities_pruned = ent_row["cnt"] if ent_row else 0
1411
+ relationships_pruned = rel_row["cnt"] if rel_row else 0
1412
+ await self.db.execute(
1413
+ "DELETE FROM entities WHERE namespace = %s AND id NOT IN "
1414
+ "(SELECT DISTINCT entity_id FROM entity_chunks)",
1415
+ (ns,),
1416
+ )
1417
+ await self.db.execute(
1418
+ "DELETE FROM relationships WHERE namespace = %s AND id NOT IN "
1419
+ "(SELECT DISTINCT relationship_id FROM relationship_chunks)",
1420
+ (ns,),
1421
+ )
1422
+ return {
1423
+ "entities_pruned": entities_pruned,
1424
+ "relationships_pruned": relationships_pruned,
1425
+ }
1426
+
1427
+ async def tune_scoring_weights(self, **kwargs):
1428
+ """Grid-search scoring weights against a gold QA set.
1429
+ See src/pg_raggraph/evolution.py:tune_scoring_weights for args."""
1430
+ from pg_raggraph.evolution import tune_scoring_weights as _tune
1431
+
1432
+ return await _tune(self, **kwargs)