ltcai 3.5.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ """Browser & web ingestion — Knowledge Graph inputs, not standalone features.
2
+
3
+ v3.6.0 Knowledge Graph First: a public URL or an open browser tab is just another
4
+ source that converges into the Knowledge Graph through the unified ingestion
5
+ pipeline. Everything runs on the **local runtime** — the server fetches/reads
6
+ locally, stores into local SQLite, and never uploads to a cloud service.
7
+
8
+ Two layers, both feeding ``IngestionPipeline.ingest``:
9
+
10
+ * ``POST /api/browser/read-url`` — the runtime fetches a public URL locally,
11
+ extracts readable text, stores it as ``source_type=web_url``. Fails gracefully
12
+ on blocked / login-required pages.
13
+ * ``POST /api/browser/ingest-current-tab`` — accepts a payload from the local
14
+ browser extension (url/title/text/selected_text/html), sanitizes + size-limits
15
+ it, stores it as ``source_type=browser_tab``.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from html.parser import HTMLParser
21
+ from typing import Any, Callable, Optional, Tuple
22
+ from urllib.parse import urlparse
23
+
24
+ from fastapi import APIRouter, HTTPException, Request
25
+ from pydantic import BaseModel
26
+
27
+ from latticeai.services.ingestion import IngestionItem
28
+
29
+ MAX_TAB_BYTES = 4 * 1024 * 1024 # 4 MB per captured tab payload
30
+ MAX_URL_FETCH_BYTES = 4 * 1024 * 1024 # 4 MB cap on a fetched page
31
+ URL_FETCH_TIMEOUT = 12.0 # seconds
32
+
33
+
34
+ class BrowserFetchError(Exception):
35
+ """A URL could not be fetched (blocked, login-required, timeout, too big)."""
36
+
37
+
38
+ # ── readable-text extraction ─────────────────────────────────────────────────
39
+ _SKIP_TAGS = {"script", "style", "noscript", "template", "svg", "head"}
40
+
41
+
42
+ class _TextExtractor(HTMLParser):
43
+ def __init__(self) -> None:
44
+ super().__init__(convert_charrefs=True)
45
+ self._skip_depth = 0
46
+ self._chunks: list[str] = []
47
+ self.title: str = ""
48
+ self._in_title = False
49
+
50
+ def handle_starttag(self, tag, attrs):
51
+ if tag in _SKIP_TAGS:
52
+ self._skip_depth += 1
53
+ if tag == "title":
54
+ self._in_title = True
55
+
56
+ def handle_endtag(self, tag):
57
+ if tag in _SKIP_TAGS and self._skip_depth > 0:
58
+ self._skip_depth -= 1
59
+ if tag == "title":
60
+ self._in_title = False
61
+ if tag in {"p", "div", "br", "li", "h1", "h2", "h3", "h4", "section", "article"}:
62
+ self._chunks.append("\n")
63
+
64
+ def handle_data(self, data):
65
+ if self._in_title:
66
+ self.title += data
67
+ if self._skip_depth == 0:
68
+ text = data.strip()
69
+ if text:
70
+ self._chunks.append(text)
71
+
72
+ def text(self) -> str:
73
+ raw = " ".join(self._chunks)
74
+ # collapse runs of whitespace while keeping paragraph breaks
75
+ lines = [ln.strip() for ln in raw.replace("\r", "").split("\n")]
76
+ return "\n".join([ln for ln in lines if ln]).strip()
77
+
78
+
79
+ def extract_readable_text(html: str) -> Tuple[str, str]:
80
+ """Return (title, readable_text) from an HTML string. Never raises."""
81
+ parser = _TextExtractor()
82
+ try:
83
+ parser.feed(html or "")
84
+ except Exception: # noqa: BLE001 — malformed HTML must still yield best-effort text
85
+ pass
86
+ return parser.title.strip(), parser.text()
87
+
88
+
89
+ def _default_fetch_url(url: str) -> Tuple[str, str]:
90
+ """Fetch a public URL on the local runtime and extract readable text.
91
+
92
+ Raises :class:`BrowserFetchError` on any non-success (blocked, login wall,
93
+ timeout, oversized, non-HTML) so the route can fail gracefully.
94
+ """
95
+ import httpx
96
+
97
+ try:
98
+ with httpx.Client(
99
+ follow_redirects=True, timeout=URL_FETCH_TIMEOUT,
100
+ headers={"User-Agent": "LatticeAI-local/3.6 (+local-first knowledge graph)"},
101
+ ) as client:
102
+ resp = client.get(url)
103
+ except httpx.HTTPError as exc:
104
+ raise BrowserFetchError(f"Could not reach the page: {exc}") from exc
105
+
106
+ if resp.status_code in (401, 403):
107
+ raise BrowserFetchError("The page is login-required or blocked (HTTP %s)." % resp.status_code)
108
+ if resp.status_code >= 400:
109
+ raise BrowserFetchError(f"The page returned HTTP {resp.status_code}.")
110
+ content_type = resp.headers.get("content-type", "")
111
+ if content_type and "html" not in content_type and "text" not in content_type:
112
+ raise BrowserFetchError(f"Unsupported content type: {content_type or 'unknown'}.")
113
+ body = resp.text or ""
114
+ if len(body.encode("utf-8", "ignore")) > MAX_URL_FETCH_BYTES:
115
+ body = body.encode("utf-8", "ignore")[:MAX_URL_FETCH_BYTES].decode("utf-8", "ignore")
116
+ title, text = extract_readable_text(body)
117
+ return (title or url, text)
118
+
119
+
120
+ def _validate_http_url(url: str) -> str:
121
+ url = (url or "").strip()
122
+ if not url:
123
+ raise HTTPException(status_code=400, detail="url is required.")
124
+ parsed = urlparse(url)
125
+ if parsed.scheme not in ("http", "https"):
126
+ raise HTTPException(status_code=400, detail="Only http(s) URLs are supported.")
127
+ if not parsed.netloc:
128
+ raise HTTPException(status_code=400, detail="Malformed URL.")
129
+ return url
130
+
131
+
132
+ # ── request models ───────────────────────────────────────────────────────────
133
+ class ReadUrlRequest(BaseModel):
134
+ url: str
135
+ workspace_id: Optional[str] = None
136
+
137
+
138
+ class IngestTabRequest(BaseModel):
139
+ url: str
140
+ title: Optional[str] = None
141
+ text: Optional[str] = None
142
+ selected_text: Optional[str] = None
143
+ html: Optional[str] = None
144
+ captured_at: Optional[str] = None
145
+ workspace_id: Optional[str] = None
146
+
147
+
148
+ def create_browser_router(
149
+ *,
150
+ pipeline: Any,
151
+ require_user: Callable[[Request], str],
152
+ fetch_url: Optional[Callable[[str], Tuple[str, str]]] = None,
153
+ max_tab_bytes: int = MAX_TAB_BYTES,
154
+ ) -> APIRouter:
155
+ router = APIRouter()
156
+ _fetch = fetch_url or _default_fetch_url
157
+
158
+ def _require_pipeline():
159
+ if pipeline is None or not pipeline.available():
160
+ raise HTTPException(status_code=503, detail="Knowledge Graph ingestion is disabled.")
161
+
162
+ @router.post("/api/browser/read-url")
163
+ async def read_url(req: ReadUrlRequest, request: Request):
164
+ """Fetch a public URL locally and ingest it as a web_url source."""
165
+ user = require_user(request)
166
+ _require_pipeline()
167
+ url = _validate_http_url(req.url)
168
+ try:
169
+ title, text = _fetch(url)
170
+ except BrowserFetchError as exc:
171
+ # Graceful failure — not a 5xx; the page was simply unreadable.
172
+ raise HTTPException(status_code=422, detail=str(exc))
173
+ if not (text or "").strip():
174
+ return {"status": "empty", "source_type": "web_url", "url": url,
175
+ "detail": "No readable text was extracted from the page."}
176
+ res = pipeline.ingest(
177
+ IngestionItem(
178
+ source_type="web_url", title=title, text=text, source_uri=url,
179
+ owner=user, workspace_id=req.workspace_id,
180
+ ),
181
+ user_email=user,
182
+ )
183
+ return res.as_dict()
184
+
185
+ @router.post("/api/browser/ingest-current-tab")
186
+ async def ingest_current_tab(req: IngestTabRequest, request: Request):
187
+ """Ingest a payload captured from the local browser extension."""
188
+ user = require_user(request)
189
+ _require_pipeline()
190
+ url = _validate_http_url(req.url)
191
+ # Sanitize: reject an oversized payload before doing any work.
192
+ for value in (req.text, req.html, req.selected_text):
193
+ if value and len(value.encode("utf-8", "ignore")) > max_tab_bytes:
194
+ raise HTTPException(status_code=413, detail="Captured payload is too large.")
195
+ text = (req.text or "").strip()
196
+ if not text and req.html:
197
+ _title, text = extract_readable_text(req.html)
198
+ if not text:
199
+ text = (req.selected_text or "").strip()
200
+ if not text:
201
+ raise HTTPException(status_code=400, detail="No text, html, or selected_text provided.")
202
+ res = pipeline.ingest(
203
+ IngestionItem(
204
+ source_type="browser_tab",
205
+ title=req.title or url,
206
+ text=text,
207
+ source_uri=url,
208
+ captured_at=req.captured_at,
209
+ owner=user,
210
+ workspace_id=req.workspace_id,
211
+ metadata={"has_selection": bool(req.selected_text)},
212
+ ),
213
+ user_email=user,
214
+ )
215
+ return res.as_dict()
216
+
217
+ return router
@@ -0,0 +1,93 @@
1
+ """Knowledge Graph portability routes — local export / import / backup / restore.
2
+
3
+ Reads (export, status) require a signed-in user. Mutating operations (import,
4
+ backup, restore, file export) require admin because the graph is machine-global,
5
+ not workspace-scoped. Nothing here touches a cloud service.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Callable, Optional
11
+
12
+ from fastapi import APIRouter, HTTPException, Request
13
+ from pydantic import BaseModel
14
+
15
+
16
+ class ImportRequest(BaseModel):
17
+ artifact: dict
18
+ mode: str = "merge"
19
+ dry_run: bool = False
20
+
21
+
22
+ class BackupRequest(BaseModel):
23
+ path: Optional[str] = None
24
+
25
+
26
+ class RestoreRequest(BaseModel):
27
+ path: str
28
+ verify: bool = True
29
+
30
+
31
+ def create_portability_router(
32
+ *,
33
+ service: Any,
34
+ require_user: Callable[[Request], str],
35
+ require_admin: Callable[[Request], Any],
36
+ ) -> APIRouter:
37
+ router = APIRouter()
38
+
39
+ def _require_service():
40
+ if service is None or not service.available():
41
+ raise HTTPException(status_code=503, detail="Knowledge Graph is disabled.")
42
+
43
+ @router.get("/api/knowledge-graph/portability")
44
+ async def portability_status(request: Request):
45
+ require_user(request)
46
+ _require_service()
47
+ return service.snapshot_metadata()
48
+
49
+ @router.get("/api/knowledge-graph/provenance")
50
+ async def recent_provenance(request: Request, limit: int = 50, source_type: Optional[str] = None):
51
+ """Recent ingestions (provenance trail) for the ingestion-sources UI."""
52
+ require_user(request)
53
+ _require_service()
54
+ return service.recent_ingestions(limit=limit, source_type=source_type)
55
+
56
+ @router.post("/api/knowledge-graph/export")
57
+ async def export_graph(request: Request):
58
+ """Logical JSON export of the whole graph (read-only)."""
59
+ require_user(request)
60
+ _require_service()
61
+ return service.export()
62
+
63
+ @router.post("/api/knowledge-graph/export-file")
64
+ async def export_graph_file(request: Request):
65
+ require_admin(request)
66
+ _require_service()
67
+ return service.export_to_file()
68
+
69
+ @router.post("/api/knowledge-graph/import")
70
+ async def import_graph(req: ImportRequest, request: Request):
71
+ require_admin(request)
72
+ _require_service()
73
+ try:
74
+ return service.import_data(req.artifact, mode=req.mode, dry_run=req.dry_run)
75
+ except ValueError as exc:
76
+ raise HTTPException(status_code=400, detail=str(exc))
77
+
78
+ @router.post("/api/knowledge-graph/backup")
79
+ async def backup_graph(req: BackupRequest, request: Request):
80
+ require_admin(request)
81
+ _require_service()
82
+ return service.backup(req.path)
83
+
84
+ @router.post("/api/knowledge-graph/restore")
85
+ async def restore_graph(req: RestoreRequest, request: Request):
86
+ require_admin(request)
87
+ _require_service()
88
+ try:
89
+ return service.restore(req.path, verify=req.verify)
90
+ except (ValueError, FileNotFoundError) as exc:
91
+ raise HTTPException(status_code=400, detail=str(exc))
92
+
93
+ return router
@@ -11,7 +11,7 @@ from copy import deepcopy
11
11
  from typing import Any, Dict, List, Optional
12
12
 
13
13
 
14
- MARKETPLACE_VERSION = "3.5.0"
14
+ MARKETPLACE_VERSION = "3.6.0"
15
15
  TEMPLATE_KINDS = ("plugin", "workflow", "agent")
16
16
 
17
17
 
@@ -14,7 +14,7 @@ from datetime import datetime
14
14
  from typing import Any, Callable, Dict, List, Optional
15
15
 
16
16
 
17
- MULTI_AGENT_VERSION = "3.5.0"
17
+ MULTI_AGENT_VERSION = "3.6.0"
18
18
 
19
19
  AGENT_ROLES = ("researcher", "planner", "executor", "reviewer", "release")
20
20
  CORE_PIPELINE = ("planner", "executor", "reviewer")
@@ -18,7 +18,7 @@ from pathlib import Path
18
18
  from typing import Any, Callable, Dict, Iterable, List, Optional
19
19
 
20
20
 
21
- WORKSPACE_OS_VERSION = "3.5.0"
21
+ WORKSPACE_OS_VERSION = "3.6.0"
22
22
 
23
23
  # Workspace types separate single-user Personal workspaces from shared
24
24
  # Organization workspaces. Both keep the same local-first JSON store; the type
@@ -120,7 +120,11 @@ from latticeai.core.builtin_hooks import register_builtin_hook_runners
120
120
  from latticeai.api.agent_registry import create_agent_registry_router
121
121
  from latticeai.core.agent_registry import AgentRegistry
122
122
  from latticeai.api.memory import create_memory_router
123
+ from latticeai.api.browser import create_browser_router
124
+ from latticeai.api.portability import create_portability_router
123
125
  from latticeai.services.memory_service import MemoryService
126
+ from latticeai.services.ingestion import IngestionPipeline
127
+ from latticeai.services.kg_portability import KGPortabilityService
124
128
  from latticeai.services.tool_dispatch import (
125
129
  LOCAL_WRITE_BLOCKED_PREFIXES as _LOCAL_WRITE_BLOCKED_PREFIXES,
126
130
  TOOL_GOVERNANCE,
@@ -322,6 +326,23 @@ MEMORY_SERVICE = MemoryService(
322
326
  enable_graph=ENABLE_GRAPH,
323
327
  history_file=HISTORY_FILE,
324
328
  )
329
+ # ── v3.6.0 unified ingestion pipeline: the single write-side seam into the
330
+ # Knowledge Graph. Every new source (web URL, browser tab, …) flows through this
331
+ # so pre_tool/post_tool hooks fire on ingestion and provenance is captured
332
+ # uniformly. Existing direct ingest callers keep working; new paths converge here.
333
+ INGESTION_PIPELINE = IngestionPipeline(
334
+ KNOWLEDGE_GRAPH,
335
+ hooks=HOOKS_REGISTRY,
336
+ enable_graph=ENABLE_GRAPH,
337
+ audit=lambda action, detail, user: append_audit_event(action, user_email=user, **detail),
338
+ )
339
+ # ── v3.6.0 Knowledge Graph portability: local export / import / backup / restore.
340
+ # The graph is the user's durable asset, so it must be portable with no cloud.
341
+ KG_PORTABILITY = KGPortabilityService(
342
+ knowledge_graph=KNOWLEDGE_GRAPH,
343
+ data_dir=DATA_DIR,
344
+ enable_graph=ENABLE_GRAPH,
345
+ )
325
346
 
326
347
  def _require_graph():
327
348
  if not ENABLE_GRAPH or KNOWLEDGE_GRAPH is None:
@@ -1508,6 +1529,17 @@ app.include_router(create_memory_router(
1508
1529
  append_audit_event=append_audit_event,
1509
1530
  ))
1510
1531
 
1532
+ app.include_router(create_browser_router(
1533
+ pipeline=INGESTION_PIPELINE,
1534
+ require_user=require_user,
1535
+ ))
1536
+
1537
+ app.include_router(create_portability_router(
1538
+ service=KG_PORTABILITY,
1539
+ require_user=require_user,
1540
+ require_admin=require_admin,
1541
+ ))
1542
+
1511
1543
  app.include_router(create_garden_router(gardener=gardener, require_user=require_user))
1512
1544
  app.include_router(create_setup_router(model_router=router, require_user=require_user))
1513
1545
 
@@ -0,0 +1,271 @@
1
+ """Unified ingestion pipeline — the single write-side seam into the Knowledge Graph.
2
+
3
+ v3.6.0 Knowledge Graph First principle: *no data source bypasses the Knowledge
4
+ Graph and no source creates an isolated silo*. Every source — local files,
5
+ connected folders, PDFs/Markdown/text/code, web URLs, browser tabs — is
6
+ normalized into one :class:`IngestionItem` and pushed through one
7
+ :meth:`IngestionPipeline.ingest` entrypoint:
8
+
9
+ Source → normalize → content hash → (file | text) ingest → provenance
10
+
11
+ The pipeline is deliberately thin. It owns normalization, idempotency reporting,
12
+ provenance capture, and — crucially — routing every ingest through the shared
13
+ ``dispatch_tool`` lifecycle so ``pre_tool``/``post_tool`` hooks fire on data
14
+ ingestion exactly as they do on tool calls. The heavy graph construction lives in
15
+ :class:`knowledge_graph.KnowledgeGraphStore` (``ingest_document`` for files,
16
+ ``ingest_source`` for text/web), which this module composes rather than
17
+ re-implements.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ from dataclasses import dataclass, field
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional
27
+
28
+ from latticeai.core.hooks import dispatch_tool
29
+
30
+ # Source types that arrive as a file on disk (read via ingest_document).
31
+ FILE_SOURCE_TYPES = frozenset({"file", "local_file", "upload", "pdf"})
32
+ # Source types that arrive as extracted text (read via ingest_source).
33
+ TEXT_SOURCE_TYPES = frozenset(
34
+ {"web_url", "browser_tab", "text", "markdown", "note", "code", "clipboard"}
35
+ )
36
+
37
+ DEFAULT_MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB of extracted text per item
38
+
39
+
40
+ def _now_iso() -> str:
41
+ return datetime.now(timezone.utc).isoformat()
42
+
43
+
44
+ @dataclass
45
+ class IngestionItem:
46
+ """A single thing to ingest, normalized across every source type."""
47
+
48
+ source_type: str
49
+ title: Optional[str] = None
50
+ text: Optional[str] = None # text/web sources
51
+ path: Optional[str] = None # file sources
52
+ source_uri: Optional[str] = None
53
+ mime_type: Optional[str] = None
54
+ owner: Optional[str] = None
55
+ workspace_id: Optional[str] = None
56
+ permissions: Optional[Dict[str, Any]] = None
57
+ captured_at: Optional[str] = None
58
+ modified_at: Optional[str] = None
59
+ conversation_id: Optional[str] = None
60
+ agent_used: Optional[str] = None
61
+ metadata: Dict[str, Any] = field(default_factory=dict)
62
+
63
+
64
+ @dataclass
65
+ class IngestionResult:
66
+ """The outcome of one ingestion, including provenance and idempotency."""
67
+
68
+ status: str # ok | unavailable | blocked | failed
69
+ source_type: str
70
+ node_id: Optional[str] = None
71
+ source_node_id: Optional[str] = None
72
+ content_hash: Optional[str] = None
73
+ title: Optional[str] = None
74
+ chunk_ids: List[str] = field(default_factory=list)
75
+ chunk_count: int = 0
76
+ duplicate: bool = False
77
+ embedded: bool = False
78
+ indexing_status: str = "pending" # indexed | skipped | failed | pending
79
+ provenance_id: Optional[str] = None
80
+ detail: Optional[str] = None
81
+
82
+ def as_dict(self) -> Dict[str, Any]:
83
+ return {
84
+ "status": self.status,
85
+ "source_type": self.source_type,
86
+ "node_id": self.node_id,
87
+ "source_node_id": self.source_node_id,
88
+ "content_hash": self.content_hash,
89
+ "title": self.title,
90
+ "chunk_ids": self.chunk_ids,
91
+ "chunk_count": self.chunk_count,
92
+ "duplicate": self.duplicate,
93
+ "embedded": self.embedded,
94
+ "indexing_status": self.indexing_status,
95
+ "provenance_id": self.provenance_id,
96
+ "detail": self.detail,
97
+ }
98
+
99
+
100
+ class IngestionPipeline:
101
+ """Single normalized entrypoint that feeds every source into the graph."""
102
+
103
+ def __init__(
104
+ self,
105
+ knowledge_graph: Any,
106
+ *,
107
+ hooks: Any = None,
108
+ enable_graph: bool = True,
109
+ audit: Optional[Any] = None,
110
+ max_text_bytes: int = DEFAULT_MAX_TEXT_BYTES,
111
+ pipeline_name: str = "unified-ingestion",
112
+ ) -> None:
113
+ self._kg = knowledge_graph
114
+ self._hooks = hooks
115
+ self._enable = bool(enable_graph)
116
+ self._audit = audit
117
+ self._max_text_bytes = int(max_text_bytes)
118
+ self._pipeline_name = pipeline_name
119
+
120
+ def available(self) -> bool:
121
+ return self._enable and self._kg is not None
122
+
123
+ # ── public API ───────────────────────────────────────────────────────────
124
+ def ingest(self, item: IngestionItem, *, user_email: Optional[str] = None) -> IngestionResult:
125
+ """Normalize, hash, route through dispatch_tool, and record provenance."""
126
+ source_type = str(item.source_type or "text").strip()
127
+ if not self.available():
128
+ return IngestionResult(
129
+ status="unavailable", source_type=source_type,
130
+ indexing_status="skipped",
131
+ detail="Knowledge Graph is disabled (LATTICEAI_ENABLE_GRAPH).",
132
+ )
133
+
134
+ captured_at = item.captured_at or _now_iso()
135
+ owner = item.owner or user_email
136
+ tool_name = f"kg_ingest.{source_type}"
137
+ # Only the keys are read by the hook payload, so this dict is safe/cheap.
138
+ args = {
139
+ "source_type": source_type,
140
+ "source_uri": item.source_uri,
141
+ "owner": owner,
142
+ "workspace_id": item.workspace_id,
143
+ }
144
+
145
+ def _run() -> Dict[str, Any]:
146
+ if source_type in FILE_SOURCE_TYPES or (item.path and not item.text):
147
+ return self._ingest_file(item, source_type=source_type, owner=owner, captured_at=captured_at)
148
+ return self._ingest_text(item, source_type=source_type, owner=owner, captured_at=captured_at)
149
+
150
+ try:
151
+ raw = dispatch_tool(
152
+ self._hooks, tool_name, args, _run,
153
+ user_email=user_email, workspace_id=item.workspace_id, source="ingestion",
154
+ )
155
+ except PermissionError as exc:
156
+ return IngestionResult(
157
+ status="blocked", source_type=source_type,
158
+ indexing_status="skipped", detail=str(exc),
159
+ )
160
+ except FileNotFoundError as exc:
161
+ return IngestionResult(
162
+ status="failed", source_type=source_type,
163
+ indexing_status="failed", detail=str(exc),
164
+ )
165
+ except Exception as exc: # noqa: BLE001 — surface as a failed result, never crash the caller
166
+ return IngestionResult(
167
+ status="failed", source_type=source_type,
168
+ indexing_status="failed", detail=str(exc),
169
+ )
170
+
171
+ node_id = raw.get("node_id")
172
+ content_hash = raw.get("content_hash") or raw.get("sha256")
173
+ chunk_ids = list(raw.get("chunk_ids") or [])
174
+ embedded = bool(self._kg.node_is_embedded(node_id)) if node_id else False
175
+ title = raw.get("title") or item.title
176
+
177
+ prov = self._kg.record_provenance(
178
+ node_id=node_id,
179
+ source_type=source_type,
180
+ pipeline=self._pipeline_name,
181
+ source_uri=item.source_uri,
182
+ content_hash=content_hash,
183
+ title=title,
184
+ owner=owner,
185
+ workspace_id=item.workspace_id,
186
+ captured_at=captured_at,
187
+ modified_at=item.modified_at,
188
+ embedded=embedded,
189
+ linked=bool(raw.get("source_node_id")),
190
+ duplicate=bool(raw.get("duplicate")),
191
+ agent_used=item.agent_used,
192
+ chunk_count=len(chunk_ids),
193
+ permissions=item.permissions,
194
+ metadata=item.metadata,
195
+ )
196
+ if self._audit is not None:
197
+ try:
198
+ self._audit(
199
+ "kg_ingest",
200
+ {
201
+ "source_type": source_type, "node_id": node_id,
202
+ "content_hash": content_hash, "duplicate": bool(raw.get("duplicate")),
203
+ },
204
+ user_email,
205
+ )
206
+ except Exception: # noqa: BLE001 — audit must never break ingestion
207
+ pass
208
+
209
+ return IngestionResult(
210
+ status="ok",
211
+ source_type=source_type,
212
+ node_id=node_id,
213
+ source_node_id=raw.get("source_node_id"),
214
+ content_hash=content_hash,
215
+ title=title,
216
+ chunk_ids=chunk_ids,
217
+ chunk_count=len(chunk_ids),
218
+ duplicate=bool(raw.get("duplicate")),
219
+ embedded=embedded,
220
+ indexing_status="indexed",
221
+ provenance_id=prov.get("id"),
222
+ )
223
+
224
+ # ── routing helpers ──────────────────────────────────────────────────────
225
+ def _ingest_text(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
226
+ text = item.text or ""
227
+ if len(text.encode("utf-8", "ignore")) > self._max_text_bytes:
228
+ raise ValueError(
229
+ f"Text payload exceeds the {self._max_text_bytes // (1024 * 1024)}MB ingestion limit."
230
+ )
231
+ title = item.title or item.source_uri or source_type
232
+ return self._kg.ingest_source(
233
+ source_type=source_type,
234
+ title=title,
235
+ text=text,
236
+ source_uri=item.source_uri,
237
+ owner=owner,
238
+ workspace_id=item.workspace_id,
239
+ permissions=item.permissions,
240
+ captured_at=captured_at,
241
+ modified_at=item.modified_at,
242
+ conversation_id=item.conversation_id,
243
+ metadata={"mime_type": item.mime_type, **(item.metadata or {})},
244
+ )
245
+
246
+ def _ingest_file(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
247
+ if not item.path:
248
+ raise ValueError("File ingestion requires a path.")
249
+ path = Path(item.path)
250
+ if not path.exists():
251
+ raise FileNotFoundError(f"File not found: {path}")
252
+ return self._kg.ingest_document(
253
+ path,
254
+ original_filename=item.title or path.name,
255
+ mime_type=item.mime_type,
256
+ uploader=owner,
257
+ conversation_id=item.conversation_id,
258
+ extracted=item.metadata.get("extracted") if item.metadata else None,
259
+ source_type=source_type,
260
+ source_uri=item.source_uri or str(path),
261
+ captured_at=captured_at,
262
+ modified_at=item.modified_at,
263
+ owner=owner,
264
+ workspace_id=item.workspace_id,
265
+ permissions=item.permissions,
266
+ )
267
+
268
+
269
+ def content_hash_text(text: str) -> str:
270
+ """Canonical content hash for a text payload (matches store hashing scheme)."""
271
+ return hashlib.sha256((text or "").encode("utf-8", "ignore")).hexdigest()