ltcai 3.5.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -28
- package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
- package/docs/CHANGELOG.md +32 -0
- package/docs/HANDOVER_v3.6.0.md +46 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
- package/docs/architecture.md +13 -12
- package/docs/kg-schema.md +55 -0
- package/docs/privacy.md +18 -2
- package/docs/security-model.md +17 -0
- package/kg_schema.py +46 -0
- package/knowledge_graph.py +520 -1
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/browser.py +217 -0
- package/latticeai/api/portability.py +93 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/multi_agent.py +1 -1
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/server_app.py +32 -0
- package/latticeai/services/ingestion.py +271 -0
- package/latticeai/services/kg_portability.py +177 -0
- package/package.json +3 -2
- package/scripts/build_vsix.mjs +72 -0
- package/static/v3/asset-manifest.json +7 -7
- package/static/v3/js/{app.d086489d.js → app.c541f955.js} +1 -1
- package/static/v3/js/core/{api.12b568ad.js → api.33d6320e.js} +38 -0
- package/static/v3/js/core/api.js +38 -0
- package/static/v3/js/core/{routes.d214b399.js → routes.2ce3815a.js} +1 -1
- package/static/v3/js/core/routes.js +1 -1
- package/static/v3/js/core/{shell.d05266f5.js → shell.8c163e0e.js} +2 -2
- package/static/v3/js/views/knowledge-graph.a96040a5.js +513 -0
- package/static/v3/js/views/knowledge-graph.js +293 -17
- package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Browser & web ingestion — Knowledge Graph inputs, not standalone features.
|
|
2
|
+
|
|
3
|
+
v3.6.0 Knowledge Graph First: a public URL or an open browser tab is just another
|
|
4
|
+
source that converges into the Knowledge Graph through the unified ingestion
|
|
5
|
+
pipeline. Everything runs on the **local runtime** — the server fetches/reads
|
|
6
|
+
locally, stores into local SQLite, and never uploads to a cloud service.
|
|
7
|
+
|
|
8
|
+
Two layers, both feeding ``IngestionPipeline.ingest``:
|
|
9
|
+
|
|
10
|
+
* ``POST /api/browser/read-url`` — the runtime fetches a public URL locally,
|
|
11
|
+
extracts readable text, stores it as ``source_type=web_url``. Fails gracefully
|
|
12
|
+
on blocked / login-required pages.
|
|
13
|
+
* ``POST /api/browser/ingest-current-tab`` — accepts a payload from the local
|
|
14
|
+
browser extension (url/title/text/selected_text/html), sanitizes + size-limits
|
|
15
|
+
it, stores it as ``source_type=browser_tab``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from html.parser import HTMLParser
|
|
21
|
+
from typing import Any, Callable, Optional, Tuple
|
|
22
|
+
from urllib.parse import urlparse
|
|
23
|
+
|
|
24
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
25
|
+
from pydantic import BaseModel
|
|
26
|
+
|
|
27
|
+
from latticeai.services.ingestion import IngestionItem
|
|
28
|
+
|
|
29
|
+
MAX_TAB_BYTES = 4 * 1024 * 1024 # 4 MB per captured tab payload
|
|
30
|
+
MAX_URL_FETCH_BYTES = 4 * 1024 * 1024 # 4 MB cap on a fetched page
|
|
31
|
+
URL_FETCH_TIMEOUT = 12.0 # seconds
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BrowserFetchError(Exception):
|
|
35
|
+
"""A URL could not be fetched (blocked, login-required, timeout, too big)."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ── readable-text extraction ─────────────────────────────────────────────────
|
|
39
|
+
_SKIP_TAGS = {"script", "style", "noscript", "template", "svg", "head"}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _TextExtractor(HTMLParser):
|
|
43
|
+
def __init__(self) -> None:
|
|
44
|
+
super().__init__(convert_charrefs=True)
|
|
45
|
+
self._skip_depth = 0
|
|
46
|
+
self._chunks: list[str] = []
|
|
47
|
+
self.title: str = ""
|
|
48
|
+
self._in_title = False
|
|
49
|
+
|
|
50
|
+
def handle_starttag(self, tag, attrs):
|
|
51
|
+
if tag in _SKIP_TAGS:
|
|
52
|
+
self._skip_depth += 1
|
|
53
|
+
if tag == "title":
|
|
54
|
+
self._in_title = True
|
|
55
|
+
|
|
56
|
+
def handle_endtag(self, tag):
|
|
57
|
+
if tag in _SKIP_TAGS and self._skip_depth > 0:
|
|
58
|
+
self._skip_depth -= 1
|
|
59
|
+
if tag == "title":
|
|
60
|
+
self._in_title = False
|
|
61
|
+
if tag in {"p", "div", "br", "li", "h1", "h2", "h3", "h4", "section", "article"}:
|
|
62
|
+
self._chunks.append("\n")
|
|
63
|
+
|
|
64
|
+
def handle_data(self, data):
|
|
65
|
+
if self._in_title:
|
|
66
|
+
self.title += data
|
|
67
|
+
if self._skip_depth == 0:
|
|
68
|
+
text = data.strip()
|
|
69
|
+
if text:
|
|
70
|
+
self._chunks.append(text)
|
|
71
|
+
|
|
72
|
+
def text(self) -> str:
|
|
73
|
+
raw = " ".join(self._chunks)
|
|
74
|
+
# collapse runs of whitespace while keeping paragraph breaks
|
|
75
|
+
lines = [ln.strip() for ln in raw.replace("\r", "").split("\n")]
|
|
76
|
+
return "\n".join([ln for ln in lines if ln]).strip()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_readable_text(html: str) -> Tuple[str, str]:
|
|
80
|
+
"""Return (title, readable_text) from an HTML string. Never raises."""
|
|
81
|
+
parser = _TextExtractor()
|
|
82
|
+
try:
|
|
83
|
+
parser.feed(html or "")
|
|
84
|
+
except Exception: # noqa: BLE001 — malformed HTML must still yield best-effort text
|
|
85
|
+
pass
|
|
86
|
+
return parser.title.strip(), parser.text()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _default_fetch_url(url: str) -> Tuple[str, str]:
|
|
90
|
+
"""Fetch a public URL on the local runtime and extract readable text.
|
|
91
|
+
|
|
92
|
+
Raises :class:`BrowserFetchError` on any non-success (blocked, login wall,
|
|
93
|
+
timeout, oversized, non-HTML) so the route can fail gracefully.
|
|
94
|
+
"""
|
|
95
|
+
import httpx
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
with httpx.Client(
|
|
99
|
+
follow_redirects=True, timeout=URL_FETCH_TIMEOUT,
|
|
100
|
+
headers={"User-Agent": "LatticeAI-local/3.6 (+local-first knowledge graph)"},
|
|
101
|
+
) as client:
|
|
102
|
+
resp = client.get(url)
|
|
103
|
+
except httpx.HTTPError as exc:
|
|
104
|
+
raise BrowserFetchError(f"Could not reach the page: {exc}") from exc
|
|
105
|
+
|
|
106
|
+
if resp.status_code in (401, 403):
|
|
107
|
+
raise BrowserFetchError("The page is login-required or blocked (HTTP %s)." % resp.status_code)
|
|
108
|
+
if resp.status_code >= 400:
|
|
109
|
+
raise BrowserFetchError(f"The page returned HTTP {resp.status_code}.")
|
|
110
|
+
content_type = resp.headers.get("content-type", "")
|
|
111
|
+
if content_type and "html" not in content_type and "text" not in content_type:
|
|
112
|
+
raise BrowserFetchError(f"Unsupported content type: {content_type or 'unknown'}.")
|
|
113
|
+
body = resp.text or ""
|
|
114
|
+
if len(body.encode("utf-8", "ignore")) > MAX_URL_FETCH_BYTES:
|
|
115
|
+
body = body.encode("utf-8", "ignore")[:MAX_URL_FETCH_BYTES].decode("utf-8", "ignore")
|
|
116
|
+
title, text = extract_readable_text(body)
|
|
117
|
+
return (title or url, text)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _validate_http_url(url: str) -> str:
|
|
121
|
+
url = (url or "").strip()
|
|
122
|
+
if not url:
|
|
123
|
+
raise HTTPException(status_code=400, detail="url is required.")
|
|
124
|
+
parsed = urlparse(url)
|
|
125
|
+
if parsed.scheme not in ("http", "https"):
|
|
126
|
+
raise HTTPException(status_code=400, detail="Only http(s) URLs are supported.")
|
|
127
|
+
if not parsed.netloc:
|
|
128
|
+
raise HTTPException(status_code=400, detail="Malformed URL.")
|
|
129
|
+
return url
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ── request models ───────────────────────────────────────────────────────────
|
|
133
|
+
class ReadUrlRequest(BaseModel):
|
|
134
|
+
url: str
|
|
135
|
+
workspace_id: Optional[str] = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class IngestTabRequest(BaseModel):
|
|
139
|
+
url: str
|
|
140
|
+
title: Optional[str] = None
|
|
141
|
+
text: Optional[str] = None
|
|
142
|
+
selected_text: Optional[str] = None
|
|
143
|
+
html: Optional[str] = None
|
|
144
|
+
captured_at: Optional[str] = None
|
|
145
|
+
workspace_id: Optional[str] = None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def create_browser_router(
|
|
149
|
+
*,
|
|
150
|
+
pipeline: Any,
|
|
151
|
+
require_user: Callable[[Request], str],
|
|
152
|
+
fetch_url: Optional[Callable[[str], Tuple[str, str]]] = None,
|
|
153
|
+
max_tab_bytes: int = MAX_TAB_BYTES,
|
|
154
|
+
) -> APIRouter:
|
|
155
|
+
router = APIRouter()
|
|
156
|
+
_fetch = fetch_url or _default_fetch_url
|
|
157
|
+
|
|
158
|
+
def _require_pipeline():
|
|
159
|
+
if pipeline is None or not pipeline.available():
|
|
160
|
+
raise HTTPException(status_code=503, detail="Knowledge Graph ingestion is disabled.")
|
|
161
|
+
|
|
162
|
+
@router.post("/api/browser/read-url")
|
|
163
|
+
async def read_url(req: ReadUrlRequest, request: Request):
|
|
164
|
+
"""Fetch a public URL locally and ingest it as a web_url source."""
|
|
165
|
+
user = require_user(request)
|
|
166
|
+
_require_pipeline()
|
|
167
|
+
url = _validate_http_url(req.url)
|
|
168
|
+
try:
|
|
169
|
+
title, text = _fetch(url)
|
|
170
|
+
except BrowserFetchError as exc:
|
|
171
|
+
# Graceful failure — not a 5xx; the page was simply unreadable.
|
|
172
|
+
raise HTTPException(status_code=422, detail=str(exc))
|
|
173
|
+
if not (text or "").strip():
|
|
174
|
+
return {"status": "empty", "source_type": "web_url", "url": url,
|
|
175
|
+
"detail": "No readable text was extracted from the page."}
|
|
176
|
+
res = pipeline.ingest(
|
|
177
|
+
IngestionItem(
|
|
178
|
+
source_type="web_url", title=title, text=text, source_uri=url,
|
|
179
|
+
owner=user, workspace_id=req.workspace_id,
|
|
180
|
+
),
|
|
181
|
+
user_email=user,
|
|
182
|
+
)
|
|
183
|
+
return res.as_dict()
|
|
184
|
+
|
|
185
|
+
@router.post("/api/browser/ingest-current-tab")
|
|
186
|
+
async def ingest_current_tab(req: IngestTabRequest, request: Request):
|
|
187
|
+
"""Ingest a payload captured from the local browser extension."""
|
|
188
|
+
user = require_user(request)
|
|
189
|
+
_require_pipeline()
|
|
190
|
+
url = _validate_http_url(req.url)
|
|
191
|
+
# Sanitize: reject an oversized payload before doing any work.
|
|
192
|
+
for value in (req.text, req.html, req.selected_text):
|
|
193
|
+
if value and len(value.encode("utf-8", "ignore")) > max_tab_bytes:
|
|
194
|
+
raise HTTPException(status_code=413, detail="Captured payload is too large.")
|
|
195
|
+
text = (req.text or "").strip()
|
|
196
|
+
if not text and req.html:
|
|
197
|
+
_title, text = extract_readable_text(req.html)
|
|
198
|
+
if not text:
|
|
199
|
+
text = (req.selected_text or "").strip()
|
|
200
|
+
if not text:
|
|
201
|
+
raise HTTPException(status_code=400, detail="No text, html, or selected_text provided.")
|
|
202
|
+
res = pipeline.ingest(
|
|
203
|
+
IngestionItem(
|
|
204
|
+
source_type="browser_tab",
|
|
205
|
+
title=req.title or url,
|
|
206
|
+
text=text,
|
|
207
|
+
source_uri=url,
|
|
208
|
+
captured_at=req.captured_at,
|
|
209
|
+
owner=user,
|
|
210
|
+
workspace_id=req.workspace_id,
|
|
211
|
+
metadata={"has_selection": bool(req.selected_text)},
|
|
212
|
+
),
|
|
213
|
+
user_email=user,
|
|
214
|
+
)
|
|
215
|
+
return res.as_dict()
|
|
216
|
+
|
|
217
|
+
return router
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Knowledge Graph portability routes — local export / import / backup / restore.
|
|
2
|
+
|
|
3
|
+
Reads (export, status) require a signed-in user. Mutating operations (import,
|
|
4
|
+
backup, restore, file export) require admin because the graph is machine-global,
|
|
5
|
+
not workspace-scoped. Nothing here touches a cloud service.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Callable, Optional
|
|
11
|
+
|
|
12
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ImportRequest(BaseModel):
|
|
17
|
+
artifact: dict
|
|
18
|
+
mode: str = "merge"
|
|
19
|
+
dry_run: bool = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BackupRequest(BaseModel):
|
|
23
|
+
path: Optional[str] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RestoreRequest(BaseModel):
|
|
27
|
+
path: str
|
|
28
|
+
verify: bool = True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_portability_router(
|
|
32
|
+
*,
|
|
33
|
+
service: Any,
|
|
34
|
+
require_user: Callable[[Request], str],
|
|
35
|
+
require_admin: Callable[[Request], Any],
|
|
36
|
+
) -> APIRouter:
|
|
37
|
+
router = APIRouter()
|
|
38
|
+
|
|
39
|
+
def _require_service():
|
|
40
|
+
if service is None or not service.available():
|
|
41
|
+
raise HTTPException(status_code=503, detail="Knowledge Graph is disabled.")
|
|
42
|
+
|
|
43
|
+
@router.get("/api/knowledge-graph/portability")
|
|
44
|
+
async def portability_status(request: Request):
|
|
45
|
+
require_user(request)
|
|
46
|
+
_require_service()
|
|
47
|
+
return service.snapshot_metadata()
|
|
48
|
+
|
|
49
|
+
@router.get("/api/knowledge-graph/provenance")
|
|
50
|
+
async def recent_provenance(request: Request, limit: int = 50, source_type: Optional[str] = None):
|
|
51
|
+
"""Recent ingestions (provenance trail) for the ingestion-sources UI."""
|
|
52
|
+
require_user(request)
|
|
53
|
+
_require_service()
|
|
54
|
+
return service.recent_ingestions(limit=limit, source_type=source_type)
|
|
55
|
+
|
|
56
|
+
@router.post("/api/knowledge-graph/export")
|
|
57
|
+
async def export_graph(request: Request):
|
|
58
|
+
"""Logical JSON export of the whole graph (read-only)."""
|
|
59
|
+
require_user(request)
|
|
60
|
+
_require_service()
|
|
61
|
+
return service.export()
|
|
62
|
+
|
|
63
|
+
@router.post("/api/knowledge-graph/export-file")
|
|
64
|
+
async def export_graph_file(request: Request):
|
|
65
|
+
require_admin(request)
|
|
66
|
+
_require_service()
|
|
67
|
+
return service.export_to_file()
|
|
68
|
+
|
|
69
|
+
@router.post("/api/knowledge-graph/import")
|
|
70
|
+
async def import_graph(req: ImportRequest, request: Request):
|
|
71
|
+
require_admin(request)
|
|
72
|
+
_require_service()
|
|
73
|
+
try:
|
|
74
|
+
return service.import_data(req.artifact, mode=req.mode, dry_run=req.dry_run)
|
|
75
|
+
except ValueError as exc:
|
|
76
|
+
raise HTTPException(status_code=400, detail=str(exc))
|
|
77
|
+
|
|
78
|
+
@router.post("/api/knowledge-graph/backup")
|
|
79
|
+
async def backup_graph(req: BackupRequest, request: Request):
|
|
80
|
+
require_admin(request)
|
|
81
|
+
_require_service()
|
|
82
|
+
return service.backup(req.path)
|
|
83
|
+
|
|
84
|
+
@router.post("/api/knowledge-graph/restore")
|
|
85
|
+
async def restore_graph(req: RestoreRequest, request: Request):
|
|
86
|
+
require_admin(request)
|
|
87
|
+
_require_service()
|
|
88
|
+
try:
|
|
89
|
+
return service.restore(req.path, verify=req.verify)
|
|
90
|
+
except (ValueError, FileNotFoundError) as exc:
|
|
91
|
+
raise HTTPException(status_code=400, detail=str(exc))
|
|
92
|
+
|
|
93
|
+
return router
|
|
@@ -14,7 +14,7 @@ from datetime import datetime
|
|
|
14
14
|
from typing import Any, Callable, Dict, List, Optional
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
MULTI_AGENT_VERSION = "3.
|
|
17
|
+
MULTI_AGENT_VERSION = "3.6.0"
|
|
18
18
|
|
|
19
19
|
AGENT_ROLES = ("researcher", "planner", "executor", "reviewer", "release")
|
|
20
20
|
CORE_PIPELINE = ("planner", "executor", "reviewer")
|
|
@@ -18,7 +18,7 @@ from pathlib import Path
|
|
|
18
18
|
from typing import Any, Callable, Dict, Iterable, List, Optional
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
WORKSPACE_OS_VERSION = "3.
|
|
21
|
+
WORKSPACE_OS_VERSION = "3.6.0"
|
|
22
22
|
|
|
23
23
|
# Workspace types separate single-user Personal workspaces from shared
|
|
24
24
|
# Organization workspaces. Both keep the same local-first JSON store; the type
|
package/latticeai/server_app.py
CHANGED
|
@@ -120,7 +120,11 @@ from latticeai.core.builtin_hooks import register_builtin_hook_runners
|
|
|
120
120
|
from latticeai.api.agent_registry import create_agent_registry_router
|
|
121
121
|
from latticeai.core.agent_registry import AgentRegistry
|
|
122
122
|
from latticeai.api.memory import create_memory_router
|
|
123
|
+
from latticeai.api.browser import create_browser_router
|
|
124
|
+
from latticeai.api.portability import create_portability_router
|
|
123
125
|
from latticeai.services.memory_service import MemoryService
|
|
126
|
+
from latticeai.services.ingestion import IngestionPipeline
|
|
127
|
+
from latticeai.services.kg_portability import KGPortabilityService
|
|
124
128
|
from latticeai.services.tool_dispatch import (
|
|
125
129
|
LOCAL_WRITE_BLOCKED_PREFIXES as _LOCAL_WRITE_BLOCKED_PREFIXES,
|
|
126
130
|
TOOL_GOVERNANCE,
|
|
@@ -322,6 +326,23 @@ MEMORY_SERVICE = MemoryService(
|
|
|
322
326
|
enable_graph=ENABLE_GRAPH,
|
|
323
327
|
history_file=HISTORY_FILE,
|
|
324
328
|
)
|
|
329
|
+
# ── v3.6.0 unified ingestion pipeline: the single write-side seam into the
|
|
330
|
+
# Knowledge Graph. Every new source (web URL, browser tab, …) flows through this
|
|
331
|
+
# so pre_tool/post_tool hooks fire on ingestion and provenance is captured
|
|
332
|
+
# uniformly. Existing direct ingest callers keep working; new paths converge here.
|
|
333
|
+
INGESTION_PIPELINE = IngestionPipeline(
|
|
334
|
+
KNOWLEDGE_GRAPH,
|
|
335
|
+
hooks=HOOKS_REGISTRY,
|
|
336
|
+
enable_graph=ENABLE_GRAPH,
|
|
337
|
+
audit=lambda action, detail, user: append_audit_event(action, user_email=user, **detail),
|
|
338
|
+
)
|
|
339
|
+
# ── v3.6.0 Knowledge Graph portability: local export / import / backup / restore.
|
|
340
|
+
# The graph is the user's durable asset, so it must be portable with no cloud.
|
|
341
|
+
KG_PORTABILITY = KGPortabilityService(
|
|
342
|
+
knowledge_graph=KNOWLEDGE_GRAPH,
|
|
343
|
+
data_dir=DATA_DIR,
|
|
344
|
+
enable_graph=ENABLE_GRAPH,
|
|
345
|
+
)
|
|
325
346
|
|
|
326
347
|
def _require_graph():
|
|
327
348
|
if not ENABLE_GRAPH or KNOWLEDGE_GRAPH is None:
|
|
@@ -1508,6 +1529,17 @@ app.include_router(create_memory_router(
|
|
|
1508
1529
|
append_audit_event=append_audit_event,
|
|
1509
1530
|
))
|
|
1510
1531
|
|
|
1532
|
+
app.include_router(create_browser_router(
|
|
1533
|
+
pipeline=INGESTION_PIPELINE,
|
|
1534
|
+
require_user=require_user,
|
|
1535
|
+
))
|
|
1536
|
+
|
|
1537
|
+
app.include_router(create_portability_router(
|
|
1538
|
+
service=KG_PORTABILITY,
|
|
1539
|
+
require_user=require_user,
|
|
1540
|
+
require_admin=require_admin,
|
|
1541
|
+
))
|
|
1542
|
+
|
|
1511
1543
|
app.include_router(create_garden_router(gardener=gardener, require_user=require_user))
|
|
1512
1544
|
app.include_router(create_setup_router(model_router=router, require_user=require_user))
|
|
1513
1545
|
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Unified ingestion pipeline — the single write-side seam into the Knowledge Graph.
|
|
2
|
+
|
|
3
|
+
v3.6.0 Knowledge Graph First principle: *no data source bypasses the Knowledge
|
|
4
|
+
Graph and no source creates an isolated silo*. Every source — local files,
|
|
5
|
+
connected folders, PDFs/Markdown/text/code, web URLs, browser tabs — is
|
|
6
|
+
normalized into one :class:`IngestionItem` and pushed through one
|
|
7
|
+
:meth:`IngestionPipeline.ingest` entrypoint:
|
|
8
|
+
|
|
9
|
+
Source → normalize → content hash → (file | text) ingest → provenance
|
|
10
|
+
|
|
11
|
+
The pipeline is deliberately thin. It owns normalization, idempotency reporting,
|
|
12
|
+
provenance capture, and — crucially — routing every ingest through the shared
|
|
13
|
+
``dispatch_tool`` lifecycle so ``pre_tool``/``post_tool`` hooks fire on data
|
|
14
|
+
ingestion exactly as they do on tool calls. The heavy graph construction lives in
|
|
15
|
+
:class:`knowledge_graph.KnowledgeGraphStore` (``ingest_document`` for files,
|
|
16
|
+
``ingest_source`` for text/web), which this module composes rather than
|
|
17
|
+
re-implements.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
from latticeai.core.hooks import dispatch_tool
|
|
29
|
+
|
|
30
|
+
# Source types that arrive as a file on disk (read via ingest_document).
|
|
31
|
+
FILE_SOURCE_TYPES = frozenset({"file", "local_file", "upload", "pdf"})
|
|
32
|
+
# Source types that arrive as extracted text (read via ingest_source).
|
|
33
|
+
TEXT_SOURCE_TYPES = frozenset(
|
|
34
|
+
{"web_url", "browser_tab", "text", "markdown", "note", "code", "clipboard"}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
DEFAULT_MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB of extracted text per item
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _now_iso() -> str:
|
|
41
|
+
return datetime.now(timezone.utc).isoformat()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class IngestionItem:
|
|
46
|
+
"""A single thing to ingest, normalized across every source type."""
|
|
47
|
+
|
|
48
|
+
source_type: str
|
|
49
|
+
title: Optional[str] = None
|
|
50
|
+
text: Optional[str] = None # text/web sources
|
|
51
|
+
path: Optional[str] = None # file sources
|
|
52
|
+
source_uri: Optional[str] = None
|
|
53
|
+
mime_type: Optional[str] = None
|
|
54
|
+
owner: Optional[str] = None
|
|
55
|
+
workspace_id: Optional[str] = None
|
|
56
|
+
permissions: Optional[Dict[str, Any]] = None
|
|
57
|
+
captured_at: Optional[str] = None
|
|
58
|
+
modified_at: Optional[str] = None
|
|
59
|
+
conversation_id: Optional[str] = None
|
|
60
|
+
agent_used: Optional[str] = None
|
|
61
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class IngestionResult:
|
|
66
|
+
"""The outcome of one ingestion, including provenance and idempotency."""
|
|
67
|
+
|
|
68
|
+
status: str # ok | unavailable | blocked | failed
|
|
69
|
+
source_type: str
|
|
70
|
+
node_id: Optional[str] = None
|
|
71
|
+
source_node_id: Optional[str] = None
|
|
72
|
+
content_hash: Optional[str] = None
|
|
73
|
+
title: Optional[str] = None
|
|
74
|
+
chunk_ids: List[str] = field(default_factory=list)
|
|
75
|
+
chunk_count: int = 0
|
|
76
|
+
duplicate: bool = False
|
|
77
|
+
embedded: bool = False
|
|
78
|
+
indexing_status: str = "pending" # indexed | skipped | failed | pending
|
|
79
|
+
provenance_id: Optional[str] = None
|
|
80
|
+
detail: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
def as_dict(self) -> Dict[str, Any]:
|
|
83
|
+
return {
|
|
84
|
+
"status": self.status,
|
|
85
|
+
"source_type": self.source_type,
|
|
86
|
+
"node_id": self.node_id,
|
|
87
|
+
"source_node_id": self.source_node_id,
|
|
88
|
+
"content_hash": self.content_hash,
|
|
89
|
+
"title": self.title,
|
|
90
|
+
"chunk_ids": self.chunk_ids,
|
|
91
|
+
"chunk_count": self.chunk_count,
|
|
92
|
+
"duplicate": self.duplicate,
|
|
93
|
+
"embedded": self.embedded,
|
|
94
|
+
"indexing_status": self.indexing_status,
|
|
95
|
+
"provenance_id": self.provenance_id,
|
|
96
|
+
"detail": self.detail,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class IngestionPipeline:
|
|
101
|
+
"""Single normalized entrypoint that feeds every source into the graph."""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self,
|
|
105
|
+
knowledge_graph: Any,
|
|
106
|
+
*,
|
|
107
|
+
hooks: Any = None,
|
|
108
|
+
enable_graph: bool = True,
|
|
109
|
+
audit: Optional[Any] = None,
|
|
110
|
+
max_text_bytes: int = DEFAULT_MAX_TEXT_BYTES,
|
|
111
|
+
pipeline_name: str = "unified-ingestion",
|
|
112
|
+
) -> None:
|
|
113
|
+
self._kg = knowledge_graph
|
|
114
|
+
self._hooks = hooks
|
|
115
|
+
self._enable = bool(enable_graph)
|
|
116
|
+
self._audit = audit
|
|
117
|
+
self._max_text_bytes = int(max_text_bytes)
|
|
118
|
+
self._pipeline_name = pipeline_name
|
|
119
|
+
|
|
120
|
+
def available(self) -> bool:
|
|
121
|
+
return self._enable and self._kg is not None
|
|
122
|
+
|
|
123
|
+
# ── public API ───────────────────────────────────────────────────────────
|
|
124
|
+
def ingest(self, item: IngestionItem, *, user_email: Optional[str] = None) -> IngestionResult:
|
|
125
|
+
"""Normalize, hash, route through dispatch_tool, and record provenance."""
|
|
126
|
+
source_type = str(item.source_type or "text").strip()
|
|
127
|
+
if not self.available():
|
|
128
|
+
return IngestionResult(
|
|
129
|
+
status="unavailable", source_type=source_type,
|
|
130
|
+
indexing_status="skipped",
|
|
131
|
+
detail="Knowledge Graph is disabled (LATTICEAI_ENABLE_GRAPH).",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
captured_at = item.captured_at or _now_iso()
|
|
135
|
+
owner = item.owner or user_email
|
|
136
|
+
tool_name = f"kg_ingest.{source_type}"
|
|
137
|
+
# Only the keys are read by the hook payload, so this dict is safe/cheap.
|
|
138
|
+
args = {
|
|
139
|
+
"source_type": source_type,
|
|
140
|
+
"source_uri": item.source_uri,
|
|
141
|
+
"owner": owner,
|
|
142
|
+
"workspace_id": item.workspace_id,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def _run() -> Dict[str, Any]:
|
|
146
|
+
if source_type in FILE_SOURCE_TYPES or (item.path and not item.text):
|
|
147
|
+
return self._ingest_file(item, source_type=source_type, owner=owner, captured_at=captured_at)
|
|
148
|
+
return self._ingest_text(item, source_type=source_type, owner=owner, captured_at=captured_at)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
raw = dispatch_tool(
|
|
152
|
+
self._hooks, tool_name, args, _run,
|
|
153
|
+
user_email=user_email, workspace_id=item.workspace_id, source="ingestion",
|
|
154
|
+
)
|
|
155
|
+
except PermissionError as exc:
|
|
156
|
+
return IngestionResult(
|
|
157
|
+
status="blocked", source_type=source_type,
|
|
158
|
+
indexing_status="skipped", detail=str(exc),
|
|
159
|
+
)
|
|
160
|
+
except FileNotFoundError as exc:
|
|
161
|
+
return IngestionResult(
|
|
162
|
+
status="failed", source_type=source_type,
|
|
163
|
+
indexing_status="failed", detail=str(exc),
|
|
164
|
+
)
|
|
165
|
+
except Exception as exc: # noqa: BLE001 — surface as a failed result, never crash the caller
|
|
166
|
+
return IngestionResult(
|
|
167
|
+
status="failed", source_type=source_type,
|
|
168
|
+
indexing_status="failed", detail=str(exc),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
node_id = raw.get("node_id")
|
|
172
|
+
content_hash = raw.get("content_hash") or raw.get("sha256")
|
|
173
|
+
chunk_ids = list(raw.get("chunk_ids") or [])
|
|
174
|
+
embedded = bool(self._kg.node_is_embedded(node_id)) if node_id else False
|
|
175
|
+
title = raw.get("title") or item.title
|
|
176
|
+
|
|
177
|
+
prov = self._kg.record_provenance(
|
|
178
|
+
node_id=node_id,
|
|
179
|
+
source_type=source_type,
|
|
180
|
+
pipeline=self._pipeline_name,
|
|
181
|
+
source_uri=item.source_uri,
|
|
182
|
+
content_hash=content_hash,
|
|
183
|
+
title=title,
|
|
184
|
+
owner=owner,
|
|
185
|
+
workspace_id=item.workspace_id,
|
|
186
|
+
captured_at=captured_at,
|
|
187
|
+
modified_at=item.modified_at,
|
|
188
|
+
embedded=embedded,
|
|
189
|
+
linked=bool(raw.get("source_node_id")),
|
|
190
|
+
duplicate=bool(raw.get("duplicate")),
|
|
191
|
+
agent_used=item.agent_used,
|
|
192
|
+
chunk_count=len(chunk_ids),
|
|
193
|
+
permissions=item.permissions,
|
|
194
|
+
metadata=item.metadata,
|
|
195
|
+
)
|
|
196
|
+
if self._audit is not None:
|
|
197
|
+
try:
|
|
198
|
+
self._audit(
|
|
199
|
+
"kg_ingest",
|
|
200
|
+
{
|
|
201
|
+
"source_type": source_type, "node_id": node_id,
|
|
202
|
+
"content_hash": content_hash, "duplicate": bool(raw.get("duplicate")),
|
|
203
|
+
},
|
|
204
|
+
user_email,
|
|
205
|
+
)
|
|
206
|
+
except Exception: # noqa: BLE001 — audit must never break ingestion
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
return IngestionResult(
|
|
210
|
+
status="ok",
|
|
211
|
+
source_type=source_type,
|
|
212
|
+
node_id=node_id,
|
|
213
|
+
source_node_id=raw.get("source_node_id"),
|
|
214
|
+
content_hash=content_hash,
|
|
215
|
+
title=title,
|
|
216
|
+
chunk_ids=chunk_ids,
|
|
217
|
+
chunk_count=len(chunk_ids),
|
|
218
|
+
duplicate=bool(raw.get("duplicate")),
|
|
219
|
+
embedded=embedded,
|
|
220
|
+
indexing_status="indexed",
|
|
221
|
+
provenance_id=prov.get("id"),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# ── routing helpers ──────────────────────────────────────────────────────
|
|
225
|
+
def _ingest_text(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
|
|
226
|
+
text = item.text or ""
|
|
227
|
+
if len(text.encode("utf-8", "ignore")) > self._max_text_bytes:
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"Text payload exceeds the {self._max_text_bytes // (1024 * 1024)}MB ingestion limit."
|
|
230
|
+
)
|
|
231
|
+
title = item.title or item.source_uri or source_type
|
|
232
|
+
return self._kg.ingest_source(
|
|
233
|
+
source_type=source_type,
|
|
234
|
+
title=title,
|
|
235
|
+
text=text,
|
|
236
|
+
source_uri=item.source_uri,
|
|
237
|
+
owner=owner,
|
|
238
|
+
workspace_id=item.workspace_id,
|
|
239
|
+
permissions=item.permissions,
|
|
240
|
+
captured_at=captured_at,
|
|
241
|
+
modified_at=item.modified_at,
|
|
242
|
+
conversation_id=item.conversation_id,
|
|
243
|
+
metadata={"mime_type": item.mime_type, **(item.metadata or {})},
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def _ingest_file(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
|
|
247
|
+
if not item.path:
|
|
248
|
+
raise ValueError("File ingestion requires a path.")
|
|
249
|
+
path = Path(item.path)
|
|
250
|
+
if not path.exists():
|
|
251
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
252
|
+
return self._kg.ingest_document(
|
|
253
|
+
path,
|
|
254
|
+
original_filename=item.title or path.name,
|
|
255
|
+
mime_type=item.mime_type,
|
|
256
|
+
uploader=owner,
|
|
257
|
+
conversation_id=item.conversation_id,
|
|
258
|
+
extracted=item.metadata.get("extracted") if item.metadata else None,
|
|
259
|
+
source_type=source_type,
|
|
260
|
+
source_uri=item.source_uri or str(path),
|
|
261
|
+
captured_at=captured_at,
|
|
262
|
+
modified_at=item.modified_at,
|
|
263
|
+
owner=owner,
|
|
264
|
+
workspace_id=item.workspace_id,
|
|
265
|
+
permissions=item.permissions,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def content_hash_text(text: str) -> str:
|
|
270
|
+
"""Canonical content hash for a text payload (matches store hashing scheme)."""
|
|
271
|
+
return hashlib.sha256((text or "").encode("utf-8", "ignore")).hexdigest()
|