ltcai 3.4.1 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +206 -247
  2. package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
  3. package/docs/CHANGELOG.md +32 -0
  4. package/docs/HANDOVER_v3.6.0.md +46 -0
  5. package/docs/RUNTIME_HOOK_COVERAGE_v3.5.0.md +56 -0
  6. package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
  7. package/docs/architecture.md +13 -12
  8. package/docs/kg-schema.md +55 -0
  9. package/docs/privacy.md +18 -2
  10. package/docs/security-model.md +17 -0
  11. package/kg_schema.py +46 -0
  12. package/knowledge_graph.py +520 -1
  13. package/latticeai/__init__.py +1 -1
  14. package/latticeai/api/auth.py +37 -9
  15. package/latticeai/api/browser.py +217 -0
  16. package/latticeai/api/chat.py +4 -1
  17. package/latticeai/api/computer_use.py +21 -8
  18. package/latticeai/api/portability.py +93 -0
  19. package/latticeai/api/tools.py +29 -26
  20. package/latticeai/core/config.py +3 -0
  21. package/latticeai/core/marketplace.py +1 -1
  22. package/latticeai/core/multi_agent.py +1 -1
  23. package/latticeai/core/oidc.py +205 -0
  24. package/latticeai/core/security.py +59 -5
  25. package/latticeai/core/workspace_os.py +1 -1
  26. package/latticeai/server_app.py +39 -0
  27. package/latticeai/services/ingestion.py +271 -0
  28. package/latticeai/services/kg_portability.py +177 -0
  29. package/package.json +5 -4
  30. package/requirements.txt +1 -0
  31. package/scripts/build_vsix.mjs +72 -0
  32. package/scripts/check_python.py +87 -0
  33. package/static/css/reference/account.css +1 -1
  34. package/static/css/reference/admin.css +1 -1
  35. package/static/css/reference/base.css +8 -5
  36. package/static/css/reference/chat.css +8 -8
  37. package/static/css/reference/graph.css +2 -2
  38. package/static/css/responsive.css +2 -2
  39. package/static/v3/asset-manifest.json +9 -9
  40. package/static/v3/css/{lattice.shell.6ceea7c8.css → lattice.shell.8fcc9d33.css} +2 -1
  41. package/static/v3/css/lattice.shell.css +2 -1
  42. package/static/v3/js/{app.d086489d.js → app.c541f955.js} +1 -1
  43. package/static/v3/js/core/{api.12b568ad.js → api.33d6320e.js} +38 -0
  44. package/static/v3/js/core/api.js +38 -0
  45. package/static/v3/js/core/{routes.d214b399.js → routes.2ce3815a.js} +1 -1
  46. package/static/v3/js/core/routes.js +1 -1
  47. package/static/v3/js/core/{shell.d05266f5.js → shell.8c163e0e.js} +2 -2
  48. package/static/v3/js/views/knowledge-graph.a96040a5.js +513 -0
  49. package/static/v3/js/views/knowledge-graph.js +293 -17
  50. package/static/workspace.css +1 -1
  51. package/tools/__init__.py +276 -0
  52. package/tools/commands.py +188 -0
  53. package/tools/computer.py +185 -0
  54. package/tools/documents.py +243 -0
  55. package/tools/filesystem.py +560 -0
  56. package/tools/knowledge.py +97 -0
  57. package/tools/local_files.py +69 -0
  58. package/tools/network.py +66 -0
  59. package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
  60. package/tools.py +0 -1525
@@ -0,0 +1,205 @@
1
+ """Self-contained OIDC ID-token validation.
2
+
3
+ A JWT is ``base64url(header).base64url(claims).base64url(signature)``. The login
4
+ flow must *never* trust a decoded payload on its own — without verifying the
5
+ signature an attacker can forge any ``email``/``sub`` claim. This module verifies
6
+ the RSA signature against the provider's published JWKS and then validates the
7
+ standard registered claims plus the login ``nonce``.
8
+
9
+ Design goals:
10
+
11
+ * No third-party JWT dependency — only the standard library plus ``cryptography``
12
+ (already a transitive dependency, and pinned explicitly in ``pyproject.toml``).
13
+ * **Fail-closed**: any anomaly raises :class:`OIDCValidationError`; the caller
14
+ must reject the login. There is no "best effort accept".
15
+ * Asymmetric algorithms only (``RS256``/``RS384``/``RS512``). ``alg: none`` and
16
+ symmetric ``HS*`` tokens are rejected outright — the classic OIDC bypasses.
17
+ * Pure and injectable: :func:`verify_id_token` takes the JWKS and clock as
18
+ arguments so every rejection path is unit-testable offline.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import base64
24
+ import json
25
+ import time
26
+ from typing import Any, Dict, List, Optional
27
+
28
+
29
+ class OIDCValidationError(Exception):
30
+ """Raised when an OIDC ID token fails any validation step (fail-closed)."""
31
+
32
+
33
+ # Asymmetric RSA algorithms only. Excluding HS*/none is deliberate: an attacker
34
+ # who can set ``alg`` must not be able to downgrade to a symmetric or unsigned
35
+ # token. Maps JWT alg name → cryptography hash name.
36
+ _ALLOWED_ALGS: Dict[str, str] = {"RS256": "sha256", "RS384": "sha384", "RS512": "sha512"}
37
+
38
+
39
+ def _b64url_decode(segment: str) -> bytes:
40
+ if not isinstance(segment, str) or not segment:
41
+ raise OIDCValidationError("empty JWT segment")
42
+ padding = "=" * (-len(segment) % 4)
43
+ try:
44
+ return base64.urlsafe_b64decode(segment + padding)
45
+ except Exception as exc: # malformed base64
46
+ raise OIDCValidationError(f"invalid base64url segment: {exc}")
47
+
48
+
49
+ def _b64url_uint(value: str) -> int:
50
+ return int.from_bytes(_b64url_decode(value), "big")
51
+
52
+
53
+ def _split(token: str) -> List[str]:
54
+ parts = str(token or "").split(".")
55
+ if len(parts) != 3 or not all(parts):
56
+ raise OIDCValidationError("malformed JWT: expected three non-empty segments")
57
+ return parts
58
+
59
+
60
+ def decode_unverified_header(token: str) -> Dict[str, Any]:
61
+ """Decode the JWT header WITHOUT verifying it (used only to pick the key)."""
62
+ header_b64 = _split(token)[0]
63
+ try:
64
+ header = json.loads(_b64url_decode(header_b64))
65
+ except Exception as exc:
66
+ raise OIDCValidationError(f"invalid JWT header: {exc}")
67
+ if not isinstance(header, dict):
68
+ raise OIDCValidationError("JWT header is not an object")
69
+ return header
70
+
71
+
72
+ def _public_key_from_jwk(jwk: Dict[str, Any]):
73
+ if not isinstance(jwk, dict) or jwk.get("kty") != "RSA":
74
+ raise OIDCValidationError("unsupported JWK key type (RSA required)")
75
+ if not jwk.get("n") or not jwk.get("e"):
76
+ raise OIDCValidationError("JWK missing modulus/exponent")
77
+ from cryptography.hazmat.primitives.asymmetric.rsa import RSAPublicNumbers
78
+
79
+ numbers = RSAPublicNumbers(_b64url_uint(jwk["e"]), _b64url_uint(jwk["n"]))
80
+ return numbers.public_key()
81
+
82
+
83
+ def _candidate_keys(jwks: Any, kid: Optional[str]) -> List[Dict[str, Any]]:
84
+ keys = jwks.get("keys") if isinstance(jwks, dict) else jwks
85
+ if not isinstance(keys, list) or not keys:
86
+ raise OIDCValidationError("JWKS contains no keys")
87
+ rsa_keys = [k for k in keys if isinstance(k, dict) and k.get("kty") == "RSA"]
88
+ if kid:
89
+ matched = [k for k in rsa_keys if k.get("kid") == kid]
90
+ if not matched:
91
+ raise OIDCValidationError("no JWKS key matches the token 'kid'")
92
+ return matched
93
+ return rsa_keys
94
+
95
+
96
+ def _verify_signature(token: str, jwks: Any) -> Dict[str, Any]:
97
+ header_b64, payload_b64, sig_b64 = _split(token)
98
+ header = decode_unverified_header(token)
99
+ alg = header.get("alg")
100
+ if alg not in _ALLOWED_ALGS:
101
+ # Rejects 'none' and symmetric HS* — the canonical signature-bypass attacks.
102
+ raise OIDCValidationError(f"unsupported or unsafe JWT alg: {alg!r}")
103
+
104
+ from cryptography.exceptions import InvalidSignature
105
+ from cryptography.hazmat.primitives import hashes
106
+ from cryptography.hazmat.primitives.asymmetric import padding
107
+
108
+ hash_obj = {"sha256": hashes.SHA256(), "sha384": hashes.SHA384(), "sha512": hashes.SHA512()}[
109
+ _ALLOWED_ALGS[alg]
110
+ ]
111
+ signature = _b64url_decode(sig_b64)
112
+ signing_input = f"{header_b64}.{payload_b64}".encode("ascii")
113
+
114
+ for jwk in _candidate_keys(jwks, header.get("kid")):
115
+ try:
116
+ public_key = _public_key_from_jwk(jwk)
117
+ public_key.verify(signature, signing_input, padding.PKCS1v15(), hash_obj)
118
+ except (InvalidSignature, OIDCValidationError):
119
+ continue
120
+ # Signature verified — now it is safe to parse the claims.
121
+ try:
122
+ claims = json.loads(_b64url_decode(payload_b64))
123
+ except Exception as exc:
124
+ raise OIDCValidationError(f"invalid JWT claims JSON: {exc}")
125
+ if not isinstance(claims, dict):
126
+ raise OIDCValidationError("JWT claims are not an object")
127
+ return claims
128
+
129
+ raise OIDCValidationError("signature verification failed against all JWKS keys")
130
+
131
+
132
+ def verify_id_token(
133
+ id_token: str,
134
+ *,
135
+ jwks: Any,
136
+ issuer: str,
137
+ audience: str,
138
+ nonce: Optional[str] = None,
139
+ now: Optional[float] = None,
140
+ leeway: int = 60,
141
+ ) -> Dict[str, Any]:
142
+ """Verify an OIDC ID token and return its claims, or raise ``OIDCValidationError``.
143
+
144
+ Validates, in order: signature (against ``jwks``, RSA only), ``iss``, ``aud``
145
+ (and ``azp`` when multi-audience), ``exp``, ``iat``/``nbf``, and ``nonce``.
146
+ All checks are fail-closed.
147
+ """
148
+ if not id_token:
149
+ raise OIDCValidationError("missing id_token")
150
+ if not issuer:
151
+ raise OIDCValidationError("issuer not configured")
152
+ if not audience:
153
+ raise OIDCValidationError("audience (client_id) not configured")
154
+
155
+ claims = _verify_signature(id_token, jwks)
156
+ current = int(now if now is not None else time.time())
157
+
158
+ if claims.get("iss") != issuer:
159
+ raise OIDCValidationError("issuer mismatch")
160
+
161
+ aud = claims.get("aud")
162
+ audiences = aud if isinstance(aud, list) else [aud]
163
+ if audience not in audiences:
164
+ raise OIDCValidationError("audience mismatch")
165
+ if isinstance(aud, list) and len(aud) > 1 and claims.get("azp") not in (None, audience):
166
+ raise OIDCValidationError("azp (authorized party) mismatch")
167
+
168
+ exp = claims.get("exp")
169
+ if exp is None:
170
+ raise OIDCValidationError("token missing 'exp'")
171
+ if current > int(exp) + leeway:
172
+ raise OIDCValidationError("token expired")
173
+
174
+ iat = claims.get("iat")
175
+ if iat is not None and int(iat) - leeway > current:
176
+ raise OIDCValidationError("token 'iat' is in the future")
177
+
178
+ nbf = claims.get("nbf")
179
+ if nbf is not None and int(nbf) - leeway > current:
180
+ raise OIDCValidationError("token not yet valid ('nbf')")
181
+
182
+ if nonce is not None and claims.get("nonce") != nonce:
183
+ raise OIDCValidationError("nonce mismatch")
184
+
185
+ return claims
186
+
187
+
188
+ async def fetch_jwks(jwks_uri: str, *, timeout: float = 15.0) -> Dict[str, Any]:
189
+ """Fetch a provider JWKS document. Network-only; injectable in tests."""
190
+ if not jwks_uri:
191
+ raise OIDCValidationError("discovery document has no 'jwks_uri'")
192
+ import httpx
193
+
194
+ async with httpx.AsyncClient() as client:
195
+ resp = await client.get(jwks_uri, timeout=timeout)
196
+ resp.raise_for_status()
197
+ return resp.json()
198
+
199
+
200
+ __all__ = [
201
+ "OIDCValidationError",
202
+ "verify_id_token",
203
+ "fetch_jwks",
204
+ "decode_unverified_header",
205
+ ]
@@ -35,12 +35,66 @@ def host_is_loopback(host: str) -> bool:
35
35
  return False
36
36
 
37
37
 
38
+ # ── Trusted-proxy handling ────────────────────────────────────────────────────
39
+ # ``client_ip`` is the key used for IP rate limiting (login / register) and for
40
+ # audit logging. A forwarded header (``X-Forwarded-For`` / ``CF-Connecting-IP``)
41
+ # is *client-controllable*, so honoring it unconditionally lets anyone spoof
42
+ # their source IP and bypass per-IP rate limits. We therefore trust those headers
43
+ # ONLY when the direct peer is a configured trusted proxy (e.g. the Cloudflare /
44
+ # Vercel edge in front of the app). Default: no trusted proxies → use the peer
45
+ # address, which is the safe, local-first behaviour.
46
+ _FORWARDED_HEADERS = ("CF-Connecting-IP", "X-Forwarded-For")
47
+ _trusted_proxies: List["ipaddress._BaseNetwork"] = []
48
+
49
+
50
+ def configure_trusted_proxies(values) -> int:
51
+ """Set the trusted-proxy allowlist from IPs / CIDRs. Returns the count parsed.
52
+
53
+ Accepts a comma-separated string or an iterable of IPs/CIDRs. Invalid entries
54
+ are skipped. Passing an empty value disables forwarded-header trust entirely.
55
+ """
56
+ global _trusted_proxies
57
+ if isinstance(values, str):
58
+ items = [v.strip() for v in values.split(",")]
59
+ else:
60
+ items = [str(v).strip() for v in (values or [])]
61
+ networks: List["ipaddress._BaseNetwork"] = []
62
+ for item in items:
63
+ if not item:
64
+ continue
65
+ try:
66
+ networks.append(ipaddress.ip_network(item, strict=False))
67
+ except ValueError:
68
+ continue
69
+ _trusted_proxies = networks
70
+ return len(networks)
71
+
72
+
73
+ def _peer_is_trusted_proxy(peer: str) -> bool:
74
+ if not peer or not _trusted_proxies:
75
+ return False
76
+ try:
77
+ addr = ipaddress.ip_address(peer)
78
+ except ValueError:
79
+ return False
80
+ return any(addr in net for net in _trusted_proxies)
81
+
82
+
38
83
  def client_ip(request) -> str:
39
- for header in ("CF-Connecting-IP", "X-Forwarded-For"):
40
- val = request.headers.get(header)
41
- if val:
42
- return val.split(",")[0].strip()
43
- return request.client.host if request.client else "unknown"
84
+ peer = request.client.host if request.client else ""
85
+ # Only a trusted proxy's forwarded headers are honoured; otherwise the
86
+ # client-supplied header is ignored so per-IP rate limits cannot be spoofed.
87
+ if _peer_is_trusted_proxy(peer):
88
+ for header in _FORWARDED_HEADERS:
89
+ val = request.headers.get(header)
90
+ if val:
91
+ candidate = val.split(",")[0].strip()
92
+ try:
93
+ ipaddress.ip_address(candidate)
94
+ return candidate
95
+ except ValueError:
96
+ continue
97
+ return peer or "unknown"
44
98
 
45
99
 
46
100
  _FILE_MAGIC: Dict[str, List[bytes]] = {
@@ -18,7 +18,7 @@ from pathlib import Path
18
18
  from typing import Any, Callable, Dict, Iterable, List, Optional
19
19
 
20
20
 
21
- WORKSPACE_OS_VERSION = "3.4.1"
21
+ WORKSPACE_OS_VERSION = "3.6.0"
22
22
 
23
23
  # Workspace types separate single-user Personal workspaces from shared
24
24
  # Organization workspaces. Both keep the same local-first JSON store; the type
@@ -40,6 +40,7 @@ from latticeai.core.security import (
40
40
  verify_password,
41
41
  host_is_loopback as _host_is_loopback_impl,
42
42
  client_ip as _client_ip_impl,
43
+ configure_trusted_proxies as _configure_trusted_proxies,
43
44
  bytes_match_extension as _bytes_match_extension_impl,
44
45
  redact_secret_text as _redact_secret_text,
45
46
  check_ip_rate_limit as _check_ip_rate_limit,
@@ -119,7 +120,11 @@ from latticeai.core.builtin_hooks import register_builtin_hook_runners
119
120
  from latticeai.api.agent_registry import create_agent_registry_router
120
121
  from latticeai.core.agent_registry import AgentRegistry
121
122
  from latticeai.api.memory import create_memory_router
123
+ from latticeai.api.browser import create_browser_router
124
+ from latticeai.api.portability import create_portability_router
122
125
  from latticeai.services.memory_service import MemoryService
126
+ from latticeai.services.ingestion import IngestionPipeline
127
+ from latticeai.services.kg_portability import KGPortabilityService
123
128
  from latticeai.services.tool_dispatch import (
124
129
  LOCAL_WRITE_BLOCKED_PREFIXES as _LOCAL_WRITE_BLOCKED_PREFIXES,
125
130
  TOOL_GOVERNANCE,
@@ -157,6 +162,12 @@ from datetime import datetime
157
162
  CONFIG = Config.from_env()
158
163
  APP_VERSION = WORKSPACE_OS_VERSION
159
164
 
165
+ # Forwarded headers (X-Forwarded-For / CF-Connecting-IP) are only honoured for
166
+ # IP rate limiting when the direct peer is one of these trusted proxies. Empty by
167
+ # default (local-first): the peer address is used and client-supplied headers are
168
+ # ignored, so per-IP rate limits cannot be spoofed.
169
+ _configure_trusted_proxies(CONFIG.trusted_proxies)
170
+
160
171
  APP_MODE = CONFIG.app_mode
161
172
  IS_PUBLIC_MODE = CONFIG.is_public
162
173
  DEFAULT_HOST = CONFIG.host
@@ -315,6 +326,23 @@ MEMORY_SERVICE = MemoryService(
315
326
  enable_graph=ENABLE_GRAPH,
316
327
  history_file=HISTORY_FILE,
317
328
  )
329
+ # ── v3.6.0 unified ingestion pipeline: the single write-side seam into the
330
+ # Knowledge Graph. Every new source (web URL, browser tab, …) flows through this
331
+ # so pre_tool/post_tool hooks fire on ingestion and provenance is captured
332
+ # uniformly. Existing direct ingest callers keep working; new paths converge here.
333
+ INGESTION_PIPELINE = IngestionPipeline(
334
+ KNOWLEDGE_GRAPH,
335
+ hooks=HOOKS_REGISTRY,
336
+ enable_graph=ENABLE_GRAPH,
337
+ audit=lambda action, detail, user: append_audit_event(action, user_email=user, **detail),
338
+ )
339
+ # ── v3.6.0 Knowledge Graph portability: local export / import / backup / restore.
340
+ # The graph is the user's durable asset, so it must be portable with no cloud.
341
+ KG_PORTABILITY = KGPortabilityService(
342
+ knowledge_graph=KNOWLEDGE_GRAPH,
343
+ data_dir=DATA_DIR,
344
+ enable_graph=ENABLE_GRAPH,
345
+ )
318
346
 
319
347
  def _require_graph():
320
348
  if not ENABLE_GRAPH or KNOWLEDGE_GRAPH is None:
@@ -1501,6 +1529,17 @@ app.include_router(create_memory_router(
1501
1529
  append_audit_event=append_audit_event,
1502
1530
  ))
1503
1531
 
1532
+ app.include_router(create_browser_router(
1533
+ pipeline=INGESTION_PIPELINE,
1534
+ require_user=require_user,
1535
+ ))
1536
+
1537
+ app.include_router(create_portability_router(
1538
+ service=KG_PORTABILITY,
1539
+ require_user=require_user,
1540
+ require_admin=require_admin,
1541
+ ))
1542
+
1504
1543
  app.include_router(create_garden_router(gardener=gardener, require_user=require_user))
1505
1544
  app.include_router(create_setup_router(model_router=router, require_user=require_user))
1506
1545
 
@@ -0,0 +1,271 @@
1
+ """Unified ingestion pipeline — the single write-side seam into the Knowledge Graph.
2
+
3
+ v3.6.0 Knowledge Graph First principle: *no data source bypasses the Knowledge
4
+ Graph and no source creates an isolated silo*. Every source — local files,
5
+ connected folders, PDFs/Markdown/text/code, web URLs, browser tabs — is
6
+ normalized into one :class:`IngestionItem` and pushed through one
7
+ :meth:`IngestionPipeline.ingest` entrypoint:
8
+
9
+ Source → normalize → content hash → (file | text) ingest → provenance
10
+
11
+ The pipeline is deliberately thin. It owns normalization, idempotency reporting,
12
+ provenance capture, and — crucially — routing every ingest through the shared
13
+ ``dispatch_tool`` lifecycle so ``pre_tool``/``post_tool`` hooks fire on data
14
+ ingestion exactly as they do on tool calls. The heavy graph construction lives in
15
+ :class:`knowledge_graph.KnowledgeGraphStore` (``ingest_document`` for files,
16
+ ``ingest_source`` for text/web), which this module composes rather than
17
+ re-implements.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ from dataclasses import dataclass, field
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional
27
+
28
+ from latticeai.core.hooks import dispatch_tool
29
+
30
+ # Source types that arrive as a file on disk (read via ingest_document).
31
+ FILE_SOURCE_TYPES = frozenset({"file", "local_file", "upload", "pdf"})
32
+ # Source types that arrive as extracted text (read via ingest_source).
33
+ TEXT_SOURCE_TYPES = frozenset(
34
+ {"web_url", "browser_tab", "text", "markdown", "note", "code", "clipboard"}
35
+ )
36
+
37
+ DEFAULT_MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB of extracted text per item
38
+
39
+
40
+ def _now_iso() -> str:
41
+ return datetime.now(timezone.utc).isoformat()
42
+
43
+
44
+ @dataclass
45
+ class IngestionItem:
46
+ """A single thing to ingest, normalized across every source type."""
47
+
48
+ source_type: str
49
+ title: Optional[str] = None
50
+ text: Optional[str] = None # text/web sources
51
+ path: Optional[str] = None # file sources
52
+ source_uri: Optional[str] = None
53
+ mime_type: Optional[str] = None
54
+ owner: Optional[str] = None
55
+ workspace_id: Optional[str] = None
56
+ permissions: Optional[Dict[str, Any]] = None
57
+ captured_at: Optional[str] = None
58
+ modified_at: Optional[str] = None
59
+ conversation_id: Optional[str] = None
60
+ agent_used: Optional[str] = None
61
+ metadata: Dict[str, Any] = field(default_factory=dict)
62
+
63
+
64
+ @dataclass
65
+ class IngestionResult:
66
+ """The outcome of one ingestion, including provenance and idempotency."""
67
+
68
+ status: str # ok | unavailable | blocked | failed
69
+ source_type: str
70
+ node_id: Optional[str] = None
71
+ source_node_id: Optional[str] = None
72
+ content_hash: Optional[str] = None
73
+ title: Optional[str] = None
74
+ chunk_ids: List[str] = field(default_factory=list)
75
+ chunk_count: int = 0
76
+ duplicate: bool = False
77
+ embedded: bool = False
78
+ indexing_status: str = "pending" # indexed | skipped | failed | pending
79
+ provenance_id: Optional[str] = None
80
+ detail: Optional[str] = None
81
+
82
+ def as_dict(self) -> Dict[str, Any]:
83
+ return {
84
+ "status": self.status,
85
+ "source_type": self.source_type,
86
+ "node_id": self.node_id,
87
+ "source_node_id": self.source_node_id,
88
+ "content_hash": self.content_hash,
89
+ "title": self.title,
90
+ "chunk_ids": self.chunk_ids,
91
+ "chunk_count": self.chunk_count,
92
+ "duplicate": self.duplicate,
93
+ "embedded": self.embedded,
94
+ "indexing_status": self.indexing_status,
95
+ "provenance_id": self.provenance_id,
96
+ "detail": self.detail,
97
+ }
98
+
99
+
100
+ class IngestionPipeline:
101
+ """Single normalized entrypoint that feeds every source into the graph."""
102
+
103
+ def __init__(
104
+ self,
105
+ knowledge_graph: Any,
106
+ *,
107
+ hooks: Any = None,
108
+ enable_graph: bool = True,
109
+ audit: Optional[Any] = None,
110
+ max_text_bytes: int = DEFAULT_MAX_TEXT_BYTES,
111
+ pipeline_name: str = "unified-ingestion",
112
+ ) -> None:
113
+ self._kg = knowledge_graph
114
+ self._hooks = hooks
115
+ self._enable = bool(enable_graph)
116
+ self._audit = audit
117
+ self._max_text_bytes = int(max_text_bytes)
118
+ self._pipeline_name = pipeline_name
119
+
120
+ def available(self) -> bool:
121
+ return self._enable and self._kg is not None
122
+
123
+ # ── public API ───────────────────────────────────────────────────────────
124
+ def ingest(self, item: IngestionItem, *, user_email: Optional[str] = None) -> IngestionResult:
125
+ """Normalize, hash, route through dispatch_tool, and record provenance."""
126
+ source_type = str(item.source_type or "text").strip()
127
+ if not self.available():
128
+ return IngestionResult(
129
+ status="unavailable", source_type=source_type,
130
+ indexing_status="skipped",
131
+ detail="Knowledge Graph is disabled (LATTICEAI_ENABLE_GRAPH).",
132
+ )
133
+
134
+ captured_at = item.captured_at or _now_iso()
135
+ owner = item.owner or user_email
136
+ tool_name = f"kg_ingest.{source_type}"
137
+ # Only the keys are read by the hook payload, so this dict is safe/cheap.
138
+ args = {
139
+ "source_type": source_type,
140
+ "source_uri": item.source_uri,
141
+ "owner": owner,
142
+ "workspace_id": item.workspace_id,
143
+ }
144
+
145
+ def _run() -> Dict[str, Any]:
146
+ if source_type in FILE_SOURCE_TYPES or (item.path and not item.text):
147
+ return self._ingest_file(item, source_type=source_type, owner=owner, captured_at=captured_at)
148
+ return self._ingest_text(item, source_type=source_type, owner=owner, captured_at=captured_at)
149
+
150
+ try:
151
+ raw = dispatch_tool(
152
+ self._hooks, tool_name, args, _run,
153
+ user_email=user_email, workspace_id=item.workspace_id, source="ingestion",
154
+ )
155
+ except PermissionError as exc:
156
+ return IngestionResult(
157
+ status="blocked", source_type=source_type,
158
+ indexing_status="skipped", detail=str(exc),
159
+ )
160
+ except FileNotFoundError as exc:
161
+ return IngestionResult(
162
+ status="failed", source_type=source_type,
163
+ indexing_status="failed", detail=str(exc),
164
+ )
165
+ except Exception as exc: # noqa: BLE001 — surface as a failed result, never crash the caller
166
+ return IngestionResult(
167
+ status="failed", source_type=source_type,
168
+ indexing_status="failed", detail=str(exc),
169
+ )
170
+
171
+ node_id = raw.get("node_id")
172
+ content_hash = raw.get("content_hash") or raw.get("sha256")
173
+ chunk_ids = list(raw.get("chunk_ids") or [])
174
+ embedded = bool(self._kg.node_is_embedded(node_id)) if node_id else False
175
+ title = raw.get("title") or item.title
176
+
177
+ prov = self._kg.record_provenance(
178
+ node_id=node_id,
179
+ source_type=source_type,
180
+ pipeline=self._pipeline_name,
181
+ source_uri=item.source_uri,
182
+ content_hash=content_hash,
183
+ title=title,
184
+ owner=owner,
185
+ workspace_id=item.workspace_id,
186
+ captured_at=captured_at,
187
+ modified_at=item.modified_at,
188
+ embedded=embedded,
189
+ linked=bool(raw.get("source_node_id")),
190
+ duplicate=bool(raw.get("duplicate")),
191
+ agent_used=item.agent_used,
192
+ chunk_count=len(chunk_ids),
193
+ permissions=item.permissions,
194
+ metadata=item.metadata,
195
+ )
196
+ if self._audit is not None:
197
+ try:
198
+ self._audit(
199
+ "kg_ingest",
200
+ {
201
+ "source_type": source_type, "node_id": node_id,
202
+ "content_hash": content_hash, "duplicate": bool(raw.get("duplicate")),
203
+ },
204
+ user_email,
205
+ )
206
+ except Exception: # noqa: BLE001 — audit must never break ingestion
207
+ pass
208
+
209
+ return IngestionResult(
210
+ status="ok",
211
+ source_type=source_type,
212
+ node_id=node_id,
213
+ source_node_id=raw.get("source_node_id"),
214
+ content_hash=content_hash,
215
+ title=title,
216
+ chunk_ids=chunk_ids,
217
+ chunk_count=len(chunk_ids),
218
+ duplicate=bool(raw.get("duplicate")),
219
+ embedded=embedded,
220
+ indexing_status="indexed",
221
+ provenance_id=prov.get("id"),
222
+ )
223
+
224
+ # ── routing helpers ──────────────────────────────────────────────────────
225
+ def _ingest_text(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
226
+ text = item.text or ""
227
+ if len(text.encode("utf-8", "ignore")) > self._max_text_bytes:
228
+ raise ValueError(
229
+ f"Text payload exceeds the {self._max_text_bytes // (1024 * 1024)}MB ingestion limit."
230
+ )
231
+ title = item.title or item.source_uri or source_type
232
+ return self._kg.ingest_source(
233
+ source_type=source_type,
234
+ title=title,
235
+ text=text,
236
+ source_uri=item.source_uri,
237
+ owner=owner,
238
+ workspace_id=item.workspace_id,
239
+ permissions=item.permissions,
240
+ captured_at=captured_at,
241
+ modified_at=item.modified_at,
242
+ conversation_id=item.conversation_id,
243
+ metadata={"mime_type": item.mime_type, **(item.metadata or {})},
244
+ )
245
+
246
+ def _ingest_file(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
247
+ if not item.path:
248
+ raise ValueError("File ingestion requires a path.")
249
+ path = Path(item.path)
250
+ if not path.exists():
251
+ raise FileNotFoundError(f"File not found: {path}")
252
+ return self._kg.ingest_document(
253
+ path,
254
+ original_filename=item.title or path.name,
255
+ mime_type=item.mime_type,
256
+ uploader=owner,
257
+ conversation_id=item.conversation_id,
258
+ extracted=item.metadata.get("extracted") if item.metadata else None,
259
+ source_type=source_type,
260
+ source_uri=item.source_uri or str(path),
261
+ captured_at=captured_at,
262
+ modified_at=item.modified_at,
263
+ owner=owner,
264
+ workspace_id=item.workspace_id,
265
+ permissions=item.permissions,
266
+ )
267
+
268
+
269
+ def content_hash_text(text: str) -> str:
270
+ """Canonical content hash for a text payload (matches store hashing scheme)."""
271
+ return hashlib.sha256((text or "").encode("utf-8", "ignore")).hexdigest()