ltcai 3.4.1 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -247
- package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
- package/docs/CHANGELOG.md +32 -0
- package/docs/HANDOVER_v3.6.0.md +46 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.5.0.md +56 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
- package/docs/architecture.md +13 -12
- package/docs/kg-schema.md +55 -0
- package/docs/privacy.md +18 -2
- package/docs/security-model.md +17 -0
- package/kg_schema.py +46 -0
- package/knowledge_graph.py +520 -1
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/auth.py +37 -9
- package/latticeai/api/browser.py +217 -0
- package/latticeai/api/chat.py +4 -1
- package/latticeai/api/computer_use.py +21 -8
- package/latticeai/api/portability.py +93 -0
- package/latticeai/api/tools.py +29 -26
- package/latticeai/core/config.py +3 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/multi_agent.py +1 -1
- package/latticeai/core/oidc.py +205 -0
- package/latticeai/core/security.py +59 -5
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/server_app.py +39 -0
- package/latticeai/services/ingestion.py +271 -0
- package/latticeai/services/kg_portability.py +177 -0
- package/package.json +5 -4
- package/requirements.txt +1 -0
- package/scripts/build_vsix.mjs +72 -0
- package/scripts/check_python.py +87 -0
- package/static/css/reference/account.css +1 -1
- package/static/css/reference/admin.css +1 -1
- package/static/css/reference/base.css +8 -5
- package/static/css/reference/chat.css +8 -8
- package/static/css/reference/graph.css +2 -2
- package/static/css/responsive.css +2 -2
- package/static/v3/asset-manifest.json +9 -9
- package/static/v3/css/{lattice.shell.6ceea7c8.css → lattice.shell.8fcc9d33.css} +2 -1
- package/static/v3/css/lattice.shell.css +2 -1
- package/static/v3/js/{app.d086489d.js → app.c541f955.js} +1 -1
- package/static/v3/js/core/{api.12b568ad.js → api.33d6320e.js} +38 -0
- package/static/v3/js/core/api.js +38 -0
- package/static/v3/js/core/{routes.d214b399.js → routes.2ce3815a.js} +1 -1
- package/static/v3/js/core/routes.js +1 -1
- package/static/v3/js/core/{shell.d05266f5.js → shell.8c163e0e.js} +2 -2
- package/static/v3/js/views/knowledge-graph.a96040a5.js +513 -0
- package/static/v3/js/views/knowledge-graph.js +293 -17
- package/static/workspace.css +1 -1
- package/tools/__init__.py +276 -0
- package/tools/commands.py +188 -0
- package/tools/computer.py +185 -0
- package/tools/documents.py +243 -0
- package/tools/filesystem.py +560 -0
- package/tools/knowledge.py +97 -0
- package/tools/local_files.py +69 -0
- package/tools/network.py +66 -0
- package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
- package/tools.py +0 -1525
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Self-contained OIDC ID-token validation.
|
|
2
|
+
|
|
3
|
+
A JWT is ``base64url(header).base64url(claims).base64url(signature)``. The login
|
|
4
|
+
flow must *never* trust a decoded payload on its own — without verifying the
|
|
5
|
+
signature an attacker can forge any ``email``/``sub`` claim. This module verifies
|
|
6
|
+
the RSA signature against the provider's published JWKS and then validates the
|
|
7
|
+
standard registered claims plus the login ``nonce``.
|
|
8
|
+
|
|
9
|
+
Design goals:
|
|
10
|
+
|
|
11
|
+
* No third-party JWT dependency — only the standard library plus ``cryptography``
|
|
12
|
+
(already a transitive dependency, and pinned explicitly in ``pyproject.toml``).
|
|
13
|
+
* **Fail-closed**: any anomaly raises :class:`OIDCValidationError`; the caller
|
|
14
|
+
must reject the login. There is no "best effort accept".
|
|
15
|
+
* Asymmetric algorithms only (``RS256``/``RS384``/``RS512``). ``alg: none`` and
|
|
16
|
+
symmetric ``HS*`` tokens are rejected outright — the classic OIDC bypasses.
|
|
17
|
+
* Pure and injectable: :func:`verify_id_token` takes the JWKS and clock as
|
|
18
|
+
arguments so every rejection path is unit-testable offline.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import base64
|
|
24
|
+
import json
|
|
25
|
+
import time
|
|
26
|
+
from typing import Any, Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OIDCValidationError(Exception):
|
|
30
|
+
"""Raised when an OIDC ID token fails any validation step (fail-closed)."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Asymmetric RSA algorithms only. Excluding HS*/none is deliberate: an attacker
|
|
34
|
+
# who can set ``alg`` must not be able to downgrade to a symmetric or unsigned
|
|
35
|
+
# token. Maps JWT alg name → cryptography hash name.
|
|
36
|
+
_ALLOWED_ALGS: Dict[str, str] = {"RS256": "sha256", "RS384": "sha384", "RS512": "sha512"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _b64url_decode(segment: str) -> bytes:
|
|
40
|
+
if not isinstance(segment, str) or not segment:
|
|
41
|
+
raise OIDCValidationError("empty JWT segment")
|
|
42
|
+
padding = "=" * (-len(segment) % 4)
|
|
43
|
+
try:
|
|
44
|
+
return base64.urlsafe_b64decode(segment + padding)
|
|
45
|
+
except Exception as exc: # malformed base64
|
|
46
|
+
raise OIDCValidationError(f"invalid base64url segment: {exc}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _b64url_uint(value: str) -> int:
|
|
50
|
+
return int.from_bytes(_b64url_decode(value), "big")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _split(token: str) -> List[str]:
|
|
54
|
+
parts = str(token or "").split(".")
|
|
55
|
+
if len(parts) != 3 or not all(parts):
|
|
56
|
+
raise OIDCValidationError("malformed JWT: expected three non-empty segments")
|
|
57
|
+
return parts
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def decode_unverified_header(token: str) -> Dict[str, Any]:
|
|
61
|
+
"""Decode the JWT header WITHOUT verifying it (used only to pick the key)."""
|
|
62
|
+
header_b64 = _split(token)[0]
|
|
63
|
+
try:
|
|
64
|
+
header = json.loads(_b64url_decode(header_b64))
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
raise OIDCValidationError(f"invalid JWT header: {exc}")
|
|
67
|
+
if not isinstance(header, dict):
|
|
68
|
+
raise OIDCValidationError("JWT header is not an object")
|
|
69
|
+
return header
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _public_key_from_jwk(jwk: Dict[str, Any]):
|
|
73
|
+
if not isinstance(jwk, dict) or jwk.get("kty") != "RSA":
|
|
74
|
+
raise OIDCValidationError("unsupported JWK key type (RSA required)")
|
|
75
|
+
if not jwk.get("n") or not jwk.get("e"):
|
|
76
|
+
raise OIDCValidationError("JWK missing modulus/exponent")
|
|
77
|
+
from cryptography.hazmat.primitives.asymmetric.rsa import RSAPublicNumbers
|
|
78
|
+
|
|
79
|
+
numbers = RSAPublicNumbers(_b64url_uint(jwk["e"]), _b64url_uint(jwk["n"]))
|
|
80
|
+
return numbers.public_key()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _candidate_keys(jwks: Any, kid: Optional[str]) -> List[Dict[str, Any]]:
|
|
84
|
+
keys = jwks.get("keys") if isinstance(jwks, dict) else jwks
|
|
85
|
+
if not isinstance(keys, list) or not keys:
|
|
86
|
+
raise OIDCValidationError("JWKS contains no keys")
|
|
87
|
+
rsa_keys = [k for k in keys if isinstance(k, dict) and k.get("kty") == "RSA"]
|
|
88
|
+
if kid:
|
|
89
|
+
matched = [k for k in rsa_keys if k.get("kid") == kid]
|
|
90
|
+
if not matched:
|
|
91
|
+
raise OIDCValidationError("no JWKS key matches the token 'kid'")
|
|
92
|
+
return matched
|
|
93
|
+
return rsa_keys
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _verify_signature(token: str, jwks: Any) -> Dict[str, Any]:
|
|
97
|
+
header_b64, payload_b64, sig_b64 = _split(token)
|
|
98
|
+
header = decode_unverified_header(token)
|
|
99
|
+
alg = header.get("alg")
|
|
100
|
+
if alg not in _ALLOWED_ALGS:
|
|
101
|
+
# Rejects 'none' and symmetric HS* — the canonical signature-bypass attacks.
|
|
102
|
+
raise OIDCValidationError(f"unsupported or unsafe JWT alg: {alg!r}")
|
|
103
|
+
|
|
104
|
+
from cryptography.exceptions import InvalidSignature
|
|
105
|
+
from cryptography.hazmat.primitives import hashes
|
|
106
|
+
from cryptography.hazmat.primitives.asymmetric import padding
|
|
107
|
+
|
|
108
|
+
hash_obj = {"sha256": hashes.SHA256(), "sha384": hashes.SHA384(), "sha512": hashes.SHA512()}[
|
|
109
|
+
_ALLOWED_ALGS[alg]
|
|
110
|
+
]
|
|
111
|
+
signature = _b64url_decode(sig_b64)
|
|
112
|
+
signing_input = f"{header_b64}.{payload_b64}".encode("ascii")
|
|
113
|
+
|
|
114
|
+
for jwk in _candidate_keys(jwks, header.get("kid")):
|
|
115
|
+
try:
|
|
116
|
+
public_key = _public_key_from_jwk(jwk)
|
|
117
|
+
public_key.verify(signature, signing_input, padding.PKCS1v15(), hash_obj)
|
|
118
|
+
except (InvalidSignature, OIDCValidationError):
|
|
119
|
+
continue
|
|
120
|
+
# Signature verified — now it is safe to parse the claims.
|
|
121
|
+
try:
|
|
122
|
+
claims = json.loads(_b64url_decode(payload_b64))
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
raise OIDCValidationError(f"invalid JWT claims JSON: {exc}")
|
|
125
|
+
if not isinstance(claims, dict):
|
|
126
|
+
raise OIDCValidationError("JWT claims are not an object")
|
|
127
|
+
return claims
|
|
128
|
+
|
|
129
|
+
raise OIDCValidationError("signature verification failed against all JWKS keys")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def verify_id_token(
|
|
133
|
+
id_token: str,
|
|
134
|
+
*,
|
|
135
|
+
jwks: Any,
|
|
136
|
+
issuer: str,
|
|
137
|
+
audience: str,
|
|
138
|
+
nonce: Optional[str] = None,
|
|
139
|
+
now: Optional[float] = None,
|
|
140
|
+
leeway: int = 60,
|
|
141
|
+
) -> Dict[str, Any]:
|
|
142
|
+
"""Verify an OIDC ID token and return its claims, or raise ``OIDCValidationError``.
|
|
143
|
+
|
|
144
|
+
Validates, in order: signature (against ``jwks``, RSA only), ``iss``, ``aud``
|
|
145
|
+
(and ``azp`` when multi-audience), ``exp``, ``iat``/``nbf``, and ``nonce``.
|
|
146
|
+
All checks are fail-closed.
|
|
147
|
+
"""
|
|
148
|
+
if not id_token:
|
|
149
|
+
raise OIDCValidationError("missing id_token")
|
|
150
|
+
if not issuer:
|
|
151
|
+
raise OIDCValidationError("issuer not configured")
|
|
152
|
+
if not audience:
|
|
153
|
+
raise OIDCValidationError("audience (client_id) not configured")
|
|
154
|
+
|
|
155
|
+
claims = _verify_signature(id_token, jwks)
|
|
156
|
+
current = int(now if now is not None else time.time())
|
|
157
|
+
|
|
158
|
+
if claims.get("iss") != issuer:
|
|
159
|
+
raise OIDCValidationError("issuer mismatch")
|
|
160
|
+
|
|
161
|
+
aud = claims.get("aud")
|
|
162
|
+
audiences = aud if isinstance(aud, list) else [aud]
|
|
163
|
+
if audience not in audiences:
|
|
164
|
+
raise OIDCValidationError("audience mismatch")
|
|
165
|
+
if isinstance(aud, list) and len(aud) > 1 and claims.get("azp") not in (None, audience):
|
|
166
|
+
raise OIDCValidationError("azp (authorized party) mismatch")
|
|
167
|
+
|
|
168
|
+
exp = claims.get("exp")
|
|
169
|
+
if exp is None:
|
|
170
|
+
raise OIDCValidationError("token missing 'exp'")
|
|
171
|
+
if current > int(exp) + leeway:
|
|
172
|
+
raise OIDCValidationError("token expired")
|
|
173
|
+
|
|
174
|
+
iat = claims.get("iat")
|
|
175
|
+
if iat is not None and int(iat) - leeway > current:
|
|
176
|
+
raise OIDCValidationError("token 'iat' is in the future")
|
|
177
|
+
|
|
178
|
+
nbf = claims.get("nbf")
|
|
179
|
+
if nbf is not None and int(nbf) - leeway > current:
|
|
180
|
+
raise OIDCValidationError("token not yet valid ('nbf')")
|
|
181
|
+
|
|
182
|
+
if nonce is not None and claims.get("nonce") != nonce:
|
|
183
|
+
raise OIDCValidationError("nonce mismatch")
|
|
184
|
+
|
|
185
|
+
return claims
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def fetch_jwks(jwks_uri: str, *, timeout: float = 15.0) -> Dict[str, Any]:
|
|
189
|
+
"""Fetch a provider JWKS document. Network-only; injectable in tests."""
|
|
190
|
+
if not jwks_uri:
|
|
191
|
+
raise OIDCValidationError("discovery document has no 'jwks_uri'")
|
|
192
|
+
import httpx
|
|
193
|
+
|
|
194
|
+
async with httpx.AsyncClient() as client:
|
|
195
|
+
resp = await client.get(jwks_uri, timeout=timeout)
|
|
196
|
+
resp.raise_for_status()
|
|
197
|
+
return resp.json()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
__all__ = [
|
|
201
|
+
"OIDCValidationError",
|
|
202
|
+
"verify_id_token",
|
|
203
|
+
"fetch_jwks",
|
|
204
|
+
"decode_unverified_header",
|
|
205
|
+
]
|
|
@@ -35,12 +35,66 @@ def host_is_loopback(host: str) -> bool:
|
|
|
35
35
|
return False
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
# ── Trusted-proxy handling ────────────────────────────────────────────────────
|
|
39
|
+
# ``client_ip`` is the key used for IP rate limiting (login / register) and for
|
|
40
|
+
# audit logging. A forwarded header (``X-Forwarded-For`` / ``CF-Connecting-IP``)
|
|
41
|
+
# is *client-controllable*, so honoring it unconditionally lets anyone spoof
|
|
42
|
+
# their source IP and bypass per-IP rate limits. We therefore trust those headers
|
|
43
|
+
# ONLY when the direct peer is a configured trusted proxy (e.g. the Cloudflare /
|
|
44
|
+
# Vercel edge in front of the app). Default: no trusted proxies → use the peer
|
|
45
|
+
# address, which is the safe, local-first behaviour.
|
|
46
|
+
_FORWARDED_HEADERS = ("CF-Connecting-IP", "X-Forwarded-For")
|
|
47
|
+
_trusted_proxies: List["ipaddress._BaseNetwork"] = []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def configure_trusted_proxies(values) -> int:
|
|
51
|
+
"""Set the trusted-proxy allowlist from IPs / CIDRs. Returns the count parsed.
|
|
52
|
+
|
|
53
|
+
Accepts a comma-separated string or an iterable of IPs/CIDRs. Invalid entries
|
|
54
|
+
are skipped. Passing an empty value disables forwarded-header trust entirely.
|
|
55
|
+
"""
|
|
56
|
+
global _trusted_proxies
|
|
57
|
+
if isinstance(values, str):
|
|
58
|
+
items = [v.strip() for v in values.split(",")]
|
|
59
|
+
else:
|
|
60
|
+
items = [str(v).strip() for v in (values or [])]
|
|
61
|
+
networks: List["ipaddress._BaseNetwork"] = []
|
|
62
|
+
for item in items:
|
|
63
|
+
if not item:
|
|
64
|
+
continue
|
|
65
|
+
try:
|
|
66
|
+
networks.append(ipaddress.ip_network(item, strict=False))
|
|
67
|
+
except ValueError:
|
|
68
|
+
continue
|
|
69
|
+
_trusted_proxies = networks
|
|
70
|
+
return len(networks)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _peer_is_trusted_proxy(peer: str) -> bool:
|
|
74
|
+
if not peer or not _trusted_proxies:
|
|
75
|
+
return False
|
|
76
|
+
try:
|
|
77
|
+
addr = ipaddress.ip_address(peer)
|
|
78
|
+
except ValueError:
|
|
79
|
+
return False
|
|
80
|
+
return any(addr in net for net in _trusted_proxies)
|
|
81
|
+
|
|
82
|
+
|
|
38
83
|
def client_ip(request) -> str:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
84
|
+
peer = request.client.host if request.client else ""
|
|
85
|
+
# Only a trusted proxy's forwarded headers are honoured; otherwise the
|
|
86
|
+
# client-supplied header is ignored so per-IP rate limits cannot be spoofed.
|
|
87
|
+
if _peer_is_trusted_proxy(peer):
|
|
88
|
+
for header in _FORWARDED_HEADERS:
|
|
89
|
+
val = request.headers.get(header)
|
|
90
|
+
if val:
|
|
91
|
+
candidate = val.split(",")[0].strip()
|
|
92
|
+
try:
|
|
93
|
+
ipaddress.ip_address(candidate)
|
|
94
|
+
return candidate
|
|
95
|
+
except ValueError:
|
|
96
|
+
continue
|
|
97
|
+
return peer or "unknown"
|
|
44
98
|
|
|
45
99
|
|
|
46
100
|
_FILE_MAGIC: Dict[str, List[bytes]] = {
|
|
@@ -18,7 +18,7 @@ from pathlib import Path
|
|
|
18
18
|
from typing import Any, Callable, Dict, Iterable, List, Optional
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
WORKSPACE_OS_VERSION = "3.
|
|
21
|
+
WORKSPACE_OS_VERSION = "3.6.0"
|
|
22
22
|
|
|
23
23
|
# Workspace types separate single-user Personal workspaces from shared
|
|
24
24
|
# Organization workspaces. Both keep the same local-first JSON store; the type
|
package/latticeai/server_app.py
CHANGED
|
@@ -40,6 +40,7 @@ from latticeai.core.security import (
|
|
|
40
40
|
verify_password,
|
|
41
41
|
host_is_loopback as _host_is_loopback_impl,
|
|
42
42
|
client_ip as _client_ip_impl,
|
|
43
|
+
configure_trusted_proxies as _configure_trusted_proxies,
|
|
43
44
|
bytes_match_extension as _bytes_match_extension_impl,
|
|
44
45
|
redact_secret_text as _redact_secret_text,
|
|
45
46
|
check_ip_rate_limit as _check_ip_rate_limit,
|
|
@@ -119,7 +120,11 @@ from latticeai.core.builtin_hooks import register_builtin_hook_runners
|
|
|
119
120
|
from latticeai.api.agent_registry import create_agent_registry_router
|
|
120
121
|
from latticeai.core.agent_registry import AgentRegistry
|
|
121
122
|
from latticeai.api.memory import create_memory_router
|
|
123
|
+
from latticeai.api.browser import create_browser_router
|
|
124
|
+
from latticeai.api.portability import create_portability_router
|
|
122
125
|
from latticeai.services.memory_service import MemoryService
|
|
126
|
+
from latticeai.services.ingestion import IngestionPipeline
|
|
127
|
+
from latticeai.services.kg_portability import KGPortabilityService
|
|
123
128
|
from latticeai.services.tool_dispatch import (
|
|
124
129
|
LOCAL_WRITE_BLOCKED_PREFIXES as _LOCAL_WRITE_BLOCKED_PREFIXES,
|
|
125
130
|
TOOL_GOVERNANCE,
|
|
@@ -157,6 +162,12 @@ from datetime import datetime
|
|
|
157
162
|
CONFIG = Config.from_env()
|
|
158
163
|
APP_VERSION = WORKSPACE_OS_VERSION
|
|
159
164
|
|
|
165
|
+
# Forwarded headers (X-Forwarded-For / CF-Connecting-IP) are only honoured for
|
|
166
|
+
# IP rate limiting when the direct peer is one of these trusted proxies. Empty by
|
|
167
|
+
# default (local-first): the peer address is used and client-supplied headers are
|
|
168
|
+
# ignored, so per-IP rate limits cannot be spoofed.
|
|
169
|
+
_configure_trusted_proxies(CONFIG.trusted_proxies)
|
|
170
|
+
|
|
160
171
|
APP_MODE = CONFIG.app_mode
|
|
161
172
|
IS_PUBLIC_MODE = CONFIG.is_public
|
|
162
173
|
DEFAULT_HOST = CONFIG.host
|
|
@@ -315,6 +326,23 @@ MEMORY_SERVICE = MemoryService(
|
|
|
315
326
|
enable_graph=ENABLE_GRAPH,
|
|
316
327
|
history_file=HISTORY_FILE,
|
|
317
328
|
)
|
|
329
|
+
# ── v3.6.0 unified ingestion pipeline: the single write-side seam into the
|
|
330
|
+
# Knowledge Graph. Every new source (web URL, browser tab, …) flows through this
|
|
331
|
+
# so pre_tool/post_tool hooks fire on ingestion and provenance is captured
|
|
332
|
+
# uniformly. Existing direct ingest callers keep working; new paths converge here.
|
|
333
|
+
INGESTION_PIPELINE = IngestionPipeline(
|
|
334
|
+
KNOWLEDGE_GRAPH,
|
|
335
|
+
hooks=HOOKS_REGISTRY,
|
|
336
|
+
enable_graph=ENABLE_GRAPH,
|
|
337
|
+
audit=lambda action, detail, user: append_audit_event(action, user_email=user, **detail),
|
|
338
|
+
)
|
|
339
|
+
# ── v3.6.0 Knowledge Graph portability: local export / import / backup / restore.
|
|
340
|
+
# The graph is the user's durable asset, so it must be portable with no cloud.
|
|
341
|
+
KG_PORTABILITY = KGPortabilityService(
|
|
342
|
+
knowledge_graph=KNOWLEDGE_GRAPH,
|
|
343
|
+
data_dir=DATA_DIR,
|
|
344
|
+
enable_graph=ENABLE_GRAPH,
|
|
345
|
+
)
|
|
318
346
|
|
|
319
347
|
def _require_graph():
|
|
320
348
|
if not ENABLE_GRAPH or KNOWLEDGE_GRAPH is None:
|
|
@@ -1501,6 +1529,17 @@ app.include_router(create_memory_router(
|
|
|
1501
1529
|
append_audit_event=append_audit_event,
|
|
1502
1530
|
))
|
|
1503
1531
|
|
|
1532
|
+
app.include_router(create_browser_router(
|
|
1533
|
+
pipeline=INGESTION_PIPELINE,
|
|
1534
|
+
require_user=require_user,
|
|
1535
|
+
))
|
|
1536
|
+
|
|
1537
|
+
app.include_router(create_portability_router(
|
|
1538
|
+
service=KG_PORTABILITY,
|
|
1539
|
+
require_user=require_user,
|
|
1540
|
+
require_admin=require_admin,
|
|
1541
|
+
))
|
|
1542
|
+
|
|
1504
1543
|
app.include_router(create_garden_router(gardener=gardener, require_user=require_user))
|
|
1505
1544
|
app.include_router(create_setup_router(model_router=router, require_user=require_user))
|
|
1506
1545
|
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Unified ingestion pipeline — the single write-side seam into the Knowledge Graph.
|
|
2
|
+
|
|
3
|
+
v3.6.0 Knowledge Graph First principle: *no data source bypasses the Knowledge
|
|
4
|
+
Graph and no source creates an isolated silo*. Every source — local files,
|
|
5
|
+
connected folders, PDFs/Markdown/text/code, web URLs, browser tabs — is
|
|
6
|
+
normalized into one :class:`IngestionItem` and pushed through one
|
|
7
|
+
:meth:`IngestionPipeline.ingest` entrypoint:
|
|
8
|
+
|
|
9
|
+
Source → normalize → content hash → (file | text) ingest → provenance
|
|
10
|
+
|
|
11
|
+
The pipeline is deliberately thin. It owns normalization, idempotency reporting,
|
|
12
|
+
provenance capture, and — crucially — routing every ingest through the shared
|
|
13
|
+
``dispatch_tool`` lifecycle so ``pre_tool``/``post_tool`` hooks fire on data
|
|
14
|
+
ingestion exactly as they do on tool calls. The heavy graph construction lives in
|
|
15
|
+
:class:`knowledge_graph.KnowledgeGraphStore` (``ingest_document`` for files,
|
|
16
|
+
``ingest_source`` for text/web), which this module composes rather than
|
|
17
|
+
re-implements.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
from latticeai.core.hooks import dispatch_tool
|
|
29
|
+
|
|
30
|
+
# Source types that arrive as a file on disk (read via ingest_document).
|
|
31
|
+
FILE_SOURCE_TYPES = frozenset({"file", "local_file", "upload", "pdf"})
|
|
32
|
+
# Source types that arrive as extracted text (read via ingest_source).
|
|
33
|
+
TEXT_SOURCE_TYPES = frozenset(
|
|
34
|
+
{"web_url", "browser_tab", "text", "markdown", "note", "code", "clipboard"}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
DEFAULT_MAX_TEXT_BYTES = 5 * 1024 * 1024 # 5 MB of extracted text per item
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _now_iso() -> str:
|
|
41
|
+
return datetime.now(timezone.utc).isoformat()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class IngestionItem:
|
|
46
|
+
"""A single thing to ingest, normalized across every source type."""
|
|
47
|
+
|
|
48
|
+
source_type: str
|
|
49
|
+
title: Optional[str] = None
|
|
50
|
+
text: Optional[str] = None # text/web sources
|
|
51
|
+
path: Optional[str] = None # file sources
|
|
52
|
+
source_uri: Optional[str] = None
|
|
53
|
+
mime_type: Optional[str] = None
|
|
54
|
+
owner: Optional[str] = None
|
|
55
|
+
workspace_id: Optional[str] = None
|
|
56
|
+
permissions: Optional[Dict[str, Any]] = None
|
|
57
|
+
captured_at: Optional[str] = None
|
|
58
|
+
modified_at: Optional[str] = None
|
|
59
|
+
conversation_id: Optional[str] = None
|
|
60
|
+
agent_used: Optional[str] = None
|
|
61
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class IngestionResult:
|
|
66
|
+
"""The outcome of one ingestion, including provenance and idempotency."""
|
|
67
|
+
|
|
68
|
+
status: str # ok | unavailable | blocked | failed
|
|
69
|
+
source_type: str
|
|
70
|
+
node_id: Optional[str] = None
|
|
71
|
+
source_node_id: Optional[str] = None
|
|
72
|
+
content_hash: Optional[str] = None
|
|
73
|
+
title: Optional[str] = None
|
|
74
|
+
chunk_ids: List[str] = field(default_factory=list)
|
|
75
|
+
chunk_count: int = 0
|
|
76
|
+
duplicate: bool = False
|
|
77
|
+
embedded: bool = False
|
|
78
|
+
indexing_status: str = "pending" # indexed | skipped | failed | pending
|
|
79
|
+
provenance_id: Optional[str] = None
|
|
80
|
+
detail: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
def as_dict(self) -> Dict[str, Any]:
|
|
83
|
+
return {
|
|
84
|
+
"status": self.status,
|
|
85
|
+
"source_type": self.source_type,
|
|
86
|
+
"node_id": self.node_id,
|
|
87
|
+
"source_node_id": self.source_node_id,
|
|
88
|
+
"content_hash": self.content_hash,
|
|
89
|
+
"title": self.title,
|
|
90
|
+
"chunk_ids": self.chunk_ids,
|
|
91
|
+
"chunk_count": self.chunk_count,
|
|
92
|
+
"duplicate": self.duplicate,
|
|
93
|
+
"embedded": self.embedded,
|
|
94
|
+
"indexing_status": self.indexing_status,
|
|
95
|
+
"provenance_id": self.provenance_id,
|
|
96
|
+
"detail": self.detail,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class IngestionPipeline:
|
|
101
|
+
"""Single normalized entrypoint that feeds every source into the graph."""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self,
|
|
105
|
+
knowledge_graph: Any,
|
|
106
|
+
*,
|
|
107
|
+
hooks: Any = None,
|
|
108
|
+
enable_graph: bool = True,
|
|
109
|
+
audit: Optional[Any] = None,
|
|
110
|
+
max_text_bytes: int = DEFAULT_MAX_TEXT_BYTES,
|
|
111
|
+
pipeline_name: str = "unified-ingestion",
|
|
112
|
+
) -> None:
|
|
113
|
+
self._kg = knowledge_graph
|
|
114
|
+
self._hooks = hooks
|
|
115
|
+
self._enable = bool(enable_graph)
|
|
116
|
+
self._audit = audit
|
|
117
|
+
self._max_text_bytes = int(max_text_bytes)
|
|
118
|
+
self._pipeline_name = pipeline_name
|
|
119
|
+
|
|
120
|
+
def available(self) -> bool:
|
|
121
|
+
return self._enable and self._kg is not None
|
|
122
|
+
|
|
123
|
+
# ── public API ───────────────────────────────────────────────────────────
|
|
124
|
+
def ingest(self, item: IngestionItem, *, user_email: Optional[str] = None) -> IngestionResult:
|
|
125
|
+
"""Normalize, hash, route through dispatch_tool, and record provenance."""
|
|
126
|
+
source_type = str(item.source_type or "text").strip()
|
|
127
|
+
if not self.available():
|
|
128
|
+
return IngestionResult(
|
|
129
|
+
status="unavailable", source_type=source_type,
|
|
130
|
+
indexing_status="skipped",
|
|
131
|
+
detail="Knowledge Graph is disabled (LATTICEAI_ENABLE_GRAPH).",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
captured_at = item.captured_at or _now_iso()
|
|
135
|
+
owner = item.owner or user_email
|
|
136
|
+
tool_name = f"kg_ingest.{source_type}"
|
|
137
|
+
# Only the keys are read by the hook payload, so this dict is safe/cheap.
|
|
138
|
+
args = {
|
|
139
|
+
"source_type": source_type,
|
|
140
|
+
"source_uri": item.source_uri,
|
|
141
|
+
"owner": owner,
|
|
142
|
+
"workspace_id": item.workspace_id,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def _run() -> Dict[str, Any]:
|
|
146
|
+
if source_type in FILE_SOURCE_TYPES or (item.path and not item.text):
|
|
147
|
+
return self._ingest_file(item, source_type=source_type, owner=owner, captured_at=captured_at)
|
|
148
|
+
return self._ingest_text(item, source_type=source_type, owner=owner, captured_at=captured_at)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
raw = dispatch_tool(
|
|
152
|
+
self._hooks, tool_name, args, _run,
|
|
153
|
+
user_email=user_email, workspace_id=item.workspace_id, source="ingestion",
|
|
154
|
+
)
|
|
155
|
+
except PermissionError as exc:
|
|
156
|
+
return IngestionResult(
|
|
157
|
+
status="blocked", source_type=source_type,
|
|
158
|
+
indexing_status="skipped", detail=str(exc),
|
|
159
|
+
)
|
|
160
|
+
except FileNotFoundError as exc:
|
|
161
|
+
return IngestionResult(
|
|
162
|
+
status="failed", source_type=source_type,
|
|
163
|
+
indexing_status="failed", detail=str(exc),
|
|
164
|
+
)
|
|
165
|
+
except Exception as exc: # noqa: BLE001 — surface as a failed result, never crash the caller
|
|
166
|
+
return IngestionResult(
|
|
167
|
+
status="failed", source_type=source_type,
|
|
168
|
+
indexing_status="failed", detail=str(exc),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
node_id = raw.get("node_id")
|
|
172
|
+
content_hash = raw.get("content_hash") or raw.get("sha256")
|
|
173
|
+
chunk_ids = list(raw.get("chunk_ids") or [])
|
|
174
|
+
embedded = bool(self._kg.node_is_embedded(node_id)) if node_id else False
|
|
175
|
+
title = raw.get("title") or item.title
|
|
176
|
+
|
|
177
|
+
prov = self._kg.record_provenance(
|
|
178
|
+
node_id=node_id,
|
|
179
|
+
source_type=source_type,
|
|
180
|
+
pipeline=self._pipeline_name,
|
|
181
|
+
source_uri=item.source_uri,
|
|
182
|
+
content_hash=content_hash,
|
|
183
|
+
title=title,
|
|
184
|
+
owner=owner,
|
|
185
|
+
workspace_id=item.workspace_id,
|
|
186
|
+
captured_at=captured_at,
|
|
187
|
+
modified_at=item.modified_at,
|
|
188
|
+
embedded=embedded,
|
|
189
|
+
linked=bool(raw.get("source_node_id")),
|
|
190
|
+
duplicate=bool(raw.get("duplicate")),
|
|
191
|
+
agent_used=item.agent_used,
|
|
192
|
+
chunk_count=len(chunk_ids),
|
|
193
|
+
permissions=item.permissions,
|
|
194
|
+
metadata=item.metadata,
|
|
195
|
+
)
|
|
196
|
+
if self._audit is not None:
|
|
197
|
+
try:
|
|
198
|
+
self._audit(
|
|
199
|
+
"kg_ingest",
|
|
200
|
+
{
|
|
201
|
+
"source_type": source_type, "node_id": node_id,
|
|
202
|
+
"content_hash": content_hash, "duplicate": bool(raw.get("duplicate")),
|
|
203
|
+
},
|
|
204
|
+
user_email,
|
|
205
|
+
)
|
|
206
|
+
except Exception: # noqa: BLE001 — audit must never break ingestion
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
return IngestionResult(
|
|
210
|
+
status="ok",
|
|
211
|
+
source_type=source_type,
|
|
212
|
+
node_id=node_id,
|
|
213
|
+
source_node_id=raw.get("source_node_id"),
|
|
214
|
+
content_hash=content_hash,
|
|
215
|
+
title=title,
|
|
216
|
+
chunk_ids=chunk_ids,
|
|
217
|
+
chunk_count=len(chunk_ids),
|
|
218
|
+
duplicate=bool(raw.get("duplicate")),
|
|
219
|
+
embedded=embedded,
|
|
220
|
+
indexing_status="indexed",
|
|
221
|
+
provenance_id=prov.get("id"),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# ── routing helpers ──────────────────────────────────────────────────────
|
|
225
|
+
def _ingest_text(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
|
|
226
|
+
text = item.text or ""
|
|
227
|
+
if len(text.encode("utf-8", "ignore")) > self._max_text_bytes:
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"Text payload exceeds the {self._max_text_bytes // (1024 * 1024)}MB ingestion limit."
|
|
230
|
+
)
|
|
231
|
+
title = item.title or item.source_uri or source_type
|
|
232
|
+
return self._kg.ingest_source(
|
|
233
|
+
source_type=source_type,
|
|
234
|
+
title=title,
|
|
235
|
+
text=text,
|
|
236
|
+
source_uri=item.source_uri,
|
|
237
|
+
owner=owner,
|
|
238
|
+
workspace_id=item.workspace_id,
|
|
239
|
+
permissions=item.permissions,
|
|
240
|
+
captured_at=captured_at,
|
|
241
|
+
modified_at=item.modified_at,
|
|
242
|
+
conversation_id=item.conversation_id,
|
|
243
|
+
metadata={"mime_type": item.mime_type, **(item.metadata or {})},
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def _ingest_file(self, item, *, source_type, owner, captured_at) -> Dict[str, Any]:
|
|
247
|
+
if not item.path:
|
|
248
|
+
raise ValueError("File ingestion requires a path.")
|
|
249
|
+
path = Path(item.path)
|
|
250
|
+
if not path.exists():
|
|
251
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
252
|
+
return self._kg.ingest_document(
|
|
253
|
+
path,
|
|
254
|
+
original_filename=item.title or path.name,
|
|
255
|
+
mime_type=item.mime_type,
|
|
256
|
+
uploader=owner,
|
|
257
|
+
conversation_id=item.conversation_id,
|
|
258
|
+
extracted=item.metadata.get("extracted") if item.metadata else None,
|
|
259
|
+
source_type=source_type,
|
|
260
|
+
source_uri=item.source_uri or str(path),
|
|
261
|
+
captured_at=captured_at,
|
|
262
|
+
modified_at=item.modified_at,
|
|
263
|
+
owner=owner,
|
|
264
|
+
workspace_id=item.workspace_id,
|
|
265
|
+
permissions=item.permissions,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def content_hash_text(text: str) -> str:
|
|
270
|
+
"""Canonical content hash for a text payload (matches store hashing scheme)."""
|
|
271
|
+
return hashlib.sha256((text or "").encode("utf-8", "ignore")).hexdigest()
|