ltcai 2.2.2 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -27
- package/codex_telegram_bot.py +6 -2
- package/docs/CHANGELOG.md +154 -0
- package/docs/V3_BACKEND_ARCHITECTURE.md +138 -0
- package/docs/V3_FRONTEND.md +136 -0
- package/knowledge_graph.py +649 -21
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +47 -0
- package/latticeai/api/agents.py +54 -31
- package/latticeai/api/auth.py +1 -1
- package/latticeai/api/chat.py +10 -2
- package/latticeai/api/search.py +236 -0
- package/latticeai/api/static_routes.py +21 -2
- package/latticeai/core/config.py +16 -0
- package/latticeai/core/embedding_providers.py +502 -0
- package/latticeai/core/local_embeddings.py +86 -0
- package/latticeai/core/logging_safety.py +62 -0
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/server_app.py +49 -1
- package/latticeai/services/agent_runtime.py +245 -0
- package/latticeai/services/search_service.py +346 -0
- package/package.json +8 -4
- package/static/account.html +9 -4
- package/static/activity.html +4 -4
- package/static/admin.html +8 -3
- package/static/agents.html +4 -4
- package/static/chat.html +16 -11
- package/static/css/reference/account.css +439 -0
- package/static/css/reference/admin.css +610 -0
- package/static/css/reference/base.css +1658 -0
- package/static/{lattice-reference.css → css/reference/chat.css} +271 -3633
- package/static/css/reference/graph.css +1016 -0
- package/static/css/responsive.css +248 -1
- package/static/css/tokens.css +132 -126
- package/static/favicon.ico +0 -0
- package/static/graph.html +9 -4
- package/static/manifest.json +3 -3
- package/static/platform.css +1 -1
- package/static/plugins.html +4 -4
- package/static/scripts/account.js +4 -4
- package/static/scripts/chat.js +227 -77
- package/static/scripts/workspace.js +78 -0
- package/static/sw.js +5 -3
- package/static/v3/css/lattice.base.css +128 -0
- package/static/v3/css/lattice.components.css +447 -0
- package/static/v3/css/lattice.shell.css +407 -0
- package/static/v3/css/lattice.tokens.css +132 -0
- package/static/v3/css/lattice.views.css +277 -0
- package/static/v3/index.html +40 -0
- package/static/v3/js/app.js +26 -0
- package/static/v3/js/core/api.js +327 -0
- package/static/v3/js/core/components.js +215 -0
- package/static/v3/js/core/dom.js +148 -0
- package/static/v3/js/core/fixtures.js +171 -0
- package/static/v3/js/core/router.js +37 -0
- package/static/v3/js/core/routes.js +73 -0
- package/static/v3/js/core/shell.js +363 -0
- package/static/v3/js/core/store.js +113 -0
- package/static/v3/js/views/admin-audit.js +185 -0
- package/static/v3/js/views/admin-permissions.js +178 -0
- package/static/v3/js/views/admin-policies.js +103 -0
- package/static/v3/js/views/admin-private-vpc.js +138 -0
- package/static/v3/js/views/admin-security.js +181 -0
- package/static/v3/js/views/admin-users.js +168 -0
- package/static/v3/js/views/agents.js +194 -0
- package/static/v3/js/views/chat.js +450 -0
- package/static/v3/js/views/files.js +180 -0
- package/static/v3/js/views/home.js +119 -0
- package/static/v3/js/views/hybrid-search.js +195 -0
- package/static/v3/js/views/knowledge-graph.js +238 -0
- package/static/v3/js/views/models.js +247 -0
- package/static/v3/js/views/my-computer.js +237 -0
- package/static/v3/js/views/pipeline.js +161 -0
- package/static/v3/js/views/settings.js +258 -0
- package/static/workflows.html +4 -4
- package/static/workspace.css +408 -14
- package/static/workspace.html +43 -24
- package/telegram_bot.py +18 -14
package/knowledge_graph.py
CHANGED
|
@@ -16,6 +16,7 @@ import platform
|
|
|
16
16
|
import re
|
|
17
17
|
import shutil
|
|
18
18
|
import sqlite3
|
|
19
|
+
import time
|
|
19
20
|
import zipfile
|
|
20
21
|
from collections import Counter
|
|
21
22
|
from datetime import datetime
|
|
@@ -30,6 +31,8 @@ except Exception: # pragma: no cover - v2 schema is optional at import time
|
|
|
30
31
|
EdgeType = None # type: ignore[assignment]
|
|
31
32
|
_exec_script = None # type: ignore[assignment]
|
|
32
33
|
|
|
34
|
+
from latticeai.core.local_embeddings import LocalEmbeddingModel
|
|
35
|
+
|
|
33
36
|
# Default read source for the graph queries: v2 reconstruction views.
|
|
34
37
|
# Override with LATTICEAI_KG_READ_V2=0 to fall back to the legacy tables.
|
|
35
38
|
_READ_FROM_V2_DEFAULT = os.getenv("LATTICEAI_KG_READ_V2", "1") != "0"
|
|
@@ -806,11 +809,16 @@ def _topic_candidates(text: str, limit: int = 8) -> List[str]:
|
|
|
806
809
|
|
|
807
810
|
|
|
808
811
|
class KnowledgeGraphStore:
|
|
809
|
-
def __init__(self, db_path: Path, blob_dir: Path):
|
|
812
|
+
def __init__(self, db_path: Path, blob_dir: Path, embedder: Any = None):
|
|
810
813
|
self.db_path = Path(db_path)
|
|
811
814
|
self.blob_dir = Path(blob_dir)
|
|
812
815
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
813
816
|
self.blob_dir.mkdir(parents=True, exist_ok=True)
|
|
817
|
+
# The embedder is swappable behind a fixed interface
|
|
818
|
+
# (model_id/dim/embed/encode/decode/similarity). Defaults to the
|
|
819
|
+
# deterministic, offline hash model so the store works with no config;
|
|
820
|
+
# server_app injects a provider-backed embedder from Config.
|
|
821
|
+
self._embedding_model = embedder if embedder is not None else LocalEmbeddingModel()
|
|
814
822
|
self._init_db()
|
|
815
823
|
# Read graph queries from the v2 projection (kgv2_* views) when available.
|
|
816
824
|
# Toggle off (e.g. in tests) to compare against the legacy tables.
|
|
@@ -909,6 +917,31 @@ class KnowledgeGraphStore:
|
|
|
909
917
|
UNIQUE(source_id, relative_path),
|
|
910
918
|
FOREIGN KEY(source_id) REFERENCES knowledge_sources(id) ON DELETE CASCADE
|
|
911
919
|
);
|
|
920
|
+
CREATE TABLE IF NOT EXISTS vector_embeddings (
|
|
921
|
+
item_id TEXT PRIMARY KEY,
|
|
922
|
+
item_type TEXT NOT NULL,
|
|
923
|
+
source_node TEXT NOT NULL,
|
|
924
|
+
text_hash TEXT NOT NULL,
|
|
925
|
+
embedding BLOB NOT NULL,
|
|
926
|
+
embedding_dim INTEGER NOT NULL,
|
|
927
|
+
embedding_model TEXT NOT NULL,
|
|
928
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
929
|
+
indexed_at TEXT NOT NULL,
|
|
930
|
+
FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
931
|
+
);
|
|
932
|
+
CREATE TABLE IF NOT EXISTS vector_index_operations (
|
|
933
|
+
id TEXT PRIMARY KEY,
|
|
934
|
+
operation TEXT NOT NULL,
|
|
935
|
+
status TEXT NOT NULL,
|
|
936
|
+
requested_at TEXT NOT NULL,
|
|
937
|
+
started_at TEXT,
|
|
938
|
+
completed_at TEXT,
|
|
939
|
+
items_total INTEGER NOT NULL DEFAULT 0,
|
|
940
|
+
items_indexed INTEGER NOT NULL DEFAULT 0,
|
|
941
|
+
items_skipped INTEGER NOT NULL DEFAULT 0,
|
|
942
|
+
error_message TEXT,
|
|
943
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
|
|
944
|
+
);
|
|
912
945
|
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
913
946
|
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
914
947
|
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
@@ -917,6 +950,10 @@ class KnowledgeGraphStore:
|
|
|
917
950
|
CREATE INDEX IF NOT EXISTS idx_local_file_index_source ON local_file_index(source_id);
|
|
918
951
|
CREATE INDEX IF NOT EXISTS idx_local_file_index_status ON local_file_index(status);
|
|
919
952
|
CREATE INDEX IF NOT EXISTS idx_local_file_index_graph_node ON local_file_index(graph_node_id);
|
|
953
|
+
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_type ON vector_embeddings(item_type);
|
|
954
|
+
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
|
|
955
|
+
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
|
|
956
|
+
CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
|
|
920
957
|
"""
|
|
921
958
|
)
|
|
922
959
|
conn.execute(
|
|
@@ -1198,6 +1235,15 @@ class KnowledgeGraphStore:
|
|
|
1198
1235
|
# dual-write: project into the v2 graph on the same transaction
|
|
1199
1236
|
self._v2_project_node(conn, node_id, node_type, title_s, summary_s, meta_json,
|
|
1200
1237
|
created_at=now, updated_at=now)
|
|
1238
|
+
if node_type != "Chunk":
|
|
1239
|
+
self._upsert_vector_item(
|
|
1240
|
+
conn,
|
|
1241
|
+
item_id=node_id,
|
|
1242
|
+
item_type="node",
|
|
1243
|
+
source_node=node_id,
|
|
1244
|
+
text=self._vector_text_for_node(title=title_s, summary=summary_s, metadata=metadata),
|
|
1245
|
+
metadata={"node_type": node_type, **(metadata or {})},
|
|
1246
|
+
)
|
|
1201
1247
|
return node_id
|
|
1202
1248
|
|
|
1203
1249
|
def _upsert_edge(
|
|
@@ -1227,6 +1273,110 @@ class KnowledgeGraphStore:
|
|
|
1227
1273
|
edge_id=edge_id, created_at=now)
|
|
1228
1274
|
return edge_id
|
|
1229
1275
|
|
|
1276
|
+
def _vector_text_for_node(
|
|
1277
|
+
self,
|
|
1278
|
+
*,
|
|
1279
|
+
title: str,
|
|
1280
|
+
summary: str = "",
|
|
1281
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
1282
|
+
) -> str:
|
|
1283
|
+
metadata = metadata or {}
|
|
1284
|
+
meta_parts = []
|
|
1285
|
+
for key in (
|
|
1286
|
+
"filename", "relative_path", "file_path", "conversation_id", "source",
|
|
1287
|
+
"category", "ext", "role",
|
|
1288
|
+
):
|
|
1289
|
+
value = metadata.get(key)
|
|
1290
|
+
if value:
|
|
1291
|
+
meta_parts.append(str(value))
|
|
1292
|
+
return _clean_text("\n".join([str(title or ""), str(summary or ""), " ".join(meta_parts)]))
|
|
1293
|
+
|
|
1294
|
+
def _upsert_vector_item(
|
|
1295
|
+
self,
|
|
1296
|
+
conn: sqlite3.Connection,
|
|
1297
|
+
*,
|
|
1298
|
+
item_id: str,
|
|
1299
|
+
item_type: str,
|
|
1300
|
+
source_node: str,
|
|
1301
|
+
text: str,
|
|
1302
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
1303
|
+
) -> bool:
|
|
1304
|
+
text = _clean_text(text)
|
|
1305
|
+
if len(text) < 2:
|
|
1306
|
+
conn.execute("DELETE FROM vector_embeddings WHERE item_id=?", (item_id,))
|
|
1307
|
+
return False
|
|
1308
|
+
text_hash = _sha256_text(text)
|
|
1309
|
+
existing = conn.execute(
|
|
1310
|
+
"""
|
|
1311
|
+
SELECT text_hash, embedding_dim, embedding_model
|
|
1312
|
+
FROM vector_embeddings
|
|
1313
|
+
WHERE item_id=?
|
|
1314
|
+
""",
|
|
1315
|
+
(item_id,),
|
|
1316
|
+
).fetchone()
|
|
1317
|
+
if (
|
|
1318
|
+
existing
|
|
1319
|
+
and existing["text_hash"] == text_hash
|
|
1320
|
+
and existing["embedding_dim"] == self._embedding_model.dim
|
|
1321
|
+
and existing["embedding_model"] == self._embedding_model.model_id
|
|
1322
|
+
):
|
|
1323
|
+
return False
|
|
1324
|
+
embedding = self._embedding_model.encode(self._embedding_model.embed(text[:50_000]))
|
|
1325
|
+
conn.execute(
|
|
1326
|
+
"""
|
|
1327
|
+
INSERT INTO vector_embeddings(
|
|
1328
|
+
item_id, item_type, source_node, text_hash, embedding,
|
|
1329
|
+
embedding_dim, embedding_model, metadata_json, indexed_at
|
|
1330
|
+
)
|
|
1331
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1332
|
+
ON CONFLICT(item_id) DO UPDATE SET
|
|
1333
|
+
item_type=excluded.item_type,
|
|
1334
|
+
source_node=excluded.source_node,
|
|
1335
|
+
text_hash=excluded.text_hash,
|
|
1336
|
+
embedding=excluded.embedding,
|
|
1337
|
+
embedding_dim=excluded.embedding_dim,
|
|
1338
|
+
embedding_model=excluded.embedding_model,
|
|
1339
|
+
metadata_json=excluded.metadata_json,
|
|
1340
|
+
indexed_at=excluded.indexed_at
|
|
1341
|
+
""",
|
|
1342
|
+
(
|
|
1343
|
+
item_id,
|
|
1344
|
+
item_type,
|
|
1345
|
+
source_node,
|
|
1346
|
+
text_hash,
|
|
1347
|
+
embedding,
|
|
1348
|
+
self._embedding_model.dim,
|
|
1349
|
+
self._embedding_model.model_id,
|
|
1350
|
+
_json(metadata),
|
|
1351
|
+
_now(),
|
|
1352
|
+
),
|
|
1353
|
+
)
|
|
1354
|
+
return True
|
|
1355
|
+
|
|
1356
|
+
def _upsert_chunk(
|
|
1357
|
+
self,
|
|
1358
|
+
conn: sqlite3.Connection,
|
|
1359
|
+
*,
|
|
1360
|
+
chunk_id: str,
|
|
1361
|
+
source_node: str,
|
|
1362
|
+
text: str,
|
|
1363
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
1364
|
+
) -> None:
|
|
1365
|
+
metadata = metadata or {}
|
|
1366
|
+
conn.execute(
|
|
1367
|
+
"INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) "
|
|
1368
|
+
"VALUES (?, ?, ?, ?, ?)",
|
|
1369
|
+
(chunk_id, source_node, text, _json(metadata), _now()),
|
|
1370
|
+
)
|
|
1371
|
+
self._upsert_vector_item(
|
|
1372
|
+
conn,
|
|
1373
|
+
item_id=chunk_id,
|
|
1374
|
+
item_type="chunk",
|
|
1375
|
+
source_node=chunk_id,
|
|
1376
|
+
text=text,
|
|
1377
|
+
metadata={**metadata, "parent_source_node": source_node},
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1230
1380
|
# ── Local folder sources → Graph RAG ──────────────────────────────────
|
|
1231
1381
|
|
|
1232
1382
|
def discover_local_roots(self) -> Dict[str, Any]:
|
|
@@ -2052,16 +2202,12 @@ class KnowledgeGraphStore:
|
|
|
2052
2202
|
summary=chunk[:500],
|
|
2053
2203
|
metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
|
|
2054
2204
|
)
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
chunk,
|
|
2062
|
-
_json({"index": index, "source_node": file_node_id, "source_id": source_id}),
|
|
2063
|
-
_now(),
|
|
2064
|
-
),
|
|
2205
|
+
self._upsert_chunk(
|
|
2206
|
+
conn,
|
|
2207
|
+
chunk_id=chunk_id,
|
|
2208
|
+
source_node=file_node_id,
|
|
2209
|
+
text=chunk,
|
|
2210
|
+
metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
|
|
2065
2211
|
)
|
|
2066
2212
|
self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
|
|
2067
2213
|
|
|
@@ -2494,11 +2640,12 @@ class KnowledgeGraphStore:
|
|
|
2494
2640
|
summary=chunk[:500],
|
|
2495
2641
|
metadata={"index": index, "source_node": node_id},
|
|
2496
2642
|
)
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2643
|
+
self._upsert_chunk(
|
|
2644
|
+
conn,
|
|
2645
|
+
chunk_id=chunk_id,
|
|
2646
|
+
source_node=node_id,
|
|
2647
|
+
text=chunk,
|
|
2648
|
+
metadata={"index": index, "source_node": node_id},
|
|
2502
2649
|
)
|
|
2503
2650
|
self._upsert_edge(conn, node_id, chunk_id, "포함함")
|
|
2504
2651
|
|
|
@@ -2621,11 +2768,12 @@ class KnowledgeGraphStore:
|
|
|
2621
2768
|
summary=chunk[:500],
|
|
2622
2769
|
metadata={"index": index, "source_node": file_id},
|
|
2623
2770
|
)
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2771
|
+
self._upsert_chunk(
|
|
2772
|
+
conn,
|
|
2773
|
+
chunk_id=chunk_id,
|
|
2774
|
+
source_node=file_id,
|
|
2775
|
+
text=chunk,
|
|
2776
|
+
metadata={"index": index, "source_node": file_id},
|
|
2629
2777
|
)
|
|
2630
2778
|
self._upsert_edge(conn, file_id, chunk_id, "포함함")
|
|
2631
2779
|
|
|
@@ -3168,6 +3316,486 @@ class KnowledgeGraphStore:
|
|
|
3168
3316
|
]
|
|
3169
3317
|
return {"node_id": node_id, "neighbors": nodes, "edges": edges}
|
|
3170
3318
|
|
|
3319
|
+
def get_node(self, node_id: str) -> Dict[str, Any]:
|
|
3320
|
+
node_id = str(node_id or "").strip()
|
|
3321
|
+
if not node_id:
|
|
3322
|
+
raise ValueError("node_id required")
|
|
3323
|
+
nt, et = self._read_tables()
|
|
3324
|
+
with self._connect() as conn:
|
|
3325
|
+
row = conn.execute(
|
|
3326
|
+
f"""
|
|
3327
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
3328
|
+
FROM {nt}
|
|
3329
|
+
WHERE id=?
|
|
3330
|
+
""",
|
|
3331
|
+
(node_id,),
|
|
3332
|
+
).fetchone()
|
|
3333
|
+
if not row:
|
|
3334
|
+
raise ValueError(f"graph node not found: {node_id}")
|
|
3335
|
+
degree = conn.execute(
|
|
3336
|
+
f"SELECT COUNT(*) AS c FROM {et} WHERE from_node=? OR to_node=?",
|
|
3337
|
+
(node_id, node_id),
|
|
3338
|
+
).fetchone()["c"]
|
|
3339
|
+
return {
|
|
3340
|
+
"id": row["id"],
|
|
3341
|
+
"type": row["type"],
|
|
3342
|
+
"title": row["title"],
|
|
3343
|
+
"summary": row["summary"],
|
|
3344
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3345
|
+
"updated_at": row["updated_at"],
|
|
3346
|
+
"degree": degree,
|
|
3347
|
+
}
|
|
3348
|
+
|
|
3349
|
+
def relationship_search(
|
|
3350
|
+
self,
|
|
3351
|
+
*,
|
|
3352
|
+
query: str = "",
|
|
3353
|
+
node_id: str = "",
|
|
3354
|
+
relationship_type: str = "",
|
|
3355
|
+
limit: int = 30,
|
|
3356
|
+
) -> Dict[str, Any]:
|
|
3357
|
+
query = str(query or "").strip()
|
|
3358
|
+
node_id = str(node_id or "").strip()
|
|
3359
|
+
relationship_type = str(relationship_type or "").strip()
|
|
3360
|
+
limit = max(1, min(int(limit or 30), 200))
|
|
3361
|
+
nt, et = self._read_tables()
|
|
3362
|
+
where = []
|
|
3363
|
+
params: List[Any] = []
|
|
3364
|
+
if node_id:
|
|
3365
|
+
where.append("(e.from_node=? OR e.to_node=?)")
|
|
3366
|
+
params.extend([node_id, node_id])
|
|
3367
|
+
if relationship_type:
|
|
3368
|
+
where.append("e.type LIKE ?")
|
|
3369
|
+
params.append(f"%{relationship_type}%")
|
|
3370
|
+
if query:
|
|
3371
|
+
where.append(
|
|
3372
|
+
"(e.type LIKE ? OR e.metadata_json LIKE ? OR src.title LIKE ? OR dst.title LIKE ? OR src.summary LIKE ? OR dst.summary LIKE ?)"
|
|
3373
|
+
)
|
|
3374
|
+
params.extend([f"%{query}%"] * 6)
|
|
3375
|
+
where_sql = "WHERE " + " AND ".join(where) if where else ""
|
|
3376
|
+
with self._connect() as conn:
|
|
3377
|
+
rows = conn.execute(
|
|
3378
|
+
f"""
|
|
3379
|
+
SELECT
|
|
3380
|
+
e.id, e.from_node, e.to_node, e.type, e.weight, e.metadata_json, e.created_at,
|
|
3381
|
+
src.type AS source_type, src.title AS source_title, src.summary AS source_summary,
|
|
3382
|
+
src.metadata_json AS source_metadata,
|
|
3383
|
+
dst.type AS target_type, dst.title AS target_title, dst.summary AS target_summary,
|
|
3384
|
+
dst.metadata_json AS target_metadata
|
|
3385
|
+
FROM {et} e
|
|
3386
|
+
JOIN {nt} src ON src.id=e.from_node
|
|
3387
|
+
JOIN {nt} dst ON dst.id=e.to_node
|
|
3388
|
+
{where_sql}
|
|
3389
|
+
ORDER BY e.weight DESC, e.created_at DESC, e.id ASC
|
|
3390
|
+
LIMIT ?
|
|
3391
|
+
""",
|
|
3392
|
+
(*params, limit),
|
|
3393
|
+
).fetchall()
|
|
3394
|
+
return {
|
|
3395
|
+
"query": query,
|
|
3396
|
+
"node_id": node_id,
|
|
3397
|
+
"relationship_type": relationship_type,
|
|
3398
|
+
"relationships": [
|
|
3399
|
+
{
|
|
3400
|
+
"id": row["id"],
|
|
3401
|
+
"type": row["type"],
|
|
3402
|
+
"weight": row["weight"],
|
|
3403
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3404
|
+
"created_at": row["created_at"],
|
|
3405
|
+
"source": {
|
|
3406
|
+
"id": row["from_node"],
|
|
3407
|
+
"type": row["source_type"],
|
|
3408
|
+
"title": row["source_title"],
|
|
3409
|
+
"summary": row["source_summary"],
|
|
3410
|
+
"metadata": _safe_loads(row["source_metadata"]),
|
|
3411
|
+
},
|
|
3412
|
+
"target": {
|
|
3413
|
+
"id": row["to_node"],
|
|
3414
|
+
"type": row["target_type"],
|
|
3415
|
+
"title": row["target_title"],
|
|
3416
|
+
"summary": row["target_summary"],
|
|
3417
|
+
"metadata": _safe_loads(row["target_metadata"]),
|
|
3418
|
+
},
|
|
3419
|
+
}
|
|
3420
|
+
for row in rows
|
|
3421
|
+
],
|
|
3422
|
+
}
|
|
3423
|
+
|
|
3424
|
+
def traverse(self, node_id: str, *, depth: int = 1, limit: int = 100) -> Dict[str, Any]:
|
|
3425
|
+
node_id = str(node_id or "").strip()
|
|
3426
|
+
if not node_id:
|
|
3427
|
+
raise ValueError("node_id required")
|
|
3428
|
+
depth = max(0, min(int(depth or 1), 4))
|
|
3429
|
+
limit = max(1, min(int(limit or 100), 500))
|
|
3430
|
+
nt, et = self._read_tables()
|
|
3431
|
+
visited = {node_id}
|
|
3432
|
+
frontier = {node_id}
|
|
3433
|
+
edges_by_id: Dict[str, Dict[str, Any]] = {}
|
|
3434
|
+
with self._connect() as conn:
|
|
3435
|
+
for _ in range(depth):
|
|
3436
|
+
if not frontier or len(visited) >= limit:
|
|
3437
|
+
break
|
|
3438
|
+
placeholders = ",".join("?" * len(frontier))
|
|
3439
|
+
rows = conn.execute(
|
|
3440
|
+
f"""
|
|
3441
|
+
SELECT id, from_node, to_node, type, weight, metadata_json
|
|
3442
|
+
FROM {et}
|
|
3443
|
+
WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})
|
|
3444
|
+
ORDER BY weight DESC, id ASC
|
|
3445
|
+
LIMIT ?
|
|
3446
|
+
""",
|
|
3447
|
+
(*frontier, *frontier, limit * 3),
|
|
3448
|
+
).fetchall()
|
|
3449
|
+
next_frontier = set()
|
|
3450
|
+
for row in rows:
|
|
3451
|
+
edges_by_id[row["id"]] = {
|
|
3452
|
+
"id": row["id"],
|
|
3453
|
+
"from": row["from_node"],
|
|
3454
|
+
"to": row["to_node"],
|
|
3455
|
+
"type": row["type"],
|
|
3456
|
+
"weight": row["weight"],
|
|
3457
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3458
|
+
}
|
|
3459
|
+
for candidate in (row["from_node"], row["to_node"]):
|
|
3460
|
+
if candidate not in visited and len(visited) < limit:
|
|
3461
|
+
visited.add(candidate)
|
|
3462
|
+
next_frontier.add(candidate)
|
|
3463
|
+
frontier = next_frontier
|
|
3464
|
+
placeholders = ",".join("?" * len(visited))
|
|
3465
|
+
node_rows = conn.execute(
|
|
3466
|
+
f"""
|
|
3467
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
3468
|
+
FROM {nt}
|
|
3469
|
+
WHERE id IN ({placeholders})
|
|
3470
|
+
ORDER BY updated_at DESC, id ASC
|
|
3471
|
+
""",
|
|
3472
|
+
list(visited),
|
|
3473
|
+
).fetchall()
|
|
3474
|
+
return {
|
|
3475
|
+
"root": node_id,
|
|
3476
|
+
"depth": depth,
|
|
3477
|
+
"nodes": [
|
|
3478
|
+
{
|
|
3479
|
+
"id": row["id"],
|
|
3480
|
+
"type": row["type"],
|
|
3481
|
+
"title": row["title"],
|
|
3482
|
+
"summary": row["summary"],
|
|
3483
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3484
|
+
"updated_at": row["updated_at"],
|
|
3485
|
+
}
|
|
3486
|
+
for row in node_rows
|
|
3487
|
+
],
|
|
3488
|
+
"edges": list(edges_by_id.values()),
|
|
3489
|
+
}
|
|
3490
|
+
|
|
3491
|
+
def _iter_vector_source_items(
|
|
3492
|
+
self,
|
|
3493
|
+
conn: sqlite3.Connection,
|
|
3494
|
+
*,
|
|
3495
|
+
include_nodes: bool = True,
|
|
3496
|
+
include_chunks: bool = True,
|
|
3497
|
+
) -> List[Dict[str, Any]]:
|
|
3498
|
+
items: List[Dict[str, Any]] = []
|
|
3499
|
+
if include_nodes:
|
|
3500
|
+
for row in conn.execute(
|
|
3501
|
+
"""
|
|
3502
|
+
SELECT id, type, title, summary, metadata_json
|
|
3503
|
+
FROM nodes
|
|
3504
|
+
WHERE type <> 'Chunk'
|
|
3505
|
+
ORDER BY updated_at DESC, id ASC
|
|
3506
|
+
"""
|
|
3507
|
+
).fetchall():
|
|
3508
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
3509
|
+
text = self._vector_text_for_node(
|
|
3510
|
+
title=row["title"],
|
|
3511
|
+
summary=row["summary"] or "",
|
|
3512
|
+
metadata=metadata,
|
|
3513
|
+
)
|
|
3514
|
+
if text:
|
|
3515
|
+
items.append({
|
|
3516
|
+
"item_id": row["id"],
|
|
3517
|
+
"item_type": "node",
|
|
3518
|
+
"source_node": row["id"],
|
|
3519
|
+
"text": text,
|
|
3520
|
+
"metadata": {"node_type": row["type"], **metadata},
|
|
3521
|
+
})
|
|
3522
|
+
if include_chunks:
|
|
3523
|
+
for row in conn.execute(
|
|
3524
|
+
"""
|
|
3525
|
+
SELECT c.id, c.source_node AS parent_source_node, c.text, c.metadata_json
|
|
3526
|
+
FROM chunks c
|
|
3527
|
+
JOIN nodes n ON n.id=c.id
|
|
3528
|
+
ORDER BY c.created_at DESC, c.id ASC
|
|
3529
|
+
"""
|
|
3530
|
+
).fetchall():
|
|
3531
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
3532
|
+
text = _clean_text(row["text"] or "")
|
|
3533
|
+
if text:
|
|
3534
|
+
items.append({
|
|
3535
|
+
"item_id": row["id"],
|
|
3536
|
+
"item_type": "chunk",
|
|
3537
|
+
"source_node": row["id"],
|
|
3538
|
+
"text": text,
|
|
3539
|
+
"metadata": {**metadata, "parent_source_node": row["parent_source_node"]},
|
|
3540
|
+
})
|
|
3541
|
+
return items
|
|
3542
|
+
|
|
3543
|
+
def rebuild_vector_index(
|
|
3544
|
+
self,
|
|
3545
|
+
*,
|
|
3546
|
+
full: bool = False,
|
|
3547
|
+
include_nodes: bool = True,
|
|
3548
|
+
include_chunks: bool = True,
|
|
3549
|
+
) -> Dict[str, Any]:
|
|
3550
|
+
"""Rebuild the derived vector index without mutating graph content."""
|
|
3551
|
+
op_id = f"vector-op:{_sha256_text(f'{time.time()}:{os.getpid()}')[:24]}"
|
|
3552
|
+
requested_at = _now()
|
|
3553
|
+
started = time.perf_counter()
|
|
3554
|
+
try:
|
|
3555
|
+
with self._connect() as conn:
|
|
3556
|
+
conn.execute(
|
|
3557
|
+
"""
|
|
3558
|
+
INSERT INTO vector_index_operations(
|
|
3559
|
+
id, operation, status, requested_at, started_at, metadata_json
|
|
3560
|
+
)
|
|
3561
|
+
VALUES (?, ?, 'running', ?, ?, ?)
|
|
3562
|
+
""",
|
|
3563
|
+
(
|
|
3564
|
+
op_id,
|
|
3565
|
+
"rebuild_full" if full else "rebuild_incremental",
|
|
3566
|
+
requested_at,
|
|
3567
|
+
requested_at,
|
|
3568
|
+
_json({"include_nodes": include_nodes, "include_chunks": include_chunks}),
|
|
3569
|
+
),
|
|
3570
|
+
)
|
|
3571
|
+
if full:
|
|
3572
|
+
filters = []
|
|
3573
|
+
if include_nodes:
|
|
3574
|
+
filters.append("'node'")
|
|
3575
|
+
if include_chunks:
|
|
3576
|
+
filters.append("'chunk'")
|
|
3577
|
+
if filters:
|
|
3578
|
+
conn.execute(f"DELETE FROM vector_embeddings WHERE item_type IN ({','.join(filters)})")
|
|
3579
|
+
items = self._iter_vector_source_items(
|
|
3580
|
+
conn,
|
|
3581
|
+
include_nodes=include_nodes,
|
|
3582
|
+
include_chunks=include_chunks,
|
|
3583
|
+
)
|
|
3584
|
+
indexed = skipped = 0
|
|
3585
|
+
for item in items:
|
|
3586
|
+
changed = self._upsert_vector_item(conn, **item)
|
|
3587
|
+
if changed:
|
|
3588
|
+
indexed += 1
|
|
3589
|
+
else:
|
|
3590
|
+
skipped += 1
|
|
3591
|
+
duration_ms = round((time.perf_counter() - started) * 1000, 2)
|
|
3592
|
+
conn.execute(
|
|
3593
|
+
"""
|
|
3594
|
+
UPDATE vector_index_operations
|
|
3595
|
+
SET status='completed', completed_at=?, items_total=?,
|
|
3596
|
+
items_indexed=?, items_skipped=?, metadata_json=?
|
|
3597
|
+
WHERE id=?
|
|
3598
|
+
""",
|
|
3599
|
+
(
|
|
3600
|
+
_now(),
|
|
3601
|
+
len(items),
|
|
3602
|
+
indexed,
|
|
3603
|
+
skipped,
|
|
3604
|
+
_json({
|
|
3605
|
+
"include_nodes": include_nodes,
|
|
3606
|
+
"include_chunks": include_chunks,
|
|
3607
|
+
"duration_ms": duration_ms,
|
|
3608
|
+
"embedding_model": self._embedding_model.model_id,
|
|
3609
|
+
"embedding_dim": self._embedding_model.dim,
|
|
3610
|
+
}),
|
|
3611
|
+
op_id,
|
|
3612
|
+
),
|
|
3613
|
+
)
|
|
3614
|
+
return {
|
|
3615
|
+
"status": "completed",
|
|
3616
|
+
"operation_id": op_id,
|
|
3617
|
+
"full": bool(full),
|
|
3618
|
+
"items_total": len(items),
|
|
3619
|
+
"items_indexed": indexed,
|
|
3620
|
+
"items_skipped": skipped,
|
|
3621
|
+
"duration_ms": duration_ms,
|
|
3622
|
+
"embedding_model": self._embedding_model.model_id,
|
|
3623
|
+
"embedding_dim": self._embedding_model.dim,
|
|
3624
|
+
}
|
|
3625
|
+
except Exception as exc:
|
|
3626
|
+
duration_ms = round((time.perf_counter() - started) * 1000, 2)
|
|
3627
|
+
with self._connect() as conn:
|
|
3628
|
+
conn.execute(
|
|
3629
|
+
"""
|
|
3630
|
+
INSERT INTO vector_index_operations(
|
|
3631
|
+
id, operation, status, requested_at, started_at, completed_at,
|
|
3632
|
+
error_message, metadata_json
|
|
3633
|
+
)
|
|
3634
|
+
VALUES (?, ?, 'failed', ?, ?, ?, ?, ?)
|
|
3635
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
3636
|
+
status='failed',
|
|
3637
|
+
completed_at=excluded.completed_at,
|
|
3638
|
+
error_message=excluded.error_message,
|
|
3639
|
+
metadata_json=excluded.metadata_json
|
|
3640
|
+
""",
|
|
3641
|
+
(
|
|
3642
|
+
op_id,
|
|
3643
|
+
"rebuild_full" if full else "rebuild_incremental",
|
|
3644
|
+
requested_at,
|
|
3645
|
+
requested_at,
|
|
3646
|
+
_now(),
|
|
3647
|
+
str(exc),
|
|
3648
|
+
_json({"duration_ms": duration_ms}),
|
|
3649
|
+
),
|
|
3650
|
+
)
|
|
3651
|
+
raise
|
|
3652
|
+
|
|
3653
|
+
def index_status(self) -> Dict[str, Any]:
|
|
3654
|
+
with self._connect() as conn:
|
|
3655
|
+
vector_counts = {
|
|
3656
|
+
row["item_type"]: row["count"]
|
|
3657
|
+
for row in conn.execute(
|
|
3658
|
+
"SELECT item_type, COUNT(*) AS count FROM vector_embeddings GROUP BY item_type"
|
|
3659
|
+
)
|
|
3660
|
+
}
|
|
3661
|
+
source_items = self._iter_vector_source_items(conn)
|
|
3662
|
+
vector_rows = {
|
|
3663
|
+
row["item_id"]: row
|
|
3664
|
+
for row in conn.execute(
|
|
3665
|
+
"""
|
|
3666
|
+
SELECT item_id, text_hash, embedding_dim, embedding_model, indexed_at
|
|
3667
|
+
FROM vector_embeddings
|
|
3668
|
+
"""
|
|
3669
|
+
).fetchall()
|
|
3670
|
+
}
|
|
3671
|
+
latest_rows = conn.execute(
|
|
3672
|
+
"""
|
|
3673
|
+
SELECT id, operation, status, requested_at, started_at, completed_at,
|
|
3674
|
+
items_total, items_indexed, items_skipped, error_message, metadata_json
|
|
3675
|
+
FROM vector_index_operations
|
|
3676
|
+
ORDER BY requested_at DESC, id DESC
|
|
3677
|
+
LIMIT 5
|
|
3678
|
+
"""
|
|
3679
|
+
).fetchall()
|
|
3680
|
+
missing = stale = ready = 0
|
|
3681
|
+
for item in source_items:
|
|
3682
|
+
vector_row = vector_rows.get(item["item_id"])
|
|
3683
|
+
expected_hash = _sha256_text(_clean_text(item["text"]))
|
|
3684
|
+
if not vector_row:
|
|
3685
|
+
missing += 1
|
|
3686
|
+
elif (
|
|
3687
|
+
vector_row["text_hash"] != expected_hash
|
|
3688
|
+
or vector_row["embedding_dim"] != self._embedding_model.dim
|
|
3689
|
+
or vector_row["embedding_model"] != self._embedding_model.model_id
|
|
3690
|
+
):
|
|
3691
|
+
stale += 1
|
|
3692
|
+
else:
|
|
3693
|
+
ready += 1
|
|
3694
|
+
pending = missing + stale
|
|
3695
|
+
return {
|
|
3696
|
+
"status": "ready" if pending == 0 else "needs_reindex",
|
|
3697
|
+
"storage": {
|
|
3698
|
+
"db_path": str(self.db_path),
|
|
3699
|
+
"backend": "sqlite",
|
|
3700
|
+
"embedding_model": self._embedding_model.model_id,
|
|
3701
|
+
"embedding_dim": self._embedding_model.dim,
|
|
3702
|
+
},
|
|
3703
|
+
"source_items": len(source_items),
|
|
3704
|
+
"indexed_items": sum(vector_counts.values()),
|
|
3705
|
+
"ready_items": ready,
|
|
3706
|
+
"missing_items": missing,
|
|
3707
|
+
"stale_items": stale,
|
|
3708
|
+
"pending_items": pending,
|
|
3709
|
+
"by_item_type": vector_counts,
|
|
3710
|
+
"operations": [
|
|
3711
|
+
{
|
|
3712
|
+
"id": row["id"],
|
|
3713
|
+
"operation": row["operation"],
|
|
3714
|
+
"status": row["status"],
|
|
3715
|
+
"requested_at": row["requested_at"],
|
|
3716
|
+
"started_at": row["started_at"],
|
|
3717
|
+
"completed_at": row["completed_at"],
|
|
3718
|
+
"items_total": row["items_total"],
|
|
3719
|
+
"items_indexed": row["items_indexed"],
|
|
3720
|
+
"items_skipped": row["items_skipped"],
|
|
3721
|
+
"error_message": row["error_message"],
|
|
3722
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3723
|
+
}
|
|
3724
|
+
for row in latest_rows
|
|
3725
|
+
],
|
|
3726
|
+
}
|
|
3727
|
+
|
|
3728
|
+
def vector_search(
|
|
3729
|
+
self,
|
|
3730
|
+
query: str,
|
|
3731
|
+
*,
|
|
3732
|
+
limit: int = 30,
|
|
3733
|
+
min_score: float = 0.0,
|
|
3734
|
+
max_candidates: int = 10_000,
|
|
3735
|
+
) -> Dict[str, Any]:
|
|
3736
|
+
query = str(query or "").strip()
|
|
3737
|
+
limit = max(1, min(int(limit or 30), 100))
|
|
3738
|
+
min_score = float(min_score or 0.0)
|
|
3739
|
+
if not query:
|
|
3740
|
+
return {"query": query, "matches": []}
|
|
3741
|
+
query_vector = self._embedding_model.embed(query)
|
|
3742
|
+
max_candidates = max(limit, min(int(max_candidates or 10_000), 50_000))
|
|
3743
|
+
with self._connect() as conn:
|
|
3744
|
+
rows = conn.execute(
|
|
3745
|
+
"""
|
|
3746
|
+
SELECT
|
|
3747
|
+
ve.item_id, ve.item_type, ve.source_node, ve.embedding,
|
|
3748
|
+
ve.embedding_dim, ve.embedding_model, ve.metadata_json AS vector_metadata,
|
|
3749
|
+
n.type AS node_type, n.title AS node_title, n.summary AS node_summary,
|
|
3750
|
+
n.metadata_json AS node_metadata, n.updated_at AS node_updated_at,
|
|
3751
|
+
c.text AS chunk_text, c.source_node AS parent_node_id,
|
|
3752
|
+
pn.type AS parent_type, pn.title AS parent_title,
|
|
3753
|
+
pn.summary AS parent_summary, pn.metadata_json AS parent_metadata,
|
|
3754
|
+
pn.updated_at AS parent_updated_at
|
|
3755
|
+
FROM vector_embeddings ve
|
|
3756
|
+
LEFT JOIN nodes n ON n.id=ve.source_node
|
|
3757
|
+
LEFT JOIN chunks c ON c.id=ve.item_id
|
|
3758
|
+
LEFT JOIN nodes pn ON pn.id=c.source_node
|
|
3759
|
+
WHERE ve.embedding_model=? AND ve.embedding_dim=?
|
|
3760
|
+
ORDER BY ve.indexed_at DESC
|
|
3761
|
+
LIMIT ?
|
|
3762
|
+
""",
|
|
3763
|
+
(self._embedding_model.model_id, self._embedding_model.dim, max_candidates),
|
|
3764
|
+
).fetchall()
|
|
3765
|
+
scored = []
|
|
3766
|
+
for row in rows:
|
|
3767
|
+
vector = self._embedding_model.decode(row["embedding"], row["embedding_dim"])
|
|
3768
|
+
score = self._embedding_model.similarity(query_vector, vector)
|
|
3769
|
+
if score < min_score:
|
|
3770
|
+
continue
|
|
3771
|
+
is_chunk = row["item_type"] == "chunk"
|
|
3772
|
+
summary = row["chunk_text"] if is_chunk and row["chunk_text"] else row["node_summary"]
|
|
3773
|
+
parent_metadata = _safe_loads(row["parent_metadata"])
|
|
3774
|
+
node_metadata = _safe_loads(row["node_metadata"])
|
|
3775
|
+
scored.append({
|
|
3776
|
+
"id": row["item_id"],
|
|
3777
|
+
"node_id": row["parent_node_id"] if is_chunk and row["parent_node_id"] else row["source_node"],
|
|
3778
|
+
"item_type": row["item_type"],
|
|
3779
|
+
"type": "Chunk" if is_chunk else row["node_type"],
|
|
3780
|
+
"title": row["parent_title"] if is_chunk and row["parent_title"] else row["node_title"],
|
|
3781
|
+
"summary": _clean_text(summary or "")[:1000],
|
|
3782
|
+
"score": round(float(score), 6),
|
|
3783
|
+
"metadata": {
|
|
3784
|
+
**(parent_metadata if is_chunk else node_metadata),
|
|
3785
|
+
"vector": _safe_loads(row["vector_metadata"]),
|
|
3786
|
+
"parent_node_id": row["parent_node_id"],
|
|
3787
|
+
"parent_type": row["parent_type"],
|
|
3788
|
+
},
|
|
3789
|
+
"updated_at": row["parent_updated_at"] if is_chunk and row["parent_updated_at"] else row["node_updated_at"],
|
|
3790
|
+
})
|
|
3791
|
+
scored.sort(key=lambda item: (item["score"], item.get("updated_at") or ""), reverse=True)
|
|
3792
|
+
return {
|
|
3793
|
+
"query": query,
|
|
3794
|
+
"embedding_model": self._embedding_model.model_id,
|
|
3795
|
+
"embedding_dim": self._embedding_model.dim,
|
|
3796
|
+
"matches": scored[:limit],
|
|
3797
|
+
}
|
|
3798
|
+
|
|
3171
3799
|
def delete_conversation(self, conversation_id: str) -> Dict[str, Any]:
|
|
3172
3800
|
conversation_id = str(conversation_id or "").strip()
|
|
3173
3801
|
if not conversation_id:
|