ltcai 3.5.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -35
- package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
- package/docs/CHANGELOG.md +32 -0
- package/docs/HANDOVER_v3.6.0.md +46 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
- package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
- package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +509 -0
- package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
- package/docs/architecture.md +13 -12
- package/docs/kg-schema.md +102 -53
- package/docs/privacy.md +18 -2
- package/docs/security-model.md +17 -0
- package/kg_schema.py +139 -10
- package/knowledge_graph.py +874 -26
- package/knowledge_graph_api.py +11 -127
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +1 -1
- package/latticeai/api/agents.py +7 -1
- package/latticeai/api/auth.py +27 -4
- package/latticeai/api/browser.py +217 -0
- package/latticeai/api/chat.py +112 -76
- package/latticeai/api/health.py +1 -1
- package/latticeai/api/hooks.py +1 -1
- package/latticeai/api/knowledge_graph.py +146 -0
- package/latticeai/api/local_files.py +1 -1
- package/latticeai/api/mcp.py +23 -11
- package/latticeai/api/memory.py +1 -1
- package/latticeai/api/models.py +1 -1
- package/latticeai/api/network.py +81 -0
- package/latticeai/api/portability.py +93 -0
- package/latticeai/api/realtime.py +1 -1
- package/latticeai/api/search.py +26 -2
- package/latticeai/api/security_dashboard.py +2 -3
- package/latticeai/api/setup.py +2 -2
- package/latticeai/api/static_routes.py +2 -4
- package/latticeai/api/tools.py +3 -0
- package/latticeai/api/workflow_designer.py +46 -0
- package/latticeai/api/workspace.py +71 -49
- package/latticeai/app_factory.py +1710 -0
- package/latticeai/brain/__init__.py +18 -0
- package/latticeai/brain/context.py +213 -0
- package/latticeai/brain/conversations.py +236 -0
- package/latticeai/brain/identity.py +175 -0
- package/latticeai/brain/memory.py +102 -0
- package/latticeai/brain/network.py +205 -0
- package/latticeai/core/agent.py +31 -7
- package/latticeai/core/audit.py +0 -7
- package/latticeai/core/config.py +1 -1
- package/latticeai/core/context_builder.py +1 -2
- package/latticeai/core/enterprise.py +1 -1
- package/latticeai/core/graph_curator.py +2 -2
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/mcp_registry.py +791 -0
- package/latticeai/core/model_compat.py +1 -1
- package/latticeai/core/model_resolution.py +0 -1
- package/latticeai/core/multi_agent.py +238 -4
- package/latticeai/core/security.py +1 -1
- package/latticeai/core/sessions.py +37 -7
- package/latticeai/core/workflow_engine.py +114 -2
- package/latticeai/core/workspace_os.py +58 -10
- package/latticeai/models/__init__.py +7 -0
- package/latticeai/models/router.py +779 -0
- package/latticeai/server_app.py +29 -1504
- package/latticeai/services/agent_runtime.py +1 -0
- package/latticeai/services/app_context.py +75 -14
- package/latticeai/services/ingestion.py +318 -0
- package/latticeai/services/kg_portability.py +207 -0
- package/latticeai/services/memory_service.py +39 -11
- package/latticeai/services/model_runtime.py +2 -5
- package/latticeai/services/platform_runtime.py +100 -23
- package/latticeai/services/search_service.py +17 -8
- package/latticeai/services/tool_dispatch.py +12 -2
- package/latticeai/services/triggers.py +241 -0
- package/latticeai/services/upload_service.py +37 -12
- package/latticeai/services/workspace_service.py +31 -0
- package/llm_router.py +29 -772
- package/ltcai_cli.py +1 -2
- package/mcp_registry.py +25 -788
- package/p_reinforce.py +124 -14
- package/package.json +11 -8
- package/scripts/build_vsix.mjs +72 -0
- package/scripts/bump_version.py +99 -0
- package/scripts/generate_diagrams.py +0 -1
- package/scripts/lint_v3.mjs +82 -18
- package/scripts/validate_release_artifacts.py +0 -1
- package/scripts/wheel_smoke.py +142 -0
- package/server.py +11 -7
- package/setup_wizard.py +1142 -0
- package/static/account.html +2 -4
- package/static/admin.html +3 -5
- package/static/chat.html +3 -6
- package/static/graph.html +2 -4
- package/static/sw.js +81 -52
- package/static/v3/asset-manifest.json +20 -19
- package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
- package/static/v3/css/lattice.base.css +1 -1
- package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
- package/static/v3/css/lattice.components.css +1 -1
- package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
- package/static/v3/css/lattice.shell.css +1 -1
- package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
- package/static/v3/css/lattice.tokens.css +3 -0
- package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
- package/static/v3/css/lattice.views.css +2 -2
- package/static/v3/index.html +3 -4
- package/static/v3/js/{app.d086489d.js → app.356e6452.js} +1 -1
- package/static/v3/js/core/{api.12b568ad.js → api.7a308b89.js} +39 -1
- package/static/v3/js/core/api.js +38 -0
- package/static/v3/js/core/{routes.d214b399.js → routes.7222343d.js} +22 -22
- package/static/v3/js/core/routes.js +22 -22
- package/static/v3/js/core/{shell.d05266f5.js → shell.a1657f20.js} +4 -4
- package/static/v3/js/core/shell.js +1 -1
- package/static/v3/js/core/{store.34ebd5e6.js → store.204a08b2.js} +1 -1
- package/static/v3/js/core/store.js +1 -1
- package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
- package/static/v3/js/views/graph-canvas.js +509 -0
- package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
- package/static/v3/js/views/hybrid-search.js +1 -2
- package/static/v3/js/views/knowledge-graph.5e40cbeb.js +509 -0
- package/static/v3/js/views/knowledge-graph.js +326 -54
- package/static/vendor/chart.umd.min.js +20 -0
- package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
- package/static/vendor/fonts/inter.css +44 -0
- package/static/vendor/icons/tabler-icons.min.css +4 -0
- package/static/vendor/icons/tabler-icons.woff2 +0 -0
- package/static/vendor/marked.min.js +69 -0
- package/static/workspace.html +2 -2
- package/telegram_bot.py +1 -2
- package/tools/commands.py +4 -2
- package/tools/computer.py +1 -1
- package/tools/documents.py +1 -3
- package/tools/filesystem.py +0 -4
- package/tools/knowledge.py +1 -3
- package/tools/network.py +1 -3
- package/codex_telegram_bot.py +0 -195
- package/docs/assets/v3.4.0/agent-run.png +0 -0
- package/docs/assets/v3.4.0/agents.png +0 -0
- package/docs/assets/v3.4.0/before/chat-before.png +0 -0
- package/docs/assets/v3.4.0/before/files-before.png +0 -0
- package/docs/assets/v3.4.0/chat.png +0 -0
- package/docs/assets/v3.4.0/connect-folder.png +0 -0
- package/docs/assets/v3.4.0/files.png +0 -0
- package/docs/assets/v3.4.0/home.png +0 -0
- package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
- package/docs/assets/v3.4.0/local-agent.png +0 -0
- package/docs/assets/v3.4.0/memory.png +0 -0
- package/docs/assets/v3.4.0/settings.png +0 -0
- package/docs/assets/v3.4.0/vision-input.png +0 -0
- package/docs/assets/v3.4.0/workflows.png +0 -0
- package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
- package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.1/local-agent.png +0 -0
- package/docs/images/admin-dashboard.png +0 -0
- package/docs/images/architecture.png +0 -0
- package/docs/images/enterprise.png +0 -0
- package/docs/images/graph.png +0 -0
- package/docs/images/hero.gif +0 -0
- package/docs/images/knowledge-graph.png +0 -0
- package/docs/images/lattice-ai-demo.gif +0 -0
- package/docs/images/lattice-ai-hero.png +0 -0
- package/docs/images/logo.svg +0 -33
- package/docs/images/mobile-responsive.png +0 -0
- package/docs/images/model-recommendation.png +0 -0
- package/docs/images/onboarding.png +0 -0
- package/docs/images/organization.png +0 -0
- package/docs/images/pipeline.png +0 -0
- package/docs/images/screenshot-admin.png +0 -0
- package/docs/images/screenshot-chat.png +0 -0
- package/docs/images/screenshot-graph.png +0 -0
- package/docs/images/skills.png +0 -0
- package/docs/images/workspace-dark.png +0 -0
- package/docs/images/workspace-light.png +0 -0
- package/docs/images/workspace.png +0 -0
- package/requirements.txt +0 -16
- package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
package/knowledge_graph.py
CHANGED
|
@@ -942,6 +942,31 @@ class KnowledgeGraphStore:
|
|
|
942
942
|
error_message TEXT,
|
|
943
943
|
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
|
|
944
944
|
);
|
|
945
|
+
-- v3.6.0 Knowledge Graph First: per-ingestion provenance trail.
|
|
946
|
+
-- Append-only audit of where every graph node came from, when it
|
|
947
|
+
-- was captured, how it was processed, and whether it was embedded /
|
|
948
|
+
-- linked / used by an agent. get_provenance() returns the latest row.
|
|
949
|
+
CREATE TABLE IF NOT EXISTS ingestion_provenance (
|
|
950
|
+
id TEXT PRIMARY KEY,
|
|
951
|
+
node_id TEXT NOT NULL,
|
|
952
|
+
source_type TEXT NOT NULL,
|
|
953
|
+
source_uri TEXT,
|
|
954
|
+
content_hash TEXT,
|
|
955
|
+
title TEXT,
|
|
956
|
+
pipeline TEXT NOT NULL,
|
|
957
|
+
owner TEXT,
|
|
958
|
+
workspace_id TEXT,
|
|
959
|
+
captured_at TEXT,
|
|
960
|
+
modified_at TEXT,
|
|
961
|
+
embedded INTEGER NOT NULL DEFAULT 0,
|
|
962
|
+
linked INTEGER NOT NULL DEFAULT 0,
|
|
963
|
+
duplicate INTEGER NOT NULL DEFAULT 0,
|
|
964
|
+
agent_used TEXT,
|
|
965
|
+
chunk_count INTEGER NOT NULL DEFAULT 0,
|
|
966
|
+
permissions_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(permissions_json)),
|
|
967
|
+
metadata_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(metadata_json)),
|
|
968
|
+
created_at TEXT NOT NULL
|
|
969
|
+
);
|
|
945
970
|
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
946
971
|
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
947
972
|
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
@@ -954,6 +979,10 @@ class KnowledgeGraphStore:
|
|
|
954
979
|
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
|
|
955
980
|
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
|
|
956
981
|
CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
|
|
982
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_node ON ingestion_provenance(node_id);
|
|
983
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_source_type ON ingestion_provenance(source_type);
|
|
984
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_hash ON ingestion_provenance(content_hash);
|
|
985
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_created ON ingestion_provenance(created_at);
|
|
957
986
|
"""
|
|
958
987
|
)
|
|
959
988
|
conn.execute(
|
|
@@ -961,6 +990,62 @@ class KnowledgeGraphStore:
|
|
|
961
990
|
("schema_version", str(GRAPH_SCHEMA_VERSION)),
|
|
962
991
|
)
|
|
963
992
|
self._init_v2_schema()
|
|
993
|
+
self._init_fts()
|
|
994
|
+
|
|
995
|
+
# ── FTS5 keyword index (v4) ──────────────────────────────────────────
|
|
996
|
+
# Replaces LIKE '%q%' table scans for keyword search. The trigram
|
|
997
|
+
# tokenizer is required (not just FTS5): unicode61 indexes whole tokens
|
|
998
|
+
# and would silently break Korean substring recall ('프로젝트' must match
|
|
999
|
+
# '프로젝트를'). Without trigram support the store honestly reports
|
|
1000
|
+
# fts_enabled=False and the LIKE path remains authoritative.
|
|
1001
|
+
_FTS_SQL = """
|
|
1002
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS node_fts USING fts5(
|
|
1003
|
+
node_id UNINDEXED, title, summary, metadata, tokenize='trigram'
|
|
1004
|
+
);
|
|
1005
|
+
CREATE TRIGGER IF NOT EXISTS node_fts_ai AFTER INSERT ON nodes BEGIN
|
|
1006
|
+
INSERT INTO node_fts(node_id, title, summary, metadata)
|
|
1007
|
+
VALUES (new.id, new.title, COALESCE(new.summary, ''), new.metadata_json);
|
|
1008
|
+
END;
|
|
1009
|
+
CREATE TRIGGER IF NOT EXISTS node_fts_au AFTER UPDATE ON nodes BEGIN
|
|
1010
|
+
DELETE FROM node_fts WHERE node_id = old.id;
|
|
1011
|
+
INSERT INTO node_fts(node_id, title, summary, metadata)
|
|
1012
|
+
VALUES (new.id, new.title, COALESCE(new.summary, ''), new.metadata_json);
|
|
1013
|
+
END;
|
|
1014
|
+
CREATE TRIGGER IF NOT EXISTS node_fts_ad AFTER DELETE ON nodes BEGIN
|
|
1015
|
+
DELETE FROM node_fts WHERE node_id = old.id;
|
|
1016
|
+
END;
|
|
1017
|
+
"""
|
|
1018
|
+
|
|
1019
|
+
def _init_fts(self) -> None:
|
|
1020
|
+
self._fts_enabled = False
|
|
1021
|
+
try:
|
|
1022
|
+
with self._connect() as conn:
|
|
1023
|
+
conn.executescript(self._FTS_SQL)
|
|
1024
|
+
fts_count = conn.execute("SELECT count(*) AS c FROM node_fts").fetchone()["c"]
|
|
1025
|
+
if fts_count == 0:
|
|
1026
|
+
conn.execute(
|
|
1027
|
+
"INSERT INTO node_fts(node_id, title, summary, metadata) "
|
|
1028
|
+
"SELECT id, title, COALESCE(summary, ''), metadata_json FROM nodes"
|
|
1029
|
+
)
|
|
1030
|
+
self._fts_enabled = True
|
|
1031
|
+
except sqlite3.OperationalError as exc:
|
|
1032
|
+
# FTS5/trigram not compiled into this SQLite build. LIKE search
|
|
1033
|
+
# stays authoritative; the capability is reported, never faked.
|
|
1034
|
+
logging.info("FTS5 trigram index unavailable (%s); keyword search uses LIKE scans.", exc)
|
|
1035
|
+
|
|
1036
|
+
def _fts_match_ids(self, conn: sqlite3.Connection, query: str, limit: int) -> List[str]:
|
|
1037
|
+
"""Ranked node ids for a trigram FTS query ('' on any failure)."""
|
|
1038
|
+
if not getattr(self, "_fts_enabled", False) or len(query) < 3:
|
|
1039
|
+
return []
|
|
1040
|
+
escaped = query.replace('"', '""')
|
|
1041
|
+
try:
|
|
1042
|
+
rows = conn.execute(
|
|
1043
|
+
'SELECT node_id FROM node_fts WHERE node_fts MATCH ? ORDER BY rank LIMIT ?',
|
|
1044
|
+
(f'"{escaped}"', limit),
|
|
1045
|
+
).fetchall()
|
|
1046
|
+
except sqlite3.OperationalError:
|
|
1047
|
+
return []
|
|
1048
|
+
return [row["node_id"] for row in rows]
|
|
964
1049
|
|
|
965
1050
|
# SQL views that reconstruct the *exact* legacy row shape on top of the
|
|
966
1051
|
# normalized v2 tables, so the read methods run unchanged against either
|
|
@@ -1099,26 +1184,40 @@ class KnowledgeGraphStore:
|
|
|
1099
1184
|
self, conn: sqlite3.Connection, node_id: str, node_type: str, title: str,
|
|
1100
1185
|
summary: Optional[str], metadata_json: Optional[str],
|
|
1101
1186
|
*, created_at: Optional[str] = None, updated_at: Optional[str] = None,
|
|
1187
|
+
owner: Optional[str] = None, workspace_id: Optional[str] = None,
|
|
1188
|
+
visibility: Optional[str] = None,
|
|
1102
1189
|
) -> None:
|
|
1103
1190
|
if KGStoreV2 is None:
|
|
1104
1191
|
return
|
|
1105
1192
|
ts = updated_at or _now()
|
|
1106
1193
|
norm_type = NodeType.from_legacy(node_type).value if NodeType is not None else node_type
|
|
1194
|
+
# Scope resolution: explicit param > metadata hints > legacy-global.
|
|
1195
|
+
# 'legacy' (not 'private') marks unscoped rows — the column default
|
|
1196
|
+
# must never silently privatize previously machine-shared data.
|
|
1197
|
+
meta = _safe_loads(metadata_json) if metadata_json else {}
|
|
1198
|
+
owner = owner or meta.get("user_email") or meta.get("owner") or None
|
|
1199
|
+
workspace_id = workspace_id or meta.get("workspace_id") or None
|
|
1200
|
+
visibility = visibility or ("legacy" if workspace_id is None else "workspace")
|
|
1107
1201
|
try:
|
|
1108
1202
|
conn.execute(
|
|
1109
1203
|
"""
|
|
1110
1204
|
INSERT INTO nodes_v2(id, type, legacy_type, label, summary, attrs,
|
|
1111
|
-
owner_id,
|
|
1112
|
-
importance_score)
|
|
1113
|
-
VALUES (?, ?, ?, ?, ?, ?,
|
|
1205
|
+
owner_id, workspace_id, visibility,
|
|
1206
|
+
created_at, updated_at, importance_score)
|
|
1207
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0.0)
|
|
1114
1208
|
ON CONFLICT(id) DO UPDATE SET
|
|
1115
1209
|
type=excluded.type, legacy_type=excluded.legacy_type,
|
|
1116
1210
|
label=excluded.label, summary=excluded.summary,
|
|
1117
|
-
attrs=excluded.attrs, updated_at=excluded.updated_at
|
|
1211
|
+
attrs=excluded.attrs, updated_at=excluded.updated_at,
|
|
1212
|
+
owner_id=COALESCE(excluded.owner_id, nodes_v2.owner_id),
|
|
1213
|
+
workspace_id=COALESCE(excluded.workspace_id, nodes_v2.workspace_id),
|
|
1214
|
+
visibility=CASE WHEN excluded.visibility != 'legacy'
|
|
1215
|
+
THEN excluded.visibility
|
|
1216
|
+
ELSE nodes_v2.visibility END
|
|
1118
1217
|
""",
|
|
1119
1218
|
(node_id, norm_type, node_type, title, summary,
|
|
1120
1219
|
metadata_json if metadata_json is not None else "{}",
|
|
1121
|
-
created_at or ts, ts),
|
|
1220
|
+
owner, workspace_id, visibility, created_at or ts, ts),
|
|
1122
1221
|
)
|
|
1123
1222
|
except Exception as ex:
|
|
1124
1223
|
logging.debug("knowledge_graph: v2 node projection skipped (%s): %s", node_id, ex)
|
|
@@ -1140,8 +1239,7 @@ class KnowledgeGraphStore:
|
|
|
1140
1239
|
INSERT INTO edges_v2(id, source, target, type, legacy_type, weight,
|
|
1141
1240
|
confidence, evidence, metadata, created_by, created_at)
|
|
1142
1241
|
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, 'legacy', ?)
|
|
1143
|
-
ON CONFLICT(source, target, legacy_type) DO UPDATE SET
|
|
1144
|
-
type=excluded.type,
|
|
1242
|
+
ON CONFLICT(source, target, type, legacy_type) DO UPDATE SET
|
|
1145
1243
|
weight=max(edges_v2.weight, excluded.weight),
|
|
1146
1244
|
confidence=excluded.confidence,
|
|
1147
1245
|
metadata=excluded.metadata
|
|
@@ -1149,9 +1247,125 @@ class KnowledgeGraphStore:
|
|
|
1149
1247
|
(eid, from_node, to_node, norm_type, edge_type, float(weight),
|
|
1150
1248
|
confidence, meta_str, created_at or _now()),
|
|
1151
1249
|
)
|
|
1250
|
+
# Temporal record: every observation of this relationship is kept
|
|
1251
|
+
# (the UNIQUE upsert + weight=max alone would erase recurrence).
|
|
1252
|
+
row = conn.execute(
|
|
1253
|
+
"SELECT id FROM edges_v2 WHERE source=? AND target=? AND type=? AND legacy_type=?",
|
|
1254
|
+
(from_node, to_node, norm_type, edge_type),
|
|
1255
|
+
).fetchone()
|
|
1256
|
+
if row is not None:
|
|
1257
|
+
conn.execute(
|
|
1258
|
+
"INSERT INTO edge_occurrences(edge_id, observed_at, weight, source) VALUES (?, ?, ?, ?)",
|
|
1259
|
+
(row["id"], created_at or _now(), float(weight),
|
|
1260
|
+
_safe_loads(meta_str).get("source")),
|
|
1261
|
+
)
|
|
1152
1262
|
except Exception as ex:
|
|
1153
1263
|
logging.debug("knowledge_graph: v2 edge projection skipped (%s->%s): %s", from_node, to_node, ex)
|
|
1154
1264
|
|
|
1265
|
+
def curate(self, *, max_documents: int = 200, max_new_nodes: int = 8) -> Dict[str, Any]:
|
|
1266
|
+
"""On-demand graph curation (T4.4 — graph_curator goes live).
|
|
1267
|
+
|
|
1268
|
+
Runs the curator's gated topic-promotion pipeline over recent content
|
|
1269
|
+
nodes: candidates are clustered, secret-bearing labels are refused,
|
|
1270
|
+
and only multi-source topics above the importance threshold become
|
|
1271
|
+
Topic nodes (with MENTIONS edges back to their sources and a real
|
|
1272
|
+
importance_score in nodes_v2). Explicit and observable — the result
|
|
1273
|
+
reports everything promoted AND everything skipped, with reasons.
|
|
1274
|
+
"""
|
|
1275
|
+
from latticeai.core.graph_curator import auto_build_graph_overlay
|
|
1276
|
+
|
|
1277
|
+
content_types = (
|
|
1278
|
+
"Document", "File", "CodeFile", "Message", "AIResponse",
|
|
1279
|
+
"Chat", "Page", "Slide", "Spreadsheet",
|
|
1280
|
+
)
|
|
1281
|
+
nt, _ = self._read_tables()
|
|
1282
|
+
with self._connect() as conn:
|
|
1283
|
+
placeholders = ",".join("?" for _ in content_types)
|
|
1284
|
+
rows = conn.execute(
|
|
1285
|
+
f"""
|
|
1286
|
+
SELECT id, type, title, summary FROM {nt}
|
|
1287
|
+
WHERE type IN ({placeholders})
|
|
1288
|
+
ORDER BY updated_at DESC, id ASC LIMIT ?
|
|
1289
|
+
""",
|
|
1290
|
+
(*content_types, max(1, min(int(max_documents), 2000))),
|
|
1291
|
+
).fetchall()
|
|
1292
|
+
existing_labels = {
|
|
1293
|
+
str(row["title"] or "").strip().lower()
|
|
1294
|
+
for row in conn.execute(
|
|
1295
|
+
f"SELECT title FROM {nt} WHERE type IN ('Topic', 'Concept')"
|
|
1296
|
+
).fetchall()
|
|
1297
|
+
}
|
|
1298
|
+
documents = [
|
|
1299
|
+
{
|
|
1300
|
+
"id": row["id"],
|
|
1301
|
+
"text": f"{row['title']} {row['summary'] or ''}",
|
|
1302
|
+
"kind": "file" if row["type"] in {"Document", "File", "CodeFile", "Spreadsheet"} else "chat",
|
|
1303
|
+
}
|
|
1304
|
+
for row in rows
|
|
1305
|
+
]
|
|
1306
|
+
overlay = auto_build_graph_overlay(
|
|
1307
|
+
documents,
|
|
1308
|
+
existing_node_labels=existing_labels,
|
|
1309
|
+
max_new_nodes=max(1, min(int(max_new_nodes), 50)),
|
|
1310
|
+
)
|
|
1311
|
+
promoted: List[Dict[str, Any]] = []
|
|
1312
|
+
with self._connect() as conn:
|
|
1313
|
+
valid_ids = {row["id"] for row in rows}
|
|
1314
|
+
for promo in overlay["promotions"]:
|
|
1315
|
+
topic_id = f"topic:{_slug(promo['label'])}"
|
|
1316
|
+
self._upsert_node(
|
|
1317
|
+
conn, topic_id, "Topic", promo["label"],
|
|
1318
|
+
metadata={
|
|
1319
|
+
"curated": True,
|
|
1320
|
+
"importance": promo["importance"],
|
|
1321
|
+
"aliases": promo["aliases"],
|
|
1322
|
+
"source": "graph_curator",
|
|
1323
|
+
},
|
|
1324
|
+
)
|
|
1325
|
+
conn.execute(
|
|
1326
|
+
"UPDATE nodes_v2 SET importance_score=? WHERE id=?",
|
|
1327
|
+
(float(promo["importance"]), topic_id),
|
|
1328
|
+
)
|
|
1329
|
+
linked = 0
|
|
1330
|
+
for source_id in promo["sources"][:10]:
|
|
1331
|
+
if source_id in valid_ids:
|
|
1332
|
+
self._upsert_edge(
|
|
1333
|
+
conn, source_id, topic_id, "MENTIONS",
|
|
1334
|
+
weight=0.6, metadata={"source": "graph_curator"},
|
|
1335
|
+
)
|
|
1336
|
+
linked += 1
|
|
1337
|
+
promoted.append({
|
|
1338
|
+
"node_id": topic_id,
|
|
1339
|
+
"label": promo["label"],
|
|
1340
|
+
"importance": promo["importance"],
|
|
1341
|
+
"linked_sources": linked,
|
|
1342
|
+
})
|
|
1343
|
+
return {
|
|
1344
|
+
"status": "ok",
|
|
1345
|
+
"documents_scanned": len(documents),
|
|
1346
|
+
"candidates_total": overlay["candidates_total"],
|
|
1347
|
+
"promoted": promoted,
|
|
1348
|
+
"skipped": overlay["skipped"][:50],
|
|
1349
|
+
"skipped_total": len(overlay["skipped"]),
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
def mark_superseded(self, old_node_id: str, new_node_id: str) -> Dict[str, Any]:
|
|
1353
|
+
"""Record that ``old_node_id`` was replaced by ``new_node_id``.
|
|
1354
|
+
|
|
1355
|
+
The old node stays queryable (knowledge is durable); readers can follow
|
|
1356
|
+
the revision chain via ``nodes_v2.superseded_by``.
|
|
1357
|
+
"""
|
|
1358
|
+
with self._connect() as conn:
|
|
1359
|
+
for node_id in (old_node_id, new_node_id):
|
|
1360
|
+
exists = conn.execute("SELECT 1 FROM nodes_v2 WHERE id=?", (node_id,)).fetchone()
|
|
1361
|
+
if not exists:
|
|
1362
|
+
raise FileNotFoundError(node_id)
|
|
1363
|
+
conn.execute(
|
|
1364
|
+
"UPDATE nodes_v2 SET superseded_by=?, updated_at=? WHERE id=?",
|
|
1365
|
+
(new_node_id, _now(), old_node_id),
|
|
1366
|
+
)
|
|
1367
|
+
return {"status": "ok", "node_id": old_node_id, "superseded_by": new_node_id}
|
|
1368
|
+
|
|
1155
1369
|
def _v2_delete_nodes(self, conn: sqlite3.Connection, ids) -> None:
|
|
1156
1370
|
"""Mirror legacy node deletions into v2 (edges_v2 cascade on the FK)."""
|
|
1157
1371
|
if KGStoreV2 is None:
|
|
@@ -1212,6 +1426,9 @@ class KnowledgeGraphStore:
|
|
|
1212
1426
|
summary: str = "",
|
|
1213
1427
|
metadata: Optional[Dict[str, Any]] = None,
|
|
1214
1428
|
raw: Optional[Dict[str, Any]] = None,
|
|
1429
|
+
owner: Optional[str] = None,
|
|
1430
|
+
workspace_id: Optional[str] = None,
|
|
1431
|
+
visibility: Optional[str] = None,
|
|
1215
1432
|
) -> str:
|
|
1216
1433
|
now = _now()
|
|
1217
1434
|
# Canonical stored values, computed once and shared with the v2
|
|
@@ -1234,7 +1451,8 @@ class KnowledgeGraphStore:
|
|
|
1234
1451
|
)
|
|
1235
1452
|
# dual-write: project into the v2 graph on the same transaction
|
|
1236
1453
|
self._v2_project_node(conn, node_id, node_type, title_s, summary_s, meta_json,
|
|
1237
|
-
created_at=now, updated_at=now
|
|
1454
|
+
created_at=now, updated_at=now,
|
|
1455
|
+
owner=owner, workspace_id=workspace_id, visibility=visibility)
|
|
1238
1456
|
if node_type != "Chunk":
|
|
1239
1457
|
self._upsert_vector_item(
|
|
1240
1458
|
conn,
|
|
@@ -1255,6 +1473,16 @@ class KnowledgeGraphStore:
|
|
|
1255
1473
|
weight: float = 1.0,
|
|
1256
1474
|
metadata: Optional[Dict[str, Any]] = None,
|
|
1257
1475
|
) -> str:
|
|
1476
|
+
# v4 write door: every new edge stores the canonical EdgeType value —
|
|
1477
|
+
# free-string types (e.g. '포함함', '언급함') are normalized here, so no
|
|
1478
|
+
# caller can mint new legacy taxonomy. The original label survives in
|
|
1479
|
+
# metadata.legacy_label for traceability.
|
|
1480
|
+
if EdgeType is not None:
|
|
1481
|
+
canonical = EdgeType.from_legacy(edge_type).value
|
|
1482
|
+
if canonical != edge_type:
|
|
1483
|
+
metadata = dict(metadata or {})
|
|
1484
|
+
metadata.setdefault("legacy_label", edge_type)
|
|
1485
|
+
edge_type = canonical
|
|
1258
1486
|
edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
|
|
1259
1487
|
now = _now()
|
|
1260
1488
|
meta_json = _json(metadata) # canonical string shared with the projection
|
|
@@ -2703,12 +2931,20 @@ class KnowledgeGraphStore:
|
|
|
2703
2931
|
uploader: Optional[str] = None,
|
|
2704
2932
|
conversation_id: Optional[str] = None,
|
|
2705
2933
|
extracted: Optional[Dict[str, Any]] = None,
|
|
2934
|
+
source_type: Optional[str] = None,
|
|
2935
|
+
source_uri: Optional[str] = None,
|
|
2936
|
+
captured_at: Optional[str] = None,
|
|
2937
|
+
modified_at: Optional[str] = None,
|
|
2938
|
+
owner: Optional[str] = None,
|
|
2939
|
+
workspace_id: Optional[str] = None,
|
|
2940
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
2706
2941
|
) -> Dict[str, Any]:
|
|
2707
2942
|
path = Path(path)
|
|
2708
2943
|
data = path.read_bytes()
|
|
2709
2944
|
digest = _sha256_bytes(data)
|
|
2710
2945
|
ext = path.suffix.lower()
|
|
2711
2946
|
filename = original_filename or path.name
|
|
2947
|
+
captured_at = captured_at or _now()
|
|
2712
2948
|
blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
|
|
2713
2949
|
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2714
2950
|
if not blob_path.exists():
|
|
@@ -2723,8 +2959,16 @@ class KnowledgeGraphStore:
|
|
|
2723
2959
|
"mime_type": mime_type,
|
|
2724
2960
|
"bytes": len(data),
|
|
2725
2961
|
"sha256": digest,
|
|
2962
|
+
"content_hash": digest,
|
|
2726
2963
|
"blob_path": str(blob_path),
|
|
2727
2964
|
"uploader": uploader,
|
|
2965
|
+
"owner": owner or uploader,
|
|
2966
|
+
"workspace_id": workspace_id,
|
|
2967
|
+
"permissions": permissions or {},
|
|
2968
|
+
"source_type": source_type or "file",
|
|
2969
|
+
"source_uri": source_uri or str(path),
|
|
2970
|
+
"captured_at": captured_at,
|
|
2971
|
+
"modified_at": modified_at,
|
|
2728
2972
|
"conversation_id": conversation_id,
|
|
2729
2973
|
"extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
|
|
2730
2974
|
"structure": doc_meta,
|
|
@@ -2732,8 +2976,11 @@ class KnowledgeGraphStore:
|
|
|
2732
2976
|
full_text = f"{filename}\n{text}"
|
|
2733
2977
|
concepts = _extract_concepts(full_text, limit=15)
|
|
2734
2978
|
triples = _extract_triples(full_text, concepts)
|
|
2979
|
+
chunk_ids: List[str] = []
|
|
2980
|
+
source_node_id: Optional[str] = None
|
|
2735
2981
|
|
|
2736
2982
|
with self._connect() as conn:
|
|
2983
|
+
duplicate = self._node_exists(conn, file_id)
|
|
2737
2984
|
# ── Document 노드 (점: 명사 — 파일) ────────────────────────────────
|
|
2738
2985
|
self._upsert_node(
|
|
2739
2986
|
conn, file_id, "Document", filename,
|
|
@@ -2742,6 +2989,15 @@ class KnowledgeGraphStore:
|
|
|
2742
2989
|
)
|
|
2743
2990
|
self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
|
|
2744
2991
|
|
|
2992
|
+
# ── SOURCE 노드 + indexed_from (v3.6.0, source_type 지정 시) ──────
|
|
2993
|
+
if source_type:
|
|
2994
|
+
source_node_id = self._attach_source_node(
|
|
2995
|
+
conn, file_id,
|
|
2996
|
+
source_type=source_type, source_uri=source_uri or str(path),
|
|
2997
|
+
title=filename, content_hash=digest, captured_at=captured_at,
|
|
2998
|
+
extra={"owner": owner or uploader, "workspace_id": workspace_id, "ext": ext},
|
|
2999
|
+
)
|
|
3000
|
+
|
|
2745
3001
|
# ── Person 노드 + 동사형 엣지 ─────────────────────────────────────
|
|
2746
3002
|
if uploader:
|
|
2747
3003
|
person_id = f"person:{_slug(uploader)}"
|
|
@@ -2762,6 +3018,7 @@ class KnowledgeGraphStore:
|
|
|
2762
3018
|
# ── RAG chunks (검색용, 그래프 비표시) ────────────────────────────
|
|
2763
3019
|
for index, chunk in enumerate(_chunks(text)):
|
|
2764
3020
|
chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
|
|
3021
|
+
chunk_ids.append(chunk_id)
|
|
2765
3022
|
self._upsert_node(
|
|
2766
3023
|
conn, chunk_id, "Chunk",
|
|
2767
3024
|
f"{filename} chunk {index + 1}",
|
|
@@ -2816,7 +3073,18 @@ class KnowledgeGraphStore:
|
|
|
2816
3073
|
# 선: Document가 Task/Decision을 "포함함"
|
|
2817
3074
|
self._upsert_edge(conn, file_id, sem_id, "포함함", weight=0.9)
|
|
2818
3075
|
|
|
2819
|
-
return {
|
|
3076
|
+
return {
|
|
3077
|
+
"node_id": file_id,
|
|
3078
|
+
"type": "Document",
|
|
3079
|
+
"sha256": digest,
|
|
3080
|
+
"content_hash": digest,
|
|
3081
|
+
"source_node_id": source_node_id,
|
|
3082
|
+
"chunk_ids": chunk_ids,
|
|
3083
|
+
"chunk_count": len(chunk_ids),
|
|
3084
|
+
"duplicate": duplicate,
|
|
3085
|
+
"captured_at": captured_at,
|
|
3086
|
+
"metadata": metadata,
|
|
3087
|
+
}
|
|
2820
3088
|
|
|
2821
3089
|
def ingest_event(
|
|
2822
3090
|
self,
|
|
@@ -2854,6 +3122,513 @@ class KnowledgeGraphStore:
|
|
|
2854
3122
|
self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
|
|
2855
3123
|
return {"node_id": event_id, "type": event_type}
|
|
2856
3124
|
|
|
3125
|
+
# ── v3.6.0 Knowledge Graph First: unified source ingestion + provenance ──────
|
|
3126
|
+
def _node_exists(self, conn: sqlite3.Connection, node_id: str) -> bool:
|
|
3127
|
+
row = conn.execute("SELECT 1 FROM nodes WHERE id = ?", (node_id,)).fetchone()
|
|
3128
|
+
return row is not None
|
|
3129
|
+
|
|
3130
|
+
def node_is_embedded(self, node_id: str) -> bool:
|
|
3131
|
+
"""True when a vector embedding exists for ``node_id`` (RAG-ready)."""
|
|
3132
|
+
with self._connect() as conn:
|
|
3133
|
+
row = conn.execute(
|
|
3134
|
+
"SELECT 1 FROM vector_embeddings WHERE item_id = ? LIMIT 1",
|
|
3135
|
+
(node_id,),
|
|
3136
|
+
).fetchone()
|
|
3137
|
+
return row is not None
|
|
3138
|
+
|
|
3139
|
+
def _attach_source_node(
|
|
3140
|
+
self,
|
|
3141
|
+
conn: sqlite3.Connection,
|
|
3142
|
+
content_node_id: str,
|
|
3143
|
+
*,
|
|
3144
|
+
source_type: str,
|
|
3145
|
+
source_uri: Optional[str] = None,
|
|
3146
|
+
title: Optional[str] = None,
|
|
3147
|
+
content_hash: Optional[str] = None,
|
|
3148
|
+
captured_at: Optional[str] = None,
|
|
3149
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
3150
|
+
) -> str:
|
|
3151
|
+
"""Create the SOURCE node for an ingested item and link it via INDEXED_FROM.
|
|
3152
|
+
|
|
3153
|
+
Every ingested content node points at exactly one SOURCE node, so the
|
|
3154
|
+
graph is always able to explain *where* a node came from. The source id
|
|
3155
|
+
is derived from (source_type, source_uri | content_hash) so re-ingesting
|
|
3156
|
+
the same origin reuses the same SOURCE node (idempotent).
|
|
3157
|
+
"""
|
|
3158
|
+
key = source_uri or content_hash or content_node_id
|
|
3159
|
+
source_id = f"source:{_sha256_text(f'{source_type}|{key}')[:24]}"
|
|
3160
|
+
meta = {
|
|
3161
|
+
"source_type": source_type,
|
|
3162
|
+
"source_uri": source_uri,
|
|
3163
|
+
"content_hash": content_hash,
|
|
3164
|
+
"captured_at": captured_at or _now(),
|
|
3165
|
+
**(extra or {}),
|
|
3166
|
+
}
|
|
3167
|
+
label = title or source_uri or source_type
|
|
3168
|
+
self._upsert_node(
|
|
3169
|
+
conn, source_id, "Source", label,
|
|
3170
|
+
summary=str(source_uri or title or source_type)[:400],
|
|
3171
|
+
metadata=meta,
|
|
3172
|
+
)
|
|
3173
|
+
# 선: 콘텐츠 노드가 "이 출처에서 색인됨" (indexed_from → SOURCE)
|
|
3174
|
+
self._upsert_edge(conn, content_node_id, source_id, "indexed_from",
|
|
3175
|
+
weight=1.0, metadata={"source_type": source_type})
|
|
3176
|
+
return source_id
|
|
3177
|
+
|
|
3178
|
+
def ingest_source(
|
|
3179
|
+
self,
|
|
3180
|
+
*,
|
|
3181
|
+
source_type: str,
|
|
3182
|
+
title: str,
|
|
3183
|
+
text: str,
|
|
3184
|
+
source_uri: Optional[str] = None,
|
|
3185
|
+
owner: Optional[str] = None,
|
|
3186
|
+
workspace_id: Optional[str] = None,
|
|
3187
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
3188
|
+
captured_at: Optional[str] = None,
|
|
3189
|
+
modified_at: Optional[str] = None,
|
|
3190
|
+
conversation_id: Optional[str] = None,
|
|
3191
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
3192
|
+
) -> Dict[str, Any]:
|
|
3193
|
+
"""Unified text/web ingestion: one shape for URL, browser tab, note, text.
|
|
3194
|
+
|
|
3195
|
+
Creates a content ``Document`` node (idempotent by content hash), a
|
|
3196
|
+
``Source`` node linked via ``indexed_from``, RAG chunks, and extracted
|
|
3197
|
+
Concept/Task/Decision nodes — mirroring ingest_document for non-file
|
|
3198
|
+
sources. Returns the full set of ids the caller needs to record
|
|
3199
|
+
provenance, including ``duplicate`` (was the content already indexed).
|
|
3200
|
+
"""
|
|
3201
|
+
source_type = str(source_type or "text")
|
|
3202
|
+
text = str(text or "")
|
|
3203
|
+
title = _clean_text(str(title or source_uri or source_type))[:240] or source_type
|
|
3204
|
+
captured_at = captured_at or _now()
|
|
3205
|
+
content_hash = _sha256_text(f"{source_type}|{source_uri or ''}|{text}")
|
|
3206
|
+
content_id = f"webdoc:{content_hash[:24]}"
|
|
3207
|
+
full_text = f"{title}\n{text}"
|
|
3208
|
+
node_meta = {
|
|
3209
|
+
"source_type": source_type,
|
|
3210
|
+
"source_uri": source_uri,
|
|
3211
|
+
"content_hash": content_hash,
|
|
3212
|
+
"title": title,
|
|
3213
|
+
"captured_at": captured_at,
|
|
3214
|
+
"modified_at": modified_at,
|
|
3215
|
+
"owner": owner,
|
|
3216
|
+
"workspace_id": workspace_id,
|
|
3217
|
+
"permissions": permissions or {},
|
|
3218
|
+
"chars": len(text),
|
|
3219
|
+
**(metadata or {}),
|
|
3220
|
+
}
|
|
3221
|
+
concepts = _extract_concepts(full_text, limit=15)
|
|
3222
|
+
triples = _extract_triples(full_text, concepts)
|
|
3223
|
+
chunk_ids: List[str] = []
|
|
3224
|
+
|
|
3225
|
+
with self._connect() as conn:
|
|
3226
|
+
duplicate = self._node_exists(conn, content_id)
|
|
3227
|
+
# ── 콘텐츠 노드 (점: 명사 — 문서) ────────────────────────────────
|
|
3228
|
+
self._upsert_node(
|
|
3229
|
+
conn, content_id, "Document", title,
|
|
3230
|
+
summary=(text or title)[:500],
|
|
3231
|
+
metadata=node_meta, raw=node_meta,
|
|
3232
|
+
)
|
|
3233
|
+
# ── SOURCE 노드 + indexed_from 엣지 (출처 추적) ──────────────────
|
|
3234
|
+
source_node_id = self._attach_source_node(
|
|
3235
|
+
conn, content_id,
|
|
3236
|
+
source_type=source_type, source_uri=source_uri, title=title,
|
|
3237
|
+
content_hash=content_hash, captured_at=captured_at,
|
|
3238
|
+
extra={"owner": owner, "workspace_id": workspace_id},
|
|
3239
|
+
)
|
|
3240
|
+
# ── 소유자(Person) + 동사형 엣지 ────────────────────────────────
|
|
3241
|
+
if owner:
|
|
3242
|
+
person_id = f"person:{_slug(owner)}"
|
|
3243
|
+
self._upsert_node(conn, person_id, "Person", owner, metadata={"email": owner})
|
|
3244
|
+
self._upsert_edge(conn, person_id, content_id, "업로드함", weight=1.0)
|
|
3245
|
+
# ── 대화 연결 ───────────────────────────────────────────────────
|
|
3246
|
+
if conversation_id:
|
|
3247
|
+
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
3248
|
+
self._upsert_node(conn, conv_id, "Chat", conversation_id)
|
|
3249
|
+
self._upsert_edge(conn, conv_id, content_id, "언급함", weight=0.8)
|
|
3250
|
+
# ── RAG 청크 ────────────────────────────────────────────────────
|
|
3251
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
3252
|
+
chunk_id = f"chunk:{_sha256_text(f'{content_id}:{index}:{chunk}')[:24]}"
|
|
3253
|
+
chunk_ids.append(chunk_id)
|
|
3254
|
+
self._upsert_node(
|
|
3255
|
+
conn, chunk_id, "Chunk", f"{title} chunk {index + 1}",
|
|
3256
|
+
summary=chunk[:500], metadata={"index": index, "source_node": content_id},
|
|
3257
|
+
)
|
|
3258
|
+
self._upsert_chunk(conn, chunk_id=chunk_id, source_node=content_id,
|
|
3259
|
+
text=chunk, metadata={"index": index, "source_node": content_id})
|
|
3260
|
+
self._upsert_edge(conn, content_id, chunk_id, "포함함")
|
|
3261
|
+
# ── Concept / Feature / Error / Code 노드 + 엣지 ────────────────
|
|
3262
|
+
concept_ids: Dict[str, str] = {}
|
|
3263
|
+
for concept in concepts:
|
|
3264
|
+
node_t = _classify_node_type(concept, full_text)
|
|
3265
|
+
cid = f"{node_t.lower()}:{_slug(concept)}"
|
|
3266
|
+
concept_ids[concept.lower()] = cid
|
|
3267
|
+
self._upsert_node(conn, cid, node_t, concept,
|
|
3268
|
+
metadata={"auto_extracted": True, "source_type": source_type})
|
|
3269
|
+
self._upsert_edge(conn, content_id, cid, "포함함", weight=0.8)
|
|
3270
|
+
for triple in triples:
|
|
3271
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
3272
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
3273
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
3274
|
+
self._upsert_edge(conn, subj_id, obj_id, triple["relation"],
|
|
3275
|
+
weight=1.0, metadata={"context": triple.get("context", "")[:240]})
|
|
3276
|
+
# ── Task / Decision 노드 ────────────────────────────────────────
|
|
3277
|
+
for item in _semantic_items(text):
|
|
3278
|
+
sem_type = item["type"]
|
|
3279
|
+
sem_title = item["title"]
|
|
3280
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{content_id}:{sem_type}:{sem_title}')[:24]}"
|
|
3281
|
+
self._upsert_node(conn, sem_id, sem_type, sem_title, summary=item["summary"],
|
|
3282
|
+
metadata={"auto_extracted": True, "source_node": content_id}, raw=item)
|
|
3283
|
+
self._upsert_edge(conn, content_id, sem_id, "포함함", weight=0.9)
|
|
3284
|
+
|
|
3285
|
+
return {
|
|
3286
|
+
"node_id": content_id,
|
|
3287
|
+
"type": "Document",
|
|
3288
|
+
"source_node_id": source_node_id,
|
|
3289
|
+
"content_hash": content_hash,
|
|
3290
|
+
"chunk_ids": chunk_ids,
|
|
3291
|
+
"chunk_count": len(chunk_ids),
|
|
3292
|
+
"duplicate": duplicate,
|
|
3293
|
+
"captured_at": captured_at,
|
|
3294
|
+
}
|
|
3295
|
+
|
|
3296
|
+
def record_provenance(
|
|
3297
|
+
self,
|
|
3298
|
+
*,
|
|
3299
|
+
node_id: str,
|
|
3300
|
+
source_type: str,
|
|
3301
|
+
pipeline: str = "unified-ingestion",
|
|
3302
|
+
source_uri: Optional[str] = None,
|
|
3303
|
+
content_hash: Optional[str] = None,
|
|
3304
|
+
title: Optional[str] = None,
|
|
3305
|
+
owner: Optional[str] = None,
|
|
3306
|
+
workspace_id: Optional[str] = None,
|
|
3307
|
+
captured_at: Optional[str] = None,
|
|
3308
|
+
modified_at: Optional[str] = None,
|
|
3309
|
+
embedded: bool = False,
|
|
3310
|
+
linked: bool = False,
|
|
3311
|
+
duplicate: bool = False,
|
|
3312
|
+
agent_used: Optional[str] = None,
|
|
3313
|
+
chunk_count: int = 0,
|
|
3314
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
3315
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
3316
|
+
) -> Dict[str, Any]:
|
|
3317
|
+
"""Append a provenance record for an ingested node (audit trail)."""
|
|
3318
|
+
now = _now()
|
|
3319
|
+
prov_basis = f"{node_id}|{content_hash or ''}|{now}"
|
|
3320
|
+
prov_id = f"prov:{_sha256_text(prov_basis)[:24]}"
|
|
3321
|
+
with self._connect() as conn:
|
|
3322
|
+
conn.execute(
|
|
3323
|
+
"""
|
|
3324
|
+
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3325
|
+
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3326
|
+
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3327
|
+
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3328
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3329
|
+
""",
|
|
3330
|
+
(
|
|
3331
|
+
prov_id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3332
|
+
owner, workspace_id, captured_at, modified_at, 1 if embedded else 0,
|
|
3333
|
+
1 if linked else 0, 1 if duplicate else 0, agent_used, int(chunk_count or 0),
|
|
3334
|
+
_json(permissions or {}), _json(metadata or {}), now,
|
|
3335
|
+
),
|
|
3336
|
+
)
|
|
3337
|
+
return {"id": prov_id, "node_id": node_id, "created_at": now}
|
|
3338
|
+
|
|
3339
|
+
@staticmethod
|
|
3340
|
+
def _provenance_row(row: sqlite3.Row) -> Dict[str, Any]:
|
|
3341
|
+
return {
|
|
3342
|
+
"id": row["id"],
|
|
3343
|
+
"node_id": row["node_id"],
|
|
3344
|
+
"source_type": row["source_type"],
|
|
3345
|
+
"source_uri": row["source_uri"],
|
|
3346
|
+
"content_hash": row["content_hash"],
|
|
3347
|
+
"title": row["title"],
|
|
3348
|
+
"pipeline": row["pipeline"],
|
|
3349
|
+
"owner": row["owner"],
|
|
3350
|
+
"workspace_id": row["workspace_id"],
|
|
3351
|
+
"captured_at": row["captured_at"],
|
|
3352
|
+
"modified_at": row["modified_at"],
|
|
3353
|
+
"embedded": bool(row["embedded"]),
|
|
3354
|
+
"linked": bool(row["linked"]),
|
|
3355
|
+
"duplicate": bool(row["duplicate"]),
|
|
3356
|
+
"agent_used": row["agent_used"],
|
|
3357
|
+
"chunk_count": row["chunk_count"],
|
|
3358
|
+
"permissions": _safe_loads(row["permissions_json"]),
|
|
3359
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3360
|
+
"created_at": row["created_at"],
|
|
3361
|
+
}
|
|
3362
|
+
|
|
3363
|
+
def get_provenance(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
3364
|
+
"""Return the most recent provenance record for a node, or None."""
|
|
3365
|
+
with self._connect() as conn:
|
|
3366
|
+
row = conn.execute(
|
|
3367
|
+
"SELECT * FROM ingestion_provenance WHERE node_id = ? "
|
|
3368
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT 1",
|
|
3369
|
+
(node_id,),
|
|
3370
|
+
).fetchone()
|
|
3371
|
+
return self._provenance_row(row) if row else None
|
|
3372
|
+
|
|
3373
|
+
def list_provenance(self, *, limit: int = 100, source_type: Optional[str] = None) -> Dict[str, Any]:
|
|
3374
|
+
"""Recent provenance records (newest first), optionally by source_type."""
|
|
3375
|
+
limit = max(1, min(int(limit or 100), 1000))
|
|
3376
|
+
with self._connect() as conn:
|
|
3377
|
+
if source_type:
|
|
3378
|
+
rows = conn.execute(
|
|
3379
|
+
"SELECT * FROM ingestion_provenance WHERE source_type = ? "
|
|
3380
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3381
|
+
(source_type, limit),
|
|
3382
|
+
).fetchall()
|
|
3383
|
+
else:
|
|
3384
|
+
rows = conn.execute(
|
|
3385
|
+
"SELECT * FROM ingestion_provenance "
|
|
3386
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3387
|
+
(limit,),
|
|
3388
|
+
).fetchall()
|
|
3389
|
+
return {"items": [self._provenance_row(r) for r in rows], "count": len(rows)}
|
|
3390
|
+
|
|
3391
|
+
def provenance_coverage(self) -> Dict[str, Any]:
|
|
3392
|
+
"""How much of the brain is explainable: nodes with vs without
|
|
3393
|
+
provenance, per node type — the honesty metric for 'every source goes
|
|
3394
|
+
through the pipeline'. Pre-v4 nodes ingested before provenance existed
|
|
3395
|
+
legitimately count as uncovered."""
|
|
3396
|
+
nt, _ = self._read_tables()
|
|
3397
|
+
with self._connect() as conn:
|
|
3398
|
+
total = conn.execute(f"SELECT COUNT(*) FROM {nt}").fetchone()[0]
|
|
3399
|
+
covered = conn.execute(
|
|
3400
|
+
f"SELECT COUNT(*) FROM {nt} WHERE id IN (SELECT DISTINCT node_id FROM ingestion_provenance)"
|
|
3401
|
+
).fetchone()[0]
|
|
3402
|
+
uncovered_by_type = {
|
|
3403
|
+
row["type"]: row["c"]
|
|
3404
|
+
for row in conn.execute(
|
|
3405
|
+
f"""
|
|
3406
|
+
SELECT type, COUNT(*) AS c FROM {nt}
|
|
3407
|
+
WHERE id NOT IN (SELECT DISTINCT node_id FROM ingestion_provenance)
|
|
3408
|
+
GROUP BY type ORDER BY c DESC LIMIT 20
|
|
3409
|
+
"""
|
|
3410
|
+
).fetchall()
|
|
3411
|
+
}
|
|
3412
|
+
by_source = {
|
|
3413
|
+
row["source_type"]: row["c"]
|
|
3414
|
+
for row in conn.execute(
|
|
3415
|
+
"SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
|
|
3416
|
+
).fetchall()
|
|
3417
|
+
}
|
|
3418
|
+
return {
|
|
3419
|
+
"total_nodes": total,
|
|
3420
|
+
"nodes_with_provenance": covered,
|
|
3421
|
+
"coverage_ratio": round(covered / total, 4) if total else None,
|
|
3422
|
+
"uncovered_by_type": uncovered_by_type,
|
|
3423
|
+
"provenance_by_source_type": by_source,
|
|
3424
|
+
}
|
|
3425
|
+
|
|
3426
|
+
def provenance_stats(self) -> Dict[str, Any]:
|
|
3427
|
+
"""Aggregate provenance counts for the Knowledge Graph status surface."""
|
|
3428
|
+
with self._connect() as conn:
|
|
3429
|
+
total = conn.execute("SELECT COUNT(*) AS c FROM ingestion_provenance").fetchone()["c"]
|
|
3430
|
+
by_source = {
|
|
3431
|
+
r["source_type"]: r["c"]
|
|
3432
|
+
for r in conn.execute(
|
|
3433
|
+
"SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
|
|
3434
|
+
).fetchall()
|
|
3435
|
+
}
|
|
3436
|
+
embedded = conn.execute(
|
|
3437
|
+
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE embedded = 1"
|
|
3438
|
+
).fetchone()["c"]
|
|
3439
|
+
duplicates = conn.execute(
|
|
3440
|
+
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE duplicate = 1"
|
|
3441
|
+
).fetchone()["c"]
|
|
3442
|
+
last = conn.execute(
|
|
3443
|
+
"SELECT created_at FROM ingestion_provenance ORDER BY created_at DESC LIMIT 1"
|
|
3444
|
+
).fetchone()
|
|
3445
|
+
return {
|
|
3446
|
+
"total": total,
|
|
3447
|
+
"by_source_type": by_source,
|
|
3448
|
+
"embedded": embedded,
|
|
3449
|
+
"duplicates": duplicates,
|
|
3450
|
+
"last_ingested_at": last["created_at"] if last else None,
|
|
3451
|
+
}
|
|
3452
|
+
|
|
3453
|
+
# ── v3.6.0 portability: logical export / import + binary backup ──────────────
|
|
3454
|
+
def schema_versions(self) -> Dict[str, Any]:
|
|
3455
|
+
"""Versions an exporter stamps and an importer validates against."""
|
|
3456
|
+
try:
|
|
3457
|
+
from kg_schema import EMBED_DIM as _EMBED_DIM, KG_SCHEMA_V2_VERSION as _V2
|
|
3458
|
+
except Exception: # pragma: no cover - kg_schema always importable in practice
|
|
3459
|
+
_EMBED_DIM, _V2 = 1024, 2
|
|
3460
|
+
return {
|
|
3461
|
+
"graph_schema_version": GRAPH_SCHEMA_VERSION,
|
|
3462
|
+
"kg_v2_schema_version": _V2,
|
|
3463
|
+
"projection_version": _PROJECTION_VERSION,
|
|
3464
|
+
"embed_dim": _EMBED_DIM,
|
|
3465
|
+
}
|
|
3466
|
+
|
|
3467
|
+
def export_graph_data(self, *, workspace_id: Optional[str] = None) -> Dict[str, Any]:
|
|
3468
|
+
"""Raw, lossless logical export of the graph (nodes/edges/chunks/sources/
|
|
3469
|
+
provenance). Vector embeddings are intentionally omitted — they are
|
|
3470
|
+
re-derived on import — so the artifact stays portable and small. Use
|
|
3471
|
+
:meth:`backup_database` for a faithful binary copy incl. embeddings.
|
|
3472
|
+
|
|
3473
|
+
``workspace_id`` REALLY filters (v4): the artifact contains only nodes
|
|
3474
|
+
scoped to that workspace plus legacy-global rows (NULL scope, readable
|
|
3475
|
+
machine-wide by definition), with edges/chunks/provenance restricted to
|
|
3476
|
+
the surviving nodes. Pre-v4 this parameter was stamped into the header
|
|
3477
|
+
while the data exported everything — a header that lied.
|
|
3478
|
+
"""
|
|
3479
|
+
with self._connect() as conn:
|
|
3480
|
+
def rows(table: str):
|
|
3481
|
+
return [dict(r) for r in conn.execute(f"SELECT * FROM {table}").fetchall()]
|
|
3482
|
+
|
|
3483
|
+
if workspace_id:
|
|
3484
|
+
keep_ids = {
|
|
3485
|
+
row["id"]
|
|
3486
|
+
for row in conn.execute(
|
|
3487
|
+
"SELECT id FROM nodes_v2 WHERE workspace_id = ? OR workspace_id IS NULL",
|
|
3488
|
+
(workspace_id,),
|
|
3489
|
+
).fetchall()
|
|
3490
|
+
}
|
|
3491
|
+
nodes = [n for n in rows("nodes") if n["id"] in keep_ids]
|
|
3492
|
+
edges = [
|
|
3493
|
+
e for e in rows("edges")
|
|
3494
|
+
if e["from_node"] in keep_ids and e["to_node"] in keep_ids
|
|
3495
|
+
]
|
|
3496
|
+
chunks = [c for c in rows("chunks") if c["source_node"] in keep_ids]
|
|
3497
|
+
provenance = [p for p in rows("ingestion_provenance") if p["node_id"] in keep_ids]
|
|
3498
|
+
data = {
|
|
3499
|
+
"nodes": nodes,
|
|
3500
|
+
"edges": edges,
|
|
3501
|
+
"chunks": chunks,
|
|
3502
|
+
"knowledge_sources": rows("knowledge_sources"),
|
|
3503
|
+
"provenance": provenance,
|
|
3504
|
+
}
|
|
3505
|
+
else:
|
|
3506
|
+
data = {
|
|
3507
|
+
"nodes": rows("nodes"),
|
|
3508
|
+
"edges": rows("edges"),
|
|
3509
|
+
"chunks": rows("chunks"),
|
|
3510
|
+
"knowledge_sources": rows("knowledge_sources"),
|
|
3511
|
+
"provenance": rows("ingestion_provenance"),
|
|
3512
|
+
}
|
|
3513
|
+
data["counts"] = {k: len(v) for k, v in data.items()}
|
|
3514
|
+
return data
|
|
3515
|
+
|
|
3516
|
+
def import_graph_data(
|
|
3517
|
+
self, data: Dict[str, Any], *, mode: str = "merge", dry_run: bool = False
|
|
3518
|
+
) -> Dict[str, Any]:
|
|
3519
|
+
"""Import a logical export back into the store.
|
|
3520
|
+
|
|
3521
|
+
``mode='merge'`` upserts on top of existing data (id collisions update);
|
|
3522
|
+
``mode='replace'`` clears the graph first. ``dry_run=True`` reports the
|
|
3523
|
+
plan without writing. Refuses artifacts from a NEWER graph schema than
|
|
3524
|
+
this build.
|
|
3525
|
+
"""
|
|
3526
|
+
nodes = data.get("nodes") or []
|
|
3527
|
+
edges = data.get("edges") or []
|
|
3528
|
+
chunks = data.get("chunks") or []
|
|
3529
|
+
sources = data.get("knowledge_sources") or []
|
|
3530
|
+
provenance = data.get("provenance") or []
|
|
3531
|
+
|
|
3532
|
+
header = data.get("header") or {}
|
|
3533
|
+
incoming_schema = header.get("graph_schema_version")
|
|
3534
|
+
if isinstance(incoming_schema, int) and incoming_schema > GRAPH_SCHEMA_VERSION:
|
|
3535
|
+
raise ValueError(
|
|
3536
|
+
f"Artifact graph_schema_version {incoming_schema} is newer than this "
|
|
3537
|
+
f"build ({GRAPH_SCHEMA_VERSION}); refusing to import."
|
|
3538
|
+
)
|
|
3539
|
+
|
|
3540
|
+
plan = {
|
|
3541
|
+
"mode": mode,
|
|
3542
|
+
"nodes": len(nodes),
|
|
3543
|
+
"edges": len(edges),
|
|
3544
|
+
"chunks": len(chunks),
|
|
3545
|
+
"knowledge_sources": len(sources),
|
|
3546
|
+
"provenance": len(provenance),
|
|
3547
|
+
}
|
|
3548
|
+
if dry_run:
|
|
3549
|
+
plan["dry_run"] = True
|
|
3550
|
+
return plan
|
|
3551
|
+
|
|
3552
|
+
if mode == "replace":
|
|
3553
|
+
self.clear_all()
|
|
3554
|
+
|
|
3555
|
+
with self._connect() as conn:
|
|
3556
|
+
for n in nodes:
|
|
3557
|
+
self._upsert_node(
|
|
3558
|
+
conn, n["id"], n["type"], n.get("title") or "",
|
|
3559
|
+
summary=n.get("summary") or "",
|
|
3560
|
+
metadata=_safe_loads(n.get("metadata_json")),
|
|
3561
|
+
raw=_safe_loads(n.get("raw_json")),
|
|
3562
|
+
)
|
|
3563
|
+
for c in chunks:
|
|
3564
|
+
self._upsert_chunk(
|
|
3565
|
+
conn, chunk_id=c["id"], source_node=c["source_node"],
|
|
3566
|
+
text=c.get("text") or "", metadata=_safe_loads(c.get("metadata_json")),
|
|
3567
|
+
)
|
|
3568
|
+
for e in edges:
|
|
3569
|
+
self._upsert_edge(
|
|
3570
|
+
conn, e["from_node"], e["to_node"], e["type"],
|
|
3571
|
+
weight=float(e.get("weight") or 1.0),
|
|
3572
|
+
metadata=_safe_loads(e.get("metadata_json")),
|
|
3573
|
+
)
|
|
3574
|
+
for s in sources:
|
|
3575
|
+
conn.execute(
|
|
3576
|
+
"""
|
|
3577
|
+
INSERT OR REPLACE INTO knowledge_sources(
|
|
3578
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
3579
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at)
|
|
3580
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3581
|
+
""",
|
|
3582
|
+
(
|
|
3583
|
+
s["id"], s["root_path"], s["os_type"], s.get("drive_id"), s.get("label"),
|
|
3584
|
+
s.get("status") or "active", int(s.get("include_ocr") or 0),
|
|
3585
|
+
int(s.get("watch_enabled") or 0), s.get("consent_json") or "{}",
|
|
3586
|
+
s.get("created_at") or _now(), s.get("updated_at") or _now(),
|
|
3587
|
+
s.get("last_scanned_at"),
|
|
3588
|
+
),
|
|
3589
|
+
)
|
|
3590
|
+
for p in provenance:
|
|
3591
|
+
conn.execute(
|
|
3592
|
+
"""
|
|
3593
|
+
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3594
|
+
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3595
|
+
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3596
|
+
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3597
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3598
|
+
""",
|
|
3599
|
+
(
|
|
3600
|
+
p["id"], p["node_id"], p["source_type"], p.get("source_uri"),
|
|
3601
|
+
p.get("content_hash"), p.get("title"), p.get("pipeline") or "import",
|
|
3602
|
+
p.get("owner"), p.get("workspace_id"), p.get("captured_at"),
|
|
3603
|
+
p.get("modified_at"), int(p.get("embedded") or 0), int(p.get("linked") or 0),
|
|
3604
|
+
int(p.get("duplicate") or 0), p.get("agent_used"), int(p.get("chunk_count") or 0),
|
|
3605
|
+
p.get("permissions_json") or "{}", p.get("metadata_json") or "{}",
|
|
3606
|
+
p.get("created_at") or _now(),
|
|
3607
|
+
),
|
|
3608
|
+
)
|
|
3609
|
+
plan["imported"] = True
|
|
3610
|
+
return plan
|
|
3611
|
+
|
|
3612
|
+
def backup_database(self, dest_path) -> Path:
|
|
3613
|
+
"""Write a clean, standalone snapshot of the live DB to ``dest_path``.
|
|
3614
|
+
|
|
3615
|
+
Uses ``VACUUM INTO`` (after a full WAL checkpoint) so the snapshot is a
|
|
3616
|
+
defragmented, rollback-journal-mode database with no companion -wal/-shm
|
|
3617
|
+
— which restores cleanly by a plain file copy. Captures all data incl.
|
|
3618
|
+
the vector_embeddings BLOBs.
|
|
3619
|
+
"""
|
|
3620
|
+
dest = Path(dest_path)
|
|
3621
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
3622
|
+
if dest.exists():
|
|
3623
|
+
dest.unlink() # VACUUM INTO requires the target to not exist
|
|
3624
|
+
conn = self._connect()
|
|
3625
|
+
try:
|
|
3626
|
+
conn.execute("PRAGMA wal_checkpoint(FULL)")
|
|
3627
|
+
conn.execute("VACUUM INTO ?", (str(dest),))
|
|
3628
|
+
finally:
|
|
3629
|
+
conn.close()
|
|
3630
|
+
return dest
|
|
3631
|
+
|
|
2857
3632
|
def _ingest_structure_nodes(
|
|
2858
3633
|
self,
|
|
2859
3634
|
conn: sqlite3.Connection,
|
|
@@ -3044,6 +3819,13 @@ class KnowledgeGraphStore:
|
|
|
3044
3819
|
"Feature", # 소프트웨어 기능
|
|
3045
3820
|
"Task", # 할 일
|
|
3046
3821
|
"Decision", # 결정 사항
|
|
3822
|
+
# v3.6.0 Knowledge Graph First — 1급 엔티티를 그래프에 노출
|
|
3823
|
+
"Source", # 수집 출처 (파일/URL/브라우저 탭/git)
|
|
3824
|
+
"Repository", # git 저장소
|
|
3825
|
+
"Meeting", # 회의
|
|
3826
|
+
"Organization", # 조직
|
|
3827
|
+
"Workflow", # 워크플로우
|
|
3828
|
+
"Agent", # 에이전트
|
|
3047
3829
|
)
|
|
3048
3830
|
|
|
3049
3831
|
def list_documents(self, limit: int = 200) -> Dict[str, Any]:
|
|
@@ -3091,7 +3873,40 @@ class KnowledgeGraphStore:
|
|
|
3091
3873
|
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
|
3092
3874
|
}
|
|
3093
3875
|
|
|
3094
|
-
def
|
|
3876
|
+
def workspaces_of(self, node_ids) -> Dict[str, Optional[str]]:
|
|
3877
|
+
"""Map node ids to their workspace scope (None = legacy-global)."""
|
|
3878
|
+
ids = [str(i) for i in node_ids if i]
|
|
3879
|
+
if not ids:
|
|
3880
|
+
return {}
|
|
3881
|
+
placeholders = ",".join("?" for _ in ids)
|
|
3882
|
+
with self._connect() as conn:
|
|
3883
|
+
try:
|
|
3884
|
+
return {
|
|
3885
|
+
row["id"]: row["workspace_id"]
|
|
3886
|
+
for row in conn.execute(
|
|
3887
|
+
f"SELECT id, workspace_id FROM nodes_v2 WHERE id IN ({placeholders})", ids
|
|
3888
|
+
).fetchall()
|
|
3889
|
+
}
|
|
3890
|
+
except Exception:
|
|
3891
|
+
return {}
|
|
3892
|
+
|
|
3893
|
+
def filter_scoped_nodes(self, items, allowed_workspaces, *, id_key: str = "id"):
|
|
3894
|
+
"""Drop items scoped to a workspace the caller is not a member of.
|
|
3895
|
+
|
|
3896
|
+
``allowed_workspaces=None`` means no scoping (single-user / no-auth
|
|
3897
|
+
mode). Legacy-global rows (no workspace) stay visible to everyone on
|
|
3898
|
+
the machine — the documented pre-v4 compatibility behavior.
|
|
3899
|
+
"""
|
|
3900
|
+
if allowed_workspaces is None:
|
|
3901
|
+
return list(items)
|
|
3902
|
+
allowed = set(allowed_workspaces)
|
|
3903
|
+
scopes = self.workspaces_of([item.get(id_key) for item in items])
|
|
3904
|
+
return [
|
|
3905
|
+
item for item in items
|
|
3906
|
+
if scopes.get(item.get(id_key)) is None or scopes.get(item.get(id_key)) in allowed
|
|
3907
|
+
]
|
|
3908
|
+
|
|
3909
|
+
def graph(self, limit: int = 300, *, allowed_workspaces=None) -> Dict[str, Any]:
|
|
3095
3910
|
limit = max(1, min(int(limit or 300), 2000))
|
|
3096
3911
|
visible = ",".join(f"'{t}'" for t in self._GRAPH_VISIBLE_TYPES)
|
|
3097
3912
|
nt, et = self._read_tables()
|
|
@@ -3141,6 +3956,11 @@ class KnowledgeGraphStore:
|
|
|
3141
3956
|
for row in edge_rows
|
|
3142
3957
|
]
|
|
3143
3958
|
|
|
3959
|
+
if allowed_workspaces is not None:
|
|
3960
|
+
nodes = self.filter_scoped_nodes(nodes, allowed_workspaces)
|
|
3961
|
+
kept_ids = {node["id"] for node in nodes}
|
|
3962
|
+
edges = [e for e in edges if e["from"] in kept_ids and e["to"] in kept_ids]
|
|
3963
|
+
|
|
3144
3964
|
degree_map: Dict[str, int] = {}
|
|
3145
3965
|
now = datetime.now()
|
|
3146
3966
|
node_by_id = {node["id"]: node for node in nodes}
|
|
@@ -3216,16 +4036,32 @@ class KnowledgeGraphStore:
|
|
|
3216
4036
|
with self._connect() as conn:
|
|
3217
4037
|
rows = []
|
|
3218
4038
|
if query:
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
4039
|
+
fts_ids = self._fts_match_ids(conn, query, limit)
|
|
4040
|
+
if fts_ids:
|
|
4041
|
+
placeholders = ",".join("?" for _ in fts_ids)
|
|
4042
|
+
by_id = {
|
|
4043
|
+
row["id"]: row
|
|
4044
|
+
for row in conn.execute(
|
|
4045
|
+
f"""
|
|
4046
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
4047
|
+
FROM {nt} WHERE id IN ({placeholders})
|
|
4048
|
+
""",
|
|
4049
|
+
fts_ids,
|
|
4050
|
+
).fetchall()
|
|
4051
|
+
}
|
|
4052
|
+
# Preserve FTS bm25 rank order.
|
|
4053
|
+
rows = [by_id[i] for i in fts_ids if i in by_id]
|
|
4054
|
+
else:
|
|
4055
|
+
rows = conn.execute(
|
|
4056
|
+
f"""
|
|
4057
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
4058
|
+
FROM {nt}
|
|
4059
|
+
WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
|
|
4060
|
+
ORDER BY updated_at DESC, id ASC
|
|
4061
|
+
LIMIT ?
|
|
4062
|
+
""",
|
|
4063
|
+
(q, q, q, limit),
|
|
4064
|
+
).fetchall()
|
|
3229
4065
|
|
|
3230
4066
|
if len(rows) < limit:
|
|
3231
4067
|
terms = _topic_candidates(query, limit=8)
|
|
@@ -3260,6 +4096,10 @@ class KnowledgeGraphStore:
|
|
|
3260
4096
|
} else 0
|
|
3261
4097
|
return (hits, type_boost, row["updated_at"] or "")
|
|
3262
4098
|
|
|
4099
|
+
# Deterministic contract: rows with equal relevance order by id ASC
|
|
4100
|
+
# (stable sort preserves the pre-sort under reverse=True), matching
|
|
4101
|
+
# the legacy LIKE path regardless of FTS bm25 tie ordering.
|
|
4102
|
+
rows = sorted(rows, key=lambda r: r["id"])
|
|
3263
4103
|
rows = sorted(rows, key=score, reverse=True)[:limit]
|
|
3264
4104
|
return {
|
|
3265
4105
|
"query": query,
|
|
@@ -3744,6 +4584,9 @@ class KnowledgeGraphStore:
|
|
|
3744
4584
|
"backend": "sqlite",
|
|
3745
4585
|
"embedding_model": self._embedding_model.model_id,
|
|
3746
4586
|
"embedding_dim": self._embedding_model.dim,
|
|
4587
|
+
# Honest capability report: trigram FTS5 keyword index, or
|
|
4588
|
+
# LIKE-scan fallback when this SQLite build lacks it.
|
|
4589
|
+
"fts_enabled": bool(getattr(self, "_fts_enabled", False)),
|
|
3747
4590
|
},
|
|
3748
4591
|
"source_items": len(source_items),
|
|
3749
4592
|
"indexed_items": sum(vector_counts.values()),
|
|
@@ -3847,21 +4690,26 @@ class KnowledgeGraphStore:
|
|
|
3847
4690
|
return {"status": "skipped", "removed_nodes": 0}
|
|
3848
4691
|
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
3849
4692
|
with self._connect() as conn:
|
|
4693
|
+
# Edge rows may carry the legacy lowercase label (pre-v4) or the
|
|
4694
|
+
# canonical EdgeType value (v4 write door) — match both.
|
|
3850
4695
|
direct_ids = [
|
|
3851
4696
|
row["to_node"]
|
|
3852
4697
|
for row in conn.execute(
|
|
3853
|
-
"SELECT to_node FROM edges WHERE from_node=? AND type
|
|
4698
|
+
"SELECT to_node FROM edges WHERE from_node=? AND type IN ('contains', 'CONTAINS')",
|
|
3854
4699
|
(conv_id,),
|
|
3855
4700
|
)
|
|
3856
4701
|
]
|
|
3857
4702
|
remove_ids = set(direct_ids)
|
|
4703
|
+
child_types = [
|
|
4704
|
+
"has_chunk", "implies", "contains_signal", "has_page",
|
|
4705
|
+
"has_slide", "has_sheet", "contains_image",
|
|
4706
|
+
]
|
|
4707
|
+
child_types += [t.upper() for t in child_types]
|
|
4708
|
+
placeholders = ",".join("?" for _ in child_types)
|
|
3858
4709
|
for source_id in list(direct_ids):
|
|
3859
4710
|
for row in conn.execute(
|
|
3860
|
-
""
|
|
3861
|
-
|
|
3862
|
-
WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
|
|
3863
|
-
""",
|
|
3864
|
-
(source_id,),
|
|
4711
|
+
f"SELECT to_node FROM edges WHERE from_node=? AND type IN ({placeholders})",
|
|
4712
|
+
(source_id, *child_types),
|
|
3865
4713
|
):
|
|
3866
4714
|
remove_ids.add(row["to_node"])
|
|
3867
4715
|
remove_ids.add(conv_id)
|