ltcai 3.5.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +73 -35
  2. package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
  3. package/docs/CHANGELOG.md +32 -0
  4. package/docs/HANDOVER_v3.6.0.md +46 -0
  5. package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
  6. package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
  7. package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +509 -0
  8. package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
  9. package/docs/architecture.md +13 -12
  10. package/docs/kg-schema.md +102 -53
  11. package/docs/privacy.md +18 -2
  12. package/docs/security-model.md +17 -0
  13. package/kg_schema.py +139 -10
  14. package/knowledge_graph.py +874 -26
  15. package/knowledge_graph_api.py +11 -127
  16. package/latticeai/__init__.py +1 -1
  17. package/latticeai/api/admin.py +1 -1
  18. package/latticeai/api/agents.py +7 -1
  19. package/latticeai/api/auth.py +27 -4
  20. package/latticeai/api/browser.py +217 -0
  21. package/latticeai/api/chat.py +112 -76
  22. package/latticeai/api/health.py +1 -1
  23. package/latticeai/api/hooks.py +1 -1
  24. package/latticeai/api/knowledge_graph.py +146 -0
  25. package/latticeai/api/local_files.py +1 -1
  26. package/latticeai/api/mcp.py +23 -11
  27. package/latticeai/api/memory.py +1 -1
  28. package/latticeai/api/models.py +1 -1
  29. package/latticeai/api/network.py +81 -0
  30. package/latticeai/api/portability.py +93 -0
  31. package/latticeai/api/realtime.py +1 -1
  32. package/latticeai/api/search.py +26 -2
  33. package/latticeai/api/security_dashboard.py +2 -3
  34. package/latticeai/api/setup.py +2 -2
  35. package/latticeai/api/static_routes.py +2 -4
  36. package/latticeai/api/tools.py +3 -0
  37. package/latticeai/api/workflow_designer.py +46 -0
  38. package/latticeai/api/workspace.py +71 -49
  39. package/latticeai/app_factory.py +1710 -0
  40. package/latticeai/brain/__init__.py +18 -0
  41. package/latticeai/brain/context.py +213 -0
  42. package/latticeai/brain/conversations.py +236 -0
  43. package/latticeai/brain/identity.py +175 -0
  44. package/latticeai/brain/memory.py +102 -0
  45. package/latticeai/brain/network.py +205 -0
  46. package/latticeai/core/agent.py +31 -7
  47. package/latticeai/core/audit.py +0 -7
  48. package/latticeai/core/config.py +1 -1
  49. package/latticeai/core/context_builder.py +1 -2
  50. package/latticeai/core/enterprise.py +1 -1
  51. package/latticeai/core/graph_curator.py +2 -2
  52. package/latticeai/core/marketplace.py +1 -1
  53. package/latticeai/core/mcp_registry.py +791 -0
  54. package/latticeai/core/model_compat.py +1 -1
  55. package/latticeai/core/model_resolution.py +0 -1
  56. package/latticeai/core/multi_agent.py +238 -4
  57. package/latticeai/core/security.py +1 -1
  58. package/latticeai/core/sessions.py +37 -7
  59. package/latticeai/core/workflow_engine.py +114 -2
  60. package/latticeai/core/workspace_os.py +58 -10
  61. package/latticeai/models/__init__.py +7 -0
  62. package/latticeai/models/router.py +779 -0
  63. package/latticeai/server_app.py +29 -1504
  64. package/latticeai/services/agent_runtime.py +1 -0
  65. package/latticeai/services/app_context.py +75 -14
  66. package/latticeai/services/ingestion.py +318 -0
  67. package/latticeai/services/kg_portability.py +207 -0
  68. package/latticeai/services/memory_service.py +39 -11
  69. package/latticeai/services/model_runtime.py +2 -5
  70. package/latticeai/services/platform_runtime.py +100 -23
  71. package/latticeai/services/search_service.py +17 -8
  72. package/latticeai/services/tool_dispatch.py +12 -2
  73. package/latticeai/services/triggers.py +241 -0
  74. package/latticeai/services/upload_service.py +37 -12
  75. package/latticeai/services/workspace_service.py +31 -0
  76. package/llm_router.py +29 -772
  77. package/ltcai_cli.py +1 -2
  78. package/mcp_registry.py +25 -788
  79. package/p_reinforce.py +124 -14
  80. package/package.json +11 -8
  81. package/scripts/build_vsix.mjs +72 -0
  82. package/scripts/bump_version.py +99 -0
  83. package/scripts/generate_diagrams.py +0 -1
  84. package/scripts/lint_v3.mjs +82 -18
  85. package/scripts/validate_release_artifacts.py +0 -1
  86. package/scripts/wheel_smoke.py +142 -0
  87. package/server.py +11 -7
  88. package/setup_wizard.py +1142 -0
  89. package/static/account.html +2 -4
  90. package/static/admin.html +3 -5
  91. package/static/chat.html +3 -6
  92. package/static/graph.html +2 -4
  93. package/static/sw.js +81 -52
  94. package/static/v3/asset-manifest.json +20 -19
  95. package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
  96. package/static/v3/css/lattice.base.css +1 -1
  97. package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
  98. package/static/v3/css/lattice.components.css +1 -1
  99. package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
  100. package/static/v3/css/lattice.shell.css +1 -1
  101. package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
  102. package/static/v3/css/lattice.tokens.css +3 -0
  103. package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
  104. package/static/v3/css/lattice.views.css +2 -2
  105. package/static/v3/index.html +3 -4
  106. package/static/v3/js/{app.d086489d.js → app.356e6452.js} +1 -1
  107. package/static/v3/js/core/{api.12b568ad.js → api.7a308b89.js} +39 -1
  108. package/static/v3/js/core/api.js +38 -0
  109. package/static/v3/js/core/{routes.d214b399.js → routes.7222343d.js} +22 -22
  110. package/static/v3/js/core/routes.js +22 -22
  111. package/static/v3/js/core/{shell.d05266f5.js → shell.a1657f20.js} +4 -4
  112. package/static/v3/js/core/shell.js +1 -1
  113. package/static/v3/js/core/{store.34ebd5e6.js → store.204a08b2.js} +1 -1
  114. package/static/v3/js/core/store.js +1 -1
  115. package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
  116. package/static/v3/js/views/graph-canvas.js +509 -0
  117. package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
  118. package/static/v3/js/views/hybrid-search.js +1 -2
  119. package/static/v3/js/views/knowledge-graph.5e40cbeb.js +509 -0
  120. package/static/v3/js/views/knowledge-graph.js +326 -54
  121. package/static/vendor/chart.umd.min.js +20 -0
  122. package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
  123. package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
  124. package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
  125. package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
  126. package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
  127. package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
  128. package/static/vendor/fonts/inter.css +44 -0
  129. package/static/vendor/icons/tabler-icons.min.css +4 -0
  130. package/static/vendor/icons/tabler-icons.woff2 +0 -0
  131. package/static/vendor/marked.min.js +69 -0
  132. package/static/workspace.html +2 -2
  133. package/telegram_bot.py +1 -2
  134. package/tools/commands.py +4 -2
  135. package/tools/computer.py +1 -1
  136. package/tools/documents.py +1 -3
  137. package/tools/filesystem.py +0 -4
  138. package/tools/knowledge.py +1 -3
  139. package/tools/network.py +1 -3
  140. package/codex_telegram_bot.py +0 -195
  141. package/docs/assets/v3.4.0/agent-run.png +0 -0
  142. package/docs/assets/v3.4.0/agents.png +0 -0
  143. package/docs/assets/v3.4.0/before/chat-before.png +0 -0
  144. package/docs/assets/v3.4.0/before/files-before.png +0 -0
  145. package/docs/assets/v3.4.0/chat.png +0 -0
  146. package/docs/assets/v3.4.0/connect-folder.png +0 -0
  147. package/docs/assets/v3.4.0/files.png +0 -0
  148. package/docs/assets/v3.4.0/home.png +0 -0
  149. package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
  150. package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
  151. package/docs/assets/v3.4.0/local-agent.png +0 -0
  152. package/docs/assets/v3.4.0/memory.png +0 -0
  153. package/docs/assets/v3.4.0/settings.png +0 -0
  154. package/docs/assets/v3.4.0/vision-input.png +0 -0
  155. package/docs/assets/v3.4.0/workflows.png +0 -0
  156. package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
  157. package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
  158. package/docs/assets/v3.4.1/local-agent.png +0 -0
  159. package/docs/images/admin-dashboard.png +0 -0
  160. package/docs/images/architecture.png +0 -0
  161. package/docs/images/enterprise.png +0 -0
  162. package/docs/images/graph.png +0 -0
  163. package/docs/images/hero.gif +0 -0
  164. package/docs/images/knowledge-graph.png +0 -0
  165. package/docs/images/lattice-ai-demo.gif +0 -0
  166. package/docs/images/lattice-ai-hero.png +0 -0
  167. package/docs/images/logo.svg +0 -33
  168. package/docs/images/mobile-responsive.png +0 -0
  169. package/docs/images/model-recommendation.png +0 -0
  170. package/docs/images/onboarding.png +0 -0
  171. package/docs/images/organization.png +0 -0
  172. package/docs/images/pipeline.png +0 -0
  173. package/docs/images/screenshot-admin.png +0 -0
  174. package/docs/images/screenshot-chat.png +0 -0
  175. package/docs/images/screenshot-graph.png +0 -0
  176. package/docs/images/skills.png +0 -0
  177. package/docs/images/workspace-dark.png +0 -0
  178. package/docs/images/workspace-light.png +0 -0
  179. package/docs/images/workspace.png +0 -0
  180. package/requirements.txt +0 -16
  181. package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
@@ -942,6 +942,31 @@ class KnowledgeGraphStore:
942
942
  error_message TEXT,
943
943
  metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
944
944
  );
945
+ -- v3.6.0 Knowledge Graph First: per-ingestion provenance trail.
946
+ -- Append-only audit of where every graph node came from, when it
947
+ -- was captured, how it was processed, and whether it was embedded /
948
+ -- linked / used by an agent. get_provenance() returns the latest row.
949
+ CREATE TABLE IF NOT EXISTS ingestion_provenance (
950
+ id TEXT PRIMARY KEY,
951
+ node_id TEXT NOT NULL,
952
+ source_type TEXT NOT NULL,
953
+ source_uri TEXT,
954
+ content_hash TEXT,
955
+ title TEXT,
956
+ pipeline TEXT NOT NULL,
957
+ owner TEXT,
958
+ workspace_id TEXT,
959
+ captured_at TEXT,
960
+ modified_at TEXT,
961
+ embedded INTEGER NOT NULL DEFAULT 0,
962
+ linked INTEGER NOT NULL DEFAULT 0,
963
+ duplicate INTEGER NOT NULL DEFAULT 0,
964
+ agent_used TEXT,
965
+ chunk_count INTEGER NOT NULL DEFAULT 0,
966
+ permissions_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(permissions_json)),
967
+ metadata_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(metadata_json)),
968
+ created_at TEXT NOT NULL
969
+ );
945
970
  CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
946
971
  CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
947
972
  CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
@@ -954,6 +979,10 @@ class KnowledgeGraphStore:
954
979
  CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
955
980
  CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
956
981
  CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
982
+ CREATE INDEX IF NOT EXISTS idx_provenance_node ON ingestion_provenance(node_id);
983
+ CREATE INDEX IF NOT EXISTS idx_provenance_source_type ON ingestion_provenance(source_type);
984
+ CREATE INDEX IF NOT EXISTS idx_provenance_hash ON ingestion_provenance(content_hash);
985
+ CREATE INDEX IF NOT EXISTS idx_provenance_created ON ingestion_provenance(created_at);
957
986
  """
958
987
  )
959
988
  conn.execute(
@@ -961,6 +990,62 @@ class KnowledgeGraphStore:
961
990
  ("schema_version", str(GRAPH_SCHEMA_VERSION)),
962
991
  )
963
992
  self._init_v2_schema()
993
+ self._init_fts()
994
+
995
+ # ── FTS5 keyword index (v4) ──────────────────────────────────────────
996
+ # Replaces LIKE '%q%' table scans for keyword search. The trigram
997
+ # tokenizer is required (not just FTS5): unicode61 indexes whole tokens
998
+ # and would silently break Korean substring recall ('프로젝트' must match
999
+ # '프로젝트를'). Without trigram support the store honestly reports
1000
+ # fts_enabled=False and the LIKE path remains authoritative.
1001
+ _FTS_SQL = """
1002
+ CREATE VIRTUAL TABLE IF NOT EXISTS node_fts USING fts5(
1003
+ node_id UNINDEXED, title, summary, metadata, tokenize='trigram'
1004
+ );
1005
+ CREATE TRIGGER IF NOT EXISTS node_fts_ai AFTER INSERT ON nodes BEGIN
1006
+ INSERT INTO node_fts(node_id, title, summary, metadata)
1007
+ VALUES (new.id, new.title, COALESCE(new.summary, ''), new.metadata_json);
1008
+ END;
1009
+ CREATE TRIGGER IF NOT EXISTS node_fts_au AFTER UPDATE ON nodes BEGIN
1010
+ DELETE FROM node_fts WHERE node_id = old.id;
1011
+ INSERT INTO node_fts(node_id, title, summary, metadata)
1012
+ VALUES (new.id, new.title, COALESCE(new.summary, ''), new.metadata_json);
1013
+ END;
1014
+ CREATE TRIGGER IF NOT EXISTS node_fts_ad AFTER DELETE ON nodes BEGIN
1015
+ DELETE FROM node_fts WHERE node_id = old.id;
1016
+ END;
1017
+ """
1018
+
1019
+ def _init_fts(self) -> None:
1020
+ self._fts_enabled = False
1021
+ try:
1022
+ with self._connect() as conn:
1023
+ conn.executescript(self._FTS_SQL)
1024
+ fts_count = conn.execute("SELECT count(*) AS c FROM node_fts").fetchone()["c"]
1025
+ if fts_count == 0:
1026
+ conn.execute(
1027
+ "INSERT INTO node_fts(node_id, title, summary, metadata) "
1028
+ "SELECT id, title, COALESCE(summary, ''), metadata_json FROM nodes"
1029
+ )
1030
+ self._fts_enabled = True
1031
+ except sqlite3.OperationalError as exc:
1032
+ # FTS5/trigram not compiled into this SQLite build. LIKE search
1033
+ # stays authoritative; the capability is reported, never faked.
1034
+ logging.info("FTS5 trigram index unavailable (%s); keyword search uses LIKE scans.", exc)
1035
+
1036
+ def _fts_match_ids(self, conn: sqlite3.Connection, query: str, limit: int) -> List[str]:
1037
+ """Ranked node ids for a trigram FTS query ('' on any failure)."""
1038
+ if not getattr(self, "_fts_enabled", False) or len(query) < 3:
1039
+ return []
1040
+ escaped = query.replace('"', '""')
1041
+ try:
1042
+ rows = conn.execute(
1043
+ 'SELECT node_id FROM node_fts WHERE node_fts MATCH ? ORDER BY rank LIMIT ?',
1044
+ (f'"{escaped}"', limit),
1045
+ ).fetchall()
1046
+ except sqlite3.OperationalError:
1047
+ return []
1048
+ return [row["node_id"] for row in rows]
964
1049
 
965
1050
  # SQL views that reconstruct the *exact* legacy row shape on top of the
966
1051
  # normalized v2 tables, so the read methods run unchanged against either
@@ -1099,26 +1184,40 @@ class KnowledgeGraphStore:
1099
1184
  self, conn: sqlite3.Connection, node_id: str, node_type: str, title: str,
1100
1185
  summary: Optional[str], metadata_json: Optional[str],
1101
1186
  *, created_at: Optional[str] = None, updated_at: Optional[str] = None,
1187
+ owner: Optional[str] = None, workspace_id: Optional[str] = None,
1188
+ visibility: Optional[str] = None,
1102
1189
  ) -> None:
1103
1190
  if KGStoreV2 is None:
1104
1191
  return
1105
1192
  ts = updated_at or _now()
1106
1193
  norm_type = NodeType.from_legacy(node_type).value if NodeType is not None else node_type
1194
+ # Scope resolution: explicit param > metadata hints > legacy-global.
1195
+ # 'legacy' (not 'private') marks unscoped rows — the column default
1196
+ # must never silently privatize previously machine-shared data.
1197
+ meta = _safe_loads(metadata_json) if metadata_json else {}
1198
+ owner = owner or meta.get("user_email") or meta.get("owner") or None
1199
+ workspace_id = workspace_id or meta.get("workspace_id") or None
1200
+ visibility = visibility or ("legacy" if workspace_id is None else "workspace")
1107
1201
  try:
1108
1202
  conn.execute(
1109
1203
  """
1110
1204
  INSERT INTO nodes_v2(id, type, legacy_type, label, summary, attrs,
1111
- owner_id, visibility, created_at, updated_at,
1112
- importance_score)
1113
- VALUES (?, ?, ?, ?, ?, ?, NULL, 'private', ?, ?, 0.0)
1205
+ owner_id, workspace_id, visibility,
1206
+ created_at, updated_at, importance_score)
1207
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0.0)
1114
1208
  ON CONFLICT(id) DO UPDATE SET
1115
1209
  type=excluded.type, legacy_type=excluded.legacy_type,
1116
1210
  label=excluded.label, summary=excluded.summary,
1117
- attrs=excluded.attrs, updated_at=excluded.updated_at
1211
+ attrs=excluded.attrs, updated_at=excluded.updated_at,
1212
+ owner_id=COALESCE(excluded.owner_id, nodes_v2.owner_id),
1213
+ workspace_id=COALESCE(excluded.workspace_id, nodes_v2.workspace_id),
1214
+ visibility=CASE WHEN excluded.visibility != 'legacy'
1215
+ THEN excluded.visibility
1216
+ ELSE nodes_v2.visibility END
1118
1217
  """,
1119
1218
  (node_id, norm_type, node_type, title, summary,
1120
1219
  metadata_json if metadata_json is not None else "{}",
1121
- created_at or ts, ts),
1220
+ owner, workspace_id, visibility, created_at or ts, ts),
1122
1221
  )
1123
1222
  except Exception as ex:
1124
1223
  logging.debug("knowledge_graph: v2 node projection skipped (%s): %s", node_id, ex)
@@ -1140,8 +1239,7 @@ class KnowledgeGraphStore:
1140
1239
  INSERT INTO edges_v2(id, source, target, type, legacy_type, weight,
1141
1240
  confidence, evidence, metadata, created_by, created_at)
1142
1241
  VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, 'legacy', ?)
1143
- ON CONFLICT(source, target, legacy_type) DO UPDATE SET
1144
- type=excluded.type,
1242
+ ON CONFLICT(source, target, type, legacy_type) DO UPDATE SET
1145
1243
  weight=max(edges_v2.weight, excluded.weight),
1146
1244
  confidence=excluded.confidence,
1147
1245
  metadata=excluded.metadata
@@ -1149,9 +1247,125 @@ class KnowledgeGraphStore:
1149
1247
  (eid, from_node, to_node, norm_type, edge_type, float(weight),
1150
1248
  confidence, meta_str, created_at or _now()),
1151
1249
  )
1250
+ # Temporal record: every observation of this relationship is kept
1251
+ # (the UNIQUE upsert + weight=max alone would erase recurrence).
1252
+ row = conn.execute(
1253
+ "SELECT id FROM edges_v2 WHERE source=? AND target=? AND type=? AND legacy_type=?",
1254
+ (from_node, to_node, norm_type, edge_type),
1255
+ ).fetchone()
1256
+ if row is not None:
1257
+ conn.execute(
1258
+ "INSERT INTO edge_occurrences(edge_id, observed_at, weight, source) VALUES (?, ?, ?, ?)",
1259
+ (row["id"], created_at or _now(), float(weight),
1260
+ _safe_loads(meta_str).get("source")),
1261
+ )
1152
1262
  except Exception as ex:
1153
1263
  logging.debug("knowledge_graph: v2 edge projection skipped (%s->%s): %s", from_node, to_node, ex)
1154
1264
 
1265
+ def curate(self, *, max_documents: int = 200, max_new_nodes: int = 8) -> Dict[str, Any]:
1266
+ """On-demand graph curation (T4.4 — graph_curator goes live).
1267
+
1268
+ Runs the curator's gated topic-promotion pipeline over recent content
1269
+ nodes: candidates are clustered, secret-bearing labels are refused,
1270
+ and only multi-source topics above the importance threshold become
1271
+ Topic nodes (with MENTIONS edges back to their sources and a real
1272
+ importance_score in nodes_v2). Explicit and observable — the result
1273
+ reports everything promoted AND everything skipped, with reasons.
1274
+ """
1275
+ from latticeai.core.graph_curator import auto_build_graph_overlay
1276
+
1277
+ content_types = (
1278
+ "Document", "File", "CodeFile", "Message", "AIResponse",
1279
+ "Chat", "Page", "Slide", "Spreadsheet",
1280
+ )
1281
+ nt, _ = self._read_tables()
1282
+ with self._connect() as conn:
1283
+ placeholders = ",".join("?" for _ in content_types)
1284
+ rows = conn.execute(
1285
+ f"""
1286
+ SELECT id, type, title, summary FROM {nt}
1287
+ WHERE type IN ({placeholders})
1288
+ ORDER BY updated_at DESC, id ASC LIMIT ?
1289
+ """,
1290
+ (*content_types, max(1, min(int(max_documents), 2000))),
1291
+ ).fetchall()
1292
+ existing_labels = {
1293
+ str(row["title"] or "").strip().lower()
1294
+ for row in conn.execute(
1295
+ f"SELECT title FROM {nt} WHERE type IN ('Topic', 'Concept')"
1296
+ ).fetchall()
1297
+ }
1298
+ documents = [
1299
+ {
1300
+ "id": row["id"],
1301
+ "text": f"{row['title']} {row['summary'] or ''}",
1302
+ "kind": "file" if row["type"] in {"Document", "File", "CodeFile", "Spreadsheet"} else "chat",
1303
+ }
1304
+ for row in rows
1305
+ ]
1306
+ overlay = auto_build_graph_overlay(
1307
+ documents,
1308
+ existing_node_labels=existing_labels,
1309
+ max_new_nodes=max(1, min(int(max_new_nodes), 50)),
1310
+ )
1311
+ promoted: List[Dict[str, Any]] = []
1312
+ with self._connect() as conn:
1313
+ valid_ids = {row["id"] for row in rows}
1314
+ for promo in overlay["promotions"]:
1315
+ topic_id = f"topic:{_slug(promo['label'])}"
1316
+ self._upsert_node(
1317
+ conn, topic_id, "Topic", promo["label"],
1318
+ metadata={
1319
+ "curated": True,
1320
+ "importance": promo["importance"],
1321
+ "aliases": promo["aliases"],
1322
+ "source": "graph_curator",
1323
+ },
1324
+ )
1325
+ conn.execute(
1326
+ "UPDATE nodes_v2 SET importance_score=? WHERE id=?",
1327
+ (float(promo["importance"]), topic_id),
1328
+ )
1329
+ linked = 0
1330
+ for source_id in promo["sources"][:10]:
1331
+ if source_id in valid_ids:
1332
+ self._upsert_edge(
1333
+ conn, source_id, topic_id, "MENTIONS",
1334
+ weight=0.6, metadata={"source": "graph_curator"},
1335
+ )
1336
+ linked += 1
1337
+ promoted.append({
1338
+ "node_id": topic_id,
1339
+ "label": promo["label"],
1340
+ "importance": promo["importance"],
1341
+ "linked_sources": linked,
1342
+ })
1343
+ return {
1344
+ "status": "ok",
1345
+ "documents_scanned": len(documents),
1346
+ "candidates_total": overlay["candidates_total"],
1347
+ "promoted": promoted,
1348
+ "skipped": overlay["skipped"][:50],
1349
+ "skipped_total": len(overlay["skipped"]),
1350
+ }
1351
+
1352
+ def mark_superseded(self, old_node_id: str, new_node_id: str) -> Dict[str, Any]:
1353
+ """Record that ``old_node_id`` was replaced by ``new_node_id``.
1354
+
1355
+ The old node stays queryable (knowledge is durable); readers can follow
1356
+ the revision chain via ``nodes_v2.superseded_by``.
1357
+ """
1358
+ with self._connect() as conn:
1359
+ for node_id in (old_node_id, new_node_id):
1360
+ exists = conn.execute("SELECT 1 FROM nodes_v2 WHERE id=?", (node_id,)).fetchone()
1361
+ if not exists:
1362
+ raise FileNotFoundError(node_id)
1363
+ conn.execute(
1364
+ "UPDATE nodes_v2 SET superseded_by=?, updated_at=? WHERE id=?",
1365
+ (new_node_id, _now(), old_node_id),
1366
+ )
1367
+ return {"status": "ok", "node_id": old_node_id, "superseded_by": new_node_id}
1368
+
1155
1369
  def _v2_delete_nodes(self, conn: sqlite3.Connection, ids) -> None:
1156
1370
  """Mirror legacy node deletions into v2 (edges_v2 cascade on the FK)."""
1157
1371
  if KGStoreV2 is None:
@@ -1212,6 +1426,9 @@ class KnowledgeGraphStore:
1212
1426
  summary: str = "",
1213
1427
  metadata: Optional[Dict[str, Any]] = None,
1214
1428
  raw: Optional[Dict[str, Any]] = None,
1429
+ owner: Optional[str] = None,
1430
+ workspace_id: Optional[str] = None,
1431
+ visibility: Optional[str] = None,
1215
1432
  ) -> str:
1216
1433
  now = _now()
1217
1434
  # Canonical stored values, computed once and shared with the v2
@@ -1234,7 +1451,8 @@ class KnowledgeGraphStore:
1234
1451
  )
1235
1452
  # dual-write: project into the v2 graph on the same transaction
1236
1453
  self._v2_project_node(conn, node_id, node_type, title_s, summary_s, meta_json,
1237
- created_at=now, updated_at=now)
1454
+ created_at=now, updated_at=now,
1455
+ owner=owner, workspace_id=workspace_id, visibility=visibility)
1238
1456
  if node_type != "Chunk":
1239
1457
  self._upsert_vector_item(
1240
1458
  conn,
@@ -1255,6 +1473,16 @@ class KnowledgeGraphStore:
1255
1473
  weight: float = 1.0,
1256
1474
  metadata: Optional[Dict[str, Any]] = None,
1257
1475
  ) -> str:
1476
+ # v4 write door: every new edge stores the canonical EdgeType value —
1477
+ # free-string types (e.g. '포함함', '언급함') are normalized here, so no
1478
+ # caller can mint new legacy taxonomy. The original label survives in
1479
+ # metadata.legacy_label for traceability.
1480
+ if EdgeType is not None:
1481
+ canonical = EdgeType.from_legacy(edge_type).value
1482
+ if canonical != edge_type:
1483
+ metadata = dict(metadata or {})
1484
+ metadata.setdefault("legacy_label", edge_type)
1485
+ edge_type = canonical
1258
1486
  edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
1259
1487
  now = _now()
1260
1488
  meta_json = _json(metadata) # canonical string shared with the projection
@@ -2703,12 +2931,20 @@ class KnowledgeGraphStore:
2703
2931
  uploader: Optional[str] = None,
2704
2932
  conversation_id: Optional[str] = None,
2705
2933
  extracted: Optional[Dict[str, Any]] = None,
2934
+ source_type: Optional[str] = None,
2935
+ source_uri: Optional[str] = None,
2936
+ captured_at: Optional[str] = None,
2937
+ modified_at: Optional[str] = None,
2938
+ owner: Optional[str] = None,
2939
+ workspace_id: Optional[str] = None,
2940
+ permissions: Optional[Dict[str, Any]] = None,
2706
2941
  ) -> Dict[str, Any]:
2707
2942
  path = Path(path)
2708
2943
  data = path.read_bytes()
2709
2944
  digest = _sha256_bytes(data)
2710
2945
  ext = path.suffix.lower()
2711
2946
  filename = original_filename or path.name
2947
+ captured_at = captured_at or _now()
2712
2948
  blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
2713
2949
  blob_path.parent.mkdir(parents=True, exist_ok=True)
2714
2950
  if not blob_path.exists():
@@ -2723,8 +2959,16 @@ class KnowledgeGraphStore:
2723
2959
  "mime_type": mime_type,
2724
2960
  "bytes": len(data),
2725
2961
  "sha256": digest,
2962
+ "content_hash": digest,
2726
2963
  "blob_path": str(blob_path),
2727
2964
  "uploader": uploader,
2965
+ "owner": owner or uploader,
2966
+ "workspace_id": workspace_id,
2967
+ "permissions": permissions or {},
2968
+ "source_type": source_type or "file",
2969
+ "source_uri": source_uri or str(path),
2970
+ "captured_at": captured_at,
2971
+ "modified_at": modified_at,
2728
2972
  "conversation_id": conversation_id,
2729
2973
  "extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
2730
2974
  "structure": doc_meta,
@@ -2732,8 +2976,11 @@ class KnowledgeGraphStore:
2732
2976
  full_text = f"{filename}\n{text}"
2733
2977
  concepts = _extract_concepts(full_text, limit=15)
2734
2978
  triples = _extract_triples(full_text, concepts)
2979
+ chunk_ids: List[str] = []
2980
+ source_node_id: Optional[str] = None
2735
2981
 
2736
2982
  with self._connect() as conn:
2983
+ duplicate = self._node_exists(conn, file_id)
2737
2984
  # ── Document 노드 (점: 명사 — 파일) ────────────────────────────────
2738
2985
  self._upsert_node(
2739
2986
  conn, file_id, "Document", filename,
@@ -2742,6 +2989,15 @@ class KnowledgeGraphStore:
2742
2989
  )
2743
2990
  self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
2744
2991
 
2992
+ # ── SOURCE 노드 + indexed_from (v3.6.0, source_type 지정 시) ──────
2993
+ if source_type:
2994
+ source_node_id = self._attach_source_node(
2995
+ conn, file_id,
2996
+ source_type=source_type, source_uri=source_uri or str(path),
2997
+ title=filename, content_hash=digest, captured_at=captured_at,
2998
+ extra={"owner": owner or uploader, "workspace_id": workspace_id, "ext": ext},
2999
+ )
3000
+
2745
3001
  # ── Person 노드 + 동사형 엣지 ─────────────────────────────────────
2746
3002
  if uploader:
2747
3003
  person_id = f"person:{_slug(uploader)}"
@@ -2762,6 +3018,7 @@ class KnowledgeGraphStore:
2762
3018
  # ── RAG chunks (검색용, 그래프 비표시) ────────────────────────────
2763
3019
  for index, chunk in enumerate(_chunks(text)):
2764
3020
  chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
3021
+ chunk_ids.append(chunk_id)
2765
3022
  self._upsert_node(
2766
3023
  conn, chunk_id, "Chunk",
2767
3024
  f"{filename} chunk {index + 1}",
@@ -2816,7 +3073,18 @@ class KnowledgeGraphStore:
2816
3073
  # 선: Document가 Task/Decision을 "포함함"
2817
3074
  self._upsert_edge(conn, file_id, sem_id, "포함함", weight=0.9)
2818
3075
 
2819
- return {"node_id": file_id, "sha256": digest, "metadata": metadata}
3076
+ return {
3077
+ "node_id": file_id,
3078
+ "type": "Document",
3079
+ "sha256": digest,
3080
+ "content_hash": digest,
3081
+ "source_node_id": source_node_id,
3082
+ "chunk_ids": chunk_ids,
3083
+ "chunk_count": len(chunk_ids),
3084
+ "duplicate": duplicate,
3085
+ "captured_at": captured_at,
3086
+ "metadata": metadata,
3087
+ }
2820
3088
 
2821
3089
  def ingest_event(
2822
3090
  self,
@@ -2854,6 +3122,513 @@ class KnowledgeGraphStore:
2854
3122
  self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
2855
3123
  return {"node_id": event_id, "type": event_type}
2856
3124
 
3125
+ # ── v3.6.0 Knowledge Graph First: unified source ingestion + provenance ──────
3126
+ def _node_exists(self, conn: sqlite3.Connection, node_id: str) -> bool:
3127
+ row = conn.execute("SELECT 1 FROM nodes WHERE id = ?", (node_id,)).fetchone()
3128
+ return row is not None
3129
+
3130
+ def node_is_embedded(self, node_id: str) -> bool:
3131
+ """True when a vector embedding exists for ``node_id`` (RAG-ready)."""
3132
+ with self._connect() as conn:
3133
+ row = conn.execute(
3134
+ "SELECT 1 FROM vector_embeddings WHERE item_id = ? LIMIT 1",
3135
+ (node_id,),
3136
+ ).fetchone()
3137
+ return row is not None
3138
+
3139
+ def _attach_source_node(
3140
+ self,
3141
+ conn: sqlite3.Connection,
3142
+ content_node_id: str,
3143
+ *,
3144
+ source_type: str,
3145
+ source_uri: Optional[str] = None,
3146
+ title: Optional[str] = None,
3147
+ content_hash: Optional[str] = None,
3148
+ captured_at: Optional[str] = None,
3149
+ extra: Optional[Dict[str, Any]] = None,
3150
+ ) -> str:
3151
+ """Create the SOURCE node for an ingested item and link it via INDEXED_FROM.
3152
+
3153
+ Every ingested content node points at exactly one SOURCE node, so the
3154
+ graph is always able to explain *where* a node came from. The source id
3155
+ is derived from (source_type, source_uri | content_hash) so re-ingesting
3156
+ the same origin reuses the same SOURCE node (idempotent).
3157
+ """
3158
+ key = source_uri or content_hash or content_node_id
3159
+ source_id = f"source:{_sha256_text(f'{source_type}|{key}')[:24]}"
3160
+ meta = {
3161
+ "source_type": source_type,
3162
+ "source_uri": source_uri,
3163
+ "content_hash": content_hash,
3164
+ "captured_at": captured_at or _now(),
3165
+ **(extra or {}),
3166
+ }
3167
+ label = title or source_uri or source_type
3168
+ self._upsert_node(
3169
+ conn, source_id, "Source", label,
3170
+ summary=str(source_uri or title or source_type)[:400],
3171
+ metadata=meta,
3172
+ )
3173
+ # 선: 콘텐츠 노드가 "이 출처에서 색인됨" (indexed_from → SOURCE)
3174
+ self._upsert_edge(conn, content_node_id, source_id, "indexed_from",
3175
+ weight=1.0, metadata={"source_type": source_type})
3176
+ return source_id
3177
+
3178
+ def ingest_source(
3179
+ self,
3180
+ *,
3181
+ source_type: str,
3182
+ title: str,
3183
+ text: str,
3184
+ source_uri: Optional[str] = None,
3185
+ owner: Optional[str] = None,
3186
+ workspace_id: Optional[str] = None,
3187
+ permissions: Optional[Dict[str, Any]] = None,
3188
+ captured_at: Optional[str] = None,
3189
+ modified_at: Optional[str] = None,
3190
+ conversation_id: Optional[str] = None,
3191
+ metadata: Optional[Dict[str, Any]] = None,
3192
+ ) -> Dict[str, Any]:
3193
+ """Unified text/web ingestion: one shape for URL, browser tab, note, text.
3194
+
3195
+ Creates a content ``Document`` node (idempotent by content hash), a
3196
+ ``Source`` node linked via ``indexed_from``, RAG chunks, and extracted
3197
+ Concept/Task/Decision nodes — mirroring ingest_document for non-file
3198
+ sources. Returns the full set of ids the caller needs to record
3199
+ provenance, including ``duplicate`` (was the content already indexed).
3200
+ """
3201
+ source_type = str(source_type or "text")
3202
+ text = str(text or "")
3203
+ title = _clean_text(str(title or source_uri or source_type))[:240] or source_type
3204
+ captured_at = captured_at or _now()
3205
+ content_hash = _sha256_text(f"{source_type}|{source_uri or ''}|{text}")
3206
+ content_id = f"webdoc:{content_hash[:24]}"
3207
+ full_text = f"{title}\n{text}"
3208
+ node_meta = {
3209
+ "source_type": source_type,
3210
+ "source_uri": source_uri,
3211
+ "content_hash": content_hash,
3212
+ "title": title,
3213
+ "captured_at": captured_at,
3214
+ "modified_at": modified_at,
3215
+ "owner": owner,
3216
+ "workspace_id": workspace_id,
3217
+ "permissions": permissions or {},
3218
+ "chars": len(text),
3219
+ **(metadata or {}),
3220
+ }
3221
+ concepts = _extract_concepts(full_text, limit=15)
3222
+ triples = _extract_triples(full_text, concepts)
3223
+ chunk_ids: List[str] = []
3224
+
3225
+ with self._connect() as conn:
3226
+ duplicate = self._node_exists(conn, content_id)
3227
+ # ── 콘텐츠 노드 (점: 명사 — 문서) ────────────────────────────────
3228
+ self._upsert_node(
3229
+ conn, content_id, "Document", title,
3230
+ summary=(text or title)[:500],
3231
+ metadata=node_meta, raw=node_meta,
3232
+ )
3233
+ # ── SOURCE 노드 + indexed_from 엣지 (출처 추적) ──────────────────
3234
+ source_node_id = self._attach_source_node(
3235
+ conn, content_id,
3236
+ source_type=source_type, source_uri=source_uri, title=title,
3237
+ content_hash=content_hash, captured_at=captured_at,
3238
+ extra={"owner": owner, "workspace_id": workspace_id},
3239
+ )
3240
+ # ── 소유자(Person) + 동사형 엣지 ────────────────────────────────
3241
+ if owner:
3242
+ person_id = f"person:{_slug(owner)}"
3243
+ self._upsert_node(conn, person_id, "Person", owner, metadata={"email": owner})
3244
+ self._upsert_edge(conn, person_id, content_id, "업로드함", weight=1.0)
3245
+ # ── 대화 연결 ───────────────────────────────────────────────────
3246
+ if conversation_id:
3247
+ conv_id = f"conversation:{_slug(conversation_id)}"
3248
+ self._upsert_node(conn, conv_id, "Chat", conversation_id)
3249
+ self._upsert_edge(conn, conv_id, content_id, "언급함", weight=0.8)
3250
+ # ── RAG 청크 ────────────────────────────────────────────────────
3251
+ for index, chunk in enumerate(_chunks(text)):
3252
+ chunk_id = f"chunk:{_sha256_text(f'{content_id}:{index}:{chunk}')[:24]}"
3253
+ chunk_ids.append(chunk_id)
3254
+ self._upsert_node(
3255
+ conn, chunk_id, "Chunk", f"{title} chunk {index + 1}",
3256
+ summary=chunk[:500], metadata={"index": index, "source_node": content_id},
3257
+ )
3258
+ self._upsert_chunk(conn, chunk_id=chunk_id, source_node=content_id,
3259
+ text=chunk, metadata={"index": index, "source_node": content_id})
3260
+ self._upsert_edge(conn, content_id, chunk_id, "포함함")
3261
+ # ── Concept / Feature / Error / Code 노드 + 엣지 ────────────────
3262
+ concept_ids: Dict[str, str] = {}
3263
+ for concept in concepts:
3264
+ node_t = _classify_node_type(concept, full_text)
3265
+ cid = f"{node_t.lower()}:{_slug(concept)}"
3266
+ concept_ids[concept.lower()] = cid
3267
+ self._upsert_node(conn, cid, node_t, concept,
3268
+ metadata={"auto_extracted": True, "source_type": source_type})
3269
+ self._upsert_edge(conn, content_id, cid, "포함함", weight=0.8)
3270
+ for triple in triples:
3271
+ subj_id = concept_ids.get(triple["subject"].lower())
3272
+ obj_id = concept_ids.get(triple["object"].lower())
3273
+ if subj_id and obj_id and subj_id != obj_id:
3274
+ self._upsert_edge(conn, subj_id, obj_id, triple["relation"],
3275
+ weight=1.0, metadata={"context": triple.get("context", "")[:240]})
3276
+ # ── Task / Decision 노드 ────────────────────────────────────────
3277
+ for item in _semantic_items(text):
3278
+ sem_type = item["type"]
3279
+ sem_title = item["title"]
3280
+ sem_id = f"{sem_type.lower()}:{_sha256_text(f'{content_id}:{sem_type}:{sem_title}')[:24]}"
3281
+ self._upsert_node(conn, sem_id, sem_type, sem_title, summary=item["summary"],
3282
+ metadata={"auto_extracted": True, "source_node": content_id}, raw=item)
3283
+ self._upsert_edge(conn, content_id, sem_id, "포함함", weight=0.9)
3284
+
3285
+ return {
3286
+ "node_id": content_id,
3287
+ "type": "Document",
3288
+ "source_node_id": source_node_id,
3289
+ "content_hash": content_hash,
3290
+ "chunk_ids": chunk_ids,
3291
+ "chunk_count": len(chunk_ids),
3292
+ "duplicate": duplicate,
3293
+ "captured_at": captured_at,
3294
+ }
3295
+
3296
+ def record_provenance(
3297
+ self,
3298
+ *,
3299
+ node_id: str,
3300
+ source_type: str,
3301
+ pipeline: str = "unified-ingestion",
3302
+ source_uri: Optional[str] = None,
3303
+ content_hash: Optional[str] = None,
3304
+ title: Optional[str] = None,
3305
+ owner: Optional[str] = None,
3306
+ workspace_id: Optional[str] = None,
3307
+ captured_at: Optional[str] = None,
3308
+ modified_at: Optional[str] = None,
3309
+ embedded: bool = False,
3310
+ linked: bool = False,
3311
+ duplicate: bool = False,
3312
+ agent_used: Optional[str] = None,
3313
+ chunk_count: int = 0,
3314
+ permissions: Optional[Dict[str, Any]] = None,
3315
+ metadata: Optional[Dict[str, Any]] = None,
3316
+ ) -> Dict[str, Any]:
3317
+ """Append a provenance record for an ingested node (audit trail)."""
3318
+ now = _now()
3319
+ prov_basis = f"{node_id}|{content_hash or ''}|{now}"
3320
+ prov_id = f"prov:{_sha256_text(prov_basis)[:24]}"
3321
+ with self._connect() as conn:
3322
+ conn.execute(
3323
+ """
3324
+ INSERT OR REPLACE INTO ingestion_provenance(
3325
+ id, node_id, source_type, source_uri, content_hash, title, pipeline,
3326
+ owner, workspace_id, captured_at, modified_at, embedded, linked,
3327
+ duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
3328
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3329
+ """,
3330
+ (
3331
+ prov_id, node_id, source_type, source_uri, content_hash, title, pipeline,
3332
+ owner, workspace_id, captured_at, modified_at, 1 if embedded else 0,
3333
+ 1 if linked else 0, 1 if duplicate else 0, agent_used, int(chunk_count or 0),
3334
+ _json(permissions or {}), _json(metadata or {}), now,
3335
+ ),
3336
+ )
3337
+ return {"id": prov_id, "node_id": node_id, "created_at": now}
3338
+
3339
+ @staticmethod
3340
+ def _provenance_row(row: sqlite3.Row) -> Dict[str, Any]:
3341
+ return {
3342
+ "id": row["id"],
3343
+ "node_id": row["node_id"],
3344
+ "source_type": row["source_type"],
3345
+ "source_uri": row["source_uri"],
3346
+ "content_hash": row["content_hash"],
3347
+ "title": row["title"],
3348
+ "pipeline": row["pipeline"],
3349
+ "owner": row["owner"],
3350
+ "workspace_id": row["workspace_id"],
3351
+ "captured_at": row["captured_at"],
3352
+ "modified_at": row["modified_at"],
3353
+ "embedded": bool(row["embedded"]),
3354
+ "linked": bool(row["linked"]),
3355
+ "duplicate": bool(row["duplicate"]),
3356
+ "agent_used": row["agent_used"],
3357
+ "chunk_count": row["chunk_count"],
3358
+ "permissions": _safe_loads(row["permissions_json"]),
3359
+ "metadata": _safe_loads(row["metadata_json"]),
3360
+ "created_at": row["created_at"],
3361
+ }
3362
+
3363
+ def get_provenance(self, node_id: str) -> Optional[Dict[str, Any]]:
3364
+ """Return the most recent provenance record for a node, or None."""
3365
+ with self._connect() as conn:
3366
+ row = conn.execute(
3367
+ "SELECT * FROM ingestion_provenance WHERE node_id = ? "
3368
+ "ORDER BY created_at DESC, rowid DESC LIMIT 1",
3369
+ (node_id,),
3370
+ ).fetchone()
3371
+ return self._provenance_row(row) if row else None
3372
+
3373
+ def list_provenance(self, *, limit: int = 100, source_type: Optional[str] = None) -> Dict[str, Any]:
3374
+ """Recent provenance records (newest first), optionally by source_type."""
3375
+ limit = max(1, min(int(limit or 100), 1000))
3376
+ with self._connect() as conn:
3377
+ if source_type:
3378
+ rows = conn.execute(
3379
+ "SELECT * FROM ingestion_provenance WHERE source_type = ? "
3380
+ "ORDER BY created_at DESC, rowid DESC LIMIT ?",
3381
+ (source_type, limit),
3382
+ ).fetchall()
3383
+ else:
3384
+ rows = conn.execute(
3385
+ "SELECT * FROM ingestion_provenance "
3386
+ "ORDER BY created_at DESC, rowid DESC LIMIT ?",
3387
+ (limit,),
3388
+ ).fetchall()
3389
+ return {"items": [self._provenance_row(r) for r in rows], "count": len(rows)}
3390
+
3391
+ def provenance_coverage(self) -> Dict[str, Any]:
3392
+ """How much of the brain is explainable: nodes with vs without
3393
+ provenance, per node type — the honesty metric for 'every source goes
3394
+ through the pipeline'. Pre-v4 nodes ingested before provenance existed
3395
+ legitimately count as uncovered."""
3396
+ nt, _ = self._read_tables()
3397
+ with self._connect() as conn:
3398
+ total = conn.execute(f"SELECT COUNT(*) FROM {nt}").fetchone()[0]
3399
+ covered = conn.execute(
3400
+ f"SELECT COUNT(*) FROM {nt} WHERE id IN (SELECT DISTINCT node_id FROM ingestion_provenance)"
3401
+ ).fetchone()[0]
3402
+ uncovered_by_type = {
3403
+ row["type"]: row["c"]
3404
+ for row in conn.execute(
3405
+ f"""
3406
+ SELECT type, COUNT(*) AS c FROM {nt}
3407
+ WHERE id NOT IN (SELECT DISTINCT node_id FROM ingestion_provenance)
3408
+ GROUP BY type ORDER BY c DESC LIMIT 20
3409
+ """
3410
+ ).fetchall()
3411
+ }
3412
+ by_source = {
3413
+ row["source_type"]: row["c"]
3414
+ for row in conn.execute(
3415
+ "SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
3416
+ ).fetchall()
3417
+ }
3418
+ return {
3419
+ "total_nodes": total,
3420
+ "nodes_with_provenance": covered,
3421
+ "coverage_ratio": round(covered / total, 4) if total else None,
3422
+ "uncovered_by_type": uncovered_by_type,
3423
+ "provenance_by_source_type": by_source,
3424
+ }
3425
+
3426
+ def provenance_stats(self) -> Dict[str, Any]:
3427
+ """Aggregate provenance counts for the Knowledge Graph status surface."""
3428
+ with self._connect() as conn:
3429
+ total = conn.execute("SELECT COUNT(*) AS c FROM ingestion_provenance").fetchone()["c"]
3430
+ by_source = {
3431
+ r["source_type"]: r["c"]
3432
+ for r in conn.execute(
3433
+ "SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
3434
+ ).fetchall()
3435
+ }
3436
+ embedded = conn.execute(
3437
+ "SELECT COUNT(*) AS c FROM ingestion_provenance WHERE embedded = 1"
3438
+ ).fetchone()["c"]
3439
+ duplicates = conn.execute(
3440
+ "SELECT COUNT(*) AS c FROM ingestion_provenance WHERE duplicate = 1"
3441
+ ).fetchone()["c"]
3442
+ last = conn.execute(
3443
+ "SELECT created_at FROM ingestion_provenance ORDER BY created_at DESC LIMIT 1"
3444
+ ).fetchone()
3445
+ return {
3446
+ "total": total,
3447
+ "by_source_type": by_source,
3448
+ "embedded": embedded,
3449
+ "duplicates": duplicates,
3450
+ "last_ingested_at": last["created_at"] if last else None,
3451
+ }
3452
+
3453
+ # ── v3.6.0 portability: logical export / import + binary backup ──────────────
3454
+ def schema_versions(self) -> Dict[str, Any]:
3455
+ """Versions an exporter stamps and an importer validates against."""
3456
+ try:
3457
+ from kg_schema import EMBED_DIM as _EMBED_DIM, KG_SCHEMA_V2_VERSION as _V2
3458
+ except Exception: # pragma: no cover - kg_schema always importable in practice
3459
+ _EMBED_DIM, _V2 = 1024, 2
3460
+ return {
3461
+ "graph_schema_version": GRAPH_SCHEMA_VERSION,
3462
+ "kg_v2_schema_version": _V2,
3463
+ "projection_version": _PROJECTION_VERSION,
3464
+ "embed_dim": _EMBED_DIM,
3465
+ }
3466
+
3467
+ def export_graph_data(self, *, workspace_id: Optional[str] = None) -> Dict[str, Any]:
3468
+ """Raw, lossless logical export of the graph (nodes/edges/chunks/sources/
3469
+ provenance). Vector embeddings are intentionally omitted — they are
3470
+ re-derived on import — so the artifact stays portable and small. Use
3471
+ :meth:`backup_database` for a faithful binary copy incl. embeddings.
3472
+
3473
+ ``workspace_id`` REALLY filters (v4): the artifact contains only nodes
3474
+ scoped to that workspace plus legacy-global rows (NULL scope, readable
3475
+ machine-wide by definition), with edges/chunks/provenance restricted to
3476
+ the surviving nodes. Pre-v4 this parameter was stamped into the header
3477
+ while the data exported everything — a header that lied.
3478
+ """
3479
+ with self._connect() as conn:
3480
+ def rows(table: str):
3481
+ return [dict(r) for r in conn.execute(f"SELECT * FROM {table}").fetchall()]
3482
+
3483
+ if workspace_id:
3484
+ keep_ids = {
3485
+ row["id"]
3486
+ for row in conn.execute(
3487
+ "SELECT id FROM nodes_v2 WHERE workspace_id = ? OR workspace_id IS NULL",
3488
+ (workspace_id,),
3489
+ ).fetchall()
3490
+ }
3491
+ nodes = [n for n in rows("nodes") if n["id"] in keep_ids]
3492
+ edges = [
3493
+ e for e in rows("edges")
3494
+ if e["from_node"] in keep_ids and e["to_node"] in keep_ids
3495
+ ]
3496
+ chunks = [c for c in rows("chunks") if c["source_node"] in keep_ids]
3497
+ provenance = [p for p in rows("ingestion_provenance") if p["node_id"] in keep_ids]
3498
+ data = {
3499
+ "nodes": nodes,
3500
+ "edges": edges,
3501
+ "chunks": chunks,
3502
+ "knowledge_sources": rows("knowledge_sources"),
3503
+ "provenance": provenance,
3504
+ }
3505
+ else:
3506
+ data = {
3507
+ "nodes": rows("nodes"),
3508
+ "edges": rows("edges"),
3509
+ "chunks": rows("chunks"),
3510
+ "knowledge_sources": rows("knowledge_sources"),
3511
+ "provenance": rows("ingestion_provenance"),
3512
+ }
3513
+ data["counts"] = {k: len(v) for k, v in data.items()}
3514
+ return data
3515
+
3516
+ def import_graph_data(
3517
+ self, data: Dict[str, Any], *, mode: str = "merge", dry_run: bool = False
3518
+ ) -> Dict[str, Any]:
3519
+ """Import a logical export back into the store.
3520
+
3521
+ ``mode='merge'`` upserts on top of existing data (id collisions update);
3522
+ ``mode='replace'`` clears the graph first. ``dry_run=True`` reports the
3523
+ plan without writing. Refuses artifacts from a NEWER graph schema than
3524
+ this build.
3525
+ """
3526
+ nodes = data.get("nodes") or []
3527
+ edges = data.get("edges") or []
3528
+ chunks = data.get("chunks") or []
3529
+ sources = data.get("knowledge_sources") or []
3530
+ provenance = data.get("provenance") or []
3531
+
3532
+ header = data.get("header") or {}
3533
+ incoming_schema = header.get("graph_schema_version")
3534
+ if isinstance(incoming_schema, int) and incoming_schema > GRAPH_SCHEMA_VERSION:
3535
+ raise ValueError(
3536
+ f"Artifact graph_schema_version {incoming_schema} is newer than this "
3537
+ f"build ({GRAPH_SCHEMA_VERSION}); refusing to import."
3538
+ )
3539
+
3540
+ plan = {
3541
+ "mode": mode,
3542
+ "nodes": len(nodes),
3543
+ "edges": len(edges),
3544
+ "chunks": len(chunks),
3545
+ "knowledge_sources": len(sources),
3546
+ "provenance": len(provenance),
3547
+ }
3548
+ if dry_run:
3549
+ plan["dry_run"] = True
3550
+ return plan
3551
+
3552
+ if mode == "replace":
3553
+ self.clear_all()
3554
+
3555
+ with self._connect() as conn:
3556
+ for n in nodes:
3557
+ self._upsert_node(
3558
+ conn, n["id"], n["type"], n.get("title") or "",
3559
+ summary=n.get("summary") or "",
3560
+ metadata=_safe_loads(n.get("metadata_json")),
3561
+ raw=_safe_loads(n.get("raw_json")),
3562
+ )
3563
+ for c in chunks:
3564
+ self._upsert_chunk(
3565
+ conn, chunk_id=c["id"], source_node=c["source_node"],
3566
+ text=c.get("text") or "", metadata=_safe_loads(c.get("metadata_json")),
3567
+ )
3568
+ for e in edges:
3569
+ self._upsert_edge(
3570
+ conn, e["from_node"], e["to_node"], e["type"],
3571
+ weight=float(e.get("weight") or 1.0),
3572
+ metadata=_safe_loads(e.get("metadata_json")),
3573
+ )
3574
+ for s in sources:
3575
+ conn.execute(
3576
+ """
3577
+ INSERT OR REPLACE INTO knowledge_sources(
3578
+ id, root_path, os_type, drive_id, label, status, include_ocr,
3579
+ watch_enabled, consent_json, created_at, updated_at, last_scanned_at)
3580
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3581
+ """,
3582
+ (
3583
+ s["id"], s["root_path"], s["os_type"], s.get("drive_id"), s.get("label"),
3584
+ s.get("status") or "active", int(s.get("include_ocr") or 0),
3585
+ int(s.get("watch_enabled") or 0), s.get("consent_json") or "{}",
3586
+ s.get("created_at") or _now(), s.get("updated_at") or _now(),
3587
+ s.get("last_scanned_at"),
3588
+ ),
3589
+ )
3590
+ for p in provenance:
3591
+ conn.execute(
3592
+ """
3593
+ INSERT OR REPLACE INTO ingestion_provenance(
3594
+ id, node_id, source_type, source_uri, content_hash, title, pipeline,
3595
+ owner, workspace_id, captured_at, modified_at, embedded, linked,
3596
+ duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
3597
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3598
+ """,
3599
+ (
3600
+ p["id"], p["node_id"], p["source_type"], p.get("source_uri"),
3601
+ p.get("content_hash"), p.get("title"), p.get("pipeline") or "import",
3602
+ p.get("owner"), p.get("workspace_id"), p.get("captured_at"),
3603
+ p.get("modified_at"), int(p.get("embedded") or 0), int(p.get("linked") or 0),
3604
+ int(p.get("duplicate") or 0), p.get("agent_used"), int(p.get("chunk_count") or 0),
3605
+ p.get("permissions_json") or "{}", p.get("metadata_json") or "{}",
3606
+ p.get("created_at") or _now(),
3607
+ ),
3608
+ )
3609
+ plan["imported"] = True
3610
+ return plan
3611
+
3612
+ def backup_database(self, dest_path) -> Path:
3613
+ """Write a clean, standalone snapshot of the live DB to ``dest_path``.
3614
+
3615
+ Uses ``VACUUM INTO`` (after a full WAL checkpoint) so the snapshot is a
3616
+ defragmented, rollback-journal-mode database with no companion -wal/-shm
3617
+ — which restores cleanly by a plain file copy. Captures all data incl.
3618
+ the vector_embeddings BLOBs.
3619
+ """
3620
+ dest = Path(dest_path)
3621
+ dest.parent.mkdir(parents=True, exist_ok=True)
3622
+ if dest.exists():
3623
+ dest.unlink() # VACUUM INTO requires the target to not exist
3624
+ conn = self._connect()
3625
+ try:
3626
+ conn.execute("PRAGMA wal_checkpoint(FULL)")
3627
+ conn.execute("VACUUM INTO ?", (str(dest),))
3628
+ finally:
3629
+ conn.close()
3630
+ return dest
3631
+
2857
3632
  def _ingest_structure_nodes(
2858
3633
  self,
2859
3634
  conn: sqlite3.Connection,
@@ -3044,6 +3819,13 @@ class KnowledgeGraphStore:
3044
3819
  "Feature", # 소프트웨어 기능
3045
3820
  "Task", # 할 일
3046
3821
  "Decision", # 결정 사항
3822
+ # v3.6.0 Knowledge Graph First — 1급 엔티티를 그래프에 노출
3823
+ "Source", # 수집 출처 (파일/URL/브라우저 탭/git)
3824
+ "Repository", # git 저장소
3825
+ "Meeting", # 회의
3826
+ "Organization", # 조직
3827
+ "Workflow", # 워크플로우
3828
+ "Agent", # 에이전트
3047
3829
  )
3048
3830
 
3049
3831
  def list_documents(self, limit: int = 200) -> Dict[str, Any]:
@@ -3091,7 +3873,40 @@ class KnowledgeGraphStore:
3091
3873
  "generated_at": datetime.now().isoformat(timespec="seconds"),
3092
3874
  }
3093
3875
 
3094
- def graph(self, limit: int = 300) -> Dict[str, Any]:
3876
+ def workspaces_of(self, node_ids) -> Dict[str, Optional[str]]:
3877
+ """Map node ids to their workspace scope (None = legacy-global)."""
3878
+ ids = [str(i) for i in node_ids if i]
3879
+ if not ids:
3880
+ return {}
3881
+ placeholders = ",".join("?" for _ in ids)
3882
+ with self._connect() as conn:
3883
+ try:
3884
+ return {
3885
+ row["id"]: row["workspace_id"]
3886
+ for row in conn.execute(
3887
+ f"SELECT id, workspace_id FROM nodes_v2 WHERE id IN ({placeholders})", ids
3888
+ ).fetchall()
3889
+ }
3890
+ except Exception:
3891
+ return {}
3892
+
3893
+ def filter_scoped_nodes(self, items, allowed_workspaces, *, id_key: str = "id"):
3894
+ """Drop items scoped to a workspace the caller is not a member of.
3895
+
3896
+ ``allowed_workspaces=None`` means no scoping (single-user / no-auth
3897
+ mode). Legacy-global rows (no workspace) stay visible to everyone on
3898
+ the machine — the documented pre-v4 compatibility behavior.
3899
+ """
3900
+ if allowed_workspaces is None:
3901
+ return list(items)
3902
+ allowed = set(allowed_workspaces)
3903
+ scopes = self.workspaces_of([item.get(id_key) for item in items])
3904
+ return [
3905
+ item for item in items
3906
+ if scopes.get(item.get(id_key)) is None or scopes.get(item.get(id_key)) in allowed
3907
+ ]
3908
+
3909
+ def graph(self, limit: int = 300, *, allowed_workspaces=None) -> Dict[str, Any]:
3095
3910
  limit = max(1, min(int(limit or 300), 2000))
3096
3911
  visible = ",".join(f"'{t}'" for t in self._GRAPH_VISIBLE_TYPES)
3097
3912
  nt, et = self._read_tables()
@@ -3141,6 +3956,11 @@ class KnowledgeGraphStore:
3141
3956
  for row in edge_rows
3142
3957
  ]
3143
3958
 
3959
+ if allowed_workspaces is not None:
3960
+ nodes = self.filter_scoped_nodes(nodes, allowed_workspaces)
3961
+ kept_ids = {node["id"] for node in nodes}
3962
+ edges = [e for e in edges if e["from"] in kept_ids and e["to"] in kept_ids]
3963
+
3144
3964
  degree_map: Dict[str, int] = {}
3145
3965
  now = datetime.now()
3146
3966
  node_by_id = {node["id"]: node for node in nodes}
@@ -3216,16 +4036,32 @@ class KnowledgeGraphStore:
3216
4036
  with self._connect() as conn:
3217
4037
  rows = []
3218
4038
  if query:
3219
- rows = conn.execute(
3220
- f"""
3221
- SELECT id, type, title, summary, metadata_json, updated_at
3222
- FROM {nt}
3223
- WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
3224
- ORDER BY updated_at DESC, id ASC
3225
- LIMIT ?
3226
- """,
3227
- (q, q, q, limit),
3228
- ).fetchall()
4039
+ fts_ids = self._fts_match_ids(conn, query, limit)
4040
+ if fts_ids:
4041
+ placeholders = ",".join("?" for _ in fts_ids)
4042
+ by_id = {
4043
+ row["id"]: row
4044
+ for row in conn.execute(
4045
+ f"""
4046
+ SELECT id, type, title, summary, metadata_json, updated_at
4047
+ FROM {nt} WHERE id IN ({placeholders})
4048
+ """,
4049
+ fts_ids,
4050
+ ).fetchall()
4051
+ }
4052
+ # Preserve FTS bm25 rank order.
4053
+ rows = [by_id[i] for i in fts_ids if i in by_id]
4054
+ else:
4055
+ rows = conn.execute(
4056
+ f"""
4057
+ SELECT id, type, title, summary, metadata_json, updated_at
4058
+ FROM {nt}
4059
+ WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
4060
+ ORDER BY updated_at DESC, id ASC
4061
+ LIMIT ?
4062
+ """,
4063
+ (q, q, q, limit),
4064
+ ).fetchall()
3229
4065
 
3230
4066
  if len(rows) < limit:
3231
4067
  terms = _topic_candidates(query, limit=8)
@@ -3260,6 +4096,10 @@ class KnowledgeGraphStore:
3260
4096
  } else 0
3261
4097
  return (hits, type_boost, row["updated_at"] or "")
3262
4098
 
4099
+ # Deterministic contract: rows with equal relevance order by id ASC
4100
+ # (stable sort preserves the pre-sort under reverse=True), matching
4101
+ # the legacy LIKE path regardless of FTS bm25 tie ordering.
4102
+ rows = sorted(rows, key=lambda r: r["id"])
3263
4103
  rows = sorted(rows, key=score, reverse=True)[:limit]
3264
4104
  return {
3265
4105
  "query": query,
@@ -3744,6 +4584,9 @@ class KnowledgeGraphStore:
3744
4584
  "backend": "sqlite",
3745
4585
  "embedding_model": self._embedding_model.model_id,
3746
4586
  "embedding_dim": self._embedding_model.dim,
4587
+ # Honest capability report: trigram FTS5 keyword index, or
4588
+ # LIKE-scan fallback when this SQLite build lacks it.
4589
+ "fts_enabled": bool(getattr(self, "_fts_enabled", False)),
3747
4590
  },
3748
4591
  "source_items": len(source_items),
3749
4592
  "indexed_items": sum(vector_counts.values()),
@@ -3847,21 +4690,26 @@ class KnowledgeGraphStore:
3847
4690
  return {"status": "skipped", "removed_nodes": 0}
3848
4691
  conv_id = f"conversation:{_slug(conversation_id)}"
3849
4692
  with self._connect() as conn:
4693
+ # Edge rows may carry the legacy lowercase label (pre-v4) or the
4694
+ # canonical EdgeType value (v4 write door) — match both.
3850
4695
  direct_ids = [
3851
4696
  row["to_node"]
3852
4697
  for row in conn.execute(
3853
- "SELECT to_node FROM edges WHERE from_node=? AND type='contains'",
4698
+ "SELECT to_node FROM edges WHERE from_node=? AND type IN ('contains', 'CONTAINS')",
3854
4699
  (conv_id,),
3855
4700
  )
3856
4701
  ]
3857
4702
  remove_ids = set(direct_ids)
4703
+ child_types = [
4704
+ "has_chunk", "implies", "contains_signal", "has_page",
4705
+ "has_slide", "has_sheet", "contains_image",
4706
+ ]
4707
+ child_types += [t.upper() for t in child_types]
4708
+ placeholders = ",".join("?" for _ in child_types)
3858
4709
  for source_id in list(direct_ids):
3859
4710
  for row in conn.execute(
3860
- """
3861
- SELECT to_node FROM edges
3862
- WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
3863
- """,
3864
- (source_id,),
4711
+ f"SELECT to_node FROM edges WHERE from_node=? AND type IN ({placeholders})",
4712
+ (source_id, *child_types),
3865
4713
  ):
3866
4714
  remove_ids.add(row["to_node"])
3867
4715
  remove_ids.add(conv_id)