ltcai 3.6.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/README.md +39 -31
  2. package/docs/CHANGELOG.md +64 -0
  3. package/docs/REALTIME_COLLABORATION.md +3 -3
  4. package/docs/V3_FRONTEND.md +9 -8
  5. package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
  6. package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +552 -0
  7. package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
  8. package/docs/kg-schema.md +51 -53
  9. package/docs/spec-vs-impl.md +10 -10
  10. package/kg_schema.py +2 -520
  11. package/knowledge_graph.py +37 -4629
  12. package/knowledge_graph_api.py +11 -127
  13. package/latticeai/__init__.py +1 -1
  14. package/latticeai/api/admin.py +16 -17
  15. package/latticeai/api/agents.py +20 -7
  16. package/latticeai/api/auth.py +46 -15
  17. package/latticeai/api/chat.py +112 -76
  18. package/latticeai/api/health.py +1 -1
  19. package/latticeai/api/hooks.py +1 -1
  20. package/latticeai/api/invitations.py +100 -0
  21. package/latticeai/api/knowledge_graph.py +139 -0
  22. package/latticeai/api/local_files.py +1 -1
  23. package/latticeai/api/mcp.py +23 -11
  24. package/latticeai/api/memory.py +1 -1
  25. package/latticeai/api/models.py +1 -1
  26. package/latticeai/api/network.py +81 -0
  27. package/latticeai/api/plugins.py +3 -6
  28. package/latticeai/api/realtime.py +5 -8
  29. package/latticeai/api/search.py +26 -2
  30. package/latticeai/api/security_dashboard.py +2 -3
  31. package/latticeai/api/setup.py +2 -2
  32. package/latticeai/api/static_routes.py +11 -16
  33. package/latticeai/api/tools.py +3 -0
  34. package/latticeai/api/ui_redirects.py +26 -0
  35. package/latticeai/api/workflow_designer.py +85 -6
  36. package/latticeai/api/workspace.py +93 -57
  37. package/latticeai/app_factory.py +1781 -0
  38. package/latticeai/brain/__init__.py +18 -0
  39. package/latticeai/brain/_kg_common.py +1123 -0
  40. package/latticeai/brain/context.py +213 -0
  41. package/latticeai/brain/conversations.py +236 -0
  42. package/latticeai/brain/discovery.py +1455 -0
  43. package/latticeai/brain/documents.py +218 -0
  44. package/latticeai/brain/identity.py +175 -0
  45. package/latticeai/brain/ingest.py +644 -0
  46. package/latticeai/brain/memory.py +102 -0
  47. package/latticeai/brain/network.py +205 -0
  48. package/latticeai/brain/projection.py +561 -0
  49. package/latticeai/brain/provenance.py +401 -0
  50. package/latticeai/brain/retrieval.py +1316 -0
  51. package/latticeai/brain/schema.py +640 -0
  52. package/latticeai/brain/store.py +216 -0
  53. package/latticeai/brain/write_master.py +225 -0
  54. package/latticeai/core/agent.py +31 -7
  55. package/latticeai/core/audit.py +0 -7
  56. package/latticeai/core/config.py +1 -1
  57. package/latticeai/core/context_builder.py +1 -2
  58. package/latticeai/core/enterprise.py +1 -1
  59. package/latticeai/core/graph_curator.py +2 -2
  60. package/latticeai/core/invitations.py +131 -0
  61. package/latticeai/core/marketplace.py +1 -1
  62. package/latticeai/core/mcp_registry.py +791 -0
  63. package/latticeai/core/model_compat.py +1 -1
  64. package/latticeai/core/model_resolution.py +0 -1
  65. package/latticeai/core/multi_agent.py +238 -4
  66. package/latticeai/core/policy.py +54 -0
  67. package/latticeai/core/realtime.py +65 -44
  68. package/latticeai/core/security.py +1 -1
  69. package/latticeai/core/sessions.py +66 -10
  70. package/latticeai/core/users.py +147 -0
  71. package/latticeai/core/workflow_engine.py +114 -2
  72. package/latticeai/core/workspace_os.py +477 -29
  73. package/latticeai/models/__init__.py +7 -0
  74. package/latticeai/models/router.py +779 -0
  75. package/latticeai/server_app.py +29 -1536
  76. package/latticeai/services/agent_runtime.py +243 -4
  77. package/latticeai/services/app_context.py +75 -14
  78. package/latticeai/services/ingestion.py +47 -0
  79. package/latticeai/services/kg_portability.py +33 -3
  80. package/latticeai/services/memory_service.py +39 -11
  81. package/latticeai/services/model_runtime.py +2 -5
  82. package/latticeai/services/platform_runtime.py +100 -23
  83. package/latticeai/services/run_executor.py +328 -0
  84. package/latticeai/services/search_service.py +17 -8
  85. package/latticeai/services/tool_dispatch.py +12 -2
  86. package/latticeai/services/triggers.py +241 -0
  87. package/latticeai/services/upload_service.py +37 -12
  88. package/latticeai/services/workspace_service.py +55 -16
  89. package/llm_router.py +29 -772
  90. package/ltcai_cli.py +1 -2
  91. package/mcp_registry.py +25 -788
  92. package/p_reinforce.py +124 -14
  93. package/package.json +10 -20
  94. package/scripts/bump_version.py +99 -0
  95. package/scripts/generate_diagrams.py +0 -1
  96. package/scripts/lint_v3.mjs +105 -18
  97. package/scripts/validate_release_artifacts.py +0 -1
  98. package/scripts/wheel_smoke.py +142 -0
  99. package/server.py +11 -7
  100. package/setup_wizard.py +1142 -0
  101. package/static/sw.js +81 -52
  102. package/static/v3/asset-manifest.json +33 -25
  103. package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
  104. package/static/v3/css/lattice.base.css +1 -1
  105. package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
  106. package/static/v3/css/lattice.components.css +1 -1
  107. package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
  108. package/static/v3/css/lattice.shell.css +1 -1
  109. package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
  110. package/static/v3/css/lattice.tokens.css +3 -0
  111. package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
  112. package/static/v3/css/lattice.views.css +2 -2
  113. package/static/v3/index.html +3 -4
  114. package/static/v3/js/{app.c541f955.js → app.c5c80c46.js} +1 -1
  115. package/static/v3/js/core/{api.33d6320e.js → api.ba0fbf14.js} +58 -1
  116. package/static/v3/js/core/api.js +57 -0
  117. package/static/v3/js/core/i18n.880e1fec.js +575 -0
  118. package/static/v3/js/core/i18n.js +575 -0
  119. package/static/v3/js/core/routes.37522821.js +101 -0
  120. package/static/v3/js/core/routes.js +71 -63
  121. package/static/v3/js/core/{shell.8c163e0e.js → shell.e3f6bbfa.js} +68 -39
  122. package/static/v3/js/core/shell.js +66 -37
  123. package/static/v3/js/core/{store.34ebd5e6.js → store.7b2aa044.js} +11 -1
  124. package/static/v3/js/core/store.js +11 -1
  125. package/static/v3/js/views/account.eff40715.js +143 -0
  126. package/static/v3/js/views/account.js +143 -0
  127. package/static/v3/js/views/activity.0d271ef9.js +67 -0
  128. package/static/v3/js/views/activity.js +67 -0
  129. package/static/v3/js/views/{admin-users.03bac88c.js → admin-users.f7ac7b43.js} +4 -6
  130. package/static/v3/js/views/admin-users.js +4 -6
  131. package/static/v3/js/views/{agents.014d0b74.js → agents.17c5288d.js} +35 -12
  132. package/static/v3/js/views/agents.js +35 -12
  133. package/static/v3/js/views/{chat.e6dd7dd0.js → chat.e250e2cc.js} +23 -0
  134. package/static/v3/js/views/chat.js +23 -0
  135. package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
  136. package/static/v3/js/views/graph-canvas.js +509 -0
  137. package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
  138. package/static/v3/js/views/hybrid-search.js +1 -2
  139. package/static/v3/js/views/{knowledge-graph.a96040a5.js → knowledge-graph.4d09c537.js} +60 -44
  140. package/static/v3/js/views/knowledge-graph.js +60 -44
  141. package/static/v3/js/views/network.52a4f181.js +97 -0
  142. package/static/v3/js/views/network.js +97 -0
  143. package/static/v3/js/views/{planning.9ac3e313.js → planning.4876fd77.js} +26 -5
  144. package/static/v3/js/views/planning.js +26 -5
  145. package/static/v3/js/views/runs.b63b2afa.js +144 -0
  146. package/static/v3/js/views/runs.js +144 -0
  147. package/static/v3/js/views/{settings.8631fa5e.js → settings.b7140634.js} +7 -8
  148. package/static/v3/js/views/settings.js +7 -8
  149. package/static/v3/js/views/snapshots.6f5db095.js +135 -0
  150. package/static/v3/js/views/snapshots.js +135 -0
  151. package/static/v3/js/views/{workflows.26c57290.js → workflows.7752225a.js} +87 -2
  152. package/static/v3/js/views/workflows.js +87 -2
  153. package/static/v3/js/views/workspace-admin.c466029b.js +156 -0
  154. package/static/v3/js/views/workspace-admin.js +156 -0
  155. package/static/vendor/chart.umd.min.js +20 -0
  156. package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
  157. package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
  158. package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
  159. package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
  160. package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
  161. package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
  162. package/static/vendor/fonts/inter.css +44 -0
  163. package/static/vendor/icons/tabler-icons.min.css +4 -0
  164. package/static/vendor/icons/tabler-icons.woff2 +0 -0
  165. package/static/vendor/marked.min.js +69 -0
  166. package/telegram_bot.py +1 -2
  167. package/tools/commands.py +4 -2
  168. package/tools/computer.py +1 -1
  169. package/tools/documents.py +1 -3
  170. package/tools/filesystem.py +0 -4
  171. package/tools/knowledge.py +1 -3
  172. package/tools/network.py +1 -3
  173. package/codex_telegram_bot.py +0 -195
  174. package/docs/assets/v3.4.0/agent-run.png +0 -0
  175. package/docs/assets/v3.4.0/agents.png +0 -0
  176. package/docs/assets/v3.4.0/before/chat-before.png +0 -0
  177. package/docs/assets/v3.4.0/before/files-before.png +0 -0
  178. package/docs/assets/v3.4.0/chat.png +0 -0
  179. package/docs/assets/v3.4.0/connect-folder.png +0 -0
  180. package/docs/assets/v3.4.0/files.png +0 -0
  181. package/docs/assets/v3.4.0/home.png +0 -0
  182. package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
  183. package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
  184. package/docs/assets/v3.4.0/local-agent.png +0 -0
  185. package/docs/assets/v3.4.0/memory.png +0 -0
  186. package/docs/assets/v3.4.0/settings.png +0 -0
  187. package/docs/assets/v3.4.0/vision-input.png +0 -0
  188. package/docs/assets/v3.4.0/workflows.png +0 -0
  189. package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
  190. package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
  191. package/docs/assets/v3.4.1/local-agent.png +0 -0
  192. package/docs/images/admin-dashboard.png +0 -0
  193. package/docs/images/architecture.png +0 -0
  194. package/docs/images/enterprise.png +0 -0
  195. package/docs/images/graph.png +0 -0
  196. package/docs/images/hero.gif +0 -0
  197. package/docs/images/knowledge-graph.png +0 -0
  198. package/docs/images/lattice-ai-demo.gif +0 -0
  199. package/docs/images/lattice-ai-hero.png +0 -0
  200. package/docs/images/logo.svg +0 -33
  201. package/docs/images/mobile-responsive.png +0 -0
  202. package/docs/images/model-recommendation.png +0 -0
  203. package/docs/images/onboarding.png +0 -0
  204. package/docs/images/organization.png +0 -0
  205. package/docs/images/pipeline.png +0 -0
  206. package/docs/images/screenshot-admin.png +0 -0
  207. package/docs/images/screenshot-chat.png +0 -0
  208. package/docs/images/screenshot-graph.png +0 -0
  209. package/docs/images/skills.png +0 -0
  210. package/docs/images/workspace-dark.png +0 -0
  211. package/docs/images/workspace-light.png +0 -0
  212. package/docs/images/workspace.png +0 -0
  213. package/requirements.txt +0 -16
  214. package/static/account.html +0 -115
  215. package/static/activity.html +0 -73
  216. package/static/admin.html +0 -488
  217. package/static/agents.html +0 -139
  218. package/static/chat.html +0 -844
  219. package/static/css/reference/account.css +0 -439
  220. package/static/css/reference/admin.css +0 -610
  221. package/static/css/reference/base.css +0 -1661
  222. package/static/css/reference/chat.css +0 -4623
  223. package/static/css/reference/graph.css +0 -1016
  224. package/static/css/responsive.css +0 -861
  225. package/static/graph.html +0 -124
  226. package/static/platform.css +0 -104
  227. package/static/plugins.html +0 -136
  228. package/static/scripts/account.js +0 -238
  229. package/static/scripts/admin.js +0 -1614
  230. package/static/scripts/chat.js +0 -5081
  231. package/static/scripts/graph.js +0 -1804
  232. package/static/scripts/platform.js +0 -64
  233. package/static/scripts/ux.js +0 -167
  234. package/static/scripts/workspace.js +0 -948
  235. package/static/v3/js/core/routes.2ce3815a.js +0 -93
  236. package/static/workflows.html +0 -146
  237. package/static/workspace.css +0 -1121
  238. package/static/workspace.html +0 -357
@@ -1,4633 +1,41 @@
1
- """
2
- SQLite knowledge graph for Lattice AI workspace memory.
1
+ """Compatibility shim for the v4 brain store.
3
2
 
4
- The graph keeps raw event JSON, normalized node metadata, and edges in one
5
- portable database so it can later migrate to Neo4j/Postgres without changing
6
- the ingestion contract.
3
+ The implementation now lives under :mod:`latticeai.brain`. Root imports are
4
+ kept for older integrations and tests.
7
5
  """
8
6
 
9
- import asyncio
10
- import hashlib
11
- import json
12
- import logging
13
- import math
14
- import os
15
- import platform
16
- import re
17
- import shutil
18
- import sqlite3
19
- import time
20
- import zipfile
21
- from collections import Counter
22
- from datetime import datetime
23
- from pathlib import Path
24
- from typing import Any, Dict, Iterable, List, Optional, Tuple
25
-
26
- try:
27
- from kg_schema import KGStoreV2, NodeType, EdgeType, _exec_script
28
- except Exception: # pragma: no cover - v2 schema is optional at import time
29
- KGStoreV2 = None # type: ignore[assignment]
30
- NodeType = None # type: ignore[assignment]
31
- EdgeType = None # type: ignore[assignment]
32
- _exec_script = None # type: ignore[assignment]
33
-
34
- from latticeai.core.local_embeddings import LocalEmbeddingModel
35
-
36
- # Default read source for the graph queries: v2 reconstruction views.
37
- # Override with LATTICEAI_KG_READ_V2=0 to fall back to the legacy tables.
38
- _READ_FROM_V2_DEFAULT = os.getenv("LATTICEAI_KG_READ_V2", "1") != "0"
39
-
40
- # Bump when the v2 projection layout changes (columns, normalization rules).
41
- # On init, a stale projection is dropped and rebuilt from the authoritative
42
- # legacy tables — safe because nodes_v2/edges_v2 only ever hold a derived view.
43
- # v4: summary nullable + verbatim (byte-faithful) projection of legacy values.
44
- _PROJECTION_VERSION = 4
45
-
46
- _llm_router_ref = None
47
-
48
- def set_llm_router(router_instance):
49
- global _llm_router_ref
50
- _llm_router_ref = router_instance
51
-
52
-
53
- GRAPH_SCHEMA_VERSION = 1
54
-
55
- LOCAL_TEXT_EXTENSIONS = {".txt", ".md"}
56
- LOCAL_CODE_EXTENSIONS = {
57
- ".py", ".js", ".ts", ".tsx", ".jsx", ".html", ".css", ".json",
58
- ".yaml", ".yml", ".xml", ".sql", ".sh", ".zsh", ".toml", ".ini",
59
- }
60
- LOCAL_DOCUMENT_EXTENSIONS = {".pdf", ".docx"}
61
- LOCAL_SPREADSHEET_EXTENSIONS = {".xlsx", ".csv"}
62
- LOCAL_SLIDE_EXTENSIONS = {".pptx"}
63
- LOCAL_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
64
- LOCAL_SUPPORTED_EXTENSIONS = (
65
- LOCAL_TEXT_EXTENSIONS
66
- | LOCAL_CODE_EXTENSIONS
67
- | LOCAL_DOCUMENT_EXTENSIONS
68
- | LOCAL_SPREADSHEET_EXTENSIONS
69
- | LOCAL_SLIDE_EXTENSIONS
70
- | LOCAL_IMAGE_EXTENSIONS
7
+ from latticeai.brain._kg_common import ( # noqa: F401
8
+ EDGE_VERB,
9
+ GRAPH_SCHEMA_VERSION,
10
+ LOCAL_CODE_EXTENSIONS,
11
+ LOCAL_DOCUMENT_EXTENSIONS,
12
+ LOCAL_IMAGE_EXTENSIONS,
13
+ LOCAL_SIZE_LIMITS,
14
+ LOCAL_SLIDE_EXTENSIONS,
15
+ LOCAL_SPREADSHEET_EXTENSIONS,
16
+ LOCAL_SUPPORTED_EXTENSIONS,
17
+ LOCAL_TEXT_EXTENSIONS,
18
+ _KG_DB_FORMAT_VERSION,
19
+ _PROJECTION_VERSION,
20
+ _extract_concepts,
21
+ _extract_concepts_rules,
22
+ _extract_triples,
23
+ _extract_triples_rules,
24
+ _slug,
25
+ set_llm_router,
71
26
  )
72
-
73
- LOCAL_SIZE_LIMITS = {
74
- "text": 4_000_000,
75
- "code": 4_000_000,
76
- "pdf": 50_000_000,
77
- "document": 50_000_000,
78
- "spreadsheet": 50_000_000,
79
- "slide_deck": 50_000_000,
80
- "image": 100_000_000,
81
- }
82
-
83
- COMMON_EXCLUDED_DIRS = {
84
- ".git", "node_modules", ".venv", "venv", "env", "__pycache__",
85
- ".pytest_cache", ".mypy_cache", ".ruff_cache", ".next", ".nuxt",
86
- ".turbo", "dist", "build", "target", "out", "coverage", ".cache",
87
- ".config", ".ssh", ".gnupg", ".docker", ".kube", ".aws", ".azure",
88
- ".npm", ".pnpm-store", ".yarn", ".bun", ".cargo", ".rustup", ".pyenv",
89
- ".conda", ".local", ".claude", ".codex", ".cursor", ".copilot",
90
- ".antigravity", ".antigravity-ide",
91
- }
92
-
93
- COMMON_EXCLUDED_FILE_NAMES = {
94
- ".env", ".env.local", ".env.production", ".env.development",
95
- "id_rsa", "id_ed25519", "authorized_keys", "known_hosts",
96
- "credentials.json", "service-account.json", "token.json", "secrets.json",
97
- "cookies", "login data", "history", "web data", ".ds_store", "thumbs.db",
98
- }
99
- COMMON_EXCLUDED_FILE_SUFFIXES = {
100
- ".pem", ".key", ".p12", ".pfx", ".kdbx", ".wallet", ".sqlite", ".db",
101
- ".exe", ".dll", ".sys", ".msi", ".dmg", ".pkg", ".app", ".zip", ".tar",
102
- ".gz", ".7z", ".rar", ".mp4", ".mov", ".mp3", ".wav", ".tmp", ".bak",
103
- ".lock",
104
- }
105
- SENSITIVE_PATH_KEYWORDS = {
106
- "secret", "secrets", "token", "password", "passwd", "credential",
107
- "credentials", "private", "key", "wallet", "recovery", "seed",
108
- "mnemonic", "cookie", "session", "auth", "oauth", "certificate",
109
- "cert", "api_key", "apikey",
110
- }
111
-
112
- MACOS_EXCLUDED_PREFIXES = (
113
- "/System", "/Library", "/Applications", "/private", "/tmp", "/var",
114
- )
115
- WINDOWS_EXCLUDED_NAMES = {
116
- "windows", "program files", "program files (x86)", "programdata", "appdata",
117
- "$recycle.bin", "system volume information", "recovery", "perflogs",
118
- "intel", "amd", "nvidia",
119
- }
120
- LINUX_EXCLUDED_PREFIXES = (
121
- "/bin", "/boot", "/dev", "/etc", "/lib", "/lib64", "/proc", "/root",
122
- "/run", "/sbin", "/sys", "/tmp", "/usr", "/var", "/snap", "/lost+found",
123
- )
124
-
125
-
126
- def _now() -> str:
127
- return datetime.now().isoformat()
128
-
129
-
130
- def _parse_iso(raw: Optional[str]) -> Optional[datetime]:
131
- if not raw:
132
- return None
133
- try:
134
- return datetime.fromisoformat(str(raw))
135
- except (TypeError, ValueError):
136
- return None
137
-
138
-
139
- def _recency_score(updated_at: Optional[str], *, now: Optional[datetime] = None, half_life_days: float = 14.0) -> float:
140
- stamp = _parse_iso(updated_at)
141
- if not stamp:
142
- return 0.0
143
- now = now or datetime.now()
144
- age_days = max(0.0, (now - stamp).total_seconds() / 86400.0)
145
- decay = math.log(2) / max(0.1, half_life_days)
146
- return math.exp(-decay * age_days)
147
-
148
-
149
- def _json(data: Optional[Dict[str, Any]]) -> str:
150
- return json.dumps(data or {}, ensure_ascii=False, sort_keys=True)
151
-
152
-
153
- def _safe_loads(raw: Optional[str]) -> Dict[str, Any]:
154
- """Tolerantly parse a metadata_json column — returns {} on corrupt rows."""
155
- if not raw:
156
- return {}
157
- try:
158
- value = json.loads(raw)
159
- return value if isinstance(value, dict) else {}
160
- except (json.JSONDecodeError, TypeError) as e:
161
- logging.warning("knowledge_graph: corrupt metadata_json (%s) — using empty dict", e)
162
- return {}
163
-
164
-
165
- def _slug(text: str, max_len: int = 96) -> str:
166
- value = re.sub(r"\s+", " ", str(text or "")).strip().lower()
167
- value = re.sub(r"[^0-9a-zA-Z가-힣._:@/-]+", "-", value).strip("-")
168
- return (value or "untitled")[:max_len]
169
-
170
-
171
- def _sha256_bytes(data: bytes) -> str:
172
- return hashlib.sha256(data).hexdigest()
173
-
174
-
175
- def _sha256_text(text: str) -> str:
176
- return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
177
-
178
-
179
- def _safe_iso_from_stat_mtime(mtime: float) -> str:
180
- try:
181
- return datetime.fromtimestamp(float(mtime)).isoformat()
182
- except (TypeError, ValueError, OSError):
183
- return ""
184
-
185
-
186
- def _path_fingerprint(path: Path) -> str:
187
- return _sha256_text(str(path.expanduser().resolve()))[:24]
188
-
189
-
190
- def _is_relative_to(path: Path, base: Path) -> bool:
191
- try:
192
- path.relative_to(base)
193
- return True
194
- except ValueError:
195
- return False
196
-
197
-
198
- def _path_parts_lower(path: Path) -> List[str]:
199
- return [part.lower() for part in path.parts if part and part not in {os.sep, path.anchor}]
200
-
201
-
202
- def _current_os_type() -> str:
203
- system = platform.system().lower()
204
- if system.startswith("darwin"):
205
- return "macos"
206
- if system.startswith("windows"):
207
- return "windows"
208
- if system.startswith("linux"):
209
- return "linux"
210
- return system or "unknown"
211
-
212
-
213
- def _drive_id_for_path(path: Path) -> str:
214
- resolved = path.expanduser().resolve()
215
- if resolved.drive:
216
- return resolved.drive.upper()
217
- parts = resolved.parts
218
- if len(parts) >= 3 and parts[1] == "Volumes":
219
- return f"/Volumes/{parts[2]}"
220
- if len(parts) >= 3 and parts[1] == "media":
221
- return f"/media/{parts[2]}"
222
- if len(parts) >= 3 and parts[1] == "mnt":
223
- return f"/mnt/{parts[2]}"
224
- return resolved.anchor or "/"
225
-
226
-
227
- def _file_category(ext: str) -> str:
228
- ext = (ext or "").lower()
229
- if ext in LOCAL_CODE_EXTENSIONS:
230
- return "code"
231
- if ext in LOCAL_TEXT_EXTENSIONS:
232
- return "text"
233
- if ext == ".pdf":
234
- return "pdf"
235
- if ext in LOCAL_DOCUMENT_EXTENSIONS:
236
- return "document"
237
- if ext in LOCAL_SPREADSHEET_EXTENSIONS:
238
- return "spreadsheet"
239
- if ext in LOCAL_SLIDE_EXTENSIONS:
240
- return "slide_deck"
241
- if ext in LOCAL_IMAGE_EXTENSIONS:
242
- return "image"
243
- return "unsupported"
244
-
245
-
246
- def _node_type_for_category(category: str) -> str:
247
- return {
248
- "code": "CodeFile",
249
- "spreadsheet": "Spreadsheet",
250
- "slide_deck": "SlideDeck",
251
- "image": "Image",
252
- "unsupported": "File",
253
- }.get(category, "Document")
254
-
255
-
256
- def _parser_type_for_category(category: str, ext: str) -> str:
257
- if category in {"text", "code"}:
258
- return "plain_text"
259
- if category == "spreadsheet" and ext == ".csv":
260
- return "csv_text"
261
- if category == "image":
262
- return "image_ocr"
263
- return ext.lstrip(".") or category
264
-
265
-
266
- def _size_limit_for_category(category: str) -> int:
267
- return LOCAL_SIZE_LIMITS.get(category, LOCAL_SIZE_LIMITS["document"])
268
-
269
-
270
- def _is_hidden_path(path: Path, root: Optional[Path] = None) -> bool:
271
- parts: Iterable[str]
272
- if root is not None:
273
- try:
274
- parts = path.relative_to(root).parts
275
- except ValueError:
276
- parts = path.parts
277
- else:
278
- parts = path.parts
279
- return any(part.startswith(".") and part not in {".", ".."} for part in parts)
280
-
281
-
282
- def _excluded_directory_reason(path: Path, *, root: Optional[Path] = None, os_type: Optional[str] = None) -> Optional[str]:
283
- os_type = os_type or _current_os_type()
284
- name = path.name.lower()
285
- if name in COMMON_EXCLUDED_DIRS:
286
- return "excluded_folder"
287
- if _is_hidden_path(path, root):
288
- return "hidden_folder"
289
- parts = _path_parts_lower(path)
290
- if os_type == "windows" and any(part in WINDOWS_EXCLUDED_NAMES for part in parts):
291
- return "system_folder"
292
- normalized = path.as_posix()
293
- root_normalized = root.as_posix() if root else ""
294
-
295
- def _prefix_blocks(prefixes: Tuple[str, ...]) -> bool:
296
- for prefix in prefixes:
297
- path_under_prefix = normalized == prefix or normalized.startswith(f"{prefix}/")
298
- root_under_prefix = bool(root_normalized) and (
299
- root_normalized == prefix or root_normalized.startswith(f"{prefix}/")
300
- )
301
- if path_under_prefix and not root_under_prefix:
302
- return True
303
- return False
304
-
305
- if os_type == "macos":
306
- home_library = Path.home() / "Library"
307
- try:
308
- root_is_library = bool(root) and _is_relative_to(root.expanduser().resolve(), home_library.expanduser().resolve())
309
- if _is_relative_to(path.expanduser().resolve(), home_library.expanduser().resolve()) and not root_is_library:
310
- return "user_library"
311
- except OSError:
312
- pass
313
- if _prefix_blocks(MACOS_EXCLUDED_PREFIXES):
314
- return "system_folder"
315
- if os_type == "linux":
316
- if _prefix_blocks(LINUX_EXCLUDED_PREFIXES):
317
- return "system_folder"
318
- return None
319
-
320
-
321
- def _sensitive_file_reason(path: Path, *, root: Optional[Path] = None) -> Optional[str]:
322
- name = path.name.lower()
323
- suffix = path.suffix.lower()
324
- if name in COMMON_EXCLUDED_FILE_NAMES or suffix in COMMON_EXCLUDED_FILE_SUFFIXES:
325
- return "sensitive_or_excluded_file"
326
- try:
327
- rel_text = path.relative_to(root).as_posix().lower() if root else path.as_posix().lower()
328
- except ValueError:
329
- rel_text = path.as_posix().lower()
330
- tokens = re.split(r"[^0-9a-zA-Z_가-힣]+", rel_text)
331
- if any(token in SENSITIVE_PATH_KEYWORDS for token in tokens):
332
- return "sensitive_name"
333
- return None
334
-
335
-
336
- def _root_warning(path: Path, os_type: str) -> Optional[str]:
337
- resolved = path.expanduser().resolve()
338
- home = Path.home().expanduser().resolve()
339
- if os_type == "macos" and resolved == home:
340
- return "홈 전체에는 설정/숨김 폴더가 포함될 수 있습니다. 문서, 데스크탑, 다운로드, 프로젝트 폴더부터 추가하는 것을 권장합니다."
341
- if os_type == "linux" and resolved.as_posix() == "/":
342
- return "루트 디렉터리에는 시스템 파일이 포함되어 있습니다. 일반 사용자 폴더나 마운트된 데이터 폴더를 권장합니다."
343
- if os_type == "windows" and str(resolved).rstrip("\\/").upper() in {"C:", "C:\\"}:
344
- return "C드라이브에는 Windows 시스템 파일과 앱 설정 파일이 포함되어 있습니다. 하위 폴더를 선택하는 것을 권장합니다."
345
- return None
346
-
347
-
348
- def _sample_file(path: Path, root: Path, status: str, reason: str = "") -> Dict[str, Any]:
349
- try:
350
- rel = path.relative_to(root).as_posix()
351
- except ValueError:
352
- rel = path.name
353
- try:
354
- stat = path.stat()
355
- size = stat.st_size if path.is_file() else None
356
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
357
- except OSError:
358
- size = None
359
- modified_at = ""
360
- return {
361
- "path": str(path),
362
- "relative_path": rel,
363
- "name": path.name,
364
- "extension": path.suffix.lower(),
365
- "status": status,
366
- "reason": reason,
367
- "size_bytes": size,
368
- "modified_at": modified_at,
369
- }
370
-
371
-
372
- def _clean_text(text: str) -> str:
373
- return re.sub(r"\s+", " ", str(text or "")).strip()
374
-
375
-
376
- def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
377
- cleaned = str(text or "").strip()
378
- if not cleaned:
379
- return []
380
- chunks: List[str] = []
381
- start = 0
382
- while start < len(cleaned):
383
- end = min(len(cleaned), start + size)
384
- chunks.append(cleaned[start:end])
385
- if end >= len(cleaned):
386
- break
387
- start = max(0, end - overlap)
388
- return chunks
389
-
390
-
391
- _LLM_EXTRACT_CONCEPT_PROMPT = """Extract the key concepts from the following text.
392
- Return ONLY a JSON array of objects, each with "concept" (string) and "importance" (float 0-1).
393
- Extract up to {limit} concepts. Focus on named entities, technical terms, and domain-specific nouns.
394
- Do NOT include common words, stop words, or generic terms.
395
-
396
- Text:
397
- {text}
398
-
399
- JSON:"""
400
-
401
- _LLM_EXTRACT_TRIPLE_PROMPT = """Extract relationship triples from the following text.
402
- Return ONLY a JSON array of objects, each with:
403
- - "subject": source concept (string)
404
- - "relation": relationship verb (string, Korean or English)
405
- - "object": target concept (string)
406
- - "evidence": the sentence supporting this triple (string, max 240 chars)
407
- - "confidence": how confident you are (float 0-1)
408
-
409
- Extract up to {limit} triples. Focus on meaningful semantic relationships.
410
-
411
- Text:
412
- {text}
413
-
414
- Concepts already identified: {concepts}
415
-
416
- JSON:"""
417
-
418
- ENABLE_LLM_EXTRACTION = os.getenv("LATTICEAI_LLM_EXTRACTION", "true").lower() in ("1", "true", "yes")
419
-
420
-
421
- def _llm_extract_concepts(text: str, limit: int = 12) -> Optional[List[str]]:
422
- if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
423
- return None
424
- if not _llm_router_ref.current_model_id:
425
- return None
426
- prompt = _LLM_EXTRACT_CONCEPT_PROMPT.format(text=text[:3000], limit=limit)
427
- try:
428
- loop = asyncio.get_event_loop()
429
- if loop.is_running():
430
- import concurrent.futures
431
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
432
- future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
433
- raw = future.result(timeout=30)
434
- else:
435
- raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
436
- raw = raw.strip()
437
- if raw.startswith("```"):
438
- raw = re.sub(r"^```(?:json)?\s*", "", raw)
439
- raw = re.sub(r"\s*```$", "", raw)
440
- parsed = json.loads(raw)
441
- if isinstance(parsed, list):
442
- concepts = []
443
- for item in parsed[:limit]:
444
- if isinstance(item, dict) and "concept" in item:
445
- concepts.append(item["concept"])
446
- elif isinstance(item, str):
447
- concepts.append(item)
448
- return concepts if concepts else None
449
- except Exception as e:
450
- logging.debug("LLM concept extraction failed (falling back to rules): %s", e)
451
- return None
452
-
453
-
454
- def _llm_extract_triples(text: str, concepts: List[str], limit: int = 20) -> Optional[List[Dict[str, str]]]:
455
- if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
456
- return None
457
- if not _llm_router_ref.current_model_id:
458
- return None
459
- prompt = _LLM_EXTRACT_TRIPLE_PROMPT.format(
460
- text=text[:3000], limit=limit,
461
- concepts=", ".join(concepts[:15]),
462
- )
463
- try:
464
- loop = asyncio.get_event_loop()
465
- if loop.is_running():
466
- import concurrent.futures
467
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
468
- future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
469
- raw = future.result(timeout=30)
470
- else:
471
- raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
472
- raw = raw.strip()
473
- if raw.startswith("```"):
474
- raw = re.sub(r"^```(?:json)?\s*", "", raw)
475
- raw = re.sub(r"\s*```$", "", raw)
476
- parsed = json.loads(raw)
477
- if isinstance(parsed, list):
478
- triples = []
479
- for item in parsed[:limit]:
480
- if isinstance(item, dict) and "subject" in item and "object" in item:
481
- triples.append({
482
- "subject": str(item["subject"]),
483
- "relation": str(item.get("relation", "관련됨")),
484
- "object": str(item["object"]),
485
- "context": str(item.get("evidence", ""))[:240],
486
- "confidence": float(item.get("confidence", 0.8)),
487
- })
488
- return triples if triples else None
489
- except Exception as e:
490
- logging.debug("LLM triple extraction failed (falling back to rules): %s", e)
491
- return None
492
-
493
-
494
- _CONCEPT_STOP: set = {
495
- # English stop words
496
- "the", "and", "for", "with", "this", "that", "from", "into", "which",
497
- "are", "was", "were", "has", "have", "had", "can", "will", "would",
498
- "could", "should", "may", "might", "must", "shall", "being", "been",
499
- "also", "just", "then", "than", "when", "where", "what", "how", "why",
500
- "its", "their", "your", "our", "you", "they", "them", "these", "those",
501
- "use", "used", "using", "based", "like", "such", "via", "per", "let",
502
- "yes", "not", "but", "are", "all", "any", "out", "new", "get", "set",
503
- # Korean stop words
504
- "사용자", "내용", "파일", "채팅", "답변", "입니다", "그리고", "처럼",
505
- "있어", "없어", "이야", "이다", "한다", "하다", "되다", "됩니다",
506
- "경우", "방법", "부분", "상태", "정도", "결과", "이후", "이전",
507
- "그것", "이것", "저것", "여기", "거기", "저기", "우리", "저희",
508
- "기능", "서버", "모델", "설정", "설명", "버전", "지원", "사용", "실행",
509
- "todo", "fixme", "note", "참고", "주의", "warning",
510
- }
511
-
512
-
513
- def _extract_concepts(text: str, limit: int = 12) -> List[str]:
514
- """LLM-first concept extraction with rule-based fallback."""
515
- llm_result = _llm_extract_concepts(text, limit)
516
- if llm_result:
517
- return llm_result
518
- return _extract_concepts_rules(text, limit)
519
-
520
-
521
- def _extract_concepts_rules(text: str, limit: int = 12) -> List[str]:
522
- """Extract meaningful named concepts from text (rule-based).
523
-
524
- Priority order:
525
- 1. Backtick / quoted terms (explicitly technical)
526
- 2. Multi-word proper nouns (Lattice AI, GPT-4o, Claude Sonnet)
527
- 3. Single capitalized proper nouns not at sentence start (Claude, Python, FastAPI)
528
- 4. Korean compound technical terms (멀티모달, 에이전트, 그래프RAG)
529
- 5. Hyphenated / versioned identifiers (gpt-4o, mlx-vlm, gemma-4)
530
- """
531
- text = str(text or "")
532
- seen: dict = {} # concept_lower → original form
533
-
534
- def _add(term: str) -> None:
535
- key = term.strip().lower()
536
- if (
537
- key
538
- and key not in _CONCEPT_STOP
539
- and not key.isdigit()
540
- and len(key) >= 2
541
- ):
542
- seen.setdefault(key, term.strip())
543
-
544
- # 1. Backtick-quoted code/term (highest confidence)
545
- for m in re.findall(r'`([^`]{2,40})`', text):
546
- if not re.search(r'[\(\)\[\]{}]', m): # skip code expressions
547
- _add(m)
548
-
549
- # 2. Double/single quoted terms
550
- for m in re.findall(r'"([^"]{2,40})"', text):
551
- _add(m)
552
-
553
- # 3. Multi-word English proper nouns (Title Case or ALL-CAPS first word, 2–4 words).
554
- # Pattern A: Mixed-case first word — "Lattice AI", "Tool Use", "Graph RAG"
555
- for m in re.findall(
556
- r'([A-Z][a-z]{1,20}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20}|\d[\w.]{0,6})){1,3})',
557
- text,
558
- ):
559
- _add(m)
560
- # Pattern B: ALL-CAPS first word — "VS Code", "MCP Server", "GPT-4o Mini"
561
- for m in re.findall(
562
- r'([A-Z]{2,6}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20})){1,2})',
563
- text,
564
- ):
565
- _add(m)
566
-
567
- # 4. Single capitalized proper noun.
568
- # Use ASCII-boundary lookaround instead of \b so Korean particles
569
- # (와, 의, 는 …) after an English word don't block the match.
570
- all_caps_words = re.findall(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]{2,24})(?![A-Za-z0-9])', text)
571
- freq: Dict[str, int] = {}
572
- for w in all_caps_words:
573
- freq[w] = freq.get(w, 0) + 1
574
- sentence_starts = set(re.findall(r'(?:^|(?<=[.!?])\s+)([A-Z][a-z]+)', text))
575
- for m, cnt in freq.items():
576
- if m.lower() in _CONCEPT_STOP:
577
- continue
578
- if cnt >= 2 or m not in sentence_starts:
579
- _add(m)
580
-
581
- # 5. Korean technical compound nouns (3–12 chars, no common particles)
582
- for m in re.findall(r'[가-힣]{2,12}(?:AI|LLM|API|UI|RAG|bot|Bot|기능|모델|서버|에이전트|파이프라인|워크플로)', text):
583
- _add(m)
584
- # Korean standalone terms that appear after topic markers (은/는/이/가 앞)
585
- for m in re.findall(r'([가-힣]{2,12})(?:은|는|이|가|을|를|의|에서|으로|와|과)', text):
586
- if m.lower() not in _CONCEPT_STOP and len(m) >= 2:
587
- # Only add if it's non-trivial (has 3+ chars or appears multiple times)
588
- cnt = text.count(m)
589
- if len(m) >= 3 or cnt >= 2:
590
- _add(m)
591
-
592
- # 6. Hyphenated / versioned identifiers (gpt-4o, gemma-4, mlx-vlm)
593
- for m in re.findall(r'\b([a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z0-9.]+)+)\b', text):
594
- if len(m) >= 4:
595
- _add(m)
596
-
597
- # De-duplicate: remove shorter if ALL its occurrences in the source text
598
- # are followed immediately by the suffix that forms the longer concept.
599
- # "Lattice" → dropped when every occurrence is "Lattice AI"
600
- # "Claude" → kept because it appears as just "Claude" too.
601
- values = list(seen.values())
602
- values_lower = [v.lower() for v in values]
603
- keep = set(range(len(values)))
604
- for i, v in enumerate(values):
605
- vl = v.lower()
606
- for j, wl in enumerate(values_lower):
607
- if i == j or j not in keep:
608
- continue
609
- # Check if vl is a word-prefix of wl
610
- suffix = wl[len(vl):]
611
- if not (wl.startswith(vl) and re.match(r'^[\s\-]', suffix)):
612
- continue
613
- # Count occurrences of v NOT followed by the suffix
614
- suffix_stripped = suffix.lstrip(" -")
615
- # Escape for regex
616
- pattern_with_suffix = re.escape(v) + r'[\s\-]+' + re.escape(suffix_stripped)
617
- pattern_alone = re.escape(v) + r'(?![\s\-]*' + re.escape(suffix_stripped) + r')'
618
- alone_count = len(re.findall(pattern_alone, text, re.IGNORECASE))
619
- if alone_count == 0:
620
- # Shorter term never appears alone → safe to remove
621
- keep.discard(i)
622
- break
623
-
624
- final = [values[i] for i in range(len(values)) if i in keep]
625
- return final[:limit]
626
-
627
-
628
- # ──────────────────────────────────────────────────────────────────────────────
629
- # Node type taxonomy (점 = 명사)
630
- # ──────────────────────────────────────────────────────────────────────────────
631
- # Chat — 대화 세션
632
- # Document — 파일 (PDF·PPT·Word·Excel·이미지 등)
633
- # Concept — 개념·아이디어·기술 용어
634
- # Person — 사람 (사용자, 언급된 인물)
635
- # Error — 오류·버그·예외
636
- # Code — 코드 스니펫·함수·클래스
637
- # Feature — 소프트웨어 기능
638
- # Task — 할 일·액션 아이템
639
- # Decision — 결정 사항
640
-
641
- # Edge type vocabulary (선 = 동사 — 과거형 서술어)
642
- EDGE_VERB = {
643
- "언급함": r"언급|mention|refer|cited",
644
- "포함함": r"포함|include|consist|구성|탑재|contains",
645
- "해결함": r"해결|resolv|fix|수정|고쳤|closed",
646
- "의존함": r"의존|depend|require|필요|based on",
647
- "설명함": r"설명|explain|describe|정의|란|이란|means",
648
- "비교함": r"비교|versus|vs\.?|차이|다르|compare",
649
- "사용함": r"사용|use|활용|이용|apply",
650
- "연결함": r"연결|connect|통합|integrate|연동|link",
651
- "확장함": r"확장|extend|플러그인|plugin|addon",
652
- "생성함": r"생성|만들|create|generate|build|produced",
653
- "대체함": r"대체|replace|instead|alternative",
654
- "지원함": r"지원|support|제공|provide|offer",
655
- "발생함": r"발생|occur|throw|raise|triggered",
656
- "관련됨": r"관련|related|associated|연관",
657
- }
658
-
659
-
660
- def _infer_edge(sentence: str) -> str:
661
- """Return the best-matching verb-form edge label for a sentence."""
662
- s = sentence.lower()
663
- for label, pattern in EDGE_VERB.items():
664
- if re.search(pattern, s):
665
- return label
666
- return "관련됨"
667
-
668
-
669
- # Technical words that cannot be person names
670
- _NOT_PERSON_WORDS: set = {
671
- "use", "api", "rag", "sdk", "ide", "cli", "llm", "mcp", "ui", "ux",
672
- "new", "old", "get", "set", "run", "add", "fix", "tool", "code",
673
- "base", "core", "data", "file", "test", "type", "mode", "view",
674
- }
675
-
676
-
677
- def _classify_node_type(concept: str, text: str) -> str:
678
- """Classify a concept into the node taxonomy.
679
-
680
- Term-level signals take priority; then a tight ±60-char window is used
681
- so distant keywords don't cause mis-classification.
682
- """
683
- term = concept.lower()
684
-
685
- # ── Term-level signals (highest confidence) ───────────────────────────
686
- if re.search(r'(?:error|exception|traceback|오류|에러|버그)$', term, re.I):
687
- return "Error"
688
- if re.search(r'error|exception|err\b', term, re.I) and len(concept) < 30:
689
- return "Error"
690
- if re.search(r'\(\)|\.py$|\.js$|\.ts$|\.go$|::\w', term):
691
- return "Code"
692
-
693
- # Person: "First Last" pattern, neither word is a known technical term
694
- if re.match(r'^[A-Z][a-z]{1,15} [A-Z][a-z]{1,15}$', concept):
695
- words = term.split()
696
- if not any(w in _NOT_PERSON_WORDS for w in words):
697
- return "Person"
698
-
699
- # ── Windowed context (±60 chars) — NOT used for Error to avoid false positives
700
- idx = text.lower().find(term)
701
- if idx >= 0:
702
- win = text[max(0, idx - 60): idx + len(concept) + 60].lower()
703
- if re.search(r'def |class |function|함수|클래스|메서드|import', win):
704
- return "Code"
705
- # Feature: concept appears DIRECTLY adjacent to 기능/feature keyword
706
- if (
707
- len(concept) <= 12
708
- and re.search(
709
- rf'{re.escape(term)}.{{0,8}}(?:기능|feature)|(?:기능|feature).{{0,8}}{re.escape(term)}',
710
- win,
711
- )
712
- ):
713
- return "Feature"
714
-
715
- return "Concept"
716
-
717
-
718
- def _extract_triples(
719
- text: str,
720
- concepts: List[str],
721
- limit: int = 20,
722
- ) -> List[Dict[str, str]]:
723
- """LLM-first triple extraction with rule-based fallback."""
724
- llm_result = _llm_extract_triples(text, concepts, limit)
725
- if llm_result:
726
- return llm_result
727
- return _extract_triples_rules(text, concepts, limit)
728
-
729
-
730
- def _extract_triples_rules(
731
- text: str,
732
- concepts: List[str],
733
- limit: int = 20,
734
- ) -> List[Dict[str, str]]:
735
- """Extract (subject, verb-edge, object, context) triples from text (rule-based).
736
-
737
- For each sentence containing ≥2 concepts, infer the verb-form edge label
738
- from surrounding context and create a directed triple.
739
- """
740
- if len(concepts) < 2:
741
- return []
742
-
743
- concept_lower = {c.lower(): c for c in concepts}
744
- triples: List[Dict[str, str]] = []
745
- seen_pairs: set = set()
746
-
747
- # Split on sentence boundaries
748
- sentences = re.split(r'(?<=[.!?\n])\s+|\n{2,}', text)
749
- for sent in sentences:
750
- sent = sent.strip()
751
- if len(sent) < 8:
752
- continue
753
- sent_lower = sent.lower()
754
-
755
- present = [concept_lower[k] for k in concept_lower if k in sent_lower]
756
- if len(present) < 2:
757
- continue
758
-
759
- edge = _infer_edge(sent)
760
-
761
- for i in range(len(present) - 1):
762
- subj, obj = present[i], present[i + 1]
763
- # Deduplicate by (subj, obj) regardless of direction for same edge
764
- pair_key = tuple(sorted([subj.lower(), obj.lower()])) + (edge,)
765
- if pair_key in seen_pairs:
766
- continue
767
- seen_pairs.add(pair_key)
768
- triples.append({
769
- "subject": subj,
770
- "relation": edge, # verb form (동사)
771
- "object": obj,
772
- "context": sent[:240],
773
- })
774
- if len(triples) >= limit:
775
- return triples
776
-
777
- return triples
778
-
779
-
780
- def _semantic_items(text: str) -> List[Dict[str, str]]:
781
- """Extract explicit decision / task items from text."""
782
- items: List[Dict[str, str]] = []
783
- for raw_line in str(text or "").splitlines():
784
- line = _clean_text(raw_line)
785
- if len(line) < 6:
786
- continue
787
- lowered = line.lower()
788
- if re.search(r"(결정|확정|하기로|decided|decision)", lowered):
789
- items.append({"type": "Decision", "title": line[:120], "summary": line[:500]})
790
- if re.search(r"(todo|해야|하자|진행|구현|수정|확인|next|task|\[ \])", lowered):
791
- items.append({"type": "Task", "title": line[:120], "summary": line[:500]})
792
- return items[:8]
793
-
794
-
795
- def _topic_candidates(text: str, limit: int = 8) -> List[str]:
796
- """Return compact keyword candidates for fallback graph search."""
797
- candidates = _extract_concepts(text, limit=limit)
798
- if candidates:
799
- return candidates[:limit]
800
- seen: Dict[str, str] = {}
801
- for token in re.findall(r"[A-Za-z][A-Za-z0-9_.:-]{2,}|[가-힣]{2,12}", str(text or "")):
802
- key = token.lower()
803
- if key in _CONCEPT_STOP or key.isdigit():
804
- continue
805
- seen.setdefault(key, token)
806
- if len(seen) >= limit:
807
- break
808
- return list(seen.values())[:limit]
809
-
810
-
811
- class KnowledgeGraphStore:
812
- def __init__(self, db_path: Path, blob_dir: Path, embedder: Any = None):
813
- self.db_path = Path(db_path)
814
- self.blob_dir = Path(blob_dir)
815
- self.db_path.parent.mkdir(parents=True, exist_ok=True)
816
- self.blob_dir.mkdir(parents=True, exist_ok=True)
817
- # The embedder is swappable behind a fixed interface
818
- # (model_id/dim/embed/encode/decode/similarity). Defaults to the
819
- # deterministic, offline hash model so the store works with no config;
820
- # server_app injects a provider-backed embedder from Config.
821
- self._embedding_model = embedder if embedder is not None else LocalEmbeddingModel()
822
- self._init_db()
823
- # Read graph queries from the v2 projection (kgv2_* views) when available.
824
- # Toggle off (e.g. in tests) to compare against the legacy tables.
825
- self._read_from_v2 = KGStoreV2 is not None and _READ_FROM_V2_DEFAULT
826
-
827
- def _read_tables(self) -> tuple:
828
- """Return (nodes_table, edges_table) for read queries.
829
-
830
- Same read code runs against the legacy tables or the v2 reconstruction
831
- views, so the two paths are equivalent by construction.
832
- """
833
- if self._read_from_v2:
834
- return ("kgv2_nodes", "kgv2_edges")
835
- return ("nodes", "edges")
836
-
837
- def _connect(self) -> sqlite3.Connection:
838
- conn = sqlite3.connect(str(self.db_path))
839
- conn.row_factory = sqlite3.Row
840
- conn.execute("PRAGMA journal_mode=WAL")
841
- conn.execute("PRAGMA foreign_keys=ON")
842
- return conn
843
-
844
- def _init_db(self) -> None:
845
- with self._connect() as conn:
846
- conn.executescript(
847
- """
848
- CREATE TABLE IF NOT EXISTS graph_meta (
849
- key TEXT PRIMARY KEY,
850
- value TEXT NOT NULL
851
- );
852
- CREATE TABLE IF NOT EXISTS nodes (
853
- id TEXT PRIMARY KEY,
854
- type TEXT NOT NULL,
855
- title TEXT NOT NULL,
856
- summary TEXT,
857
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
858
- raw_json TEXT NOT NULL CHECK (json_valid(raw_json)),
859
- created_at TEXT NOT NULL,
860
- updated_at TEXT NOT NULL
861
- );
862
- CREATE TABLE IF NOT EXISTS edges (
863
- id TEXT PRIMARY KEY,
864
- from_node TEXT NOT NULL,
865
- to_node TEXT NOT NULL,
866
- type TEXT NOT NULL,
867
- weight REAL NOT NULL DEFAULT 1.0,
868
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
869
- created_at TEXT NOT NULL,
870
- UNIQUE(from_node, to_node, type),
871
- FOREIGN KEY(from_node) REFERENCES nodes(id) ON DELETE CASCADE,
872
- FOREIGN KEY(to_node) REFERENCES nodes(id) ON DELETE CASCADE
873
- );
874
- CREATE TABLE IF NOT EXISTS chunks (
875
- id TEXT PRIMARY KEY,
876
- source_node TEXT NOT NULL,
877
- text TEXT NOT NULL,
878
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
879
- created_at TEXT NOT NULL,
880
- FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
881
- );
882
- CREATE TABLE IF NOT EXISTS knowledge_sources (
883
- id TEXT PRIMARY KEY,
884
- root_path TEXT NOT NULL UNIQUE,
885
- os_type TEXT NOT NULL,
886
- drive_id TEXT,
887
- label TEXT,
888
- status TEXT NOT NULL,
889
- include_ocr INTEGER NOT NULL DEFAULT 0,
890
- watch_enabled INTEGER NOT NULL DEFAULT 0,
891
- consent_json TEXT NOT NULL CHECK (json_valid(consent_json)),
892
- created_at TEXT NOT NULL,
893
- updated_at TEXT NOT NULL,
894
- last_scanned_at TEXT
895
- );
896
- CREATE TABLE IF NOT EXISTS local_file_index (
897
- id TEXT PRIMARY KEY,
898
- source_id TEXT NOT NULL,
899
- os_type TEXT NOT NULL,
900
- drive_id TEXT,
901
- root_path TEXT NOT NULL,
902
- file_path TEXT NOT NULL,
903
- relative_path TEXT NOT NULL,
904
- file_name TEXT NOT NULL,
905
- extension TEXT NOT NULL,
906
- size_bytes INTEGER,
907
- modified_at TEXT,
908
- sha256 TEXT,
909
- last_scanned_at TEXT,
910
- last_indexed_at TEXT,
911
- parser_type TEXT,
912
- status TEXT NOT NULL,
913
- error_message TEXT,
914
- graph_node_id TEXT,
915
- deleted INTEGER NOT NULL DEFAULT 0,
916
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
917
- UNIQUE(source_id, relative_path),
918
- FOREIGN KEY(source_id) REFERENCES knowledge_sources(id) ON DELETE CASCADE
919
- );
920
- CREATE TABLE IF NOT EXISTS vector_embeddings (
921
- item_id TEXT PRIMARY KEY,
922
- item_type TEXT NOT NULL,
923
- source_node TEXT NOT NULL,
924
- text_hash TEXT NOT NULL,
925
- embedding BLOB NOT NULL,
926
- embedding_dim INTEGER NOT NULL,
927
- embedding_model TEXT NOT NULL,
928
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
929
- indexed_at TEXT NOT NULL,
930
- FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
931
- );
932
- CREATE TABLE IF NOT EXISTS vector_index_operations (
933
- id TEXT PRIMARY KEY,
934
- operation TEXT NOT NULL,
935
- status TEXT NOT NULL,
936
- requested_at TEXT NOT NULL,
937
- started_at TEXT,
938
- completed_at TEXT,
939
- items_total INTEGER NOT NULL DEFAULT 0,
940
- items_indexed INTEGER NOT NULL DEFAULT 0,
941
- items_skipped INTEGER NOT NULL DEFAULT 0,
942
- error_message TEXT,
943
- metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
944
- );
945
- -- v3.6.0 Knowledge Graph First: per-ingestion provenance trail.
946
- -- Append-only audit of where every graph node came from, when it
947
- -- was captured, how it was processed, and whether it was embedded /
948
- -- linked / used by an agent. get_provenance() returns the latest row.
949
- CREATE TABLE IF NOT EXISTS ingestion_provenance (
950
- id TEXT PRIMARY KEY,
951
- node_id TEXT NOT NULL,
952
- source_type TEXT NOT NULL,
953
- source_uri TEXT,
954
- content_hash TEXT,
955
- title TEXT,
956
- pipeline TEXT NOT NULL,
957
- owner TEXT,
958
- workspace_id TEXT,
959
- captured_at TEXT,
960
- modified_at TEXT,
961
- embedded INTEGER NOT NULL DEFAULT 0,
962
- linked INTEGER NOT NULL DEFAULT 0,
963
- duplicate INTEGER NOT NULL DEFAULT 0,
964
- agent_used TEXT,
965
- chunk_count INTEGER NOT NULL DEFAULT 0,
966
- permissions_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(permissions_json)),
967
- metadata_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(metadata_json)),
968
- created_at TEXT NOT NULL
969
- );
970
- CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
971
- CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
972
- CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
973
- CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
974
- CREATE INDEX IF NOT EXISTS idx_knowledge_sources_root ON knowledge_sources(root_path);
975
- CREATE INDEX IF NOT EXISTS idx_local_file_index_source ON local_file_index(source_id);
976
- CREATE INDEX IF NOT EXISTS idx_local_file_index_status ON local_file_index(status);
977
- CREATE INDEX IF NOT EXISTS idx_local_file_index_graph_node ON local_file_index(graph_node_id);
978
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_type ON vector_embeddings(item_type);
979
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
980
- CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
981
- CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
982
- CREATE INDEX IF NOT EXISTS idx_provenance_node ON ingestion_provenance(node_id);
983
- CREATE INDEX IF NOT EXISTS idx_provenance_source_type ON ingestion_provenance(source_type);
984
- CREATE INDEX IF NOT EXISTS idx_provenance_hash ON ingestion_provenance(content_hash);
985
- CREATE INDEX IF NOT EXISTS idx_provenance_created ON ingestion_provenance(created_at);
986
- """
987
- )
988
- conn.execute(
989
- "INSERT OR REPLACE INTO graph_meta(key, value) VALUES (?, ?)",
990
- ("schema_version", str(GRAPH_SCHEMA_VERSION)),
991
- )
992
- self._init_v2_schema()
993
-
994
- # SQL views that reconstruct the *exact* legacy row shape on top of the
995
- # normalized v2 tables, so the read methods run unchanged against either
996
- # source. The projection stores the raw legacy type string in ``legacy_type``
997
- # and promotes summary + metadata to first-class columns (no more
998
- # ``attrs._kg`` passthrough / ``evidence`` abuse), so these views are
999
- # byte-faithful to the legacy nodes/edges tables.
1000
- _V2_VIEWS_SQL = """
1001
- CREATE VIEW IF NOT EXISTS kgv2_nodes AS
1002
- SELECT id,
1003
- COALESCE(legacy_type, type) AS type,
1004
- label AS title,
1005
- summary,
1006
- attrs AS metadata_json,
1007
- created_at, updated_at
1008
- FROM nodes_v2;
1009
- CREATE VIEW IF NOT EXISTS kgv2_edges AS
1010
- SELECT id, source AS from_node, target AS to_node,
1011
- COALESCE(legacy_type, type) AS type,
1012
- weight,
1013
- metadata AS metadata_json,
1014
- created_at
1015
- FROM edges_v2;
1016
- """
1017
-
1018
- def _init_v2_schema(self) -> None:
1019
- """Initialize the normalized v2 tables + reconstruction views, migrating
1020
- the projection layout when it is stale — **atomically**.
1021
-
1022
- The entire DROP → CREATE → VIEWS → BACKFILL → version-stamp sequence runs
1023
- in a single transaction on one connection: on any failure it rolls back,
1024
- leaving the prior projection untouched and the version unchanged, so the
1025
- next startup simply retries. The migration only ever touches the v2
1026
- tables/views and the ``projection_version`` key — never the authoritative
1027
- legacy ``nodes``/``edges`` — so legacy data cannot be corrupted even if
1028
- the rebuild fails midway.
1029
- """
1030
- if KGStoreV2 is None or _exec_script is None:
1031
- return
1032
- try:
1033
- with self._connect() as conn:
1034
- conn.execute("BEGIN")
1035
- stale = self._projection_version(conn) != _PROJECTION_VERSION
1036
- if stale:
1037
- # The projection is non-authoritative; drop it so init_schema
1038
- # recreates the tables with the current normalized columns.
1039
- for stmt in (
1040
- "DROP VIEW IF EXISTS kgv2_edges",
1041
- "DROP VIEW IF EXISTS kgv2_nodes",
1042
- "DROP TABLE IF EXISTS edges_v2",
1043
- "DROP TABLE IF EXISTS nodes_v2",
1044
- ):
1045
- conn.execute(stmt)
1046
- # init_schema(conn=...) joins this transaction (no implicit commit)
1047
- KGStoreV2(self.db_path).init_schema(conn=conn)
1048
- _exec_script(conn, self._V2_VIEWS_SQL)
1049
- self._backfill_v2_on(conn, force=stale)
1050
- # version stamp commits together with the backfill — never stranded
1051
- conn.execute(
1052
- "INSERT OR REPLACE INTO kg_meta(key, value) VALUES ('projection_version', ?)",
1053
- (str(_PROJECTION_VERSION),),
1054
- )
1055
- except Exception as e:
1056
- logging.warning("knowledge_graph: v2 schema init/backfill skipped: %s", e)
1057
-
1058
- def _projection_version(self, conn: sqlite3.Connection) -> int:
1059
- """Return the stored v2 projection layout version (0 if unknown).
1060
-
1061
- A fresh DB (kg_meta absent) raises ``sqlite3.OperationalError`` here and
1062
- is correctly treated as version 0 → rebuild. Only sqlite errors are
1063
- swallowed so a real bug doesn't masquerade as a stale projection.
1064
- """
1065
- try:
1066
- row = conn.execute(
1067
- "SELECT value FROM kg_meta WHERE key='projection_version'"
1068
- ).fetchone()
1069
- return int(row["value"]) if row and row["value"] is not None else 0
1070
- except sqlite3.Error:
1071
- return 0
1072
-
1073
- def _backfill_v2_if_needed(self, *, force: bool = False) -> None:
1074
- """Project legacy nodes/edges into v2 on a fresh transaction.
1075
-
1076
- Thin wrapper around :meth:`_backfill_v2_on` for callers (tests, ad-hoc
1077
- re-sync) that aren't already inside the migration transaction.
1078
- """
1079
- try:
1080
- with self._connect() as conn:
1081
- self._backfill_v2_on(conn, force=force)
1082
- except Exception as ex:
1083
- logging.warning("knowledge_graph: v2 backfill skipped: %s", ex)
1084
-
1085
- def _backfill_v2_on(self, conn: sqlite3.Connection, *, force: bool = False) -> None:
1086
- """Project legacy nodes/edges into the normalized v2 tables on ``conn``.
1087
-
1088
- Non-destructive to legacy. ``force`` rebuilds unconditionally (used after
1089
- a layout migration); otherwise it only projects when v2 is empty. The v2
1090
- graph is a derived projection, so clearing + rebuilding it is always safe.
1091
- Idempotent: no-ops once v2 carries the current projection. Copies the
1092
- legacy column values **verbatim** so the kgv2_* views are byte-faithful.
1093
- """
1094
- legacy_nodes = conn.execute("SELECT COUNT(*) FROM nodes").fetchone()[0]
1095
- if legacy_nodes == 0:
1096
- return
1097
- v2_nodes = conn.execute("SELECT COUNT(*) FROM nodes_v2").fetchone()[0]
1098
- if v2_nodes > 0 and not force:
1099
- return # current projection already present
1100
- # (re)project: clear v2 graph (not authoritative) and rebuild
1101
- conn.execute("DELETE FROM edges_v2")
1102
- conn.execute("DELETE FROM nodes_v2")
1103
- n = e = 0
1104
- for r in conn.execute(
1105
- "SELECT id, type, title, summary, metadata_json, created_at, updated_at FROM nodes"
1106
- ).fetchall():
1107
- self._v2_project_node(
1108
- conn, r["id"], r["type"], r["title"], r["summary"], r["metadata_json"],
1109
- created_at=r["created_at"], updated_at=r["updated_at"],
1110
- )
1111
- n += 1
1112
- for r in conn.execute(
1113
- "SELECT id, from_node, to_node, type, weight, metadata_json, created_at FROM edges"
1114
- ).fetchall():
1115
- self._v2_project_edge(
1116
- conn, r["from_node"], r["to_node"], r["type"], float(r["weight"] or 1.0),
1117
- r["metadata_json"], edge_id=r["id"], created_at=r["created_at"],
1118
- )
1119
- e += 1
1120
- logging.info("knowledge_graph: projected legacy → v2 (%d nodes, %d edges)", n, e)
1121
-
1122
- # ── v2 dual-write projection (normalized type, byte-faithful legacy values) ──
1123
- # The projection stores the legacy ``title``/``summary``/``metadata_json``
1124
- # values it is handed VERBATIM (no truncation or JSON re-encoding) so the
1125
- # kgv2_* views reproduce the legacy rows exactly. Callers (_upsert_* and the
1126
- # backfill) pass the already-canonical legacy column values.
1127
- def _v2_project_node(
1128
- self, conn: sqlite3.Connection, node_id: str, node_type: str, title: str,
1129
- summary: Optional[str], metadata_json: Optional[str],
1130
- *, created_at: Optional[str] = None, updated_at: Optional[str] = None,
1131
- ) -> None:
1132
- if KGStoreV2 is None:
1133
- return
1134
- ts = updated_at or _now()
1135
- norm_type = NodeType.from_legacy(node_type).value if NodeType is not None else node_type
1136
- try:
1137
- conn.execute(
1138
- """
1139
- INSERT INTO nodes_v2(id, type, legacy_type, label, summary, attrs,
1140
- owner_id, visibility, created_at, updated_at,
1141
- importance_score)
1142
- VALUES (?, ?, ?, ?, ?, ?, NULL, 'private', ?, ?, 0.0)
1143
- ON CONFLICT(id) DO UPDATE SET
1144
- type=excluded.type, legacy_type=excluded.legacy_type,
1145
- label=excluded.label, summary=excluded.summary,
1146
- attrs=excluded.attrs, updated_at=excluded.updated_at
1147
- """,
1148
- (node_id, norm_type, node_type, title, summary,
1149
- metadata_json if metadata_json is not None else "{}",
1150
- created_at or ts, ts),
1151
- )
1152
- except Exception as ex:
1153
- logging.debug("knowledge_graph: v2 node projection skipped (%s): %s", node_id, ex)
1154
-
1155
- def _v2_project_edge(
1156
- self, conn: sqlite3.Connection, from_node: str, to_node: str, edge_type: str,
1157
- weight: float, metadata_json: Optional[str],
1158
- *, edge_id: Optional[str] = None, created_at: Optional[str] = None,
1159
- ) -> None:
1160
- if KGStoreV2 is None:
1161
- return
1162
- eid = edge_id or f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
1163
- norm_type = EdgeType.from_legacy(edge_type).value if EdgeType is not None else edge_type
1164
- meta_str = metadata_json if metadata_json is not None else "{}"
1165
- confidence = float(_safe_loads(meta_str).get("confidence", 1.0))
1166
- try:
1167
- conn.execute(
1168
- """
1169
- INSERT INTO edges_v2(id, source, target, type, legacy_type, weight,
1170
- confidence, evidence, metadata, created_by, created_at)
1171
- VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, 'legacy', ?)
1172
- ON CONFLICT(source, target, legacy_type) DO UPDATE SET
1173
- type=excluded.type,
1174
- weight=max(edges_v2.weight, excluded.weight),
1175
- confidence=excluded.confidence,
1176
- metadata=excluded.metadata
1177
- """,
1178
- (eid, from_node, to_node, norm_type, edge_type, float(weight),
1179
- confidence, meta_str, created_at or _now()),
1180
- )
1181
- except Exception as ex:
1182
- logging.debug("knowledge_graph: v2 edge projection skipped (%s->%s): %s", from_node, to_node, ex)
1183
-
1184
- def _v2_delete_nodes(self, conn: sqlite3.Connection, ids) -> None:
1185
- """Mirror legacy node deletions into v2 (edges_v2 cascade on the FK)."""
1186
- if KGStoreV2 is None:
1187
- return
1188
- ids = list(ids)
1189
- if not ids:
1190
- return
1191
- ph = ",".join("?" * len(ids))
1192
- try:
1193
- conn.execute(f"DELETE FROM nodes_v2 WHERE id IN ({ph})", ids)
1194
- except Exception as ex:
1195
- logging.debug("knowledge_graph: v2 node delete mirror skipped: %s", ex)
1196
-
1197
- def _v2_delete_edges_from(self, conn: sqlite3.Connection, node_id: str) -> None:
1198
- """Mirror a legacy ``DELETE FROM edges WHERE from_node=?`` into v2."""
1199
- if KGStoreV2 is None:
1200
- return
1201
- try:
1202
- conn.execute("DELETE FROM edges_v2 WHERE source=?", (node_id,))
1203
- except Exception as ex:
1204
- logging.debug("knowledge_graph: v2 edge delete mirror skipped: %s", ex)
1205
-
1206
- def _v2_sync_report(self) -> Dict[str, Any]:
1207
- """Diagnose the dual-write invariant: legacy node/edge id sets must equal
1208
- the v2 projection's. Returns counts + any drift (ids missing from / extra
1209
- in v2). ``in_sync`` is True only when both id sets match exactly.
1210
-
1211
- All legacy writes go through _upsert_node/_upsert_edge (which dual-write)
1212
- and every legacy delete is mirrored, so a non-empty drift signals a
1213
- bypassed write path — this is the runtime guard for that invariant.
1214
- """
1215
- if KGStoreV2 is None:
1216
- return {"available": False, "in_sync": True}
1217
- with self._connect() as conn:
1218
- legacy_nodes = {r[0] for r in conn.execute("SELECT id FROM nodes")}
1219
- v2_nodes = {r[0] for r in conn.execute("SELECT id FROM nodes_v2")}
1220
- legacy_edges = {r[0] for r in conn.execute("SELECT id FROM edges")}
1221
- v2_edges = {r[0] for r in conn.execute("SELECT id FROM edges_v2")}
1222
- return {
1223
- "available": True,
1224
- "in_sync": legacy_nodes == v2_nodes and legacy_edges == v2_edges,
1225
- "nodes_legacy": len(legacy_nodes),
1226
- "nodes_v2": len(v2_nodes),
1227
- "edges_legacy": len(legacy_edges),
1228
- "edges_v2": len(v2_edges),
1229
- "nodes_missing_from_v2": sorted(legacy_nodes - v2_nodes),
1230
- "nodes_extra_in_v2": sorted(v2_nodes - legacy_nodes),
1231
- "edges_missing_from_v2": sorted(legacy_edges - v2_edges),
1232
- "edges_extra_in_v2": sorted(v2_edges - legacy_edges),
1233
- }
1234
-
1235
- def _upsert_node(
1236
- self,
1237
- conn: sqlite3.Connection,
1238
- node_id: str,
1239
- node_type: str,
1240
- title: str,
1241
- summary: str = "",
1242
- metadata: Optional[Dict[str, Any]] = None,
1243
- raw: Optional[Dict[str, Any]] = None,
1244
- ) -> str:
1245
- now = _now()
1246
- # Canonical stored values, computed once and shared with the v2
1247
- # projection so legacy and v2 hold byte-identical strings.
1248
- title_s = title[:240]
1249
- summary_s = summary[:1000]
1250
- meta_json = _json(metadata)
1251
- conn.execute(
1252
- """
1253
- INSERT INTO nodes(id, type, title, summary, metadata_json, raw_json, created_at, updated_at)
1254
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1255
- ON CONFLICT(id) DO UPDATE SET
1256
- title=excluded.title,
1257
- summary=excluded.summary,
1258
- metadata_json=excluded.metadata_json,
1259
- raw_json=excluded.raw_json,
1260
- updated_at=excluded.updated_at
1261
- """,
1262
- (node_id, node_type, title_s, summary_s, meta_json, _json(raw), now, now),
1263
- )
1264
- # dual-write: project into the v2 graph on the same transaction
1265
- self._v2_project_node(conn, node_id, node_type, title_s, summary_s, meta_json,
1266
- created_at=now, updated_at=now)
1267
- if node_type != "Chunk":
1268
- self._upsert_vector_item(
1269
- conn,
1270
- item_id=node_id,
1271
- item_type="node",
1272
- source_node=node_id,
1273
- text=self._vector_text_for_node(title=title_s, summary=summary_s, metadata=metadata),
1274
- metadata={"node_type": node_type, **(metadata or {})},
1275
- )
1276
- return node_id
1277
-
1278
- def _upsert_edge(
1279
- self,
1280
- conn: sqlite3.Connection,
1281
- from_node: str,
1282
- to_node: str,
1283
- edge_type: str,
1284
- weight: float = 1.0,
1285
- metadata: Optional[Dict[str, Any]] = None,
1286
- ) -> str:
1287
- edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
1288
- now = _now()
1289
- meta_json = _json(metadata) # canonical string shared with the projection
1290
- conn.execute(
1291
- """
1292
- INSERT INTO edges(id, from_node, to_node, type, weight, metadata_json, created_at)
1293
- VALUES (?, ?, ?, ?, ?, ?, ?)
1294
- ON CONFLICT(from_node, to_node, type) DO UPDATE SET
1295
- weight=max(edges.weight, excluded.weight),
1296
- metadata_json=excluded.metadata_json
1297
- """,
1298
- (edge_id, from_node, to_node, edge_type, float(weight), meta_json, now),
1299
- )
1300
- # dual-write: project into the v2 graph on the same transaction
1301
- self._v2_project_edge(conn, from_node, to_node, edge_type, float(weight), meta_json,
1302
- edge_id=edge_id, created_at=now)
1303
- return edge_id
1304
-
1305
- def _vector_text_for_node(
1306
- self,
1307
- *,
1308
- title: str,
1309
- summary: str = "",
1310
- metadata: Optional[Dict[str, Any]] = None,
1311
- ) -> str:
1312
- metadata = metadata or {}
1313
- meta_parts = []
1314
- for key in (
1315
- "filename", "relative_path", "file_path", "conversation_id", "source",
1316
- "category", "ext", "role",
1317
- ):
1318
- value = metadata.get(key)
1319
- if value:
1320
- meta_parts.append(str(value))
1321
- return _clean_text("\n".join([str(title or ""), str(summary or ""), " ".join(meta_parts)]))
1322
-
1323
- def _upsert_vector_item(
1324
- self,
1325
- conn: sqlite3.Connection,
1326
- *,
1327
- item_id: str,
1328
- item_type: str,
1329
- source_node: str,
1330
- text: str,
1331
- metadata: Optional[Dict[str, Any]] = None,
1332
- ) -> bool:
1333
- text = _clean_text(text)
1334
- if len(text) < 2:
1335
- conn.execute("DELETE FROM vector_embeddings WHERE item_id=?", (item_id,))
1336
- return False
1337
- text_hash = _sha256_text(text)
1338
- existing = conn.execute(
1339
- """
1340
- SELECT text_hash, embedding_dim, embedding_model
1341
- FROM vector_embeddings
1342
- WHERE item_id=?
1343
- """,
1344
- (item_id,),
1345
- ).fetchone()
1346
- if (
1347
- existing
1348
- and existing["text_hash"] == text_hash
1349
- and existing["embedding_dim"] == self._embedding_model.dim
1350
- and existing["embedding_model"] == self._embedding_model.model_id
1351
- ):
1352
- return False
1353
- embedding = self._embedding_model.encode(self._embedding_model.embed(text[:50_000]))
1354
- conn.execute(
1355
- """
1356
- INSERT INTO vector_embeddings(
1357
- item_id, item_type, source_node, text_hash, embedding,
1358
- embedding_dim, embedding_model, metadata_json, indexed_at
1359
- )
1360
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1361
- ON CONFLICT(item_id) DO UPDATE SET
1362
- item_type=excluded.item_type,
1363
- source_node=excluded.source_node,
1364
- text_hash=excluded.text_hash,
1365
- embedding=excluded.embedding,
1366
- embedding_dim=excluded.embedding_dim,
1367
- embedding_model=excluded.embedding_model,
1368
- metadata_json=excluded.metadata_json,
1369
- indexed_at=excluded.indexed_at
1370
- """,
1371
- (
1372
- item_id,
1373
- item_type,
1374
- source_node,
1375
- text_hash,
1376
- embedding,
1377
- self._embedding_model.dim,
1378
- self._embedding_model.model_id,
1379
- _json(metadata),
1380
- _now(),
1381
- ),
1382
- )
1383
- return True
1384
-
1385
- def _upsert_chunk(
1386
- self,
1387
- conn: sqlite3.Connection,
1388
- *,
1389
- chunk_id: str,
1390
- source_node: str,
1391
- text: str,
1392
- metadata: Optional[Dict[str, Any]] = None,
1393
- ) -> None:
1394
- metadata = metadata or {}
1395
- conn.execute(
1396
- "INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) "
1397
- "VALUES (?, ?, ?, ?, ?)",
1398
- (chunk_id, source_node, text, _json(metadata), _now()),
1399
- )
1400
- self._upsert_vector_item(
1401
- conn,
1402
- item_id=chunk_id,
1403
- item_type="chunk",
1404
- source_node=chunk_id,
1405
- text=text,
1406
- metadata={**metadata, "parent_source_node": source_node},
1407
- )
1408
-
1409
- # ── Local folder sources → Graph RAG ──────────────────────────────────
1410
-
1411
- def discover_local_roots(self) -> Dict[str, Any]:
1412
- """Return safe, cross-platform starting points for structure browsing."""
1413
- os_type = _current_os_type()
1414
- home = Path.home().expanduser()
1415
- roots: List[Dict[str, Any]] = []
1416
- seen: set = set()
1417
-
1418
- def add(label: str, path: Path, kind: str, *, recommended: bool = True, warning: Optional[str] = None) -> None:
1419
- try:
1420
- resolved = path.expanduser().resolve()
1421
- except OSError:
1422
- resolved = path.expanduser()
1423
- key = str(resolved)
1424
- if key in seen or not resolved.exists():
1425
- return
1426
- seen.add(key)
1427
- roots.append({
1428
- "id": f"{kind}:{_path_fingerprint(resolved)}",
1429
- "label": label,
1430
- "path": key,
1431
- "kind": kind,
1432
- "recommended": recommended,
1433
- "warning": warning or _root_warning(resolved, os_type),
1434
- })
1435
-
1436
- add("홈", home, "home", warning=_root_warning(home, os_type))
1437
- for name, label in (
1438
- ("Documents", "문서"),
1439
- ("Desktop", "데스크탑"),
1440
- ("Downloads", "다운로드"),
1441
- ("Pictures", "사진"),
1442
- ("Projects", "프로젝트"),
1443
- ):
1444
- add(label, home / name, name.lower())
1445
-
1446
- if os_type == "macos":
1447
- volumes = Path("/Volumes")
1448
- if volumes.exists():
1449
- try:
1450
- for volume in sorted(volumes.iterdir(), key=lambda p: p.name.lower()):
1451
- add(volume.name, volume, "volume", recommended=False)
1452
- except OSError:
1453
- pass
1454
- elif os_type == "windows":
1455
- for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
1456
- drive = Path(f"{letter}:\\")
1457
- if drive.exists():
1458
- add(f"{letter}: 드라이브", drive, "drive", recommended=(letter != "C"))
1459
- for env_name, label in (("OneDrive", "OneDrive"), ("OneDriveCommercial", "OneDrive")):
1460
- raw = os.environ.get(env_name)
1461
- if raw:
1462
- add(label, Path(raw), "cloud", recommended=False)
1463
- elif os_type == "linux":
1464
- for base in (Path("/mnt"), Path("/media")):
1465
- add(str(base), base, "mounts", recommended=False)
1466
- try:
1467
- if base.exists():
1468
- for mounted in sorted(base.iterdir(), key=lambda p: p.name.lower()):
1469
- add(mounted.name, mounted, "volume", recommended=False)
1470
- except OSError:
1471
- pass
1472
-
1473
- return {
1474
- "os_type": os_type,
1475
- "computer": platform.node() or "local",
1476
- "roots": roots,
1477
- "privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
1478
- }
1479
-
1480
- def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
1481
- """List one folder level using metadata only; file contents are not read."""
1482
- root = Path(path).expanduser().resolve()
1483
- if not root.exists():
1484
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
1485
- if not root.is_dir():
1486
- raise ValueError(f"폴더가 아닙니다: {path}")
1487
-
1488
- os_type = _current_os_type()
1489
- max_items = max(1, min(int(max_items or 200), 1000))
1490
- items: List[Dict[str, Any]] = []
1491
- inaccessible = 0
1492
- try:
1493
- children = sorted(root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
1494
- except PermissionError as exc:
1495
- return {
1496
- "path": str(root),
1497
- "items": [],
1498
- "error": f"접근 권한 없음: {exc}",
1499
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
1500
- }
1501
-
1502
- for child in children[:max_items]:
1503
- try:
1504
- is_dir = child.is_dir()
1505
- stat = child.stat()
1506
- reason = _excluded_directory_reason(child, root=root, os_type=os_type) if is_dir else _sensitive_file_reason(child, root=root)
1507
- items.append({
1508
- "name": child.name,
1509
- "path": str(child),
1510
- "type": "directory" if is_dir else "file",
1511
- "extension": "" if is_dir else child.suffix.lower(),
1512
- "size_bytes": None if is_dir else stat.st_size,
1513
- "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
1514
- "hidden": _is_hidden_path(child, root),
1515
- "accessible": True,
1516
- "excluded_reason": reason,
1517
- })
1518
- except PermissionError:
1519
- inaccessible += 1
1520
- items.append({
1521
- "name": child.name,
1522
- "path": str(child),
1523
- "type": "unknown",
1524
- "accessible": False,
1525
- "excluded_reason": "permission_denied",
1526
- })
1527
- except OSError as exc:
1528
- inaccessible += 1
1529
- items.append({
1530
- "name": child.name,
1531
- "path": str(child),
1532
- "type": "unknown",
1533
- "accessible": False,
1534
- "excluded_reason": str(exc),
1535
- })
1536
-
1537
- return {
1538
- "path": str(root),
1539
- "os_type": os_type,
1540
- "items": items,
1541
- "truncated": len(children) > max_items,
1542
- "inaccessible": inaccessible,
1543
- "warning": _root_warning(root, os_type),
1544
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
1545
- }
1546
-
1547
- def _iter_local_scan_entries(self, root: Path, *, max_files: int) -> Iterable[Dict[str, Any]]:
1548
- os_type = _current_os_type()
1549
- stack = [root]
1550
- files_seen = 0
1551
- while stack:
1552
- current = stack.pop()
1553
- try:
1554
- children = sorted(current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
1555
- except PermissionError as exc:
1556
- yield {"kind": "inaccessible_dir", "path": current, "reason": f"permission_denied: {exc}"}
1557
- continue
1558
- except OSError as exc:
1559
- yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
1560
- continue
1561
-
1562
- for child in children:
1563
- if child.is_symlink():
1564
- yield {"kind": "excluded", "path": child, "reason": "symlink"}
1565
- continue
1566
- try:
1567
- if child.is_dir():
1568
- reason = _excluded_directory_reason(child, root=root, os_type=os_type)
1569
- if reason:
1570
- yield {"kind": "excluded_dir", "path": child, "reason": reason}
1571
- else:
1572
- stack.append(child)
1573
- continue
1574
- if not child.is_file():
1575
- yield {"kind": "excluded", "path": child, "reason": "not_regular_file"}
1576
- continue
1577
- stat = child.stat()
1578
- except PermissionError as exc:
1579
- yield {"kind": "inaccessible_file", "path": child, "reason": f"permission_denied: {exc}"}
1580
- continue
1581
- except OSError as exc:
1582
- yield {"kind": "inaccessible_file", "path": child, "reason": str(exc)}
1583
- continue
1584
-
1585
- files_seen += 1
1586
- if files_seen > max_files:
1587
- yield {"kind": "limit_reached", "path": child, "reason": "max_files"}
1588
- return
1589
- yield {"kind": "file", "path": child, "stat": stat}
1590
-
1591
- def _local_file_decision(self, path: Path, root: Path, stat: os.stat_result) -> Dict[str, Any]:
1592
- ext = path.suffix.lower()
1593
- category = _file_category(ext)
1594
- parser_type = _parser_type_for_category(category, ext)
1595
- sensitive_reason = _sensitive_file_reason(path, root=root)
1596
- if sensitive_reason:
1597
- return {
1598
- "status": "sensitive_blocked",
1599
- "reason": sensitive_reason,
1600
- "category": category,
1601
- "parser_type": parser_type,
1602
- "indexable": False,
1603
- }
1604
- if category == "unsupported":
1605
- return {
1606
- "status": "unsupported",
1607
- "reason": "unsupported_extension",
1608
- "category": category,
1609
- "parser_type": parser_type,
1610
- "indexable": False,
1611
- }
1612
- limit = _size_limit_for_category(category)
1613
- if stat.st_size > limit:
1614
- return {
1615
- "status": "too_large",
1616
- "reason": f"size>{limit}",
1617
- "category": category,
1618
- "parser_type": parser_type,
1619
- "indexable": False,
1620
- }
1621
- return {
1622
- "status": "pending",
1623
- "reason": "",
1624
- "category": category,
1625
- "parser_type": parser_type,
1626
- "indexable": True,
1627
- }
1628
-
1629
- def audit_local_folder(self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000) -> Dict[str, Any]:
1630
- """Safety-check a folder using metadata only; file bodies are not read."""
1631
- root = Path(path).expanduser().resolve()
1632
- if not root.exists():
1633
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
1634
- if not root.is_dir():
1635
- raise ValueError(f"폴더가 아닙니다: {path}")
1636
-
1637
- os_type = _current_os_type()
1638
- max_files = max(1, min(int(max_files or 50_000), 200_000))
1639
- status_counts: Counter = Counter()
1640
- category_counts: Counter = Counter()
1641
- extension_counts: Counter = Counter()
1642
- allowed_samples: List[Dict[str, Any]] = []
1643
- excluded_samples: List[Dict[str, Any]] = []
1644
- total_files = 0
1645
- readable_files = 0
1646
- inaccessible = 0
1647
- excluded_dirs = 0
1648
- limit_reached = False
1649
-
1650
- for entry in self._iter_local_scan_entries(root, max_files=max_files):
1651
- kind = entry["kind"]
1652
- path_obj = entry["path"]
1653
- if kind == "limit_reached":
1654
- limit_reached = True
1655
- break
1656
- if kind == "excluded_dir":
1657
- excluded_dirs += 1
1658
- if len(excluded_samples) < 25:
1659
- excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
1660
- continue
1661
- if kind in {"inaccessible_dir", "inaccessible_file"}:
1662
- inaccessible += 1
1663
- status_counts["failed"] += 1
1664
- if len(excluded_samples) < 25:
1665
- excluded_samples.append(_sample_file(path_obj, root, "failed", entry.get("reason", "")))
1666
- continue
1667
- if kind == "excluded":
1668
- status_counts["excluded"] += 1
1669
- if len(excluded_samples) < 25:
1670
- excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
1671
- continue
1672
- if kind != "file":
1673
- continue
1674
-
1675
- total_files += 1
1676
- stat = entry["stat"]
1677
- decision = self._local_file_decision(path_obj, root, stat)
1678
- status = decision["status"]
1679
- category = decision["category"]
1680
- ext = path_obj.suffix.lower() or "(none)"
1681
- category_counts[category] += 1
1682
- extension_counts[ext] += 1
1683
- if decision["indexable"]:
1684
- readable_files += 1
1685
- status_counts["readable"] += 1
1686
- if len(allowed_samples) < 25:
1687
- allowed_samples.append(_sample_file(path_obj, root, "readable"))
1688
- else:
1689
- status_counts[status] += 1
1690
- if len(excluded_samples) < 25:
1691
- excluded_samples.append(_sample_file(path_obj, root, status, decision["reason"]))
1692
-
1693
- doc_weight = category_counts["pdf"] * 1.4 + category_counts["document"] * 0.9 + category_counts["slide_deck"] * 1.0
1694
- sheet_weight = category_counts["spreadsheet"] * 0.6
1695
- ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
1696
- estimated_seconds = round(readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1)
1697
-
1698
- return {
1699
- "path": str(root),
1700
- "source_id": f"source:{_path_fingerprint(root)}",
1701
- "os_type": os_type,
1702
- "drive_id": _drive_id_for_path(root),
1703
- "warning": _root_warning(root, os_type),
1704
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
1705
- "include_ocr_requested": bool(include_ocr),
1706
- "summary": {
1707
- "total_files": total_files,
1708
- "readable_files": readable_files,
1709
- "excluded_files": int(
1710
- status_counts["excluded"]
1711
- + status_counts["sensitive_blocked"]
1712
- + status_counts["too_large"]
1713
- + status_counts["unsupported"]
1714
- ),
1715
- "sensitive_files": int(status_counts["sensitive_blocked"]),
1716
- "too_large_files": int(status_counts["too_large"]),
1717
- "unsupported_files": int(status_counts["unsupported"]),
1718
- "image_ocr_candidates": int(category_counts["image"]),
1719
- "inaccessible_items": inaccessible,
1720
- "excluded_dirs": excluded_dirs,
1721
- "estimated_seconds": estimated_seconds,
1722
- "storage_root": str(self.db_path.parent),
1723
- "limit_reached": limit_reached,
1724
- },
1725
- "by_status": dict(status_counts),
1726
- "by_category": dict(category_counts),
1727
- "by_extension": dict(extension_counts.most_common(40)),
1728
- "allowed_samples": allowed_samples,
1729
- "excluded_samples": excluded_samples,
1730
- "consent_required": {
1731
- "knowledge_source": True,
1732
- "image_ocr": bool(category_counts["image"]),
1733
- "watch": True,
1734
- "sensitive_files_default_excluded": True,
1735
- },
1736
- }
1737
-
1738
- def local_sources(self) -> Dict[str, Any]:
1739
- with self._connect() as conn:
1740
- sources = [
1741
- {
1742
- "id": row["id"],
1743
- "root_path": row["root_path"],
1744
- "os_type": row["os_type"],
1745
- "drive_id": row["drive_id"],
1746
- "label": row["label"],
1747
- "status": row["status"],
1748
- "include_ocr": bool(row["include_ocr"]),
1749
- "watch_enabled": bool(row["watch_enabled"]),
1750
- "consent": _safe_loads(row["consent_json"]),
1751
- "created_at": row["created_at"],
1752
- "updated_at": row["updated_at"],
1753
- "last_scanned_at": row["last_scanned_at"],
1754
- }
1755
- for row in conn.execute(
1756
- """
1757
- SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
1758
- watch_enabled, consent_json, created_at, updated_at, last_scanned_at
1759
- FROM knowledge_sources
1760
- ORDER BY updated_at DESC, id ASC
1761
- """
1762
- )
1763
- ]
1764
- status_rows = conn.execute(
1765
- "SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
1766
- ).fetchall()
1767
- counts: Dict[str, Dict[str, int]] = {}
1768
- for row in status_rows:
1769
- counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
1770
- for source in sources:
1771
- source["file_status"] = counts.get(source["id"], {})
1772
- return {"sources": sources}
1773
-
1774
- def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
1775
- source_id = str(source_id or "").strip()
1776
- if not source_id:
1777
- raise ValueError("source_id required")
1778
- with self._connect() as conn:
1779
- row = conn.execute(
1780
- "SELECT id FROM knowledge_sources WHERE id=?",
1781
- (source_id,),
1782
- ).fetchone()
1783
- if not row:
1784
- raise ValueError(f"knowledge source not found: {source_id}")
1785
- conn.execute(
1786
- "UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
1787
- (1 if enabled else 0, _now(), source_id),
1788
- )
1789
- return {"source_id": source_id, "watch_enabled": bool(enabled)}
1790
-
1791
- def remove_local_source(self, source_id: str) -> Dict[str, Any]:
1792
- """Remove one approved local source and its derived graph projection.
1793
-
1794
- This is intentionally non-destructive for user files: only the LatticeAI
1795
- index rows, graph nodes, edges, and chunks derived from the source are
1796
- removed. The original folder and files are never touched.
1797
- """
1798
- source_id = str(source_id or "").strip()
1799
- if not source_id:
1800
- raise ValueError("source_id required")
1801
- with self._connect() as conn:
1802
- source = conn.execute(
1803
- "SELECT id, root_path FROM knowledge_sources WHERE id=?",
1804
- (source_id,),
1805
- ).fetchone()
1806
- if not source:
1807
- raise ValueError(f"knowledge source not found: {source_id}")
1808
- rows = conn.execute(
1809
- "SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
1810
- (source_id,),
1811
- ).fetchall()
1812
- graph_node_ids = [row["graph_node_id"] for row in rows if row["graph_node_id"]]
1813
- for graph_node_id in graph_node_ids:
1814
- self._delete_local_file_graph(conn, graph_node_id)
1815
- conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
1816
- conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
1817
- self._cleanup_local_graph_orphans(conn, source_id)
1818
- return {
1819
- "source_id": source_id,
1820
- "root_path": source["root_path"],
1821
- "removed_graph_nodes": len(graph_node_ids),
1822
- }
1823
-
1824
- def _extract_local_file_text(self, path: Path, category: str, *, include_ocr: bool) -> Tuple[str, Dict[str, Any]]:
1825
- ext = path.suffix.lower()
1826
- meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
1827
- text = ""
1828
- if category in {"text", "code"} or ext == ".csv":
1829
- text = path.read_text(encoding="utf-8", errors="replace")
1830
- elif ext == ".pdf":
1831
- import pdfplumber
1832
- with pdfplumber.open(str(path)) as pdf:
1833
- meta["pages"] = len(pdf.pages)
1834
- text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
1835
- elif ext == ".docx":
1836
- from docx import Document
1837
- doc = Document(str(path))
1838
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
1839
- table_lines = []
1840
- for table in doc.tables:
1841
- for row in table.rows:
1842
- cells = [_clean_text(cell.text) for cell in row.cells]
1843
- if any(cells):
1844
- table_lines.append("\t".join(cells))
1845
- meta["paragraphs"] = len(paragraphs)
1846
- meta["tables"] = len(doc.tables)
1847
- meta["table_rows"] = len(table_lines)
1848
- text = "\n\n".join([*paragraphs, *table_lines])
1849
- elif ext == ".xlsx":
1850
- from openpyxl import load_workbook
1851
- wb = load_workbook(str(path), read_only=True, data_only=True)
1852
- rows_all = []
1853
- non_empty_rows = 0
1854
- non_empty_cells = 0
1855
- char_count = 0
1856
- for ws in wb.worksheets:
1857
- sheet_rows = []
1858
- for row in ws.iter_rows(values_only=True):
1859
- cells = [str(cell).strip() if cell is not None else "" for cell in row]
1860
- if not any(cells):
1861
- continue
1862
- line = "\t".join(cells)
1863
- non_empty_rows += 1
1864
- non_empty_cells += sum(1 for cell in cells if cell)
1865
- sheet_rows.append(line)
1866
- char_count += len(line) + 1
1867
- if char_count > 200_000:
1868
- break
1869
- if sheet_rows:
1870
- rows_all.append(f"[Sheet: {ws.title}]")
1871
- rows_all.extend(sheet_rows)
1872
- if char_count > 200_000:
1873
- break
1874
- meta["sheets"] = len(wb.worksheets)
1875
- meta["rows"] = non_empty_rows
1876
- meta["cells"] = non_empty_cells
1877
- text = "\n".join(rows_all)
1878
- elif ext == ".pptx":
1879
- from pptx import Presentation
1880
- prs = Presentation(str(path))
1881
- slides_text = []
1882
- for index, slide in enumerate(prs.slides, 1):
1883
- parts = []
1884
- for shape in slide.shapes:
1885
- if getattr(shape, "has_text_frame", False):
1886
- slide_text = shape.text_frame.text.strip()
1887
- if slide_text:
1888
- parts.append(slide_text)
1889
- if parts:
1890
- slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
1891
- meta["slides"] = len(prs.slides)
1892
- meta["text_slides"] = len(slides_text)
1893
- text = "\n\n".join(slides_text)
1894
- elif category == "image":
1895
- from PIL import Image
1896
- with Image.open(str(path)) as image:
1897
- meta.update({
1898
- "width": image.width,
1899
- "height": image.height,
1900
- "format": image.format,
1901
- "mode": image.mode,
1902
- "ocr_enabled": bool(include_ocr),
1903
- })
1904
- if include_ocr:
1905
- try:
1906
- import pytesseract
1907
- text = pytesseract.image_to_string(image)
1908
- meta["ocr_chars"] = len(text)
1909
- except Exception as exc: # pragma: no cover - depends on local OCR runtime
1910
- meta["ocr_error"] = str(exc)
1911
- text = ""
1912
- return text[:200_000], meta
1913
-
1914
- def _ensure_local_hierarchy(
1915
- self,
1916
- conn: sqlite3.Connection,
1917
- *,
1918
- source_id: str,
1919
- root: Path,
1920
- file_path: Path,
1921
- os_type: str,
1922
- drive_id: str,
1923
- ) -> str:
1924
- computer_label = platform.node() or "내 컴퓨터"
1925
- computer_id = f"computer:{_slug(computer_label)}"
1926
- drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
1927
- root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
1928
- self._upsert_node(conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type})
1929
- self._upsert_node(conn, drive_node_id, "Drive", drive_id, metadata={"os_type": os_type, "drive_id": drive_id})
1930
- self._upsert_edge(conn, computer_id, drive_node_id, "포함함", metadata={"source": "local_scan"})
1931
- self._upsert_node(
1932
- conn,
1933
- root_folder_id,
1934
- "Folder",
1935
- root.name or str(root),
1936
- summary=str(root),
1937
- metadata={"source_id": source_id, "path": str(root), "root": True},
1938
- )
1939
- self._upsert_edge(conn, drive_node_id, root_folder_id, "포함함", metadata={"source": "local_scan"})
1940
-
1941
- try:
1942
- relative_parent = file_path.parent.relative_to(root)
1943
- except ValueError:
1944
- relative_parent = Path()
1945
- parent_id = root_folder_id
1946
- current_path = root
1947
- for part in relative_parent.parts:
1948
- current_path = current_path / part
1949
- folder_id = f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
1950
- self._upsert_node(
1951
- conn,
1952
- folder_id,
1953
- "Folder",
1954
- part,
1955
- summary=str(current_path),
1956
- metadata={"source_id": source_id, "path": str(current_path), "root": False},
1957
- )
1958
- self._upsert_edge(conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"})
1959
- parent_id = folder_id
1960
- return parent_id
1961
-
1962
- def _upsert_local_file_index(
1963
- self,
1964
- conn: sqlite3.Connection,
1965
- *,
1966
- source_id: str,
1967
- root: Path,
1968
- file_path: Path,
1969
- stat: Optional[os.stat_result],
1970
- os_type: str,
1971
- drive_id: str,
1972
- status: str,
1973
- parser_type: str,
1974
- sha256: Optional[str] = None,
1975
- graph_node_id: Optional[str] = None,
1976
- error_message: Optional[str] = None,
1977
- metadata: Optional[Dict[str, Any]] = None,
1978
- ) -> str:
1979
- try:
1980
- relative_path = file_path.relative_to(root).as_posix()
1981
- except ValueError:
1982
- relative_path = file_path.name
1983
- index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
1984
- now = _now()
1985
- size = stat.st_size if stat else None
1986
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
1987
- conn.execute(
1988
- """
1989
- INSERT INTO local_file_index(
1990
- id, source_id, os_type, drive_id, root_path, file_path, relative_path,
1991
- file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
1992
- last_indexed_at, parser_type, status, error_message, graph_node_id,
1993
- deleted, metadata_json
1994
- )
1995
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1996
- ON CONFLICT(source_id, relative_path) DO UPDATE SET
1997
- os_type=excluded.os_type,
1998
- drive_id=excluded.drive_id,
1999
- root_path=excluded.root_path,
2000
- file_path=excluded.file_path,
2001
- file_name=excluded.file_name,
2002
- extension=excluded.extension,
2003
- size_bytes=excluded.size_bytes,
2004
- modified_at=excluded.modified_at,
2005
- sha256=excluded.sha256,
2006
- last_scanned_at=excluded.last_scanned_at,
2007
- last_indexed_at=excluded.last_indexed_at,
2008
- parser_type=excluded.parser_type,
2009
- status=excluded.status,
2010
- error_message=excluded.error_message,
2011
- graph_node_id=excluded.graph_node_id,
2012
- deleted=excluded.deleted,
2013
- metadata_json=excluded.metadata_json
2014
- """,
2015
- (
2016
- index_id, source_id, os_type, drive_id, str(root), str(file_path), relative_path,
2017
- file_path.name, file_path.suffix.lower(), size, modified_at, sha256, now,
2018
- now if status == "indexed" else None, parser_type, status, error_message,
2019
- graph_node_id, 0 if status != "deleted" else 1, _json(metadata),
2020
- ),
2021
- )
2022
- return index_id
2023
-
2024
- def _delete_local_file_graph(self, conn: sqlite3.Connection, file_node_id: Optional[str]) -> None:
2025
- if not file_node_id:
2026
- return
2027
-
2028
- file_row = conn.execute(
2029
- "SELECT metadata_json FROM nodes WHERE id=?",
2030
- (file_node_id,),
2031
- ).fetchone()
2032
- source_id = None
2033
- if file_row:
2034
- source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
2035
-
2036
- linked_rows = conn.execute(
2037
- """
2038
- SELECT n.id, n.type, n.metadata_json
2039
- FROM edges e
2040
- JOIN nodes n ON n.id=e.to_node
2041
- WHERE e.from_node=?
2042
- """,
2043
- (file_node_id,),
2044
- ).fetchall()
2045
- owned_ids: set = set()
2046
- auto_candidate_ids: set = set()
2047
- for row in linked_rows:
2048
- metadata = _safe_loads(row["metadata_json"])
2049
- if row["type"] in {"Chunk", "ImageText", "Section"} or metadata.get("source_node") == file_node_id:
2050
- owned_ids.add(row["id"])
2051
- elif metadata.get("auto_extracted") and metadata.get("source") == "local_folder":
2052
- auto_candidate_ids.add(row["id"])
2053
-
2054
- conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
2055
- conn.execute("DELETE FROM edges WHERE from_node=? OR to_node=?", (file_node_id, file_node_id))
2056
- conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
2057
- self._v2_delete_nodes(conn, [file_node_id])
2058
-
2059
- def delete_nodes(node_ids: set) -> None:
2060
- if not node_ids:
2061
- return
2062
- placeholders = ",".join("?" * len(node_ids))
2063
- params = list(node_ids)
2064
- conn.execute(f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params)
2065
- conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", params * 2)
2066
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
2067
- self._v2_delete_nodes(conn, params)
2068
-
2069
- delete_nodes(owned_ids)
2070
-
2071
- removable_auto_ids: set = set()
2072
- for node_id in auto_candidate_ids:
2073
- remaining_edges = conn.execute(
2074
- "SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
2075
- (node_id, node_id),
2076
- ).fetchall()
2077
- if all(
2078
- (row["from_node"] in auto_candidate_ids and row["to_node"] in auto_candidate_ids)
2079
- for row in remaining_edges
2080
- ):
2081
- removable_auto_ids.add(node_id)
2082
- delete_nodes(removable_auto_ids)
2083
- if source_id:
2084
- self._cleanup_local_graph_orphans(conn, str(source_id))
2085
-
2086
- def _cleanup_local_graph_orphans(self, conn: sqlite3.Connection, source_id: str) -> None:
2087
- while True:
2088
- folder_rows = conn.execute(
2089
- "SELECT id, metadata_json FROM nodes WHERE type='Folder'"
2090
- ).fetchall()
2091
- leaf_ids = []
2092
- for row in folder_rows:
2093
- metadata = _safe_loads(row["metadata_json"])
2094
- if metadata.get("source_id") != source_id:
2095
- continue
2096
- has_children = conn.execute(
2097
- "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
2098
- (row["id"],),
2099
- ).fetchone()
2100
- if not has_children:
2101
- leaf_ids.append(row["id"])
2102
- if not leaf_ids:
2103
- break
2104
- placeholders = ",".join("?" * len(leaf_ids))
2105
- conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", leaf_ids * 2)
2106
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
2107
- self._v2_delete_nodes(conn, leaf_ids)
2108
-
2109
- for node_type in ("Drive", "Computer"):
2110
- rows = conn.execute("SELECT id FROM nodes WHERE type=?", (node_type,)).fetchall()
2111
- removable = []
2112
- for row in rows:
2113
- has_children = conn.execute(
2114
- "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
2115
- (row["id"],),
2116
- ).fetchone()
2117
- if not has_children:
2118
- removable.append(row["id"])
2119
- if removable:
2120
- placeholders = ",".join("?" * len(removable))
2121
- conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", removable * 2)
2122
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", removable)
2123
- self._v2_delete_nodes(conn, removable)
2124
-
2125
- def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
2126
- metadata = _safe_loads(row["metadata_json"])
2127
- parser = metadata.get("parser") if isinstance(metadata, dict) else {}
2128
- if not isinstance(parser, dict):
2129
- return False
2130
- try:
2131
- return int(parser.get("extracted_chars") or 0) > 0
2132
- except (TypeError, ValueError):
2133
- return False
2134
-
2135
- def _upsert_local_file_node(
2136
- self,
2137
- conn: sqlite3.Connection,
2138
- *,
2139
- source_id: str,
2140
- root: Path,
2141
- file_path: Path,
2142
- stat: os.stat_result,
2143
- os_type: str,
2144
- drive_id: str,
2145
- sha256: str,
2146
- category: str,
2147
- parser_type: str,
2148
- text: str,
2149
- parser_meta: Dict[str, Any],
2150
- ) -> str:
2151
- text = _clean_text(text)
2152
- if not text:
2153
- raise ValueError("텍스트 추출 결과가 비어 있습니다.")
2154
- try:
2155
- relative_path = file_path.relative_to(root).as_posix()
2156
- except ValueError:
2157
- relative_path = file_path.name
2158
- file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
2159
- parent_folder_id = self._ensure_local_hierarchy(
2160
- conn,
2161
- source_id=source_id,
2162
- root=root,
2163
- file_path=file_path,
2164
- os_type=os_type,
2165
- drive_id=drive_id,
2166
- )
2167
- child_rows = conn.execute(
2168
- """
2169
- SELECT e.to_node AS id
2170
- FROM edges e
2171
- JOIN nodes n ON n.id=e.to_node
2172
- WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
2173
- """,
2174
- (file_node_id,),
2175
- ).fetchall()
2176
- child_ids = [row["id"] for row in child_rows]
2177
- conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
2178
- if child_ids:
2179
- placeholders = ",".join("?" * len(child_ids))
2180
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
2181
- self._v2_delete_nodes(conn, child_ids)
2182
- conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
2183
- self._v2_delete_edges_from(conn, file_node_id)
2184
-
2185
- metadata = {
2186
- "source": "local_folder",
2187
- "source_id": source_id,
2188
- "root_path": str(root),
2189
- "file_path": str(file_path),
2190
- "relative_path": relative_path,
2191
- "filename": file_path.name,
2192
- "ext": file_path.suffix.lower(),
2193
- "category": category,
2194
- "parser_type": parser_type,
2195
- "bytes": stat.st_size,
2196
- "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
2197
- "sha256": sha256,
2198
- "parser": parser_meta,
2199
- }
2200
- self._upsert_node(
2201
- conn,
2202
- file_node_id,
2203
- _node_type_for_category(category),
2204
- file_path.name,
2205
- summary=text[:700],
2206
- metadata=metadata,
2207
- raw=metadata,
2208
- )
2209
- self._upsert_edge(conn, parent_folder_id, file_node_id, "포함함", weight=1.0, metadata={"source": "local_scan"})
2210
-
2211
- target_for_concepts = text
2212
- if category == "image" and text:
2213
- image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
2214
- self._upsert_node(
2215
- conn,
2216
- image_text_id,
2217
- "ImageText",
2218
- f"{file_path.name} OCR",
2219
- summary=_clean_text(text)[:700],
2220
- metadata={"source_node": file_node_id, "source_id": source_id, "chars": len(text)},
2221
- )
2222
- self._upsert_edge(conn, file_node_id, image_text_id, "포함함", weight=0.8, metadata={"source": "ocr"})
2223
-
2224
- for index, chunk in enumerate(_chunks(text)):
2225
- chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
2226
- self._upsert_node(
2227
- conn,
2228
- chunk_id,
2229
- "Chunk",
2230
- f"{file_path.name} chunk {index + 1}",
2231
- summary=chunk[:500],
2232
- metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
2233
- )
2234
- self._upsert_chunk(
2235
- conn,
2236
- chunk_id=chunk_id,
2237
- source_node=file_node_id,
2238
- text=chunk,
2239
- metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
2240
- )
2241
- self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
2242
-
2243
- concepts = _extract_concepts(target_for_concepts, limit=18)
2244
- concept_ids: Dict[str, str] = {}
2245
- for concept in concepts:
2246
- node_t = _classify_node_type(concept, target_for_concepts)
2247
- concept_id = f"{node_t.lower()}:{_slug(concept)}"
2248
- concept_ids[concept.lower()] = concept_id
2249
- self._upsert_node(
2250
- conn,
2251
- concept_id,
2252
- node_t,
2253
- concept,
2254
- metadata={"auto_extracted": True, "source": "local_folder", "source_id": source_id},
2255
- )
2256
- self._upsert_edge(conn, file_node_id, concept_id, "언급함", weight=0.75, metadata={"source": "local_scan"})
2257
-
2258
- for triple in _extract_triples(target_for_concepts, concepts, limit=20):
2259
- subj_id = concept_ids.get(triple["subject"].lower())
2260
- obj_id = concept_ids.get(triple["object"].lower())
2261
- if subj_id and obj_id and subj_id != obj_id:
2262
- self._upsert_edge(
2263
- conn,
2264
- subj_id,
2265
- obj_id,
2266
- triple["relation"],
2267
- weight=0.9,
2268
- metadata={"context": triple.get("context", "")[:240], "source_id": source_id},
2269
- )
2270
-
2271
- for item in _semantic_items(target_for_concepts):
2272
- sem_type = item["type"]
2273
- sem_title = item["title"]
2274
- sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
2275
- self._upsert_node(
2276
- conn,
2277
- sem_id,
2278
- sem_type,
2279
- sem_title,
2280
- summary=item["summary"],
2281
- metadata={"auto_extracted": True, "source_node": file_node_id, "filename": file_path.name},
2282
- raw=item,
2283
- )
2284
- self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
2285
-
2286
- return file_node_id
2287
-
2288
- def index_local_folder(
2289
- self,
2290
- path: Path,
2291
- *,
2292
- include_ocr: bool = False,
2293
- watch_enabled: bool = False,
2294
- user_email: Optional[str] = None,
2295
- consent: Optional[Dict[str, Any]] = None,
2296
- max_files: int = 5_000,
2297
- ) -> Dict[str, Any]:
2298
- """Read approved files from a local folder and connect them to Graph RAG."""
2299
- root = Path(path).expanduser().resolve()
2300
- if not root.exists():
2301
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
2302
- if not root.is_dir():
2303
- raise ValueError(f"폴더가 아닙니다: {path}")
2304
-
2305
- os_type = _current_os_type()
2306
- drive_id = _drive_id_for_path(root)
2307
- source_id = f"source:{_path_fingerprint(root)}"
2308
- now = _now()
2309
- max_files = max(1, min(int(max_files or 5_000), 50_000))
2310
- consent_payload = {
2311
- "approved_at": now,
2312
- "approved_by": user_email,
2313
- "knowledge_source": True,
2314
- "include_ocr": bool(include_ocr),
2315
- "watch_enabled": bool(watch_enabled),
2316
- "sensitive_files_default_excluded": True,
2317
- **(consent or {}),
2318
- }
2319
- counts: Counter = Counter()
2320
- seen_relative_paths: set = set()
2321
- indexed_nodes: List[str] = []
2322
- errors: List[Dict[str, str]] = []
2323
- limit_reached = False
2324
-
2325
- with self._connect() as conn:
2326
- conn.execute(
2327
- """
2328
- INSERT INTO knowledge_sources(
2329
- id, root_path, os_type, drive_id, label, status, include_ocr,
2330
- watch_enabled, consent_json, created_at, updated_at, last_scanned_at
2331
- )
2332
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
2333
- ON CONFLICT(id) DO UPDATE SET
2334
- root_path=excluded.root_path,
2335
- os_type=excluded.os_type,
2336
- drive_id=excluded.drive_id,
2337
- label=excluded.label,
2338
- status=excluded.status,
2339
- include_ocr=excluded.include_ocr,
2340
- watch_enabled=excluded.watch_enabled,
2341
- consent_json=excluded.consent_json,
2342
- updated_at=excluded.updated_at,
2343
- last_scanned_at=excluded.last_scanned_at
2344
- """,
2345
- (
2346
- source_id, str(root), os_type, drive_id, root.name or str(root), "scanning",
2347
- 1 if include_ocr else 0, 1 if watch_enabled else 0, _json(consent_payload),
2348
- now, now, now,
2349
- ),
2350
- )
2351
-
2352
- for entry in self._iter_local_scan_entries(root, max_files=max_files):
2353
- kind = entry["kind"]
2354
- file_path = entry["path"]
2355
- if kind == "limit_reached":
2356
- counts["limit_reached"] += 1
2357
- limit_reached = True
2358
- break
2359
- if kind in {"excluded_dir", "excluded"}:
2360
- counts["excluded"] += 1
2361
- continue
2362
- if kind in {"inaccessible_dir", "inaccessible_file"}:
2363
- counts["failed"] += 1
2364
- errors.append({"path": str(file_path), "error": entry.get("reason", "inaccessible")})
2365
- continue
2366
- if kind != "file":
2367
- continue
2368
-
2369
- stat = entry["stat"]
2370
- try:
2371
- relative_path = file_path.relative_to(root).as_posix()
2372
- except ValueError:
2373
- relative_path = file_path.name
2374
- seen_relative_paths.add(relative_path)
2375
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
2376
- existing = conn.execute(
2377
- """
2378
- SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
2379
- FROM local_file_index
2380
- WHERE source_id=? AND relative_path=?
2381
- """,
2382
- (source_id, relative_path),
2383
- ).fetchone()
2384
- decision = self._local_file_decision(file_path, root, stat)
2385
- parser_type = decision["parser_type"]
2386
- if not decision["indexable"]:
2387
- counts[decision["status"]] += 1
2388
- if existing and existing["graph_node_id"]:
2389
- self._delete_local_file_graph(conn, existing["graph_node_id"])
2390
- self._upsert_local_file_index(
2391
- conn,
2392
- source_id=source_id,
2393
- root=root,
2394
- file_path=file_path,
2395
- stat=stat,
2396
- os_type=os_type,
2397
- drive_id=drive_id,
2398
- status=decision["status"],
2399
- parser_type=parser_type,
2400
- metadata={"reason": decision["reason"], "category": decision["category"]},
2401
- )
2402
- continue
2403
-
2404
- if (
2405
- existing
2406
- and existing["status"] == "indexed"
2407
- and existing["graph_node_id"]
2408
- and self._local_file_index_has_extracted_text(existing)
2409
- and existing["size_bytes"] == stat.st_size
2410
- and existing["modified_at"] == modified_at
2411
- ):
2412
- counts["skipped_unchanged"] += 1
2413
- self._upsert_local_file_index(
2414
- conn,
2415
- source_id=source_id,
2416
- root=root,
2417
- file_path=file_path,
2418
- stat=stat,
2419
- os_type=os_type,
2420
- drive_id=drive_id,
2421
- status="indexed",
2422
- parser_type=parser_type,
2423
- sha256=existing["sha256"],
2424
- graph_node_id=existing["graph_node_id"],
2425
- metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "unchanged": True},
2426
- )
2427
- continue
2428
-
2429
- try:
2430
- data = file_path.read_bytes()
2431
- digest = _sha256_bytes(data)
2432
- except Exception as exc:
2433
- counts["failed"] += 1
2434
- errors.append({"path": str(file_path), "error": str(exc)})
2435
- if existing and existing["graph_node_id"]:
2436
- self._delete_local_file_graph(conn, existing["graph_node_id"])
2437
- self._upsert_local_file_index(
2438
- conn,
2439
- source_id=source_id,
2440
- root=root,
2441
- file_path=file_path,
2442
- stat=stat,
2443
- os_type=os_type,
2444
- drive_id=drive_id,
2445
- status="failed",
2446
- parser_type=parser_type,
2447
- error_message=str(exc),
2448
- metadata={"category": decision["category"]},
2449
- )
2450
- continue
2451
-
2452
- if (
2453
- existing
2454
- and existing["sha256"] == digest
2455
- and existing["graph_node_id"]
2456
- and self._local_file_index_has_extracted_text(existing)
2457
- ):
2458
- counts["skipped_unchanged"] += 1
2459
- self._upsert_local_file_index(
2460
- conn,
2461
- source_id=source_id,
2462
- root=root,
2463
- file_path=file_path,
2464
- stat=stat,
2465
- os_type=os_type,
2466
- drive_id=drive_id,
2467
- status="indexed",
2468
- parser_type=parser_type,
2469
- sha256=digest,
2470
- graph_node_id=existing["graph_node_id"],
2471
- metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "sha256_unchanged": True},
2472
- )
2473
- continue
2474
-
2475
- try:
2476
- text, parser_meta = self._extract_local_file_text(
2477
- file_path,
2478
- decision["category"],
2479
- include_ocr=include_ocr,
2480
- )
2481
- text = _clean_text(text)
2482
- parser_meta = {**parser_meta, "extracted_chars": len(text)}
2483
- if not text:
2484
- counts["skipped_empty_text"] += 1
2485
- if existing and existing["graph_node_id"]:
2486
- self._delete_local_file_graph(conn, existing["graph_node_id"])
2487
- self._upsert_local_file_index(
2488
- conn,
2489
- source_id=source_id,
2490
- root=root,
2491
- file_path=file_path,
2492
- stat=stat,
2493
- os_type=os_type,
2494
- drive_id=drive_id,
2495
- status="skipped_empty_text",
2496
- parser_type=parser_type,
2497
- sha256=digest,
2498
- error_message="텍스트 추출 결과가 비어 있습니다.",
2499
- metadata={"category": decision["category"], "parser": parser_meta},
2500
- )
2501
- continue
2502
- graph_node_id = self._upsert_local_file_node(
2503
- conn,
2504
- source_id=source_id,
2505
- root=root,
2506
- file_path=file_path,
2507
- stat=stat,
2508
- os_type=os_type,
2509
- drive_id=drive_id,
2510
- sha256=digest,
2511
- category=decision["category"],
2512
- parser_type=parser_type,
2513
- text=text,
2514
- parser_meta=parser_meta,
2515
- )
2516
- self._upsert_local_file_index(
2517
- conn,
2518
- source_id=source_id,
2519
- root=root,
2520
- file_path=file_path,
2521
- stat=stat,
2522
- os_type=os_type,
2523
- drive_id=drive_id,
2524
- status="indexed",
2525
- parser_type=parser_type,
2526
- sha256=digest,
2527
- graph_node_id=graph_node_id,
2528
- metadata={"category": decision["category"], "parser": parser_meta},
2529
- )
2530
- counts["indexed"] += 1
2531
- indexed_nodes.append(graph_node_id)
2532
- except Exception as exc:
2533
- counts["failed"] += 1
2534
- errors.append({"path": str(file_path), "error": str(exc)})
2535
- if existing and existing["graph_node_id"]:
2536
- self._delete_local_file_graph(conn, existing["graph_node_id"])
2537
- self._upsert_local_file_index(
2538
- conn,
2539
- source_id=source_id,
2540
- root=root,
2541
- file_path=file_path,
2542
- stat=stat,
2543
- os_type=os_type,
2544
- drive_id=drive_id,
2545
- status="failed",
2546
- parser_type=parser_type,
2547
- sha256=digest,
2548
- error_message=str(exc),
2549
- metadata={"category": decision["category"]},
2550
- )
2551
-
2552
- if not limit_reached:
2553
- existing_rows = {
2554
- row["relative_path"]: row["graph_node_id"]
2555
- for row in conn.execute(
2556
- "SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
2557
- (source_id,),
2558
- )
2559
- }
2560
- deleted_paths = set(existing_rows) - seen_relative_paths
2561
- for relative_path in deleted_paths:
2562
- self._delete_local_file_graph(conn, existing_rows.get(relative_path))
2563
- conn.execute(
2564
- """
2565
- UPDATE local_file_index
2566
- SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
2567
- WHERE source_id=? AND relative_path=?
2568
- """,
2569
- (_now(), source_id, relative_path),
2570
- )
2571
- counts["deleted"] = len(deleted_paths)
2572
- conn.execute(
2573
- """
2574
- UPDATE knowledge_sources
2575
- SET status='active', updated_at=?, last_scanned_at=?
2576
- WHERE id=?
2577
- """,
2578
- (_now(), _now(), source_id),
2579
- )
2580
-
2581
- return {
2582
- "status": "ok",
2583
- "source": {
2584
- "id": source_id,
2585
- "root_path": str(root),
2586
- "os_type": os_type,
2587
- "drive_id": drive_id,
2588
- "include_ocr": bool(include_ocr),
2589
- "watch_enabled": bool(watch_enabled),
2590
- },
2591
- "counts": dict(counts),
2592
- "indexed_nodes": indexed_nodes[:100],
2593
- "errors": errors[:50],
2594
- "notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
2595
- }
2596
-
2597
- def ingest_message(
2598
- self,
2599
- role: str,
2600
- content: str,
2601
- *,
2602
- user_email: Optional[str] = None,
2603
- user_nickname: Optional[str] = None,
2604
- source: Optional[str] = None,
2605
- conversation_id: Optional[str] = None,
2606
- raw: Optional[Dict[str, Any]] = None,
2607
- ) -> Dict[str, Any]:
2608
- content = str(content or "")
2609
- digest = _sha256_text("|".join([role or "", content, conversation_id or "", user_email or ""]))[:24]
2610
- node_type = "AIResponse" if role == "assistant" else "Message"
2611
- node_id = f"{node_type.lower()}:{digest}"
2612
- conv_id = f"conversation:{_slug(conversation_id or 'default')}"
2613
- metadata = {
2614
- "role": role,
2615
- "source": source,
2616
- "conversation_id": conversation_id,
2617
- "user_email": user_email,
2618
- "user_nickname": user_nickname,
2619
- "chars": len(content),
2620
- }
2621
- concepts = _extract_concepts(content)
2622
- triples = _extract_triples(content, concepts)
2623
- semantic = _semantic_items(content)
2624
-
2625
- with self._connect() as conn:
2626
- # ── 1. Chat node (점: 명사 — 대화 세션 단위) ─────────────────────
2627
- # One Chat node per conversation_id; title = first 80 chars of
2628
- # the first user message in this session (updated on each call).
2629
- chat_title = _clean_text(content)[:80] or (conversation_id or "대화")
2630
- self._upsert_node(
2631
- conn, conv_id, "Chat",
2632
- chat_title,
2633
- summary=_clean_text(content)[:400],
2634
- metadata={"source": source, "conversation_id": conversation_id},
2635
- )
2636
-
2637
- # ── 2. Person node (점: 명사 — 사람) ─────────────────────────────
2638
- person_id = None
2639
- if user_email or user_nickname:
2640
- person_key = user_email or user_nickname or "unknown"
2641
- person_id = f"person:{_slug(person_key)}"
2642
- self._upsert_node(
2643
- conn, person_id, "Person",
2644
- user_nickname or user_email or "Unknown",
2645
- metadata={"email": user_email, "nickname": user_nickname},
2646
- )
2647
- # 선: 동사 — Person이 Chat을 "작성함"
2648
- self._upsert_edge(conn, person_id, conv_id, "작성함",
2649
- weight=1.0, metadata={"role": role})
2650
-
2651
- # ── 3. Raw message node (RAG 검색용, 그래프에서 숨김) ─────────────
2652
- self._upsert_node(
2653
- conn, node_id, node_type,
2654
- _clean_text(content)[:80] or role,
2655
- summary=_clean_text(content)[:500],
2656
- metadata=metadata,
2657
- raw=raw or metadata,
2658
- )
2659
- # 선: Chat이 메시지를 "포함함"
2660
- self._upsert_edge(conn, conv_id, node_id, "포함함",
2661
- weight=0.3, metadata={"role": role})
2662
-
2663
- # ── 4. RAG chunks (검색용, 그래프에서 숨김) ──────────────────────
2664
- for index, chunk in enumerate(_chunks(content)):
2665
- chunk_id = f"chunk:{_sha256_text(f'{node_id}:{index}:{chunk}')[:24]}"
2666
- self._upsert_node(
2667
- conn, chunk_id, "Chunk",
2668
- f"chunk {index + 1}",
2669
- summary=chunk[:500],
2670
- metadata={"index": index, "source_node": node_id},
2671
- )
2672
- self._upsert_chunk(
2673
- conn,
2674
- chunk_id=chunk_id,
2675
- source_node=node_id,
2676
- text=chunk,
2677
- metadata={"index": index, "source_node": node_id},
2678
- )
2679
- self._upsert_edge(conn, node_id, chunk_id, "포함함")
2680
-
2681
- # ── 5. Concept / Feature / Error / Code 노드 (점: 명사) ───────────
2682
- concept_ids: Dict[str, str] = {}
2683
- for concept in concepts:
2684
- node_t = _classify_node_type(concept, content)
2685
- cid = f"{node_t.lower()}:{_slug(concept)}"
2686
- concept_ids[concept.lower()] = cid
2687
- self._upsert_node(
2688
- conn, cid, node_t, concept,
2689
- metadata={"auto_extracted": True, "source": source},
2690
- )
2691
- # 선: Chat이 개념을 "언급함"
2692
- self._upsert_edge(conn, conv_id, cid, "언급함",
2693
- weight=0.7, metadata={"source": source})
2694
-
2695
- # ── 6. Concept–Concept 엣지 (선: 동사형) ─────────────────────────
2696
- for triple in triples:
2697
- subj_id = concept_ids.get(triple["subject"].lower())
2698
- obj_id = concept_ids.get(triple["object"].lower())
2699
- if subj_id and obj_id and subj_id != obj_id:
2700
- self._upsert_edge(
2701
- conn, subj_id, obj_id,
2702
- triple["relation"], # 동사형 레이블
2703
- weight=1.0,
2704
- metadata={"context": triple.get("context", "")[:240]},
2705
- )
2706
-
2707
- # ── 7. Task / Decision 노드 (점: 명사) ────────────────────────────
2708
- for item in semantic:
2709
- sem_type = item["type"]
2710
- sem_title = item["title"]
2711
- sem_id = f"{sem_type.lower()}:{_sha256_text(f'{conv_id}:{sem_type}:{sem_title}')[:24]}"
2712
- self._upsert_node(
2713
- conn, sem_id, sem_type, sem_title,
2714
- summary=item["summary"],
2715
- metadata={"auto_extracted": True, "source_node": node_id},
2716
- raw=item,
2717
- )
2718
- # 선: Chat이 Task/Decision을 "생성함"
2719
- self._upsert_edge(conn, conv_id, sem_id, "생성함", weight=0.9)
2720
- # Task/Decision이 관련 개념을 "언급함"
2721
- for cid in list(concept_ids.values())[:3]:
2722
- self._upsert_edge(conn, sem_id, cid, "언급함", weight=0.6)
2723
-
2724
- return {"node_id": node_id, "type": node_type}
2725
-
2726
- def ingest_document(
2727
- self,
2728
- path: Path,
2729
- *,
2730
- original_filename: Optional[str] = None,
2731
- mime_type: Optional[str] = None,
2732
- uploader: Optional[str] = None,
2733
- conversation_id: Optional[str] = None,
2734
- extracted: Optional[Dict[str, Any]] = None,
2735
- source_type: Optional[str] = None,
2736
- source_uri: Optional[str] = None,
2737
- captured_at: Optional[str] = None,
2738
- modified_at: Optional[str] = None,
2739
- owner: Optional[str] = None,
2740
- workspace_id: Optional[str] = None,
2741
- permissions: Optional[Dict[str, Any]] = None,
2742
- ) -> Dict[str, Any]:
2743
- path = Path(path)
2744
- data = path.read_bytes()
2745
- digest = _sha256_bytes(data)
2746
- ext = path.suffix.lower()
2747
- filename = original_filename or path.name
2748
- captured_at = captured_at or _now()
2749
- blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
2750
- blob_path.parent.mkdir(parents=True, exist_ok=True)
2751
- if not blob_path.exists():
2752
- shutil.copyfile(path, blob_path)
2753
-
2754
- doc_meta = self._document_structure(path, ext)
2755
- text = str((extracted or {}).get("content") or (extracted or {}).get("preview") or "")
2756
- file_id = f"file:{digest[:24]}"
2757
- metadata = {
2758
- "filename": filename,
2759
- "ext": ext,
2760
- "mime_type": mime_type,
2761
- "bytes": len(data),
2762
- "sha256": digest,
2763
- "content_hash": digest,
2764
- "blob_path": str(blob_path),
2765
- "uploader": uploader,
2766
- "owner": owner or uploader,
2767
- "workspace_id": workspace_id,
2768
- "permissions": permissions or {},
2769
- "source_type": source_type or "file",
2770
- "source_uri": source_uri or str(path),
2771
- "captured_at": captured_at,
2772
- "modified_at": modified_at,
2773
- "conversation_id": conversation_id,
2774
- "extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
2775
- "structure": doc_meta,
2776
- }
2777
- full_text = f"{filename}\n{text}"
2778
- concepts = _extract_concepts(full_text, limit=15)
2779
- triples = _extract_triples(full_text, concepts)
2780
- chunk_ids: List[str] = []
2781
- source_node_id: Optional[str] = None
2782
-
2783
- with self._connect() as conn:
2784
- duplicate = self._node_exists(conn, file_id)
2785
- # ── Document 노드 (점: 명사 — 파일) ────────────────────────────────
2786
- self._upsert_node(
2787
- conn, file_id, "Document", filename,
2788
- summary=(text or filename)[:500],
2789
- metadata=metadata, raw=metadata,
2790
- )
2791
- self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
2792
-
2793
- # ── SOURCE 노드 + indexed_from (v3.6.0, source_type 지정 시) ──────
2794
- if source_type:
2795
- source_node_id = self._attach_source_node(
2796
- conn, file_id,
2797
- source_type=source_type, source_uri=source_uri or str(path),
2798
- title=filename, content_hash=digest, captured_at=captured_at,
2799
- extra={"owner": owner or uploader, "workspace_id": workspace_id, "ext": ext},
2800
- )
2801
-
2802
- # ── Person 노드 + 동사형 엣지 ─────────────────────────────────────
2803
- if uploader:
2804
- person_id = f"person:{_slug(uploader)}"
2805
- self._upsert_node(
2806
- conn, person_id, "Person", uploader,
2807
- metadata={"email": uploader},
2808
- )
2809
- # 선: 동사 — Person이 Document를 "업로드함"
2810
- self._upsert_edge(conn, person_id, file_id, "업로드함", weight=1.0)
2811
-
2812
- # ── Chat 노드와 연결 ──────────────────────────────────────────────
2813
- if conversation_id:
2814
- conv_id = f"conversation:{_slug(conversation_id)}"
2815
- self._upsert_node(conn, conv_id, "Chat", conversation_id)
2816
- # 선: 동사 — Chat이 Document를 "언급함"
2817
- self._upsert_edge(conn, conv_id, file_id, "언급함", weight=0.8)
2818
-
2819
- # ── RAG chunks (검색용, 그래프 비표시) ────────────────────────────
2820
- for index, chunk in enumerate(_chunks(text)):
2821
- chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
2822
- chunk_ids.append(chunk_id)
2823
- self._upsert_node(
2824
- conn, chunk_id, "Chunk",
2825
- f"{filename} chunk {index + 1}",
2826
- summary=chunk[:500],
2827
- metadata={"index": index, "source_node": file_id},
2828
- )
2829
- self._upsert_chunk(
2830
- conn,
2831
- chunk_id=chunk_id,
2832
- source_node=file_id,
2833
- text=chunk,
2834
- metadata={"index": index, "source_node": file_id},
2835
- )
2836
- self._upsert_edge(conn, file_id, chunk_id, "포함함")
2837
-
2838
- # ── Concept / Feature / Error / Code 노드 + 동사형 엣지 ───────────
2839
- concept_ids: Dict[str, str] = {}
2840
- for concept in concepts:
2841
- node_t = _classify_node_type(concept, full_text)
2842
- cid = f"{node_t.lower()}:{_slug(concept)}"
2843
- concept_ids[concept.lower()] = cid
2844
- self._upsert_node(
2845
- conn, cid, node_t, concept,
2846
- metadata={"auto_extracted": True, "source_file": filename},
2847
- )
2848
- # 선: 동사 — Document가 Concept을 "포함함"
2849
- self._upsert_edge(conn, file_id, cid, "포함함", weight=0.8)
2850
-
2851
- # ── Concept–Concept 엣지 (선: 동사형) ───────────────────────────
2852
- for triple in triples:
2853
- subj_id = concept_ids.get(triple["subject"].lower())
2854
- obj_id = concept_ids.get(triple["object"].lower())
2855
- if subj_id and obj_id and subj_id != obj_id:
2856
- self._upsert_edge(
2857
- conn, subj_id, obj_id,
2858
- triple["relation"],
2859
- weight=1.0,
2860
- metadata={"context": triple.get("context", "")[:240]},
2861
- )
2862
-
2863
- # ── Task / Decision 노드 ──────────────────────────────────────────
2864
- for item in _semantic_items(text):
2865
- sem_type = item["type"]
2866
- sem_title = item["title"]
2867
- sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_id}:{sem_type}:{sem_title}')[:24]}"
2868
- self._upsert_node(
2869
- conn, sem_id, sem_type, sem_title,
2870
- summary=item["summary"],
2871
- metadata={"auto_extracted": True, "source_node": file_id, "filename": filename},
2872
- raw=item,
2873
- )
2874
- # 선: Document가 Task/Decision을 "포함함"
2875
- self._upsert_edge(conn, file_id, sem_id, "포함함", weight=0.9)
2876
-
2877
- return {
2878
- "node_id": file_id,
2879
- "type": "Document",
2880
- "sha256": digest,
2881
- "content_hash": digest,
2882
- "source_node_id": source_node_id,
2883
- "chunk_ids": chunk_ids,
2884
- "chunk_count": len(chunk_ids),
2885
- "duplicate": duplicate,
2886
- "captured_at": captured_at,
2887
- "metadata": metadata,
2888
- }
2889
-
2890
- def ingest_event(
2891
- self,
2892
- event_type: str,
2893
- title: str,
2894
- *,
2895
- user_email: Optional[str] = None,
2896
- user_nickname: Optional[str] = None,
2897
- source: Optional[str] = None,
2898
- conversation_id: Optional[str] = None,
2899
- metadata: Optional[Dict[str, Any]] = None,
2900
- ) -> Dict[str, Any]:
2901
- event_type = str(event_type or "Event")
2902
- title = str(title or event_type)
2903
- payload = {
2904
- "event_type": event_type,
2905
- "title": title,
2906
- "user_email": user_email,
2907
- "user_nickname": user_nickname,
2908
- "source": source,
2909
- "conversation_id": conversation_id,
2910
- "metadata": metadata or {},
2911
- "timestamp": _now(),
2912
- }
2913
- event_id = f"event:{_sha256_text(_json(payload))[:24]}"
2914
- conv_id = f"conversation:{_slug(conversation_id or 'default')}"
2915
- with self._connect() as conn:
2916
- self._upsert_node(conn, event_id, event_type, title, summary=title, metadata=payload, raw=payload)
2917
- self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
2918
- self._upsert_edge(conn, conv_id, event_id, "has_event", metadata={"source": source})
2919
- if user_email or user_nickname:
2920
- person_key = user_email or user_nickname or "unknown"
2921
- person_id = f"person:{_slug(person_key)}"
2922
- self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
2923
- self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
2924
- return {"node_id": event_id, "type": event_type}
2925
-
2926
- # ── v3.6.0 Knowledge Graph First: unified source ingestion + provenance ──────
2927
- def _node_exists(self, conn: sqlite3.Connection, node_id: str) -> bool:
2928
- row = conn.execute("SELECT 1 FROM nodes WHERE id = ?", (node_id,)).fetchone()
2929
- return row is not None
2930
-
2931
- def node_is_embedded(self, node_id: str) -> bool:
2932
- """True when a vector embedding exists for ``node_id`` (RAG-ready)."""
2933
- with self._connect() as conn:
2934
- row = conn.execute(
2935
- "SELECT 1 FROM vector_embeddings WHERE item_id = ? LIMIT 1",
2936
- (node_id,),
2937
- ).fetchone()
2938
- return row is not None
2939
-
2940
- def _attach_source_node(
2941
- self,
2942
- conn: sqlite3.Connection,
2943
- content_node_id: str,
2944
- *,
2945
- source_type: str,
2946
- source_uri: Optional[str] = None,
2947
- title: Optional[str] = None,
2948
- content_hash: Optional[str] = None,
2949
- captured_at: Optional[str] = None,
2950
- extra: Optional[Dict[str, Any]] = None,
2951
- ) -> str:
2952
- """Create the SOURCE node for an ingested item and link it via INDEXED_FROM.
2953
-
2954
- Every ingested content node points at exactly one SOURCE node, so the
2955
- graph is always able to explain *where* a node came from. The source id
2956
- is derived from (source_type, source_uri | content_hash) so re-ingesting
2957
- the same origin reuses the same SOURCE node (idempotent).
2958
- """
2959
- key = source_uri or content_hash or content_node_id
2960
- source_id = f"source:{_sha256_text(f'{source_type}|{key}')[:24]}"
2961
- meta = {
2962
- "source_type": source_type,
2963
- "source_uri": source_uri,
2964
- "content_hash": content_hash,
2965
- "captured_at": captured_at or _now(),
2966
- **(extra or {}),
2967
- }
2968
- label = title or source_uri or source_type
2969
- self._upsert_node(
2970
- conn, source_id, "Source", label,
2971
- summary=str(source_uri or title or source_type)[:400],
2972
- metadata=meta,
2973
- )
2974
- # 선: 콘텐츠 노드가 "이 출처에서 색인됨" (indexed_from → SOURCE)
2975
- self._upsert_edge(conn, content_node_id, source_id, "indexed_from",
2976
- weight=1.0, metadata={"source_type": source_type})
2977
- return source_id
2978
-
2979
- def ingest_source(
2980
- self,
2981
- *,
2982
- source_type: str,
2983
- title: str,
2984
- text: str,
2985
- source_uri: Optional[str] = None,
2986
- owner: Optional[str] = None,
2987
- workspace_id: Optional[str] = None,
2988
- permissions: Optional[Dict[str, Any]] = None,
2989
- captured_at: Optional[str] = None,
2990
- modified_at: Optional[str] = None,
2991
- conversation_id: Optional[str] = None,
2992
- metadata: Optional[Dict[str, Any]] = None,
2993
- ) -> Dict[str, Any]:
2994
- """Unified text/web ingestion: one shape for URL, browser tab, note, text.
2995
-
2996
- Creates a content ``Document`` node (idempotent by content hash), a
2997
- ``Source`` node linked via ``indexed_from``, RAG chunks, and extracted
2998
- Concept/Task/Decision nodes — mirroring ingest_document for non-file
2999
- sources. Returns the full set of ids the caller needs to record
3000
- provenance, including ``duplicate`` (was the content already indexed).
3001
- """
3002
- source_type = str(source_type or "text")
3003
- text = str(text or "")
3004
- title = _clean_text(str(title or source_uri or source_type))[:240] or source_type
3005
- captured_at = captured_at or _now()
3006
- content_hash = _sha256_text(f"{source_type}|{source_uri or ''}|{text}")
3007
- content_id = f"webdoc:{content_hash[:24]}"
3008
- full_text = f"{title}\n{text}"
3009
- node_meta = {
3010
- "source_type": source_type,
3011
- "source_uri": source_uri,
3012
- "content_hash": content_hash,
3013
- "title": title,
3014
- "captured_at": captured_at,
3015
- "modified_at": modified_at,
3016
- "owner": owner,
3017
- "workspace_id": workspace_id,
3018
- "permissions": permissions or {},
3019
- "chars": len(text),
3020
- **(metadata or {}),
3021
- }
3022
- concepts = _extract_concepts(full_text, limit=15)
3023
- triples = _extract_triples(full_text, concepts)
3024
- chunk_ids: List[str] = []
3025
-
3026
- with self._connect() as conn:
3027
- duplicate = self._node_exists(conn, content_id)
3028
- # ── 콘텐츠 노드 (점: 명사 — 문서) ────────────────────────────────
3029
- self._upsert_node(
3030
- conn, content_id, "Document", title,
3031
- summary=(text or title)[:500],
3032
- metadata=node_meta, raw=node_meta,
3033
- )
3034
- # ── SOURCE 노드 + indexed_from 엣지 (출처 추적) ──────────────────
3035
- source_node_id = self._attach_source_node(
3036
- conn, content_id,
3037
- source_type=source_type, source_uri=source_uri, title=title,
3038
- content_hash=content_hash, captured_at=captured_at,
3039
- extra={"owner": owner, "workspace_id": workspace_id},
3040
- )
3041
- # ── 소유자(Person) + 동사형 엣지 ────────────────────────────────
3042
- if owner:
3043
- person_id = f"person:{_slug(owner)}"
3044
- self._upsert_node(conn, person_id, "Person", owner, metadata={"email": owner})
3045
- self._upsert_edge(conn, person_id, content_id, "업로드함", weight=1.0)
3046
- # ── 대화 연결 ───────────────────────────────────────────────────
3047
- if conversation_id:
3048
- conv_id = f"conversation:{_slug(conversation_id)}"
3049
- self._upsert_node(conn, conv_id, "Chat", conversation_id)
3050
- self._upsert_edge(conn, conv_id, content_id, "언급함", weight=0.8)
3051
- # ── RAG 청크 ────────────────────────────────────────────────────
3052
- for index, chunk in enumerate(_chunks(text)):
3053
- chunk_id = f"chunk:{_sha256_text(f'{content_id}:{index}:{chunk}')[:24]}"
3054
- chunk_ids.append(chunk_id)
3055
- self._upsert_node(
3056
- conn, chunk_id, "Chunk", f"{title} chunk {index + 1}",
3057
- summary=chunk[:500], metadata={"index": index, "source_node": content_id},
3058
- )
3059
- self._upsert_chunk(conn, chunk_id=chunk_id, source_node=content_id,
3060
- text=chunk, metadata={"index": index, "source_node": content_id})
3061
- self._upsert_edge(conn, content_id, chunk_id, "포함함")
3062
- # ── Concept / Feature / Error / Code 노드 + 엣지 ────────────────
3063
- concept_ids: Dict[str, str] = {}
3064
- for concept in concepts:
3065
- node_t = _classify_node_type(concept, full_text)
3066
- cid = f"{node_t.lower()}:{_slug(concept)}"
3067
- concept_ids[concept.lower()] = cid
3068
- self._upsert_node(conn, cid, node_t, concept,
3069
- metadata={"auto_extracted": True, "source_type": source_type})
3070
- self._upsert_edge(conn, content_id, cid, "포함함", weight=0.8)
3071
- for triple in triples:
3072
- subj_id = concept_ids.get(triple["subject"].lower())
3073
- obj_id = concept_ids.get(triple["object"].lower())
3074
- if subj_id and obj_id and subj_id != obj_id:
3075
- self._upsert_edge(conn, subj_id, obj_id, triple["relation"],
3076
- weight=1.0, metadata={"context": triple.get("context", "")[:240]})
3077
- # ── Task / Decision 노드 ────────────────────────────────────────
3078
- for item in _semantic_items(text):
3079
- sem_type = item["type"]
3080
- sem_title = item["title"]
3081
- sem_id = f"{sem_type.lower()}:{_sha256_text(f'{content_id}:{sem_type}:{sem_title}')[:24]}"
3082
- self._upsert_node(conn, sem_id, sem_type, sem_title, summary=item["summary"],
3083
- metadata={"auto_extracted": True, "source_node": content_id}, raw=item)
3084
- self._upsert_edge(conn, content_id, sem_id, "포함함", weight=0.9)
3085
-
3086
- return {
3087
- "node_id": content_id,
3088
- "type": "Document",
3089
- "source_node_id": source_node_id,
3090
- "content_hash": content_hash,
3091
- "chunk_ids": chunk_ids,
3092
- "chunk_count": len(chunk_ids),
3093
- "duplicate": duplicate,
3094
- "captured_at": captured_at,
3095
- }
3096
-
3097
- def record_provenance(
3098
- self,
3099
- *,
3100
- node_id: str,
3101
- source_type: str,
3102
- pipeline: str = "unified-ingestion",
3103
- source_uri: Optional[str] = None,
3104
- content_hash: Optional[str] = None,
3105
- title: Optional[str] = None,
3106
- owner: Optional[str] = None,
3107
- workspace_id: Optional[str] = None,
3108
- captured_at: Optional[str] = None,
3109
- modified_at: Optional[str] = None,
3110
- embedded: bool = False,
3111
- linked: bool = False,
3112
- duplicate: bool = False,
3113
- agent_used: Optional[str] = None,
3114
- chunk_count: int = 0,
3115
- permissions: Optional[Dict[str, Any]] = None,
3116
- metadata: Optional[Dict[str, Any]] = None,
3117
- ) -> Dict[str, Any]:
3118
- """Append a provenance record for an ingested node (audit trail)."""
3119
- now = _now()
3120
- prov_basis = f"{node_id}|{content_hash or ''}|{now}"
3121
- prov_id = f"prov:{_sha256_text(prov_basis)[:24]}"
3122
- with self._connect() as conn:
3123
- conn.execute(
3124
- """
3125
- INSERT OR REPLACE INTO ingestion_provenance(
3126
- id, node_id, source_type, source_uri, content_hash, title, pipeline,
3127
- owner, workspace_id, captured_at, modified_at, embedded, linked,
3128
- duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
3129
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3130
- """,
3131
- (
3132
- prov_id, node_id, source_type, source_uri, content_hash, title, pipeline,
3133
- owner, workspace_id, captured_at, modified_at, 1 if embedded else 0,
3134
- 1 if linked else 0, 1 if duplicate else 0, agent_used, int(chunk_count or 0),
3135
- _json(permissions or {}), _json(metadata or {}), now,
3136
- ),
3137
- )
3138
- return {"id": prov_id, "node_id": node_id, "created_at": now}
3139
-
3140
- @staticmethod
3141
- def _provenance_row(row: sqlite3.Row) -> Dict[str, Any]:
3142
- return {
3143
- "id": row["id"],
3144
- "node_id": row["node_id"],
3145
- "source_type": row["source_type"],
3146
- "source_uri": row["source_uri"],
3147
- "content_hash": row["content_hash"],
3148
- "title": row["title"],
3149
- "pipeline": row["pipeline"],
3150
- "owner": row["owner"],
3151
- "workspace_id": row["workspace_id"],
3152
- "captured_at": row["captured_at"],
3153
- "modified_at": row["modified_at"],
3154
- "embedded": bool(row["embedded"]),
3155
- "linked": bool(row["linked"]),
3156
- "duplicate": bool(row["duplicate"]),
3157
- "agent_used": row["agent_used"],
3158
- "chunk_count": row["chunk_count"],
3159
- "permissions": _safe_loads(row["permissions_json"]),
3160
- "metadata": _safe_loads(row["metadata_json"]),
3161
- "created_at": row["created_at"],
3162
- }
3163
-
3164
- def get_provenance(self, node_id: str) -> Optional[Dict[str, Any]]:
3165
- """Return the most recent provenance record for a node, or None."""
3166
- with self._connect() as conn:
3167
- row = conn.execute(
3168
- "SELECT * FROM ingestion_provenance WHERE node_id = ? "
3169
- "ORDER BY created_at DESC, rowid DESC LIMIT 1",
3170
- (node_id,),
3171
- ).fetchone()
3172
- return self._provenance_row(row) if row else None
3173
-
3174
- def list_provenance(self, *, limit: int = 100, source_type: Optional[str] = None) -> Dict[str, Any]:
3175
- """Recent provenance records (newest first), optionally by source_type."""
3176
- limit = max(1, min(int(limit or 100), 1000))
3177
- with self._connect() as conn:
3178
- if source_type:
3179
- rows = conn.execute(
3180
- "SELECT * FROM ingestion_provenance WHERE source_type = ? "
3181
- "ORDER BY created_at DESC, rowid DESC LIMIT ?",
3182
- (source_type, limit),
3183
- ).fetchall()
3184
- else:
3185
- rows = conn.execute(
3186
- "SELECT * FROM ingestion_provenance "
3187
- "ORDER BY created_at DESC, rowid DESC LIMIT ?",
3188
- (limit,),
3189
- ).fetchall()
3190
- return {"items": [self._provenance_row(r) for r in rows], "count": len(rows)}
3191
-
3192
- def provenance_stats(self) -> Dict[str, Any]:
3193
- """Aggregate provenance counts for the Knowledge Graph status surface."""
3194
- with self._connect() as conn:
3195
- total = conn.execute("SELECT COUNT(*) AS c FROM ingestion_provenance").fetchone()["c"]
3196
- by_source = {
3197
- r["source_type"]: r["c"]
3198
- for r in conn.execute(
3199
- "SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
3200
- ).fetchall()
3201
- }
3202
- embedded = conn.execute(
3203
- "SELECT COUNT(*) AS c FROM ingestion_provenance WHERE embedded = 1"
3204
- ).fetchone()["c"]
3205
- duplicates = conn.execute(
3206
- "SELECT COUNT(*) AS c FROM ingestion_provenance WHERE duplicate = 1"
3207
- ).fetchone()["c"]
3208
- last = conn.execute(
3209
- "SELECT created_at FROM ingestion_provenance ORDER BY created_at DESC LIMIT 1"
3210
- ).fetchone()
3211
- return {
3212
- "total": total,
3213
- "by_source_type": by_source,
3214
- "embedded": embedded,
3215
- "duplicates": duplicates,
3216
- "last_ingested_at": last["created_at"] if last else None,
3217
- }
3218
-
3219
- # ── v3.6.0 portability: logical export / import + binary backup ──────────────
3220
- def schema_versions(self) -> Dict[str, Any]:
3221
- """Versions an exporter stamps and an importer validates against."""
3222
- try:
3223
- from kg_schema import EMBED_DIM as _EMBED_DIM, KG_SCHEMA_V2_VERSION as _V2
3224
- except Exception: # pragma: no cover - kg_schema always importable in practice
3225
- _EMBED_DIM, _V2 = 1024, 2
3226
- return {
3227
- "graph_schema_version": GRAPH_SCHEMA_VERSION,
3228
- "kg_v2_schema_version": _V2,
3229
- "projection_version": _PROJECTION_VERSION,
3230
- "embed_dim": _EMBED_DIM,
3231
- }
3232
-
3233
- def export_graph_data(self) -> Dict[str, Any]:
3234
- """Raw, lossless logical export of the graph (nodes/edges/chunks/sources/
3235
- provenance). Vector embeddings are intentionally omitted — they are
3236
- re-derived on import — so the artifact stays portable and small. Use
3237
- :meth:`backup_database` for a faithful binary copy incl. embeddings.
3238
- """
3239
- with self._connect() as conn:
3240
- def rows(table: str):
3241
- return [dict(r) for r in conn.execute(f"SELECT * FROM {table}").fetchall()]
3242
-
3243
- data = {
3244
- "nodes": rows("nodes"),
3245
- "edges": rows("edges"),
3246
- "chunks": rows("chunks"),
3247
- "knowledge_sources": rows("knowledge_sources"),
3248
- "provenance": rows("ingestion_provenance"),
3249
- }
3250
- data["counts"] = {k: len(v) for k, v in data.items()}
3251
- return data
3252
-
3253
- def import_graph_data(
3254
- self, data: Dict[str, Any], *, mode: str = "merge", dry_run: bool = False
3255
- ) -> Dict[str, Any]:
3256
- """Import a logical export back into the store.
3257
-
3258
- ``mode='merge'`` upserts on top of existing data (id collisions update);
3259
- ``mode='replace'`` clears the graph first. ``dry_run=True`` reports the
3260
- plan without writing. Refuses artifacts from a NEWER graph schema than
3261
- this build.
3262
- """
3263
- nodes = data.get("nodes") or []
3264
- edges = data.get("edges") or []
3265
- chunks = data.get("chunks") or []
3266
- sources = data.get("knowledge_sources") or []
3267
- provenance = data.get("provenance") or []
3268
-
3269
- header = data.get("header") or {}
3270
- incoming_schema = header.get("graph_schema_version")
3271
- if isinstance(incoming_schema, int) and incoming_schema > GRAPH_SCHEMA_VERSION:
3272
- raise ValueError(
3273
- f"Artifact graph_schema_version {incoming_schema} is newer than this "
3274
- f"build ({GRAPH_SCHEMA_VERSION}); refusing to import."
3275
- )
3276
-
3277
- plan = {
3278
- "mode": mode,
3279
- "nodes": len(nodes),
3280
- "edges": len(edges),
3281
- "chunks": len(chunks),
3282
- "knowledge_sources": len(sources),
3283
- "provenance": len(provenance),
3284
- }
3285
- if dry_run:
3286
- plan["dry_run"] = True
3287
- return plan
3288
-
3289
- if mode == "replace":
3290
- self.clear_all()
3291
-
3292
- with self._connect() as conn:
3293
- for n in nodes:
3294
- self._upsert_node(
3295
- conn, n["id"], n["type"], n.get("title") or "",
3296
- summary=n.get("summary") or "",
3297
- metadata=_safe_loads(n.get("metadata_json")),
3298
- raw=_safe_loads(n.get("raw_json")),
3299
- )
3300
- for c in chunks:
3301
- self._upsert_chunk(
3302
- conn, chunk_id=c["id"], source_node=c["source_node"],
3303
- text=c.get("text") or "", metadata=_safe_loads(c.get("metadata_json")),
3304
- )
3305
- for e in edges:
3306
- self._upsert_edge(
3307
- conn, e["from_node"], e["to_node"], e["type"],
3308
- weight=float(e.get("weight") or 1.0),
3309
- metadata=_safe_loads(e.get("metadata_json")),
3310
- )
3311
- for s in sources:
3312
- conn.execute(
3313
- """
3314
- INSERT OR REPLACE INTO knowledge_sources(
3315
- id, root_path, os_type, drive_id, label, status, include_ocr,
3316
- watch_enabled, consent_json, created_at, updated_at, last_scanned_at)
3317
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3318
- """,
3319
- (
3320
- s["id"], s["root_path"], s["os_type"], s.get("drive_id"), s.get("label"),
3321
- s.get("status") or "active", int(s.get("include_ocr") or 0),
3322
- int(s.get("watch_enabled") or 0), s.get("consent_json") or "{}",
3323
- s.get("created_at") or _now(), s.get("updated_at") or _now(),
3324
- s.get("last_scanned_at"),
3325
- ),
3326
- )
3327
- for p in provenance:
3328
- conn.execute(
3329
- """
3330
- INSERT OR REPLACE INTO ingestion_provenance(
3331
- id, node_id, source_type, source_uri, content_hash, title, pipeline,
3332
- owner, workspace_id, captured_at, modified_at, embedded, linked,
3333
- duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
3334
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
3335
- """,
3336
- (
3337
- p["id"], p["node_id"], p["source_type"], p.get("source_uri"),
3338
- p.get("content_hash"), p.get("title"), p.get("pipeline") or "import",
3339
- p.get("owner"), p.get("workspace_id"), p.get("captured_at"),
3340
- p.get("modified_at"), int(p.get("embedded") or 0), int(p.get("linked") or 0),
3341
- int(p.get("duplicate") or 0), p.get("agent_used"), int(p.get("chunk_count") or 0),
3342
- p.get("permissions_json") or "{}", p.get("metadata_json") or "{}",
3343
- p.get("created_at") or _now(),
3344
- ),
3345
- )
3346
- plan["imported"] = True
3347
- return plan
3348
-
3349
- def backup_database(self, dest_path) -> Path:
3350
- """Write a clean, standalone snapshot of the live DB to ``dest_path``.
3351
-
3352
- Uses ``VACUUM INTO`` (after a full WAL checkpoint) so the snapshot is a
3353
- defragmented, rollback-journal-mode database with no companion -wal/-shm
3354
- — which restores cleanly by a plain file copy. Captures all data incl.
3355
- the vector_embeddings BLOBs.
3356
- """
3357
- dest = Path(dest_path)
3358
- dest.parent.mkdir(parents=True, exist_ok=True)
3359
- if dest.exists():
3360
- dest.unlink() # VACUUM INTO requires the target to not exist
3361
- conn = self._connect()
3362
- try:
3363
- conn.execute("PRAGMA wal_checkpoint(FULL)")
3364
- conn.execute("VACUUM INTO ?", (str(dest),))
3365
- finally:
3366
- conn.close()
3367
- return dest
3368
-
3369
- def _ingest_structure_nodes(
3370
- self,
3371
- conn: sqlite3.Connection,
3372
- file_id: str,
3373
- filename: str,
3374
- structure: Dict[str, Any],
3375
- ) -> None:
3376
- for slide in structure.get("slides") or []:
3377
- index = slide.get("index")
3378
- slide_id = f"slide:{_sha256_text(f'{file_id}:slide:{index}')[:24]}"
3379
- title = f"{filename} slide {index}"
3380
- summary = "\n".join(slide.get("texts") or [])[:800]
3381
- self._upsert_node(conn, slide_id, "Slide", title, summary=summary, metadata=slide)
3382
- self._upsert_edge(conn, file_id, slide_id, "has_slide")
3383
- for text in slide.get("texts") or []:
3384
- for topic in _topic_candidates(text, limit=4):
3385
- topic_id = f"topic:{_slug(topic)}"
3386
- self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
3387
- self._upsert_edge(conn, slide_id, topic_id, "discusses", weight=0.6)
3388
-
3389
- for page in structure.get("pages") or []:
3390
- index = page.get("index")
3391
- page_id = f"page:{_sha256_text(f'{file_id}:page:{index}')[:24]}"
3392
- title = f"{filename} page {index}"
3393
- self._upsert_node(conn, page_id, "Page", title, summary=page.get("preview") or "", metadata=page)
3394
- self._upsert_edge(conn, file_id, page_id, "has_page")
3395
- for topic in _topic_candidates(page.get("preview") or "", limit=4):
3396
- topic_id = f"topic:{_slug(topic)}"
3397
- self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
3398
- self._upsert_edge(conn, page_id, topic_id, "discusses", weight=0.6)
3399
-
3400
- for sheet in (structure.get("sheets") or []):
3401
- sheet_title = sheet.get("title")
3402
- sheet_id = f"sheet:{_sha256_text(f'{file_id}:sheet:{sheet_title}')[:24]}"
3403
- self._upsert_node(conn, sheet_id, "Sheet", f"{filename} / {sheet_title}", metadata=sheet)
3404
- self._upsert_edge(conn, file_id, sheet_id, "has_sheet")
3405
-
3406
- for image in (structure.get("images") or []):
3407
- image_key = image.get("sha256") or _sha256_text(json.dumps(image, ensure_ascii=False, sort_keys=True))
3408
- image_id = f"image:{str(image_key)[:24]}"
3409
- title_parts = [filename, "image"]
3410
- if image.get("page"):
3411
- title_parts.append(f"page {image.get('page')}")
3412
- if image.get("name"):
3413
- title_parts.append(str(image.get("name")).split("/")[-1])
3414
- self._upsert_node(conn, image_id, "Image", " / ".join(title_parts), metadata=image)
3415
- self._upsert_edge(conn, file_id, image_id, "contains_image")
3416
-
3417
- def _document_structure(self, path: Path, ext: str) -> Dict[str, Any]:
3418
- try:
3419
- if ext == ".pptx":
3420
- return self._pptx_structure(path)
3421
- if ext == ".pdf":
3422
- return self._pdf_structure(path)
3423
- if ext == ".docx":
3424
- return self._docx_structure(path)
3425
- if ext == ".xlsx":
3426
- return self._xlsx_structure(path)
3427
- except Exception as exc:
3428
- return {"error": str(exc)}
3429
- return {}
3430
-
3431
- def _pptx_structure(self, path: Path) -> Dict[str, Any]:
3432
- result: Dict[str, Any] = {"slides": [], "images": []}
3433
- try:
3434
- from PIL import Image
3435
- from pptx import Presentation
3436
- prs = Presentation(str(path))
3437
- for slide_index, slide in enumerate(prs.slides, start=1):
3438
- slide_info = {"index": slide_index, "shapes": [], "texts": []}
3439
- for shape_index, shape in enumerate(slide.shapes, start=1):
3440
- shape_info = {
3441
- "index": shape_index,
3442
- "name": getattr(shape, "name", ""),
3443
- "shape_type": str(getattr(shape, "shape_type", "")),
3444
- "bbox": {
3445
- "left": int(getattr(shape, "left", 0) or 0),
3446
- "top": int(getattr(shape, "top", 0) or 0),
3447
- "width": int(getattr(shape, "width", 0) or 0),
3448
- "height": int(getattr(shape, "height", 0) or 0),
3449
- },
3450
- }
3451
- if getattr(shape, "has_text_frame", False):
3452
- text = shape.text_frame.text.strip()
3453
- if text:
3454
- shape_info["text"] = text[:1000]
3455
- slide_info["texts"].append(text)
3456
- slide_info["shapes"].append(shape_info)
3457
- result["slides"].append(slide_info)
3458
- with zipfile.ZipFile(path) as zf:
3459
- for name in zf.namelist():
3460
- if not name.startswith("ppt/media/"):
3461
- continue
3462
- data = zf.read(name)
3463
- image_info: Dict[str, Any] = {
3464
- "name": name,
3465
- "bytes": len(data),
3466
- "sha256": _sha256_bytes(data),
3467
- }
3468
- try:
3469
- from io import BytesIO
3470
- with Image.open(BytesIO(data)) as img:
3471
- image_info.update({"width": img.width, "height": img.height, "format": img.format})
3472
- except Exception:
3473
- pass
3474
- result["images"].append(image_info)
3475
- except Exception as exc:
3476
- result["error"] = str(exc)
3477
- return result
3478
-
3479
- def _pdf_structure(self, path: Path) -> Dict[str, Any]:
3480
- result: Dict[str, Any] = {"pages": [], "images": []}
3481
- try:
3482
- import pdfplumber
3483
- with pdfplumber.open(str(path)) as pdf:
3484
- metadata = dict(pdf.metadata or {})
3485
- result["metadata"] = {str(k): str(v) for k, v in metadata.items()}
3486
- for page_index, page in enumerate(pdf.pages, start=1):
3487
- text = page.extract_text() or ""
3488
- page_info = {
3489
- "index": page_index,
3490
- "width": float(page.width or 0),
3491
- "height": float(page.height or 0),
3492
- "chars": len(text),
3493
- "preview": _clean_text(text)[:500],
3494
- "image_count": len(page.images or []),
3495
- }
3496
- result["pages"].append(page_info)
3497
- for image_index, image in enumerate(page.images or [], start=1):
3498
- result["images"].append({
3499
- "page": page_index,
3500
- "index": image_index,
3501
- "name": image.get("name"),
3502
- "width": image.get("width"),
3503
- "height": image.get("height"),
3504
- "bbox": {
3505
- "x0": image.get("x0"),
3506
- "top": image.get("top"),
3507
- "x1": image.get("x1"),
3508
- "bottom": image.get("bottom"),
3509
- },
3510
- })
3511
- except Exception as exc:
3512
- result["error"] = str(exc)
3513
- return result
3514
-
3515
- def _docx_structure(self, path: Path) -> Dict[str, Any]:
3516
- from docx import Document
3517
- doc = Document(str(path))
3518
- headings = []
3519
- paragraphs = 0
3520
- for p in doc.paragraphs:
3521
- text = p.text.strip()
3522
- if not text:
3523
- continue
3524
- paragraphs += 1
3525
- style = getattr(p.style, "name", "")
3526
- if style.lower().startswith("heading"):
3527
- headings.append({"style": style, "text": text[:240]})
3528
- return {"paragraphs": paragraphs, "headings": headings[:80], "tables": len(doc.tables)}
3529
-
3530
- def _xlsx_structure(self, path: Path) -> Dict[str, Any]:
3531
- from openpyxl import load_workbook
3532
- wb = load_workbook(str(path), read_only=True, data_only=True)
3533
- sheets = []
3534
- for ws in wb.worksheets:
3535
- sheets.append({"title": ws.title, "max_row": ws.max_row, "max_column": ws.max_column})
3536
- return {"sheets": sheets}
3537
-
3538
- # ── 그래프에 표시되는 노드 타입 (점 = 명사) ──────────────────────────────
3539
- # Message / AIResponse / Chunk 는 RAG 검색용으로만 저장, 그래프에서 숨김.
3540
- _GRAPH_VISIBLE_TYPES = (
3541
- "Computer", # 내 컴퓨터
3542
- "Drive", # 드라이브 / 볼륨
3543
- "Folder", # 폴더
3544
- "File", # 일반 파일
3545
- "Chat", # 대화 세션
3546
- "Document", # 파일 (PDF·PPT·Word·Excel·이미지)
3547
- "CodeFile", # 코드 파일
3548
- "Spreadsheet",# 엑셀/CSV
3549
- "SlideDeck", # 프레젠테이션
3550
- "Image", # 이미지
3551
- "ImageText", # OCR 텍스트
3552
- "Concept", # 개념 / 아이디어 / 기술 용어
3553
- "Person", # 사람
3554
- "Error", # 오류 / 버그
3555
- "Code", # 코드 / 함수
3556
- "Feature", # 소프트웨어 기능
3557
- "Task", # 할 일
3558
- "Decision", # 결정 사항
3559
- # v3.6.0 Knowledge Graph First — 1급 엔티티를 그래프에 노출
3560
- "Source", # 수집 출처 (파일/URL/브라우저 탭/git)
3561
- "Repository", # git 저장소
3562
- "Meeting", # 회의
3563
- "Organization", # 조직
3564
- "Workflow", # 워크플로우
3565
- "Agent", # 에이전트
3566
- )
3567
-
3568
- def list_documents(self, limit: int = 200) -> Dict[str, Any]:
3569
- """List ingested ``Document`` nodes with their ingest + index state.
3570
-
3571
- Powers the Files view: every accepted upload and every indexed local
3572
- document becomes a ``Document`` node. A document is reported ``indexed``
3573
- once its retrieval chunks exist (searchable in Chat / Hybrid Search).
3574
- """
3575
- limit = max(1, min(int(limit or 200), 1000))
3576
- nt, _ = self._read_tables()
3577
- documents: List[Dict[str, Any]] = []
3578
- with self._connect() as conn:
3579
- rows = conn.execute(
3580
- f"SELECT id, title, summary, metadata_json, created_at, updated_at "
3581
- f"FROM {nt} WHERE type='Document' ORDER BY updated_at DESC, id ASC LIMIT ?",
3582
- (limit,),
3583
- ).fetchall()
3584
- for row in rows:
3585
- meta = _safe_loads(row["metadata_json"]) or {}
3586
- extracted = meta.get("extracted") or {}
3587
- node_id = row["id"]
3588
- chunk_count = conn.execute(
3589
- f"SELECT COUNT(*) AS c FROM {nt} WHERE type='Chunk' AND metadata_json LIKE ?",
3590
- (f"%{node_id}%",),
3591
- ).fetchone()["c"]
3592
- documents.append({
3593
- "id": node_id,
3594
- "filename": meta.get("filename") or row["title"],
3595
- "ext": meta.get("ext"),
3596
- "mime_type": meta.get("mime_type"),
3597
- "bytes": meta.get("bytes"),
3598
- "sha256": meta.get("sha256"),
3599
- "uploader": meta.get("uploader"),
3600
- "chars": extracted.get("chars"),
3601
- "chunks": int(chunk_count or 0),
3602
- "indexed": int(chunk_count or 0) > 0,
3603
- "ingest_state": "indexed" if int(chunk_count or 0) > 0 else "ingested",
3604
- "created_at": row["created_at"],
3605
- "updated_at": row["updated_at"],
3606
- })
3607
- return {
3608
- "documents": documents,
3609
- "total": len(documents),
3610
- "generated_at": datetime.now().isoformat(timespec="seconds"),
3611
- }
3612
-
3613
- def graph(self, limit: int = 300) -> Dict[str, Any]:
3614
- limit = max(1, min(int(limit or 300), 2000))
3615
- visible = ",".join(f"'{t}'" for t in self._GRAPH_VISIBLE_TYPES)
3616
- nt, et = self._read_tables()
3617
- with self._connect() as conn:
3618
- nodes = [
3619
- {
3620
- "id": row["id"],
3621
- "type": row["type"],
3622
- "title": row["title"],
3623
- "summary": row["summary"],
3624
- "metadata": _safe_loads(row["metadata_json"]),
3625
- "updated_at": row["updated_at"],
3626
- }
3627
- for row in conn.execute(
3628
- f"SELECT id, type, title, summary, metadata_json, updated_at FROM {nt} WHERE type IN ({visible}) ORDER BY updated_at DESC, id ASC LIMIT ?",
3629
- (limit,),
3630
- )
3631
- ]
3632
- node_ids = {node["id"] for node in nodes}
3633
- edges: List[Dict[str, Any]] = []
3634
- if node_ids:
3635
- edge_rows = conn.execute(
3636
- f"""
3637
- SELECT id, from_node, to_node, type, weight, metadata_json
3638
- FROM {et}
3639
- WHERE from_node IN (
3640
- SELECT id FROM {nt} WHERE type IN ({visible})
3641
- ORDER BY updated_at DESC, id ASC LIMIT ?
3642
- )
3643
- AND to_node IN (
3644
- SELECT id FROM {nt} WHERE type IN ({visible})
3645
- ORDER BY updated_at DESC, id ASC LIMIT ?
3646
- )
3647
- ORDER BY weight DESC, created_at DESC, id ASC
3648
- """,
3649
- (limit, limit),
3650
- ).fetchall()
3651
- edges = [
3652
- {
3653
- "id": row["id"],
3654
- "from": row["from_node"],
3655
- "to": row["to_node"],
3656
- "type": row["type"],
3657
- "weight": row["weight"],
3658
- "metadata": _safe_loads(row["metadata_json"]),
3659
- }
3660
- for row in edge_rows
3661
- ]
3662
-
3663
- degree_map: Dict[str, int] = {}
3664
- now = datetime.now()
3665
- node_by_id = {node["id"]: node for node in nodes}
3666
- topic_metrics: Dict[str, Dict[str, Any]] = {}
3667
-
3668
- for edge in edges:
3669
- degree_map[edge["from"]] = degree_map.get(edge["from"], 0) + 1
3670
- degree_map[edge["to"]] = degree_map.get(edge["to"], 0) + 1
3671
- from_node = node_by_id.get(edge["from"])
3672
- to_node = node_by_id.get(edge["to"])
3673
- if not from_node or not to_node:
3674
- continue
3675
- for topic_node, other_node in ((from_node, to_node), (to_node, from_node)):
3676
- if topic_node["type"] != "Topic":
3677
- continue
3678
- metrics = topic_metrics.setdefault(topic_node["id"], {
3679
- "mention_count": 0.0,
3680
- "conversation_ids": set(),
3681
- })
3682
- if edge["type"] in {"mentions", "discusses"}:
3683
- metrics["mention_count"] += max(0.5, float(edge.get("weight") or 1.0))
3684
- other_meta = other_node.get("metadata") or {}
3685
- conversation_id = other_meta.get("conversation_id")
3686
- if other_node["type"] == "Conversation":
3687
- conversation_id = other_node["id"]
3688
- if conversation_id:
3689
- metrics["conversation_ids"].add(str(conversation_id))
3690
-
3691
- type_max_raw: Dict[str, float] = {}
3692
- for node in nodes:
3693
- degree = degree_map.get(node["id"], 0)
3694
- recency = _recency_score(node.get("updated_at"), now=now)
3695
- metrics = {
3696
- "degree": degree,
3697
- "recency_score": round(recency, 4),
3698
- }
3699
- if node["type"] == "Topic":
3700
- topic_stat = topic_metrics.get(node["id"], {})
3701
- mention_count = float(topic_stat.get("mention_count") or 0.0)
3702
- conversation_count = len(topic_stat.get("conversation_ids") or ())
3703
- raw_importance = (
3704
- math.log1p(mention_count) * 2.8
3705
- + math.log1p(conversation_count) * 2.2
3706
- + recency * 1.4
3707
- + math.sqrt(max(0, degree)) * 0.45
3708
- )
3709
- metrics.update({
3710
- "mention_count": round(mention_count, 2),
3711
- "conversation_count": conversation_count,
3712
- })
3713
- else:
3714
- raw_importance = math.log1p(max(0, degree)) * 1.4 + recency * 0.9
3715
-
3716
- metrics["importance_raw"] = round(raw_importance, 4)
3717
- node["importance"] = round(raw_importance, 4)
3718
- node["_raw_importance"] = raw_importance
3719
- node["metadata"] = {**(node.get("metadata") or {}), "graph_metrics": metrics}
3720
- type_max_raw[node["type"]] = max(type_max_raw.get(node["type"], 0.0), raw_importance)
3721
-
3722
- for node in nodes:
3723
- max_raw = max(type_max_raw.get(node["type"], 0.0), 0.0001)
3724
- importance_norm = min(1.0, (node.get("_raw_importance") or 0.0) / max_raw)
3725
- node["importance_norm"] = round(importance_norm, 4)
3726
- node["metadata"]["graph_metrics"]["importance_norm"] = node["importance_norm"]
3727
- node.pop("_raw_importance", None)
3728
- return {"nodes": nodes, "edges": edges}
3729
-
3730
- def search(self, query: str, limit: int = 30) -> Dict[str, Any]:
3731
- query = str(query or "").strip()
3732
- q = f"%{query}%"
3733
- limit = max(1, min(int(limit or 30), 100))
3734
- nt, et = self._read_tables()
3735
- with self._connect() as conn:
3736
- rows = []
3737
- if query:
3738
- rows = conn.execute(
3739
- f"""
3740
- SELECT id, type, title, summary, metadata_json, updated_at
3741
- FROM {nt}
3742
- WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
3743
- ORDER BY updated_at DESC, id ASC
3744
- LIMIT ?
3745
- """,
3746
- (q, q, q, limit),
3747
- ).fetchall()
3748
-
3749
- if len(rows) < limit:
3750
- terms = _topic_candidates(query, limit=8)
3751
- if terms:
3752
- clauses = []
3753
- params: List[str] = []
3754
- for term in terms:
3755
- clauses.append("(title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)")
3756
- params.extend([f"%{term}%", f"%{term}%", f"%{term}%"])
3757
- extra = conn.execute(
3758
- f"""
3759
- SELECT id, type, title, summary, metadata_json, updated_at
3760
- FROM {nt}
3761
- WHERE {' OR '.join(clauses)}
3762
- ORDER BY updated_at DESC, id ASC
3763
- LIMIT ?
3764
- """,
3765
- (*params, limit * 3),
3766
- ).fetchall()
3767
- by_id = {row["id"]: row for row in rows}
3768
- for row in extra:
3769
- by_id.setdefault(row["id"], row)
3770
- rows = list(by_id.values())
3771
-
3772
- terms_for_score = set(_topic_candidates(query, limit=12))
3773
- def score(row: sqlite3.Row) -> tuple:
3774
- haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
3775
- hits = sum(1 for term in terms_for_score if term.lower() in haystack)
3776
- type_boost = 1 if row["type"] in {
3777
- "Decision", "Task", "File", "Document", "CodeFile",
3778
- "Spreadsheet", "SlideDeck", "Image", "ImageText", "Page", "Slide",
3779
- } else 0
3780
- return (hits, type_boost, row["updated_at"] or "")
3781
-
3782
- rows = sorted(rows, key=score, reverse=True)[:limit]
3783
- return {
3784
- "query": query,
3785
- "matches": [
3786
- {
3787
- "id": row["id"],
3788
- "type": row["type"],
3789
- "title": row["title"],
3790
- "summary": row["summary"],
3791
- "metadata": _safe_loads(row["metadata_json"]),
3792
- "updated_at": row["updated_at"],
3793
- }
3794
- for row in rows
3795
- ],
3796
- }
3797
-
3798
- def context_for_query(self, query: str, limit: int = 6) -> str:
3799
- """Return compact graph-backed RAG context for chat generation."""
3800
- query = str(query or "").strip()
3801
- if not query:
3802
- return ""
3803
- matches = self.search(query, limit).get("matches", [])
3804
- if not matches:
3805
- topics = _topic_candidates(query, limit=4)
3806
- if topics:
3807
- nt, et = self._read_tables()
3808
- with self._connect() as conn:
3809
- rows = []
3810
- for topic in topics:
3811
- rows.extend(conn.execute(
3812
- f"""
3813
- SELECT id, type, title, summary, metadata_json
3814
- FROM {nt}
3815
- WHERE title LIKE ? OR metadata_json LIKE ?
3816
- ORDER BY updated_at DESC, id ASC
3817
- LIMIT 3
3818
- """,
3819
- (f"%{topic}%", f"%{topic}%"),
3820
- ).fetchall())
3821
- seen = set()
3822
- matches = []
3823
- for row in rows:
3824
- if row["id"] in seen:
3825
- continue
3826
- seen.add(row["id"])
3827
- matches.append({
3828
- "id": row["id"],
3829
- "type": row["type"],
3830
- "title": row["title"],
3831
- "summary": row["summary"],
3832
- "metadata": _safe_loads(row["metadata_json"]),
3833
- })
3834
- if len(matches) >= limit:
3835
- break
3836
- lines = []
3837
- for match in matches[:limit]:
3838
- meta = match.get("metadata") or {}
3839
- source = (
3840
- meta.get("relative_path")
3841
- or meta.get("filename")
3842
- or meta.get("conversation_id")
3843
- or meta.get("source")
3844
- or match["id"]
3845
- )
3846
- summary = _clean_text(match.get("summary") or "")[:700]
3847
- lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
3848
- return "\n".join(lines)
3849
-
3850
- def neighbors(self, node_id: str) -> Dict[str, Any]:
3851
- """Return direct neighbors (1-hop) of a node."""
3852
- nt, et = self._read_tables()
3853
- with self._connect() as conn:
3854
- edge_rows = conn.execute(
3855
- f"SELECT from_node, to_node, type, weight FROM {et} WHERE from_node=? OR to_node=? ORDER BY id ASC",
3856
- (node_id, node_id),
3857
- ).fetchall()
3858
- neighbor_ids: set = set()
3859
- edges = []
3860
- for row in edge_rows:
3861
- neighbor_ids.add(row["from_node"])
3862
- neighbor_ids.add(row["to_node"])
3863
- edges.append({"from": row["from_node"], "to": row["to_node"], "type": row["type"], "weight": row["weight"]})
3864
- neighbor_ids.discard(node_id)
3865
- nodes = []
3866
- if neighbor_ids:
3867
- placeholders = ",".join("?" * len(neighbor_ids))
3868
- nodes = [
3869
- {
3870
- "id": row["id"],
3871
- "type": row["type"],
3872
- "title": row["title"],
3873
- "summary": row["summary"],
3874
- "metadata": _safe_loads(row["metadata_json"]),
3875
- }
3876
- for row in conn.execute(
3877
- f"SELECT id, type, title, summary, metadata_json FROM {nt} WHERE id IN ({placeholders}) ORDER BY id ASC",
3878
- list(neighbor_ids),
3879
- )
3880
- ]
3881
- return {"node_id": node_id, "neighbors": nodes, "edges": edges}
3882
-
3883
- def get_node(self, node_id: str) -> Dict[str, Any]:
3884
- node_id = str(node_id or "").strip()
3885
- if not node_id:
3886
- raise ValueError("node_id required")
3887
- nt, et = self._read_tables()
3888
- with self._connect() as conn:
3889
- row = conn.execute(
3890
- f"""
3891
- SELECT id, type, title, summary, metadata_json, updated_at
3892
- FROM {nt}
3893
- WHERE id=?
3894
- """,
3895
- (node_id,),
3896
- ).fetchone()
3897
- if not row:
3898
- raise ValueError(f"graph node not found: {node_id}")
3899
- degree = conn.execute(
3900
- f"SELECT COUNT(*) AS c FROM {et} WHERE from_node=? OR to_node=?",
3901
- (node_id, node_id),
3902
- ).fetchone()["c"]
3903
- return {
3904
- "id": row["id"],
3905
- "type": row["type"],
3906
- "title": row["title"],
3907
- "summary": row["summary"],
3908
- "metadata": _safe_loads(row["metadata_json"]),
3909
- "updated_at": row["updated_at"],
3910
- "degree": degree,
3911
- }
3912
-
3913
- def relationship_search(
3914
- self,
3915
- *,
3916
- query: str = "",
3917
- node_id: str = "",
3918
- relationship_type: str = "",
3919
- limit: int = 30,
3920
- ) -> Dict[str, Any]:
3921
- query = str(query or "").strip()
3922
- node_id = str(node_id or "").strip()
3923
- relationship_type = str(relationship_type or "").strip()
3924
- limit = max(1, min(int(limit or 30), 200))
3925
- nt, et = self._read_tables()
3926
- where = []
3927
- params: List[Any] = []
3928
- if node_id:
3929
- where.append("(e.from_node=? OR e.to_node=?)")
3930
- params.extend([node_id, node_id])
3931
- if relationship_type:
3932
- where.append("e.type LIKE ?")
3933
- params.append(f"%{relationship_type}%")
3934
- if query:
3935
- where.append(
3936
- "(e.type LIKE ? OR e.metadata_json LIKE ? OR src.title LIKE ? OR dst.title LIKE ? OR src.summary LIKE ? OR dst.summary LIKE ?)"
3937
- )
3938
- params.extend([f"%{query}%"] * 6)
3939
- where_sql = "WHERE " + " AND ".join(where) if where else ""
3940
- with self._connect() as conn:
3941
- rows = conn.execute(
3942
- f"""
3943
- SELECT
3944
- e.id, e.from_node, e.to_node, e.type, e.weight, e.metadata_json, e.created_at,
3945
- src.type AS source_type, src.title AS source_title, src.summary AS source_summary,
3946
- src.metadata_json AS source_metadata,
3947
- dst.type AS target_type, dst.title AS target_title, dst.summary AS target_summary,
3948
- dst.metadata_json AS target_metadata
3949
- FROM {et} e
3950
- JOIN {nt} src ON src.id=e.from_node
3951
- JOIN {nt} dst ON dst.id=e.to_node
3952
- {where_sql}
3953
- ORDER BY e.weight DESC, e.created_at DESC, e.id ASC
3954
- LIMIT ?
3955
- """,
3956
- (*params, limit),
3957
- ).fetchall()
3958
- return {
3959
- "query": query,
3960
- "node_id": node_id,
3961
- "relationship_type": relationship_type,
3962
- "relationships": [
3963
- {
3964
- "id": row["id"],
3965
- "type": row["type"],
3966
- "weight": row["weight"],
3967
- "metadata": _safe_loads(row["metadata_json"]),
3968
- "created_at": row["created_at"],
3969
- "source": {
3970
- "id": row["from_node"],
3971
- "type": row["source_type"],
3972
- "title": row["source_title"],
3973
- "summary": row["source_summary"],
3974
- "metadata": _safe_loads(row["source_metadata"]),
3975
- },
3976
- "target": {
3977
- "id": row["to_node"],
3978
- "type": row["target_type"],
3979
- "title": row["target_title"],
3980
- "summary": row["target_summary"],
3981
- "metadata": _safe_loads(row["target_metadata"]),
3982
- },
3983
- }
3984
- for row in rows
3985
- ],
3986
- }
3987
-
3988
- def traverse(self, node_id: str, *, depth: int = 1, limit: int = 100) -> Dict[str, Any]:
3989
- node_id = str(node_id or "").strip()
3990
- if not node_id:
3991
- raise ValueError("node_id required")
3992
- depth = max(0, min(int(depth or 1), 4))
3993
- limit = max(1, min(int(limit or 100), 500))
3994
- nt, et = self._read_tables()
3995
- visited = {node_id}
3996
- frontier = {node_id}
3997
- edges_by_id: Dict[str, Dict[str, Any]] = {}
3998
- with self._connect() as conn:
3999
- for _ in range(depth):
4000
- if not frontier or len(visited) >= limit:
4001
- break
4002
- placeholders = ",".join("?" * len(frontier))
4003
- rows = conn.execute(
4004
- f"""
4005
- SELECT id, from_node, to_node, type, weight, metadata_json
4006
- FROM {et}
4007
- WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})
4008
- ORDER BY weight DESC, id ASC
4009
- LIMIT ?
4010
- """,
4011
- (*frontier, *frontier, limit * 3),
4012
- ).fetchall()
4013
- next_frontier = set()
4014
- for row in rows:
4015
- edges_by_id[row["id"]] = {
4016
- "id": row["id"],
4017
- "from": row["from_node"],
4018
- "to": row["to_node"],
4019
- "type": row["type"],
4020
- "weight": row["weight"],
4021
- "metadata": _safe_loads(row["metadata_json"]),
4022
- }
4023
- for candidate in (row["from_node"], row["to_node"]):
4024
- if candidate not in visited and len(visited) < limit:
4025
- visited.add(candidate)
4026
- next_frontier.add(candidate)
4027
- frontier = next_frontier
4028
- placeholders = ",".join("?" * len(visited))
4029
- node_rows = conn.execute(
4030
- f"""
4031
- SELECT id, type, title, summary, metadata_json, updated_at
4032
- FROM {nt}
4033
- WHERE id IN ({placeholders})
4034
- ORDER BY updated_at DESC, id ASC
4035
- """,
4036
- list(visited),
4037
- ).fetchall()
4038
- return {
4039
- "root": node_id,
4040
- "depth": depth,
4041
- "nodes": [
4042
- {
4043
- "id": row["id"],
4044
- "type": row["type"],
4045
- "title": row["title"],
4046
- "summary": row["summary"],
4047
- "metadata": _safe_loads(row["metadata_json"]),
4048
- "updated_at": row["updated_at"],
4049
- }
4050
- for row in node_rows
4051
- ],
4052
- "edges": list(edges_by_id.values()),
4053
- }
4054
-
4055
- def _iter_vector_source_items(
4056
- self,
4057
- conn: sqlite3.Connection,
4058
- *,
4059
- include_nodes: bool = True,
4060
- include_chunks: bool = True,
4061
- ) -> List[Dict[str, Any]]:
4062
- items: List[Dict[str, Any]] = []
4063
- if include_nodes:
4064
- for row in conn.execute(
4065
- """
4066
- SELECT id, type, title, summary, metadata_json
4067
- FROM nodes
4068
- WHERE type <> 'Chunk'
4069
- ORDER BY updated_at DESC, id ASC
4070
- """
4071
- ).fetchall():
4072
- metadata = _safe_loads(row["metadata_json"])
4073
- text = self._vector_text_for_node(
4074
- title=row["title"],
4075
- summary=row["summary"] or "",
4076
- metadata=metadata,
4077
- )
4078
- if text:
4079
- items.append({
4080
- "item_id": row["id"],
4081
- "item_type": "node",
4082
- "source_node": row["id"],
4083
- "text": text,
4084
- "metadata": {"node_type": row["type"], **metadata},
4085
- })
4086
- if include_chunks:
4087
- for row in conn.execute(
4088
- """
4089
- SELECT c.id, c.source_node AS parent_source_node, c.text, c.metadata_json
4090
- FROM chunks c
4091
- JOIN nodes n ON n.id=c.id
4092
- ORDER BY c.created_at DESC, c.id ASC
4093
- """
4094
- ).fetchall():
4095
- metadata = _safe_loads(row["metadata_json"])
4096
- text = _clean_text(row["text"] or "")
4097
- if text:
4098
- items.append({
4099
- "item_id": row["id"],
4100
- "item_type": "chunk",
4101
- "source_node": row["id"],
4102
- "text": text,
4103
- "metadata": {**metadata, "parent_source_node": row["parent_source_node"]},
4104
- })
4105
- return items
4106
-
4107
- def rebuild_vector_index(
4108
- self,
4109
- *,
4110
- full: bool = False,
4111
- include_nodes: bool = True,
4112
- include_chunks: bool = True,
4113
- ) -> Dict[str, Any]:
4114
- """Rebuild the derived vector index without mutating graph content."""
4115
- op_id = f"vector-op:{_sha256_text(f'{time.time()}:{os.getpid()}')[:24]}"
4116
- requested_at = _now()
4117
- started = time.perf_counter()
4118
- try:
4119
- with self._connect() as conn:
4120
- conn.execute(
4121
- """
4122
- INSERT INTO vector_index_operations(
4123
- id, operation, status, requested_at, started_at, metadata_json
4124
- )
4125
- VALUES (?, ?, 'running', ?, ?, ?)
4126
- """,
4127
- (
4128
- op_id,
4129
- "rebuild_full" if full else "rebuild_incremental",
4130
- requested_at,
4131
- requested_at,
4132
- _json({"include_nodes": include_nodes, "include_chunks": include_chunks}),
4133
- ),
4134
- )
4135
- if full:
4136
- filters = []
4137
- if include_nodes:
4138
- filters.append("'node'")
4139
- if include_chunks:
4140
- filters.append("'chunk'")
4141
- if filters:
4142
- conn.execute(f"DELETE FROM vector_embeddings WHERE item_type IN ({','.join(filters)})")
4143
- items = self._iter_vector_source_items(
4144
- conn,
4145
- include_nodes=include_nodes,
4146
- include_chunks=include_chunks,
4147
- )
4148
- indexed = skipped = 0
4149
- for item in items:
4150
- changed = self._upsert_vector_item(conn, **item)
4151
- if changed:
4152
- indexed += 1
4153
- else:
4154
- skipped += 1
4155
- duration_ms = round((time.perf_counter() - started) * 1000, 2)
4156
- conn.execute(
4157
- """
4158
- UPDATE vector_index_operations
4159
- SET status='completed', completed_at=?, items_total=?,
4160
- items_indexed=?, items_skipped=?, metadata_json=?
4161
- WHERE id=?
4162
- """,
4163
- (
4164
- _now(),
4165
- len(items),
4166
- indexed,
4167
- skipped,
4168
- _json({
4169
- "include_nodes": include_nodes,
4170
- "include_chunks": include_chunks,
4171
- "duration_ms": duration_ms,
4172
- "embedding_model": self._embedding_model.model_id,
4173
- "embedding_dim": self._embedding_model.dim,
4174
- }),
4175
- op_id,
4176
- ),
4177
- )
4178
- return {
4179
- "status": "completed",
4180
- "operation_id": op_id,
4181
- "full": bool(full),
4182
- "items_total": len(items),
4183
- "items_indexed": indexed,
4184
- "items_skipped": skipped,
4185
- "duration_ms": duration_ms,
4186
- "embedding_model": self._embedding_model.model_id,
4187
- "embedding_dim": self._embedding_model.dim,
4188
- }
4189
- except Exception as exc:
4190
- duration_ms = round((time.perf_counter() - started) * 1000, 2)
4191
- with self._connect() as conn:
4192
- conn.execute(
4193
- """
4194
- INSERT INTO vector_index_operations(
4195
- id, operation, status, requested_at, started_at, completed_at,
4196
- error_message, metadata_json
4197
- )
4198
- VALUES (?, ?, 'failed', ?, ?, ?, ?, ?)
4199
- ON CONFLICT(id) DO UPDATE SET
4200
- status='failed',
4201
- completed_at=excluded.completed_at,
4202
- error_message=excluded.error_message,
4203
- metadata_json=excluded.metadata_json
4204
- """,
4205
- (
4206
- op_id,
4207
- "rebuild_full" if full else "rebuild_incremental",
4208
- requested_at,
4209
- requested_at,
4210
- _now(),
4211
- str(exc),
4212
- _json({"duration_ms": duration_ms}),
4213
- ),
4214
- )
4215
- raise
4216
-
4217
- def index_status(self) -> Dict[str, Any]:
4218
- with self._connect() as conn:
4219
- vector_counts = {
4220
- row["item_type"]: row["count"]
4221
- for row in conn.execute(
4222
- "SELECT item_type, COUNT(*) AS count FROM vector_embeddings GROUP BY item_type"
4223
- )
4224
- }
4225
- source_items = self._iter_vector_source_items(conn)
4226
- vector_rows = {
4227
- row["item_id"]: row
4228
- for row in conn.execute(
4229
- """
4230
- SELECT item_id, text_hash, embedding_dim, embedding_model, indexed_at
4231
- FROM vector_embeddings
4232
- """
4233
- ).fetchall()
4234
- }
4235
- latest_rows = conn.execute(
4236
- """
4237
- SELECT id, operation, status, requested_at, started_at, completed_at,
4238
- items_total, items_indexed, items_skipped, error_message, metadata_json
4239
- FROM vector_index_operations
4240
- ORDER BY requested_at DESC, id DESC
4241
- LIMIT 5
4242
- """
4243
- ).fetchall()
4244
- missing = stale = ready = 0
4245
- for item in source_items:
4246
- vector_row = vector_rows.get(item["item_id"])
4247
- expected_hash = _sha256_text(_clean_text(item["text"]))
4248
- if not vector_row:
4249
- missing += 1
4250
- elif (
4251
- vector_row["text_hash"] != expected_hash
4252
- or vector_row["embedding_dim"] != self._embedding_model.dim
4253
- or vector_row["embedding_model"] != self._embedding_model.model_id
4254
- ):
4255
- stale += 1
4256
- else:
4257
- ready += 1
4258
- pending = missing + stale
4259
- return {
4260
- "status": "ready" if pending == 0 else "needs_reindex",
4261
- "storage": {
4262
- "db_path": str(self.db_path),
4263
- "backend": "sqlite",
4264
- "embedding_model": self._embedding_model.model_id,
4265
- "embedding_dim": self._embedding_model.dim,
4266
- },
4267
- "source_items": len(source_items),
4268
- "indexed_items": sum(vector_counts.values()),
4269
- "ready_items": ready,
4270
- "missing_items": missing,
4271
- "stale_items": stale,
4272
- "pending_items": pending,
4273
- "by_item_type": vector_counts,
4274
- "operations": [
4275
- {
4276
- "id": row["id"],
4277
- "operation": row["operation"],
4278
- "status": row["status"],
4279
- "requested_at": row["requested_at"],
4280
- "started_at": row["started_at"],
4281
- "completed_at": row["completed_at"],
4282
- "items_total": row["items_total"],
4283
- "items_indexed": row["items_indexed"],
4284
- "items_skipped": row["items_skipped"],
4285
- "error_message": row["error_message"],
4286
- "metadata": _safe_loads(row["metadata_json"]),
4287
- }
4288
- for row in latest_rows
4289
- ],
4290
- }
4291
-
4292
- def vector_search(
4293
- self,
4294
- query: str,
4295
- *,
4296
- limit: int = 30,
4297
- min_score: float = 0.0,
4298
- max_candidates: int = 10_000,
4299
- ) -> Dict[str, Any]:
4300
- query = str(query or "").strip()
4301
- limit = max(1, min(int(limit or 30), 100))
4302
- min_score = float(min_score or 0.0)
4303
- if not query:
4304
- return {"query": query, "matches": []}
4305
- query_vector = self._embedding_model.embed(query)
4306
- max_candidates = max(limit, min(int(max_candidates or 10_000), 50_000))
4307
- with self._connect() as conn:
4308
- rows = conn.execute(
4309
- """
4310
- SELECT
4311
- ve.item_id, ve.item_type, ve.source_node, ve.embedding,
4312
- ve.embedding_dim, ve.embedding_model, ve.metadata_json AS vector_metadata,
4313
- n.type AS node_type, n.title AS node_title, n.summary AS node_summary,
4314
- n.metadata_json AS node_metadata, n.updated_at AS node_updated_at,
4315
- c.text AS chunk_text, c.source_node AS parent_node_id,
4316
- pn.type AS parent_type, pn.title AS parent_title,
4317
- pn.summary AS parent_summary, pn.metadata_json AS parent_metadata,
4318
- pn.updated_at AS parent_updated_at
4319
- FROM vector_embeddings ve
4320
- LEFT JOIN nodes n ON n.id=ve.source_node
4321
- LEFT JOIN chunks c ON c.id=ve.item_id
4322
- LEFT JOIN nodes pn ON pn.id=c.source_node
4323
- WHERE ve.embedding_model=? AND ve.embedding_dim=?
4324
- ORDER BY ve.indexed_at DESC
4325
- LIMIT ?
4326
- """,
4327
- (self._embedding_model.model_id, self._embedding_model.dim, max_candidates),
4328
- ).fetchall()
4329
- scored = []
4330
- for row in rows:
4331
- vector = self._embedding_model.decode(row["embedding"], row["embedding_dim"])
4332
- score = self._embedding_model.similarity(query_vector, vector)
4333
- if score < min_score:
4334
- continue
4335
- is_chunk = row["item_type"] == "chunk"
4336
- summary = row["chunk_text"] if is_chunk and row["chunk_text"] else row["node_summary"]
4337
- parent_metadata = _safe_loads(row["parent_metadata"])
4338
- node_metadata = _safe_loads(row["node_metadata"])
4339
- scored.append({
4340
- "id": row["item_id"],
4341
- "node_id": row["parent_node_id"] if is_chunk and row["parent_node_id"] else row["source_node"],
4342
- "item_type": row["item_type"],
4343
- "type": "Chunk" if is_chunk else row["node_type"],
4344
- "title": row["parent_title"] if is_chunk and row["parent_title"] else row["node_title"],
4345
- "summary": _clean_text(summary or "")[:1000],
4346
- "score": round(float(score), 6),
4347
- "metadata": {
4348
- **(parent_metadata if is_chunk else node_metadata),
4349
- "vector": _safe_loads(row["vector_metadata"]),
4350
- "parent_node_id": row["parent_node_id"],
4351
- "parent_type": row["parent_type"],
4352
- },
4353
- "updated_at": row["parent_updated_at"] if is_chunk and row["parent_updated_at"] else row["node_updated_at"],
4354
- })
4355
- scored.sort(key=lambda item: (item["score"], item.get("updated_at") or ""), reverse=True)
4356
- return {
4357
- "query": query,
4358
- "embedding_model": self._embedding_model.model_id,
4359
- "embedding_dim": self._embedding_model.dim,
4360
- "matches": scored[:limit],
4361
- }
4362
-
4363
- def delete_conversation(self, conversation_id: str) -> Dict[str, Any]:
4364
- conversation_id = str(conversation_id or "").strip()
4365
- if not conversation_id:
4366
- return {"status": "skipped", "removed_nodes": 0}
4367
- conv_id = f"conversation:{_slug(conversation_id)}"
4368
- with self._connect() as conn:
4369
- direct_ids = [
4370
- row["to_node"]
4371
- for row in conn.execute(
4372
- "SELECT to_node FROM edges WHERE from_node=? AND type='contains'",
4373
- (conv_id,),
4374
- )
4375
- ]
4376
- remove_ids = set(direct_ids)
4377
- for source_id in list(direct_ids):
4378
- for row in conn.execute(
4379
- """
4380
- SELECT to_node FROM edges
4381
- WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
4382
- """,
4383
- (source_id,),
4384
- ):
4385
- remove_ids.add(row["to_node"])
4386
- remove_ids.add(conv_id)
4387
- for node_id in remove_ids:
4388
- conn.execute("DELETE FROM nodes WHERE id=?", (node_id,))
4389
- if KGStoreV2 is not None:
4390
- conn.execute("DELETE FROM nodes_v2 WHERE id=?", (node_id,)) # edges_v2 cascade
4391
- conn.execute(
4392
- """
4393
- DELETE FROM nodes
4394
- WHERE type='Topic'
4395
- AND id NOT IN (SELECT to_node FROM edges)
4396
- AND id NOT IN (SELECT from_node FROM edges)
4397
- """
4398
- )
4399
- if KGStoreV2 is not None:
4400
- conn.execute(
4401
- """
4402
- DELETE FROM nodes_v2
4403
- WHERE legacy_type='Topic'
4404
- AND id NOT IN (SELECT target FROM edges_v2)
4405
- AND id NOT IN (SELECT source FROM edges_v2)
4406
- """
4407
- )
4408
- return {"status": "ok", "conversation_id": conversation_id, "removed_nodes": len(remove_ids)}
4409
-
4410
- def clear_all(self) -> Dict[str, Any]:
4411
- with self._connect() as conn:
4412
- counts = {
4413
- "nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
4414
- "edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
4415
- "chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
4416
- "knowledge_sources": conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"],
4417
- "local_file_index": conn.execute("SELECT COUNT(*) AS c FROM local_file_index").fetchone()["c"],
4418
- }
4419
- conn.execute("DELETE FROM local_file_index")
4420
- conn.execute("DELETE FROM knowledge_sources")
4421
- conn.execute("DELETE FROM chunks")
4422
- conn.execute("DELETE FROM edges")
4423
- conn.execute("DELETE FROM nodes")
4424
- if KGStoreV2 is not None:
4425
- conn.execute("DELETE FROM edges_v2")
4426
- conn.execute("DELETE FROM nodes_v2")
4427
- if self.blob_dir.exists():
4428
- shutil.rmtree(self.blob_dir, ignore_errors=True)
4429
- self.blob_dir.mkdir(parents=True, exist_ok=True)
4430
- return {"status": "ok", "removed": counts}
4431
-
4432
- def stats(self) -> Dict[str, Any]:
4433
- nt, et = self._read_tables()
4434
- with self._connect() as conn:
4435
- node_counts = {
4436
- row["type"]: row["count"]
4437
- for row in conn.execute(f"SELECT type, COUNT(*) AS count FROM {nt} GROUP BY type")
4438
- }
4439
- edge_counts = {
4440
- row["type"]: row["count"]
4441
- for row in conn.execute(f"SELECT type, COUNT(*) AS count FROM {et} GROUP BY type")
4442
- }
4443
- local_sources = conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"]
4444
- local_file_status = {
4445
- row["status"]: row["count"]
4446
- for row in conn.execute("SELECT status, COUNT(*) AS count FROM local_file_index GROUP BY status")
4447
- }
4448
- v2 = None
4449
- if KGStoreV2 is not None:
4450
- try:
4451
- v2 = KGStoreV2(self.db_path).stats()
4452
- except Exception as e:
4453
- v2 = {"available": False, "error": str(e)}
4454
- return {
4455
- "db_path": str(self.db_path),
4456
- "schema_version": GRAPH_SCHEMA_VERSION,
4457
- "v2_schema_available": KGStoreV2 is not None,
4458
- "nodes": node_counts,
4459
- "edges": edge_counts,
4460
- "local_sources": local_sources,
4461
- "local_file_status": local_file_status,
4462
- "v2": v2,
4463
- }
4464
-
4465
- def search_for_document_generation(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
4466
- """Hybrid retrieval optimized for document generation.
4467
-
4468
- Scoring: 0.5*text_relevance + 0.3*graph_relationship + 0.2*recency
4469
- Returns nodes with rich context for document generation prompts.
4470
- """
4471
- query = str(query or "").strip()
4472
- if not query:
4473
- return []
4474
- limit = max(1, min(int(limit or 10), 50))
4475
- terms = _topic_candidates(query, limit=12)
4476
- now = datetime.now()
4477
- nt, et = self._read_tables()
4478
-
4479
- with self._connect() as conn:
4480
- candidate_rows = []
4481
- seen_ids = set()
4482
-
4483
- if query:
4484
- q = f"%{query}%"
4485
- rows = conn.execute(
4486
- f"""
4487
- SELECT id, type, title, summary, metadata_json, updated_at
4488
- FROM {nt}
4489
- WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
4490
- AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
4491
- 'Spreadsheet', 'Image', 'ImageText', 'Chat',
4492
- 'Decision', 'Task', 'Concept', 'Feature',
4493
- 'Page', 'Slide')
4494
- ORDER BY updated_at DESC, id ASC
4495
- LIMIT ?
4496
- """,
4497
- (q, q, q, limit * 5),
4498
- ).fetchall()
4499
- for row in rows:
4500
- if row["id"] not in seen_ids:
4501
- seen_ids.add(row["id"])
4502
- candidate_rows.append(row)
4503
-
4504
- for term in terms:
4505
- t = f"%{term}%"
4506
- rows = conn.execute(
4507
- f"""
4508
- SELECT id, type, title, summary, metadata_json, updated_at
4509
- FROM {nt}
4510
- WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
4511
- AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
4512
- 'Spreadsheet', 'Image', 'ImageText', 'Chat',
4513
- 'Decision', 'Task', 'Concept', 'Feature',
4514
- 'Page', 'Slide')
4515
- ORDER BY updated_at DESC, id ASC
4516
- LIMIT ?
4517
- """,
4518
- (t, t, t, limit * 3),
4519
- ).fetchall()
4520
- for row in rows:
4521
- if row["id"] not in seen_ids:
4522
- seen_ids.add(row["id"])
4523
- candidate_rows.append(row)
4524
-
4525
- scored_results = []
4526
- for row in candidate_rows:
4527
- haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
4528
-
4529
- text_hits = sum(1 for term in terms if term.lower() in haystack)
4530
- text_score = min(1.0, text_hits / max(len(terms), 1))
4531
-
4532
- edge_count = conn.execute(
4533
- f"SELECT COUNT(*) AS c FROM {et} WHERE from_node=? OR to_node=?",
4534
- (row["id"], row["id"]),
4535
- ).fetchone()["c"]
4536
- graph_score = min(1.0, math.log1p(edge_count) / 4.0)
4537
-
4538
- recency = _recency_score(row["updated_at"], now=now, half_life_days=14.0)
4539
-
4540
- doc_type_boost = 1.2 if row["type"] in (
4541
- "Document", "File", "SlideDeck", "Decision",
4542
- ) else 1.0
4543
-
4544
- hybrid_score = (
4545
- 0.5 * text_score
4546
- + 0.3 * graph_score
4547
- + 0.2 * recency
4548
- ) * doc_type_boost
4549
-
4550
- meta = _safe_loads(row["metadata_json"])
4551
- neighbor_concepts = []
4552
- neighbor_rows = conn.execute(
4553
- f"""
4554
- SELECT n.title, n.type FROM {et} e
4555
- JOIN {nt} n ON n.id = CASE WHEN e.from_node = ? THEN e.to_node ELSE e.from_node END
4556
- WHERE (e.from_node = ? OR e.to_node = ?)
4557
- AND n.type IN ('Concept', 'Feature', 'Decision', 'Task')
4558
- LIMIT 8
4559
- """,
4560
- (row["id"], row["id"], row["id"]),
4561
- ).fetchall()
4562
- for nr in neighbor_rows:
4563
- neighbor_concepts.append({"title": nr["title"], "type": nr["type"]})
4564
-
4565
- scored_results.append({
4566
- "id": row["id"],
4567
- "type": row["type"],
4568
- "title": row["title"],
4569
- "summary": row["summary"],
4570
- "metadata": meta,
4571
- "updated_at": row["updated_at"],
4572
- "hybrid_score": round(hybrid_score, 4),
4573
- "scores": {
4574
- "text": round(text_score, 4),
4575
- "graph": round(graph_score, 4),
4576
- "recency": round(recency, 4),
4577
- },
4578
- "related_concepts": neighbor_concepts,
4579
- })
4580
-
4581
- scored_results.sort(key=lambda x: x["hybrid_score"], reverse=True)
4582
- return scored_results[:limit]
4583
-
4584
- def multi_hop_context(self, node_ids: List[str], max_hops: int = 2) -> Dict[str, Any]:
4585
- """Multi-hop graph traversal from seed nodes for richer context."""
4586
- visited_nodes = set()
4587
- visited_edges = set()
4588
- all_nodes = []
4589
- all_edges = []
4590
- frontier = set(node_ids)
4591
- nt, et = self._read_tables()
4592
-
4593
- with self._connect() as conn:
4594
- for hop in range(max_hops):
4595
- if not frontier:
4596
- break
4597
- next_frontier = set()
4598
- for nid in frontier:
4599
- if nid in visited_nodes:
4600
- continue
4601
- visited_nodes.add(nid)
4602
- row = conn.execute(
4603
- f"SELECT id, type, title, summary, metadata_json, updated_at FROM {nt} WHERE id=?",
4604
- (nid,),
4605
- ).fetchone()
4606
- if row:
4607
- all_nodes.append({
4608
- "id": row["id"], "type": row["type"],
4609
- "title": row["title"], "summary": row["summary"],
4610
- "metadata": _safe_loads(row["metadata_json"]),
4611
- "hop": hop,
4612
- })
4613
- edge_rows = conn.execute(
4614
- f"""
4615
- SELECT id, from_node, to_node, type, weight
4616
- FROM {et} WHERE from_node=? OR to_node=?
4617
- ORDER BY id ASC
4618
- """,
4619
- (nid, nid),
4620
- ).fetchall()
4621
- for er in edge_rows:
4622
- if er["id"] not in visited_edges:
4623
- visited_edges.add(er["id"])
4624
- all_edges.append({
4625
- "from": er["from_node"], "to": er["to_node"],
4626
- "type": er["type"], "weight": er["weight"],
4627
- })
4628
- other = er["to_node"] if er["from_node"] == nid else er["from_node"]
4629
- if other not in visited_nodes:
4630
- next_frontier.add(other)
4631
- frontier = next_frontier
4632
-
4633
- return {"nodes": all_nodes, "edges": all_edges}
27
+ from latticeai.brain.store import KnowledgeGraphStore
28
+
29
+ __all__ = [
30
+ "KnowledgeGraphStore",
31
+ "GRAPH_SCHEMA_VERSION",
32
+ "EDGE_VERB",
33
+ "_PROJECTION_VERSION",
34
+ "_KG_DB_FORMAT_VERSION",
35
+ "set_llm_router",
36
+ "_slug",
37
+ "_extract_concepts",
38
+ "_extract_concepts_rules",
39
+ "_extract_triples",
40
+ "_extract_triples_rules",
41
+ ]