ltcai 4.3.3 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/README.md +53 -20
  2. package/docs/CHANGELOG.md +122 -0
  3. package/docs/V4_4_0_EXTRACTION_REPORT.md +239 -0
  4. package/docs/V4_5_0_GEMMA_RUNTIME_COMPATIBILITY_REPORT.md +49 -0
  5. package/docs/V4_5_0_GRAPH_UX_REPORT.md +34 -0
  6. package/docs/V4_5_0_MODEL_RUNTIME_UX_REPORT.md +40 -0
  7. package/docs/V4_5_0_ONBOARDING_REPORT.md +31 -0
  8. package/docs/V4_5_0_PRODUCT_EXPERIENCE_RECOVERY_REPORT.md +49 -0
  9. package/docs/V4_5_0_VALIDATION_REPORT.md +60 -0
  10. package/docs/V4_5_1_GRAPH_EXPERIENCE_REPORT.md +33 -0
  11. package/docs/V4_5_1_MODEL_EXPERIENCE_REPORT.md +37 -0
  12. package/docs/V4_5_1_NAVIGATION_REPORT.md +37 -0
  13. package/docs/V4_5_1_ONBOARDING_REPORT.md +29 -0
  14. package/docs/V4_5_1_PRODUCT_REIMAGINING_REPORT.md +61 -0
  15. package/docs/V4_5_1_RC_ARTIFACTS.md +44 -0
  16. package/docs/V4_5_1_UX_REPORT.md +45 -0
  17. package/docs/V4_5_1_VALIDATION_REPORT.md +54 -0
  18. package/docs/V4_5_1_VISUAL_DESIGN_REPORT.md +30 -0
  19. package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +16 -16
  20. package/docs/architecture.md +8 -4
  21. package/frontend/src/App.tsx +152 -91
  22. package/frontend/src/api/client.ts +83 -1
  23. package/frontend/src/components/FirstRunGuide.tsx +99 -0
  24. package/frontend/src/components/primitives.tsx +131 -25
  25. package/frontend/src/components/ui/badge.tsx +2 -2
  26. package/frontend/src/components/ui/button.tsx +7 -7
  27. package/frontend/src/components/ui/card.tsx +5 -5
  28. package/frontend/src/components/ui/input.tsx +1 -1
  29. package/frontend/src/components/ui/textarea.tsx +1 -1
  30. package/frontend/src/pages/Act.tsx +58 -28
  31. package/frontend/src/pages/Ask.tsx +51 -19
  32. package/frontend/src/pages/Brain.tsx +60 -42
  33. package/frontend/src/pages/Capture.tsx +24 -24
  34. package/frontend/src/pages/Library.tsx +222 -32
  35. package/frontend/src/pages/System.tsx +56 -34
  36. package/frontend/src/routes.ts +15 -13
  37. package/frontend/src/store/appStore.ts +8 -1
  38. package/frontend/src/styles.css +666 -36
  39. package/lattice_brain/__init__.py +38 -23
  40. package/lattice_brain/_kg_common.py +11 -1
  41. package/lattice_brain/context.py +212 -2
  42. package/lattice_brain/conversations.py +234 -1
  43. package/lattice_brain/discovery.py +11 -1
  44. package/lattice_brain/documents.py +11 -1
  45. package/lattice_brain/graph/__init__.py +28 -0
  46. package/lattice_brain/graph/_kg_common.py +1123 -0
  47. package/lattice_brain/graph/curator.py +473 -0
  48. package/lattice_brain/graph/discovery.py +1455 -0
  49. package/lattice_brain/graph/documents.py +218 -0
  50. package/lattice_brain/graph/identity.py +175 -0
  51. package/lattice_brain/graph/ingest.py +644 -0
  52. package/lattice_brain/graph/network.py +205 -0
  53. package/lattice_brain/graph/projection.py +571 -0
  54. package/lattice_brain/graph/provenance.py +401 -0
  55. package/lattice_brain/graph/retrieval.py +1341 -0
  56. package/lattice_brain/graph/schema.py +640 -0
  57. package/lattice_brain/graph/store.py +237 -0
  58. package/lattice_brain/graph/write_master.py +225 -0
  59. package/lattice_brain/identity.py +11 -13
  60. package/lattice_brain/ingest.py +11 -1
  61. package/lattice_brain/ingestion.py +318 -0
  62. package/lattice_brain/memory.py +100 -1
  63. package/lattice_brain/network.py +11 -1
  64. package/lattice_brain/portability.py +431 -0
  65. package/lattice_brain/projection.py +11 -1
  66. package/lattice_brain/provenance.py +11 -1
  67. package/lattice_brain/retrieval.py +11 -1
  68. package/lattice_brain/runtime/__init__.py +32 -0
  69. package/lattice_brain/runtime/agent_runtime.py +569 -0
  70. package/lattice_brain/runtime/hooks.py +754 -0
  71. package/lattice_brain/runtime/multi_agent.py +795 -0
  72. package/lattice_brain/schema.py +11 -1
  73. package/lattice_brain/store.py +10 -2
  74. package/lattice_brain/workflow.py +461 -0
  75. package/lattice_brain/write_master.py +11 -1
  76. package/latticeai/__init__.py +1 -1
  77. package/latticeai/api/agents.py +2 -2
  78. package/latticeai/api/browser.py +1 -1
  79. package/latticeai/api/chat.py +1 -1
  80. package/latticeai/api/computer_use.py +1 -1
  81. package/latticeai/api/hooks.py +2 -2
  82. package/latticeai/api/mcp.py +1 -1
  83. package/latticeai/api/models.py +107 -18
  84. package/latticeai/api/tools.py +1 -1
  85. package/latticeai/api/workflow_designer.py +2 -2
  86. package/latticeai/app_factory.py +4 -4
  87. package/latticeai/brain/__init__.py +24 -6
  88. package/latticeai/brain/_kg_common.py +11 -1117
  89. package/latticeai/brain/context.py +12 -208
  90. package/latticeai/brain/conversations.py +12 -231
  91. package/latticeai/brain/discovery.py +13 -1451
  92. package/latticeai/brain/documents.py +13 -214
  93. package/latticeai/brain/identity.py +11 -169
  94. package/latticeai/brain/ingest.py +13 -640
  95. package/latticeai/brain/memory.py +12 -97
  96. package/latticeai/brain/network.py +12 -200
  97. package/latticeai/brain/projection.py +13 -567
  98. package/latticeai/brain/provenance.py +13 -397
  99. package/latticeai/brain/retrieval.py +13 -1337
  100. package/latticeai/brain/schema.py +12 -635
  101. package/latticeai/brain/store.py +13 -233
  102. package/latticeai/brain/write_master.py +13 -221
  103. package/latticeai/core/agent.py +1 -1
  104. package/latticeai/core/agent_registry.py +2 -2
  105. package/latticeai/core/builtin_hooks.py +2 -2
  106. package/latticeai/core/graph_curator.py +6 -468
  107. package/latticeai/core/hooks.py +6 -749
  108. package/latticeai/core/marketplace.py +1 -1
  109. package/latticeai/core/model_compat.py +250 -0
  110. package/latticeai/core/multi_agent.py +6 -790
  111. package/latticeai/core/workflow_engine.py +6 -456
  112. package/latticeai/core/workspace_os.py +1 -1
  113. package/latticeai/models/router.py +136 -32
  114. package/latticeai/services/agent_runtime.py +6 -564
  115. package/latticeai/services/ingestion.py +6 -313
  116. package/latticeai/services/kg_portability.py +6 -426
  117. package/latticeai/services/model_catalog.py +2 -2
  118. package/latticeai/services/model_recommendation.py +8 -1
  119. package/latticeai/services/model_runtime.py +18 -3
  120. package/latticeai/services/platform_runtime.py +3 -3
  121. package/latticeai/services/run_executor.py +1 -1
  122. package/latticeai/services/upload_service.py +1 -1
  123. package/p_reinforce.py +1 -1
  124. package/package.json +1 -1
  125. package/scripts/build_frontend_assets.mjs +12 -1
  126. package/scripts/bump_version.py +1 -1
  127. package/scripts/wheel_smoke.py +7 -0
  128. package/src-tauri/Cargo.lock +1 -1
  129. package/src-tauri/Cargo.toml +1 -1
  130. package/src-tauri/tauri.conf.json +1 -1
  131. package/static/app/asset-manifest.json +5 -5
  132. package/static/app/assets/index-3G8qcrIS.js +336 -0
  133. package/static/app/assets/index-3G8qcrIS.js.map +1 -0
  134. package/static/app/assets/index-C0wYZp7k.css +2 -0
  135. package/static/app/index.html +2 -2
  136. package/static/app/assets/index-CHHal8Zl.css +0 -2
  137. package/static/app/assets/index-pdzil9ac.js +0 -333
  138. package/static/app/assets/index-pdzil9ac.js.map +0 -1
@@ -1,1455 +1,17 @@
1
- from __future__ import annotations
1
+ """Deprecated shim: physically moved to lattice_brain.graph.discovery.
2
2
 
3
- # ruff: noqa: F403,F405
3
+ Kept only for the compatibility window. The module aliases itself to the
4
+ physical module so identity, singletons, and monkeypatching are preserved.
5
+ """
4
6
 
5
- from ._kg_common import * # noqa: F403,F401
7
+ import sys
8
+ import warnings
6
9
 
10
+ import lattice_brain.graph.discovery as _impl
7
11
 
8
- class KnowledgeGraphDiscoveryMixin:
9
- def discover_local_roots(self) -> Dict[str, Any]:
10
- """Return safe, cross-platform starting points for structure browsing."""
11
- os_type = _current_os_type()
12
- home = Path.home().expanduser()
13
- roots: List[Dict[str, Any]] = []
14
- seen: set = set()
15
-
16
- def add(
17
- label: str,
18
- path: Path,
19
- kind: str,
20
- *,
21
- recommended: bool = True,
22
- warning: Optional[str] = None,
23
- ) -> None:
24
- try:
25
- resolved = path.expanduser().resolve()
26
- except OSError:
27
- resolved = path.expanduser()
28
- key = str(resolved)
29
- if key in seen or not resolved.exists():
30
- return
31
- seen.add(key)
32
- roots.append(
33
- {
34
- "id": f"{kind}:{_path_fingerprint(resolved)}",
35
- "label": label,
36
- "path": key,
37
- "kind": kind,
38
- "recommended": recommended,
39
- "warning": warning or _root_warning(resolved, os_type),
40
- }
41
- )
42
-
43
- add("홈", home, "home", warning=_root_warning(home, os_type))
44
- for name, label in (
45
- ("Documents", "문서"),
46
- ("Desktop", "데스크탑"),
47
- ("Downloads", "다운로드"),
48
- ("Pictures", "사진"),
49
- ("Projects", "프로젝트"),
50
- ):
51
- add(label, home / name, name.lower())
52
-
53
- if os_type == "macos":
54
- volumes = Path("/Volumes")
55
- if volumes.exists():
56
- try:
57
- for volume in sorted(
58
- volumes.iterdir(), key=lambda p: p.name.lower()
59
- ):
60
- add(volume.name, volume, "volume", recommended=False)
61
- except OSError:
62
- pass
63
- elif os_type == "windows":
64
- for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
65
- drive = Path(f"{letter}:\\")
66
- if drive.exists():
67
- add(
68
- f"{letter}: 드라이브",
69
- drive,
70
- "drive",
71
- recommended=(letter != "C"),
72
- )
73
- for env_name, label in (
74
- ("OneDrive", "OneDrive"),
75
- ("OneDriveCommercial", "OneDrive"),
76
- ):
77
- raw = os.environ.get(env_name)
78
- if raw:
79
- add(label, Path(raw), "cloud", recommended=False)
80
- elif os_type == "linux":
81
- for base in (Path("/mnt"), Path("/media")):
82
- add(str(base), base, "mounts", recommended=False)
83
- try:
84
- if base.exists():
85
- for mounted in sorted(
86
- base.iterdir(), key=lambda p: p.name.lower()
87
- ):
88
- add(mounted.name, mounted, "volume", recommended=False)
89
- except OSError:
90
- pass
91
-
92
- return {
93
- "os_type": os_type,
94
- "computer": platform.node() or "local",
95
- "roots": roots,
96
- "privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
97
- }
98
-
99
- def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
100
- """List one folder level using metadata only; file contents are not read."""
101
- root = Path(path).expanduser().resolve()
102
- if not root.exists():
103
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
104
- if not root.is_dir():
105
- raise ValueError(f"폴더가 아닙니다: {path}")
106
-
107
- os_type = _current_os_type()
108
- max_items = max(1, min(int(max_items or 200), 1000))
109
- items: List[Dict[str, Any]] = []
110
- inaccessible = 0
111
- try:
112
- children = sorted(
113
- root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
114
- )
115
- except PermissionError as exc:
116
- return {
117
- "path": str(root),
118
- "items": [],
119
- "error": f"접근 권한 없음: {exc}",
120
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
121
- }
122
-
123
- for child in children[:max_items]:
124
- try:
125
- is_dir = child.is_dir()
126
- stat = child.stat()
127
- reason = (
128
- _excluded_directory_reason(child, root=root, os_type=os_type)
129
- if is_dir
130
- else _sensitive_file_reason(child, root=root)
131
- )
132
- items.append(
133
- {
134
- "name": child.name,
135
- "path": str(child),
136
- "type": "directory" if is_dir else "file",
137
- "extension": "" if is_dir else child.suffix.lower(),
138
- "size_bytes": None if is_dir else stat.st_size,
139
- "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
140
- "hidden": _is_hidden_path(child, root),
141
- "accessible": True,
142
- "excluded_reason": reason,
143
- }
144
- )
145
- except PermissionError:
146
- inaccessible += 1
147
- items.append(
148
- {
149
- "name": child.name,
150
- "path": str(child),
151
- "type": "unknown",
152
- "accessible": False,
153
- "excluded_reason": "permission_denied",
154
- }
155
- )
156
- except OSError as exc:
157
- inaccessible += 1
158
- items.append(
159
- {
160
- "name": child.name,
161
- "path": str(child),
162
- "type": "unknown",
163
- "accessible": False,
164
- "excluded_reason": str(exc),
165
- }
166
- )
167
-
168
- return {
169
- "path": str(root),
170
- "os_type": os_type,
171
- "items": items,
172
- "truncated": len(children) > max_items,
173
- "inaccessible": inaccessible,
174
- "warning": _root_warning(root, os_type),
175
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
176
- }
177
-
178
- def _iter_local_scan_entries(
179
- self, root: Path, *, max_files: int
180
- ) -> Iterable[Dict[str, Any]]:
181
- os_type = _current_os_type()
182
- stack = [root]
183
- files_seen = 0
184
- while stack:
185
- current = stack.pop()
186
- try:
187
- children = sorted(
188
- current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
189
- )
190
- except PermissionError as exc:
191
- yield {
192
- "kind": "inaccessible_dir",
193
- "path": current,
194
- "reason": f"permission_denied: {exc}",
195
- }
196
- continue
197
- except OSError as exc:
198
- yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
199
- continue
200
-
201
- for child in children:
202
- if child.is_symlink():
203
- yield {"kind": "excluded", "path": child, "reason": "symlink"}
204
- continue
205
- try:
206
- if child.is_dir():
207
- reason = _excluded_directory_reason(
208
- child, root=root, os_type=os_type
209
- )
210
- if reason:
211
- yield {
212
- "kind": "excluded_dir",
213
- "path": child,
214
- "reason": reason,
215
- }
216
- else:
217
- stack.append(child)
218
- continue
219
- if not child.is_file():
220
- yield {
221
- "kind": "excluded",
222
- "path": child,
223
- "reason": "not_regular_file",
224
- }
225
- continue
226
- stat = child.stat()
227
- except PermissionError as exc:
228
- yield {
229
- "kind": "inaccessible_file",
230
- "path": child,
231
- "reason": f"permission_denied: {exc}",
232
- }
233
- continue
234
- except OSError as exc:
235
- yield {
236
- "kind": "inaccessible_file",
237
- "path": child,
238
- "reason": str(exc),
239
- }
240
- continue
241
-
242
- files_seen += 1
243
- if files_seen > max_files:
244
- yield {
245
- "kind": "limit_reached",
246
- "path": child,
247
- "reason": "max_files",
248
- }
249
- return
250
- yield {"kind": "file", "path": child, "stat": stat}
251
-
252
- def _local_file_decision(
253
- self, path: Path, root: Path, stat: os.stat_result
254
- ) -> Dict[str, Any]:
255
- ext = path.suffix.lower()
256
- category = _file_category(ext)
257
- parser_type = _parser_type_for_category(category, ext)
258
- sensitive_reason = _sensitive_file_reason(path, root=root)
259
- if sensitive_reason:
260
- return {
261
- "status": "sensitive_blocked",
262
- "reason": sensitive_reason,
263
- "category": category,
264
- "parser_type": parser_type,
265
- "indexable": False,
266
- }
267
- if category == "unsupported":
268
- return {
269
- "status": "unsupported",
270
- "reason": "unsupported_extension",
271
- "category": category,
272
- "parser_type": parser_type,
273
- "indexable": False,
274
- }
275
- limit = _size_limit_for_category(category)
276
- if stat.st_size > limit:
277
- return {
278
- "status": "too_large",
279
- "reason": f"size>{limit}",
280
- "category": category,
281
- "parser_type": parser_type,
282
- "indexable": False,
283
- }
284
- return {
285
- "status": "pending",
286
- "reason": "",
287
- "category": category,
288
- "parser_type": parser_type,
289
- "indexable": True,
290
- }
291
-
292
- def audit_local_folder(
293
- self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000
294
- ) -> Dict[str, Any]:
295
- """Safety-check a folder using metadata only; file bodies are not read."""
296
- root = Path(path).expanduser().resolve()
297
- if not root.exists():
298
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
299
- if not root.is_dir():
300
- raise ValueError(f"폴더가 아닙니다: {path}")
301
-
302
- os_type = _current_os_type()
303
- max_files = max(1, min(int(max_files or 50_000), 200_000))
304
- status_counts: Counter = Counter()
305
- category_counts: Counter = Counter()
306
- extension_counts: Counter = Counter()
307
- allowed_samples: List[Dict[str, Any]] = []
308
- excluded_samples: List[Dict[str, Any]] = []
309
- total_files = 0
310
- readable_files = 0
311
- inaccessible = 0
312
- excluded_dirs = 0
313
- limit_reached = False
314
-
315
- for entry in self._iter_local_scan_entries(root, max_files=max_files):
316
- kind = entry["kind"]
317
- path_obj = entry["path"]
318
- if kind == "limit_reached":
319
- limit_reached = True
320
- break
321
- if kind == "excluded_dir":
322
- excluded_dirs += 1
323
- if len(excluded_samples) < 25:
324
- excluded_samples.append(
325
- _sample_file(
326
- path_obj, root, "excluded", entry.get("reason", "")
327
- )
328
- )
329
- continue
330
- if kind in {"inaccessible_dir", "inaccessible_file"}:
331
- inaccessible += 1
332
- status_counts["failed"] += 1
333
- if len(excluded_samples) < 25:
334
- excluded_samples.append(
335
- _sample_file(path_obj, root, "failed", entry.get("reason", ""))
336
- )
337
- continue
338
- if kind == "excluded":
339
- status_counts["excluded"] += 1
340
- if len(excluded_samples) < 25:
341
- excluded_samples.append(
342
- _sample_file(
343
- path_obj, root, "excluded", entry.get("reason", "")
344
- )
345
- )
346
- continue
347
- if kind != "file":
348
- continue
349
-
350
- total_files += 1
351
- stat = entry["stat"]
352
- decision = self._local_file_decision(path_obj, root, stat)
353
- status = decision["status"]
354
- category = decision["category"]
355
- ext = path_obj.suffix.lower() or "(none)"
356
- category_counts[category] += 1
357
- extension_counts[ext] += 1
358
- if decision["indexable"]:
359
- readable_files += 1
360
- status_counts["readable"] += 1
361
- if len(allowed_samples) < 25:
362
- allowed_samples.append(_sample_file(path_obj, root, "readable"))
363
- else:
364
- status_counts[status] += 1
365
- if len(excluded_samples) < 25:
366
- excluded_samples.append(
367
- _sample_file(path_obj, root, status, decision["reason"])
368
- )
369
-
370
- doc_weight = (
371
- category_counts["pdf"] * 1.4
372
- + category_counts["document"] * 0.9
373
- + category_counts["slide_deck"] * 1.0
374
- )
375
- sheet_weight = category_counts["spreadsheet"] * 0.6
376
- ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
377
- estimated_seconds = round(
378
- readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1
379
- )
380
-
381
- return {
382
- "path": str(root),
383
- "source_id": f"source:{_path_fingerprint(root)}",
384
- "os_type": os_type,
385
- "drive_id": _drive_id_for_path(root),
386
- "warning": _root_warning(root, os_type),
387
- "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
388
- "include_ocr_requested": bool(include_ocr),
389
- "summary": {
390
- "total_files": total_files,
391
- "readable_files": readable_files,
392
- "excluded_files": int(
393
- status_counts["excluded"]
394
- + status_counts["sensitive_blocked"]
395
- + status_counts["too_large"]
396
- + status_counts["unsupported"]
397
- ),
398
- "sensitive_files": int(status_counts["sensitive_blocked"]),
399
- "too_large_files": int(status_counts["too_large"]),
400
- "unsupported_files": int(status_counts["unsupported"]),
401
- "image_ocr_candidates": int(category_counts["image"]),
402
- "inaccessible_items": inaccessible,
403
- "excluded_dirs": excluded_dirs,
404
- "estimated_seconds": estimated_seconds,
405
- "storage_root": str(self.db_path.parent),
406
- "limit_reached": limit_reached,
407
- },
408
- "by_status": dict(status_counts),
409
- "by_category": dict(category_counts),
410
- "by_extension": dict(extension_counts.most_common(40)),
411
- "allowed_samples": allowed_samples,
412
- "excluded_samples": excluded_samples,
413
- "consent_required": {
414
- "knowledge_source": True,
415
- "image_ocr": bool(category_counts["image"]),
416
- "watch": True,
417
- "sensitive_files_default_excluded": True,
418
- },
419
- }
420
-
421
- def local_sources(self) -> Dict[str, Any]:
422
- with self._connect() as conn:
423
- sources = [
424
- {
425
- "id": row["id"],
426
- "root_path": row["root_path"],
427
- "os_type": row["os_type"],
428
- "drive_id": row["drive_id"],
429
- "label": row["label"],
430
- "status": row["status"],
431
- "include_ocr": bool(row["include_ocr"]),
432
- "watch_enabled": bool(row["watch_enabled"]),
433
- "consent": _safe_loads(row["consent_json"]),
434
- "created_at": row["created_at"],
435
- "updated_at": row["updated_at"],
436
- "last_scanned_at": row["last_scanned_at"],
437
- }
438
- for row in conn.execute(
439
- """
440
- SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
441
- watch_enabled, consent_json, created_at, updated_at, last_scanned_at
442
- FROM knowledge_sources
443
- ORDER BY updated_at DESC, id ASC
444
- """
445
- )
446
- ]
447
- status_rows = conn.execute(
448
- "SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
449
- ).fetchall()
450
- counts: Dict[str, Dict[str, int]] = {}
451
- for row in status_rows:
452
- counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
453
- for source in sources:
454
- source["file_status"] = counts.get(source["id"], {})
455
- return {"sources": sources}
456
-
457
- def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
458
- source_id = str(source_id or "").strip()
459
- if not source_id:
460
- raise ValueError("source_id required")
461
- with self._connect() as conn:
462
- row = conn.execute(
463
- "SELECT id FROM knowledge_sources WHERE id=?",
464
- (source_id,),
465
- ).fetchone()
466
- if not row:
467
- raise ValueError(f"knowledge source not found: {source_id}")
468
- conn.execute(
469
- "UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
470
- (1 if enabled else 0, _now(), source_id),
471
- )
472
- return {"source_id": source_id, "watch_enabled": bool(enabled)}
473
-
474
- def remove_local_source(self, source_id: str) -> Dict[str, Any]:
475
- """Remove one approved local source and its derived graph projection.
476
-
477
- This is intentionally non-destructive for user files: only the LatticeAI
478
- index rows, graph nodes, edges, and chunks derived from the source are
479
- removed. The original folder and files are never touched.
480
- """
481
- source_id = str(source_id or "").strip()
482
- if not source_id:
483
- raise ValueError("source_id required")
484
- with self._connect() as conn:
485
- source = conn.execute(
486
- "SELECT id, root_path FROM knowledge_sources WHERE id=?",
487
- (source_id,),
488
- ).fetchone()
489
- if not source:
490
- raise ValueError(f"knowledge source not found: {source_id}")
491
- rows = conn.execute(
492
- "SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
493
- (source_id,),
494
- ).fetchall()
495
- graph_node_ids = [
496
- row["graph_node_id"] for row in rows if row["graph_node_id"]
497
- ]
498
- for graph_node_id in graph_node_ids:
499
- self._delete_local_file_graph(conn, graph_node_id)
500
- conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
501
- conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
502
- self._cleanup_local_graph_orphans(conn, source_id)
503
- return {
504
- "source_id": source_id,
505
- "root_path": source["root_path"],
506
- "removed_graph_nodes": len(graph_node_ids),
507
- }
508
-
509
- def _extract_local_file_text(
510
- self, path: Path, category: str, *, include_ocr: bool
511
- ) -> Tuple[str, Dict[str, Any]]:
512
- ext = path.suffix.lower()
513
- meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
514
- text = ""
515
- if category in {"text", "code"} or ext == ".csv":
516
- text = path.read_text(encoding="utf-8", errors="replace")
517
- elif ext == ".pdf":
518
- import pdfplumber
519
-
520
- with pdfplumber.open(str(path)) as pdf:
521
- meta["pages"] = len(pdf.pages)
522
- text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
523
- elif ext == ".docx":
524
- from docx import Document
525
-
526
- doc = Document(str(path))
527
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
528
- table_lines = []
529
- for table in doc.tables:
530
- for row in table.rows:
531
- cells = [_clean_text(cell.text) for cell in row.cells]
532
- if any(cells):
533
- table_lines.append("\t".join(cells))
534
- meta["paragraphs"] = len(paragraphs)
535
- meta["tables"] = len(doc.tables)
536
- meta["table_rows"] = len(table_lines)
537
- text = "\n\n".join([*paragraphs, *table_lines])
538
- elif ext == ".xlsx":
539
- from openpyxl import load_workbook
540
-
541
- wb = load_workbook(str(path), read_only=True, data_only=True)
542
- rows_all = []
543
- non_empty_rows = 0
544
- non_empty_cells = 0
545
- char_count = 0
546
- for ws in wb.worksheets:
547
- sheet_rows = []
548
- for row in ws.iter_rows(values_only=True):
549
- cells = [
550
- str(cell).strip() if cell is not None else "" for cell in row
551
- ]
552
- if not any(cells):
553
- continue
554
- line = "\t".join(cells)
555
- non_empty_rows += 1
556
- non_empty_cells += sum(1 for cell in cells if cell)
557
- sheet_rows.append(line)
558
- char_count += len(line) + 1
559
- if char_count > 200_000:
560
- break
561
- if sheet_rows:
562
- rows_all.append(f"[Sheet: {ws.title}]")
563
- rows_all.extend(sheet_rows)
564
- if char_count > 200_000:
565
- break
566
- meta["sheets"] = len(wb.worksheets)
567
- meta["rows"] = non_empty_rows
568
- meta["cells"] = non_empty_cells
569
- text = "\n".join(rows_all)
570
- elif ext == ".pptx":
571
- from pptx import Presentation
572
-
573
- prs = Presentation(str(path))
574
- slides_text = []
575
- for index, slide in enumerate(prs.slides, 1):
576
- parts = []
577
- for shape in slide.shapes:
578
- if getattr(shape, "has_text_frame", False):
579
- slide_text = shape.text_frame.text.strip()
580
- if slide_text:
581
- parts.append(slide_text)
582
- if parts:
583
- slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
584
- meta["slides"] = len(prs.slides)
585
- meta["text_slides"] = len(slides_text)
586
- text = "\n\n".join(slides_text)
587
- elif category == "image":
588
- from PIL import Image
589
-
590
- with Image.open(str(path)) as image:
591
- meta.update(
592
- {
593
- "width": image.width,
594
- "height": image.height,
595
- "format": image.format,
596
- "mode": image.mode,
597
- "ocr_enabled": bool(include_ocr),
598
- }
599
- )
600
- if include_ocr:
601
- try:
602
- import pytesseract
603
-
604
- text = pytesseract.image_to_string(image)
605
- meta["ocr_chars"] = len(text)
606
- except (
607
- Exception
608
- ) as exc: # pragma: no cover - depends on local OCR runtime
609
- meta["ocr_error"] = str(exc)
610
- text = ""
611
- return text[:200_000], meta
612
-
613
- def _ensure_local_hierarchy(
614
- self,
615
- conn: sqlite3.Connection,
616
- *,
617
- source_id: str,
618
- root: Path,
619
- file_path: Path,
620
- os_type: str,
621
- drive_id: str,
622
- ) -> str:
623
- computer_label = platform.node() or "내 컴퓨터"
624
- computer_id = f"computer:{_slug(computer_label)}"
625
- drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
626
- root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
627
- self._upsert_node(
628
- conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type}
629
- )
630
- self._upsert_node(
631
- conn,
632
- drive_node_id,
633
- "Drive",
634
- drive_id,
635
- metadata={"os_type": os_type, "drive_id": drive_id},
636
- )
637
- self._upsert_edge(
638
- conn,
639
- computer_id,
640
- drive_node_id,
641
- "포함함",
642
- metadata={"source": "local_scan"},
643
- )
644
- self._upsert_node(
645
- conn,
646
- root_folder_id,
647
- "Folder",
648
- root.name or str(root),
649
- summary=str(root),
650
- metadata={"source_id": source_id, "path": str(root), "root": True},
651
- )
652
- self._upsert_edge(
653
- conn,
654
- drive_node_id,
655
- root_folder_id,
656
- "포함함",
657
- metadata={"source": "local_scan"},
658
- )
659
-
660
- try:
661
- relative_parent = file_path.parent.relative_to(root)
662
- except ValueError:
663
- relative_parent = Path()
664
- parent_id = root_folder_id
665
- current_path = root
666
- for part in relative_parent.parts:
667
- current_path = current_path / part
668
- folder_id = (
669
- f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
670
- )
671
- self._upsert_node(
672
- conn,
673
- folder_id,
674
- "Folder",
675
- part,
676
- summary=str(current_path),
677
- metadata={
678
- "source_id": source_id,
679
- "path": str(current_path),
680
- "root": False,
681
- },
682
- )
683
- self._upsert_edge(
684
- conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"}
685
- )
686
- parent_id = folder_id
687
- return parent_id
688
-
689
- def _upsert_local_file_index(
690
- self,
691
- conn: sqlite3.Connection,
692
- *,
693
- source_id: str,
694
- root: Path,
695
- file_path: Path,
696
- stat: Optional[os.stat_result],
697
- os_type: str,
698
- drive_id: str,
699
- status: str,
700
- parser_type: str,
701
- sha256: Optional[str] = None,
702
- graph_node_id: Optional[str] = None,
703
- error_message: Optional[str] = None,
704
- metadata: Optional[Dict[str, Any]] = None,
705
- ) -> str:
706
- try:
707
- relative_path = file_path.relative_to(root).as_posix()
708
- except ValueError:
709
- relative_path = file_path.name
710
- index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
711
- now = _now()
712
- size = stat.st_size if stat else None
713
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
714
- conn.execute(
715
- """
716
- INSERT INTO local_file_index(
717
- id, source_id, os_type, drive_id, root_path, file_path, relative_path,
718
- file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
719
- last_indexed_at, parser_type, status, error_message, graph_node_id,
720
- deleted, metadata_json
721
- )
722
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
723
- ON CONFLICT(source_id, relative_path) DO UPDATE SET
724
- os_type=excluded.os_type,
725
- drive_id=excluded.drive_id,
726
- root_path=excluded.root_path,
727
- file_path=excluded.file_path,
728
- file_name=excluded.file_name,
729
- extension=excluded.extension,
730
- size_bytes=excluded.size_bytes,
731
- modified_at=excluded.modified_at,
732
- sha256=excluded.sha256,
733
- last_scanned_at=excluded.last_scanned_at,
734
- last_indexed_at=excluded.last_indexed_at,
735
- parser_type=excluded.parser_type,
736
- status=excluded.status,
737
- error_message=excluded.error_message,
738
- graph_node_id=excluded.graph_node_id,
739
- deleted=excluded.deleted,
740
- metadata_json=excluded.metadata_json
741
- """,
742
- (
743
- index_id,
744
- source_id,
745
- os_type,
746
- drive_id,
747
- str(root),
748
- str(file_path),
749
- relative_path,
750
- file_path.name,
751
- file_path.suffix.lower(),
752
- size,
753
- modified_at,
754
- sha256,
755
- now,
756
- now if status == "indexed" else None,
757
- parser_type,
758
- status,
759
- error_message,
760
- graph_node_id,
761
- 0 if status != "deleted" else 1,
762
- _json(metadata),
763
- ),
764
- )
765
- return index_id
766
-
767
- def _upsert_local_file_node(
768
- self,
769
- conn: sqlite3.Connection,
770
- *,
771
- source_id: str,
772
- root: Path,
773
- file_path: Path,
774
- stat: os.stat_result,
775
- os_type: str,
776
- drive_id: str,
777
- sha256: str,
778
- category: str,
779
- parser_type: str,
780
- text: str,
781
- parser_meta: Dict[str, Any],
782
- ) -> str:
783
- text = _clean_text(text)
784
- if not text:
785
- raise ValueError("텍스트 추출 결과가 비어 있습니다.")
786
- try:
787
- relative_path = file_path.relative_to(root).as_posix()
788
- except ValueError:
789
- relative_path = file_path.name
790
- file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
791
- parent_folder_id = self._ensure_local_hierarchy(
792
- conn,
793
- source_id=source_id,
794
- root=root,
795
- file_path=file_path,
796
- os_type=os_type,
797
- drive_id=drive_id,
798
- )
799
- child_rows = conn.execute(
800
- """
801
- SELECT e.to_node AS id
802
- FROM edges e
803
- JOIN nodes n ON n.id=e.to_node
804
- WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
805
- """,
806
- (file_node_id,),
807
- ).fetchall()
808
- child_ids = [row["id"] for row in child_rows]
809
- conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
810
- if child_ids:
811
- placeholders = ",".join("?" * len(child_ids))
812
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
813
- self._v2_delete_nodes(conn, child_ids)
814
- conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
815
- self._v2_delete_edges_from(conn, file_node_id)
816
-
817
- metadata = {
818
- "source": "local_folder",
819
- "source_id": source_id,
820
- "root_path": str(root),
821
- "file_path": str(file_path),
822
- "relative_path": relative_path,
823
- "filename": file_path.name,
824
- "ext": file_path.suffix.lower(),
825
- "category": category,
826
- "parser_type": parser_type,
827
- "bytes": stat.st_size,
828
- "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
829
- "sha256": sha256,
830
- "parser": parser_meta,
831
- }
832
- self._upsert_node(
833
- conn,
834
- file_node_id,
835
- _node_type_for_category(category),
836
- file_path.name,
837
- summary=text[:700],
838
- metadata=metadata,
839
- raw=metadata,
840
- )
841
- self._upsert_edge(
842
- conn,
843
- parent_folder_id,
844
- file_node_id,
845
- "포함함",
846
- weight=1.0,
847
- metadata={"source": "local_scan"},
848
- )
849
-
850
- target_for_concepts = text
851
- if category == "image" and text:
852
- image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
853
- self._upsert_node(
854
- conn,
855
- image_text_id,
856
- "ImageText",
857
- f"{file_path.name} OCR",
858
- summary=_clean_text(text)[:700],
859
- metadata={
860
- "source_node": file_node_id,
861
- "source_id": source_id,
862
- "chars": len(text),
863
- },
864
- )
865
- self._upsert_edge(
866
- conn,
867
- file_node_id,
868
- image_text_id,
869
- "포함함",
870
- weight=0.8,
871
- metadata={"source": "ocr"},
872
- )
873
-
874
- for index, chunk in enumerate(_chunks(text)):
875
- chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
876
- self._upsert_node(
877
- conn,
878
- chunk_id,
879
- "Chunk",
880
- f"{file_path.name} chunk {index + 1}",
881
- summary=chunk[:500],
882
- metadata={
883
- "index": index,
884
- "source_node": file_node_id,
885
- "source_id": source_id,
886
- },
887
- )
888
- self._upsert_chunk(
889
- conn,
890
- chunk_id=chunk_id,
891
- source_node=file_node_id,
892
- text=chunk,
893
- metadata={
894
- "index": index,
895
- "source_node": file_node_id,
896
- "source_id": source_id,
897
- },
898
- )
899
- self._upsert_edge(
900
- conn,
901
- file_node_id,
902
- chunk_id,
903
- "포함함",
904
- weight=0.7,
905
- metadata={"source": "local_scan"},
906
- )
907
-
908
- concepts = _extract_concepts(target_for_concepts, limit=18)
909
- concept_ids: Dict[str, str] = {}
910
- for concept in concepts:
911
- node_t = _classify_node_type(concept, target_for_concepts)
912
- concept_id = f"{node_t.lower()}:{_slug(concept)}"
913
- concept_ids[concept.lower()] = concept_id
914
- self._upsert_node(
915
- conn,
916
- concept_id,
917
- node_t,
918
- concept,
919
- metadata={
920
- "auto_extracted": True,
921
- "source": "local_folder",
922
- "source_id": source_id,
923
- },
924
- )
925
- self._upsert_edge(
926
- conn,
927
- file_node_id,
928
- concept_id,
929
- "언급함",
930
- weight=0.75,
931
- metadata={"source": "local_scan"},
932
- )
933
-
934
- for triple in _extract_triples(target_for_concepts, concepts, limit=20):
935
- subj_id = concept_ids.get(triple["subject"].lower())
936
- obj_id = concept_ids.get(triple["object"].lower())
937
- if subj_id and obj_id and subj_id != obj_id:
938
- self._upsert_edge(
939
- conn,
940
- subj_id,
941
- obj_id,
942
- triple["relation"],
943
- weight=0.9,
944
- metadata={
945
- "context": triple.get("context", "")[:240],
946
- "source_id": source_id,
947
- },
948
- )
949
-
950
- for item in _semantic_items(target_for_concepts):
951
- sem_type = item["type"]
952
- sem_title = item["title"]
953
- sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
954
- self._upsert_node(
955
- conn,
956
- sem_id,
957
- sem_type,
958
- sem_title,
959
- summary=item["summary"],
960
- metadata={
961
- "auto_extracted": True,
962
- "source_node": file_node_id,
963
- "filename": file_path.name,
964
- },
965
- raw=item,
966
- )
967
- self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
968
-
969
- return file_node_id
970
-
971
- def _delete_local_file_graph(
972
- self, conn: sqlite3.Connection, file_node_id: Optional[str]
973
- ) -> None:
974
- if not file_node_id:
975
- return
976
-
977
- file_row = conn.execute(
978
- "SELECT metadata_json FROM nodes WHERE id=?",
979
- (file_node_id,),
980
- ).fetchone()
981
- source_id = None
982
- if file_row:
983
- source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
984
-
985
- linked_rows = conn.execute(
986
- """
987
- SELECT n.id, n.type, n.metadata_json
988
- FROM edges e
989
- JOIN nodes n ON n.id=e.to_node
990
- WHERE e.from_node=?
991
- """,
992
- (file_node_id,),
993
- ).fetchall()
994
- owned_ids: set = set()
995
- auto_candidate_ids: set = set()
996
- for row in linked_rows:
997
- metadata = _safe_loads(row["metadata_json"])
998
- if (
999
- row["type"] in {"Chunk", "ImageText", "Section"}
1000
- or metadata.get("source_node") == file_node_id
1001
- ):
1002
- owned_ids.add(row["id"])
1003
- elif (
1004
- metadata.get("auto_extracted")
1005
- and metadata.get("source") == "local_folder"
1006
- ):
1007
- auto_candidate_ids.add(row["id"])
1008
-
1009
- conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
1010
- conn.execute(
1011
- "DELETE FROM edges WHERE from_node=? OR to_node=?",
1012
- (file_node_id, file_node_id),
1013
- )
1014
- conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
1015
- self._v2_delete_nodes(conn, [file_node_id])
1016
-
1017
- def delete_nodes(node_ids: set) -> None:
1018
- if not node_ids:
1019
- return
1020
- placeholders = ",".join("?" * len(node_ids))
1021
- params = list(node_ids)
1022
- conn.execute(
1023
- f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params
1024
- )
1025
- conn.execute(
1026
- f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
1027
- params * 2,
1028
- )
1029
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
1030
- self._v2_delete_nodes(conn, params)
1031
-
1032
- delete_nodes(owned_ids)
1033
-
1034
- removable_auto_ids: set = set()
1035
- for node_id in auto_candidate_ids:
1036
- remaining_edges = conn.execute(
1037
- "SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
1038
- (node_id, node_id),
1039
- ).fetchall()
1040
- if all(
1041
- (
1042
- row["from_node"] in auto_candidate_ids
1043
- and row["to_node"] in auto_candidate_ids
1044
- )
1045
- for row in remaining_edges
1046
- ):
1047
- removable_auto_ids.add(node_id)
1048
- delete_nodes(removable_auto_ids)
1049
- if source_id:
1050
- self._cleanup_local_graph_orphans(conn, str(source_id))
1051
-
1052
- def _cleanup_local_graph_orphans(
1053
- self, conn: sqlite3.Connection, source_id: str
1054
- ) -> None:
1055
- while True:
1056
- folder_rows = conn.execute(
1057
- "SELECT id, metadata_json FROM nodes WHERE type='Folder'"
1058
- ).fetchall()
1059
- leaf_ids = []
1060
- for row in folder_rows:
1061
- metadata = _safe_loads(row["metadata_json"])
1062
- if metadata.get("source_id") != source_id:
1063
- continue
1064
- has_children = conn.execute(
1065
- "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
1066
- (row["id"],),
1067
- ).fetchone()
1068
- if not has_children:
1069
- leaf_ids.append(row["id"])
1070
- if not leaf_ids:
1071
- break
1072
- placeholders = ",".join("?" * len(leaf_ids))
1073
- conn.execute(
1074
- f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
1075
- leaf_ids * 2,
1076
- )
1077
- conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
1078
- self._v2_delete_nodes(conn, leaf_ids)
1079
-
1080
- for node_type in ("Drive", "Computer"):
1081
- rows = conn.execute(
1082
- "SELECT id FROM nodes WHERE type=?", (node_type,)
1083
- ).fetchall()
1084
- removable = []
1085
- for row in rows:
1086
- has_children = conn.execute(
1087
- "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
1088
- (row["id"],),
1089
- ).fetchone()
1090
- if not has_children:
1091
- removable.append(row["id"])
1092
- if removable:
1093
- placeholders = ",".join("?" * len(removable))
1094
- conn.execute(
1095
- f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
1096
- removable * 2,
1097
- )
1098
- conn.execute(
1099
- f"DELETE FROM nodes WHERE id IN ({placeholders})", removable
1100
- )
1101
- self._v2_delete_nodes(conn, removable)
1102
-
1103
- def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
1104
- metadata = _safe_loads(row["metadata_json"])
1105
- parser = metadata.get("parser") if isinstance(metadata, dict) else {}
1106
- if not isinstance(parser, dict):
1107
- return False
1108
- try:
1109
- return int(parser.get("extracted_chars") or 0) > 0
1110
- except (TypeError, ValueError):
1111
- return False
1112
-
1113
- def index_local_folder(
1114
- self,
1115
- path: Path,
1116
- *,
1117
- include_ocr: bool = False,
1118
- watch_enabled: bool = False,
1119
- user_email: Optional[str] = None,
1120
- consent: Optional[Dict[str, Any]] = None,
1121
- max_files: int = 5_000,
1122
- ) -> Dict[str, Any]:
1123
- """Read approved files from a local folder and connect them to Graph RAG."""
1124
- root = Path(path).expanduser().resolve()
1125
- if not root.exists():
1126
- raise ValueError(f"경로가 존재하지 않습니다: {path}")
1127
- if not root.is_dir():
1128
- raise ValueError(f"폴더가 아닙니다: {path}")
1129
-
1130
- os_type = _current_os_type()
1131
- drive_id = _drive_id_for_path(root)
1132
- source_id = f"source:{_path_fingerprint(root)}"
1133
- now = _now()
1134
- max_files = max(1, min(int(max_files or 5_000), 50_000))
1135
- consent_payload = {
1136
- "approved_at": now,
1137
- "approved_by": user_email,
1138
- "knowledge_source": True,
1139
- "include_ocr": bool(include_ocr),
1140
- "watch_enabled": bool(watch_enabled),
1141
- "sensitive_files_default_excluded": True,
1142
- **(consent or {}),
1143
- }
1144
- counts: Counter = Counter()
1145
- seen_relative_paths: set = set()
1146
- indexed_nodes: List[str] = []
1147
- errors: List[Dict[str, str]] = []
1148
- limit_reached = False
1149
-
1150
- with self._connect() as conn:
1151
- conn.execute(
1152
- """
1153
- INSERT INTO knowledge_sources(
1154
- id, root_path, os_type, drive_id, label, status, include_ocr,
1155
- watch_enabled, consent_json, created_at, updated_at, last_scanned_at
1156
- )
1157
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1158
- ON CONFLICT(id) DO UPDATE SET
1159
- root_path=excluded.root_path,
1160
- os_type=excluded.os_type,
1161
- drive_id=excluded.drive_id,
1162
- label=excluded.label,
1163
- status=excluded.status,
1164
- include_ocr=excluded.include_ocr,
1165
- watch_enabled=excluded.watch_enabled,
1166
- consent_json=excluded.consent_json,
1167
- updated_at=excluded.updated_at,
1168
- last_scanned_at=excluded.last_scanned_at
1169
- """,
1170
- (
1171
- source_id,
1172
- str(root),
1173
- os_type,
1174
- drive_id,
1175
- root.name or str(root),
1176
- "scanning",
1177
- 1 if include_ocr else 0,
1178
- 1 if watch_enabled else 0,
1179
- _json(consent_payload),
1180
- now,
1181
- now,
1182
- now,
1183
- ),
1184
- )
1185
-
1186
- for entry in self._iter_local_scan_entries(root, max_files=max_files):
1187
- kind = entry["kind"]
1188
- file_path = entry["path"]
1189
- if kind == "limit_reached":
1190
- counts["limit_reached"] += 1
1191
- limit_reached = True
1192
- break
1193
- if kind in {"excluded_dir", "excluded"}:
1194
- counts["excluded"] += 1
1195
- continue
1196
- if kind in {"inaccessible_dir", "inaccessible_file"}:
1197
- counts["failed"] += 1
1198
- errors.append(
1199
- {
1200
- "path": str(file_path),
1201
- "error": entry.get("reason", "inaccessible"),
1202
- }
1203
- )
1204
- continue
1205
- if kind != "file":
1206
- continue
1207
-
1208
- stat = entry["stat"]
1209
- try:
1210
- relative_path = file_path.relative_to(root).as_posix()
1211
- except ValueError:
1212
- relative_path = file_path.name
1213
- seen_relative_paths.add(relative_path)
1214
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
1215
- existing = conn.execute(
1216
- """
1217
- SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
1218
- FROM local_file_index
1219
- WHERE source_id=? AND relative_path=?
1220
- """,
1221
- (source_id, relative_path),
1222
- ).fetchone()
1223
- decision = self._local_file_decision(file_path, root, stat)
1224
- parser_type = decision["parser_type"]
1225
- if not decision["indexable"]:
1226
- counts[decision["status"]] += 1
1227
- if existing and existing["graph_node_id"]:
1228
- self._delete_local_file_graph(conn, existing["graph_node_id"])
1229
- self._upsert_local_file_index(
1230
- conn,
1231
- source_id=source_id,
1232
- root=root,
1233
- file_path=file_path,
1234
- stat=stat,
1235
- os_type=os_type,
1236
- drive_id=drive_id,
1237
- status=decision["status"],
1238
- parser_type=parser_type,
1239
- metadata={
1240
- "reason": decision["reason"],
1241
- "category": decision["category"],
1242
- },
1243
- )
1244
- continue
1245
-
1246
- if (
1247
- existing
1248
- and existing["status"] == "indexed"
1249
- and existing["graph_node_id"]
1250
- and self._local_file_index_has_extracted_text(existing)
1251
- and existing["size_bytes"] == stat.st_size
1252
- and existing["modified_at"] == modified_at
1253
- ):
1254
- counts["skipped_unchanged"] += 1
1255
- self._upsert_local_file_index(
1256
- conn,
1257
- source_id=source_id,
1258
- root=root,
1259
- file_path=file_path,
1260
- stat=stat,
1261
- os_type=os_type,
1262
- drive_id=drive_id,
1263
- status="indexed",
1264
- parser_type=parser_type,
1265
- sha256=existing["sha256"],
1266
- graph_node_id=existing["graph_node_id"],
1267
- metadata={
1268
- **_safe_loads(existing["metadata_json"]),
1269
- "category": decision["category"],
1270
- "unchanged": True,
1271
- },
1272
- )
1273
- continue
1274
-
1275
- try:
1276
- data = file_path.read_bytes()
1277
- digest = _sha256_bytes(data)
1278
- except Exception as exc:
1279
- counts["failed"] += 1
1280
- errors.append({"path": str(file_path), "error": str(exc)})
1281
- if existing and existing["graph_node_id"]:
1282
- self._delete_local_file_graph(conn, existing["graph_node_id"])
1283
- self._upsert_local_file_index(
1284
- conn,
1285
- source_id=source_id,
1286
- root=root,
1287
- file_path=file_path,
1288
- stat=stat,
1289
- os_type=os_type,
1290
- drive_id=drive_id,
1291
- status="failed",
1292
- parser_type=parser_type,
1293
- error_message=str(exc),
1294
- metadata={"category": decision["category"]},
1295
- )
1296
- continue
1297
-
1298
- if (
1299
- existing
1300
- and existing["sha256"] == digest
1301
- and existing["graph_node_id"]
1302
- and self._local_file_index_has_extracted_text(existing)
1303
- ):
1304
- counts["skipped_unchanged"] += 1
1305
- self._upsert_local_file_index(
1306
- conn,
1307
- source_id=source_id,
1308
- root=root,
1309
- file_path=file_path,
1310
- stat=stat,
1311
- os_type=os_type,
1312
- drive_id=drive_id,
1313
- status="indexed",
1314
- parser_type=parser_type,
1315
- sha256=digest,
1316
- graph_node_id=existing["graph_node_id"],
1317
- metadata={
1318
- **_safe_loads(existing["metadata_json"]),
1319
- "category": decision["category"],
1320
- "sha256_unchanged": True,
1321
- },
1322
- )
1323
- continue
1324
-
1325
- try:
1326
- text, parser_meta = self._extract_local_file_text(
1327
- file_path,
1328
- decision["category"],
1329
- include_ocr=include_ocr,
1330
- )
1331
- text = _clean_text(text)
1332
- parser_meta = {**parser_meta, "extracted_chars": len(text)}
1333
- if not text:
1334
- counts["skipped_empty_text"] += 1
1335
- if existing and existing["graph_node_id"]:
1336
- self._delete_local_file_graph(
1337
- conn, existing["graph_node_id"]
1338
- )
1339
- self._upsert_local_file_index(
1340
- conn,
1341
- source_id=source_id,
1342
- root=root,
1343
- file_path=file_path,
1344
- stat=stat,
1345
- os_type=os_type,
1346
- drive_id=drive_id,
1347
- status="skipped_empty_text",
1348
- parser_type=parser_type,
1349
- sha256=digest,
1350
- error_message="텍스트 추출 결과가 비어 있습니다.",
1351
- metadata={
1352
- "category": decision["category"],
1353
- "parser": parser_meta,
1354
- },
1355
- )
1356
- continue
1357
- graph_node_id = self._upsert_local_file_node(
1358
- conn,
1359
- source_id=source_id,
1360
- root=root,
1361
- file_path=file_path,
1362
- stat=stat,
1363
- os_type=os_type,
1364
- drive_id=drive_id,
1365
- sha256=digest,
1366
- category=decision["category"],
1367
- parser_type=parser_type,
1368
- text=text,
1369
- parser_meta=parser_meta,
1370
- )
1371
- self._upsert_local_file_index(
1372
- conn,
1373
- source_id=source_id,
1374
- root=root,
1375
- file_path=file_path,
1376
- stat=stat,
1377
- os_type=os_type,
1378
- drive_id=drive_id,
1379
- status="indexed",
1380
- parser_type=parser_type,
1381
- sha256=digest,
1382
- graph_node_id=graph_node_id,
1383
- metadata={
1384
- "category": decision["category"],
1385
- "parser": parser_meta,
1386
- },
1387
- )
1388
- counts["indexed"] += 1
1389
- indexed_nodes.append(graph_node_id)
1390
- except Exception as exc:
1391
- counts["failed"] += 1
1392
- errors.append({"path": str(file_path), "error": str(exc)})
1393
- if existing and existing["graph_node_id"]:
1394
- self._delete_local_file_graph(conn, existing["graph_node_id"])
1395
- self._upsert_local_file_index(
1396
- conn,
1397
- source_id=source_id,
1398
- root=root,
1399
- file_path=file_path,
1400
- stat=stat,
1401
- os_type=os_type,
1402
- drive_id=drive_id,
1403
- status="failed",
1404
- parser_type=parser_type,
1405
- sha256=digest,
1406
- error_message=str(exc),
1407
- metadata={"category": decision["category"]},
1408
- )
1409
-
1410
- if not limit_reached:
1411
- existing_rows = {
1412
- row["relative_path"]: row["graph_node_id"]
1413
- for row in conn.execute(
1414
- "SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
1415
- (source_id,),
1416
- )
1417
- }
1418
- deleted_paths = set(existing_rows) - seen_relative_paths
1419
- for relative_path in deleted_paths:
1420
- self._delete_local_file_graph(
1421
- conn, existing_rows.get(relative_path)
1422
- )
1423
- conn.execute(
1424
- """
1425
- UPDATE local_file_index
1426
- SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
1427
- WHERE source_id=? AND relative_path=?
1428
- """,
1429
- (_now(), source_id, relative_path),
1430
- )
1431
- counts["deleted"] = len(deleted_paths)
1432
- conn.execute(
1433
- """
1434
- UPDATE knowledge_sources
1435
- SET status='active', updated_at=?, last_scanned_at=?
1436
- WHERE id=?
1437
- """,
1438
- (_now(), _now(), source_id),
1439
- )
1440
-
1441
- return {
1442
- "status": "ok",
1443
- "source": {
1444
- "id": source_id,
1445
- "root_path": str(root),
1446
- "os_type": os_type,
1447
- "drive_id": drive_id,
1448
- "include_ocr": bool(include_ocr),
1449
- "watch_enabled": bool(watch_enabled),
1450
- },
1451
- "counts": dict(counts),
1452
- "indexed_nodes": indexed_nodes[:100],
1453
- "errors": errors[:50],
1454
- "notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
1455
- }
12
+ warnings.warn(
13
+ "latticeai.brain.discovery is deprecated; import lattice_brain.graph.discovery instead",
14
+ DeprecationWarning,
15
+ stacklevel=2,
16
+ )
17
+ sys.modules[__name__] = _impl