ltcai 4.3.3 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -20
- package/docs/CHANGELOG.md +122 -0
- package/docs/V4_4_0_EXTRACTION_REPORT.md +239 -0
- package/docs/V4_5_0_GEMMA_RUNTIME_COMPATIBILITY_REPORT.md +49 -0
- package/docs/V4_5_0_GRAPH_UX_REPORT.md +34 -0
- package/docs/V4_5_0_MODEL_RUNTIME_UX_REPORT.md +40 -0
- package/docs/V4_5_0_ONBOARDING_REPORT.md +31 -0
- package/docs/V4_5_0_PRODUCT_EXPERIENCE_RECOVERY_REPORT.md +49 -0
- package/docs/V4_5_0_VALIDATION_REPORT.md +60 -0
- package/docs/V4_5_1_GRAPH_EXPERIENCE_REPORT.md +33 -0
- package/docs/V4_5_1_MODEL_EXPERIENCE_REPORT.md +37 -0
- package/docs/V4_5_1_NAVIGATION_REPORT.md +37 -0
- package/docs/V4_5_1_ONBOARDING_REPORT.md +29 -0
- package/docs/V4_5_1_PRODUCT_REIMAGINING_REPORT.md +61 -0
- package/docs/V4_5_1_RC_ARTIFACTS.md +44 -0
- package/docs/V4_5_1_UX_REPORT.md +45 -0
- package/docs/V4_5_1_VALIDATION_REPORT.md +54 -0
- package/docs/V4_5_1_VISUAL_DESIGN_REPORT.md +30 -0
- package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +16 -16
- package/docs/architecture.md +8 -4
- package/frontend/src/App.tsx +152 -91
- package/frontend/src/api/client.ts +83 -1
- package/frontend/src/components/FirstRunGuide.tsx +99 -0
- package/frontend/src/components/primitives.tsx +131 -25
- package/frontend/src/components/ui/badge.tsx +2 -2
- package/frontend/src/components/ui/button.tsx +7 -7
- package/frontend/src/components/ui/card.tsx +5 -5
- package/frontend/src/components/ui/input.tsx +1 -1
- package/frontend/src/components/ui/textarea.tsx +1 -1
- package/frontend/src/pages/Act.tsx +58 -28
- package/frontend/src/pages/Ask.tsx +51 -19
- package/frontend/src/pages/Brain.tsx +60 -42
- package/frontend/src/pages/Capture.tsx +24 -24
- package/frontend/src/pages/Library.tsx +222 -32
- package/frontend/src/pages/System.tsx +56 -34
- package/frontend/src/routes.ts +15 -13
- package/frontend/src/store/appStore.ts +8 -1
- package/frontend/src/styles.css +666 -36
- package/lattice_brain/__init__.py +38 -23
- package/lattice_brain/_kg_common.py +11 -1
- package/lattice_brain/context.py +212 -2
- package/lattice_brain/conversations.py +234 -1
- package/lattice_brain/discovery.py +11 -1
- package/lattice_brain/documents.py +11 -1
- package/lattice_brain/graph/__init__.py +28 -0
- package/lattice_brain/graph/_kg_common.py +1123 -0
- package/lattice_brain/graph/curator.py +473 -0
- package/lattice_brain/graph/discovery.py +1455 -0
- package/lattice_brain/graph/documents.py +218 -0
- package/lattice_brain/graph/identity.py +175 -0
- package/lattice_brain/graph/ingest.py +644 -0
- package/lattice_brain/graph/network.py +205 -0
- package/lattice_brain/graph/projection.py +571 -0
- package/lattice_brain/graph/provenance.py +401 -0
- package/lattice_brain/graph/retrieval.py +1341 -0
- package/lattice_brain/graph/schema.py +640 -0
- package/lattice_brain/graph/store.py +237 -0
- package/lattice_brain/graph/write_master.py +225 -0
- package/lattice_brain/identity.py +11 -13
- package/lattice_brain/ingest.py +11 -1
- package/lattice_brain/ingestion.py +318 -0
- package/lattice_brain/memory.py +100 -1
- package/lattice_brain/network.py +11 -1
- package/lattice_brain/portability.py +431 -0
- package/lattice_brain/projection.py +11 -1
- package/lattice_brain/provenance.py +11 -1
- package/lattice_brain/retrieval.py +11 -1
- package/lattice_brain/runtime/__init__.py +32 -0
- package/lattice_brain/runtime/agent_runtime.py +569 -0
- package/lattice_brain/runtime/hooks.py +754 -0
- package/lattice_brain/runtime/multi_agent.py +795 -0
- package/lattice_brain/schema.py +11 -1
- package/lattice_brain/store.py +10 -2
- package/lattice_brain/workflow.py +461 -0
- package/lattice_brain/write_master.py +11 -1
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/agents.py +2 -2
- package/latticeai/api/browser.py +1 -1
- package/latticeai/api/chat.py +1 -1
- package/latticeai/api/computer_use.py +1 -1
- package/latticeai/api/hooks.py +2 -2
- package/latticeai/api/mcp.py +1 -1
- package/latticeai/api/models.py +107 -18
- package/latticeai/api/tools.py +1 -1
- package/latticeai/api/workflow_designer.py +2 -2
- package/latticeai/app_factory.py +4 -4
- package/latticeai/brain/__init__.py +24 -6
- package/latticeai/brain/_kg_common.py +11 -1117
- package/latticeai/brain/context.py +12 -208
- package/latticeai/brain/conversations.py +12 -231
- package/latticeai/brain/discovery.py +13 -1451
- package/latticeai/brain/documents.py +13 -214
- package/latticeai/brain/identity.py +11 -169
- package/latticeai/brain/ingest.py +13 -640
- package/latticeai/brain/memory.py +12 -97
- package/latticeai/brain/network.py +12 -200
- package/latticeai/brain/projection.py +13 -567
- package/latticeai/brain/provenance.py +13 -397
- package/latticeai/brain/retrieval.py +13 -1337
- package/latticeai/brain/schema.py +12 -635
- package/latticeai/brain/store.py +13 -233
- package/latticeai/brain/write_master.py +13 -221
- package/latticeai/core/agent.py +1 -1
- package/latticeai/core/agent_registry.py +2 -2
- package/latticeai/core/builtin_hooks.py +2 -2
- package/latticeai/core/graph_curator.py +6 -468
- package/latticeai/core/hooks.py +6 -749
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/model_compat.py +250 -0
- package/latticeai/core/multi_agent.py +6 -790
- package/latticeai/core/workflow_engine.py +6 -456
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/models/router.py +136 -32
- package/latticeai/services/agent_runtime.py +6 -564
- package/latticeai/services/ingestion.py +6 -313
- package/latticeai/services/kg_portability.py +6 -426
- package/latticeai/services/model_catalog.py +2 -2
- package/latticeai/services/model_recommendation.py +8 -1
- package/latticeai/services/model_runtime.py +18 -3
- package/latticeai/services/platform_runtime.py +3 -3
- package/latticeai/services/run_executor.py +1 -1
- package/latticeai/services/upload_service.py +1 -1
- package/p_reinforce.py +1 -1
- package/package.json +1 -1
- package/scripts/build_frontend_assets.mjs +12 -1
- package/scripts/bump_version.py +1 -1
- package/scripts/wheel_smoke.py +7 -0
- package/src-tauri/Cargo.lock +1 -1
- package/src-tauri/Cargo.toml +1 -1
- package/src-tauri/tauri.conf.json +1 -1
- package/static/app/asset-manifest.json +5 -5
- package/static/app/assets/index-3G8qcrIS.js +336 -0
- package/static/app/assets/index-3G8qcrIS.js.map +1 -0
- package/static/app/assets/index-C0wYZp7k.css +2 -0
- package/static/app/index.html +2 -2
- package/static/app/assets/index-CHHal8Zl.css +0 -2
- package/static/app/assets/index-pdzil9ac.js +0 -333
- package/static/app/assets/index-pdzil9ac.js.map +0 -1
|
@@ -0,0 +1,1455 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: F403,F405
|
|
4
|
+
|
|
5
|
+
from ._kg_common import * # noqa: F403,F401
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KnowledgeGraphDiscoveryMixin:
|
|
9
|
+
def discover_local_roots(self) -> Dict[str, Any]:
|
|
10
|
+
"""Return safe, cross-platform starting points for structure browsing."""
|
|
11
|
+
os_type = _current_os_type()
|
|
12
|
+
home = Path.home().expanduser()
|
|
13
|
+
roots: List[Dict[str, Any]] = []
|
|
14
|
+
seen: set = set()
|
|
15
|
+
|
|
16
|
+
def add(
|
|
17
|
+
label: str,
|
|
18
|
+
path: Path,
|
|
19
|
+
kind: str,
|
|
20
|
+
*,
|
|
21
|
+
recommended: bool = True,
|
|
22
|
+
warning: Optional[str] = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
try:
|
|
25
|
+
resolved = path.expanduser().resolve()
|
|
26
|
+
except OSError:
|
|
27
|
+
resolved = path.expanduser()
|
|
28
|
+
key = str(resolved)
|
|
29
|
+
if key in seen or not resolved.exists():
|
|
30
|
+
return
|
|
31
|
+
seen.add(key)
|
|
32
|
+
roots.append(
|
|
33
|
+
{
|
|
34
|
+
"id": f"{kind}:{_path_fingerprint(resolved)}",
|
|
35
|
+
"label": label,
|
|
36
|
+
"path": key,
|
|
37
|
+
"kind": kind,
|
|
38
|
+
"recommended": recommended,
|
|
39
|
+
"warning": warning or _root_warning(resolved, os_type),
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
add("홈", home, "home", warning=_root_warning(home, os_type))
|
|
44
|
+
for name, label in (
|
|
45
|
+
("Documents", "문서"),
|
|
46
|
+
("Desktop", "데스크탑"),
|
|
47
|
+
("Downloads", "다운로드"),
|
|
48
|
+
("Pictures", "사진"),
|
|
49
|
+
("Projects", "프로젝트"),
|
|
50
|
+
):
|
|
51
|
+
add(label, home / name, name.lower())
|
|
52
|
+
|
|
53
|
+
if os_type == "macos":
|
|
54
|
+
volumes = Path("/Volumes")
|
|
55
|
+
if volumes.exists():
|
|
56
|
+
try:
|
|
57
|
+
for volume in sorted(
|
|
58
|
+
volumes.iterdir(), key=lambda p: p.name.lower()
|
|
59
|
+
):
|
|
60
|
+
add(volume.name, volume, "volume", recommended=False)
|
|
61
|
+
except OSError:
|
|
62
|
+
pass
|
|
63
|
+
elif os_type == "windows":
|
|
64
|
+
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
65
|
+
drive = Path(f"{letter}:\\")
|
|
66
|
+
if drive.exists():
|
|
67
|
+
add(
|
|
68
|
+
f"{letter}: 드라이브",
|
|
69
|
+
drive,
|
|
70
|
+
"drive",
|
|
71
|
+
recommended=(letter != "C"),
|
|
72
|
+
)
|
|
73
|
+
for env_name, label in (
|
|
74
|
+
("OneDrive", "OneDrive"),
|
|
75
|
+
("OneDriveCommercial", "OneDrive"),
|
|
76
|
+
):
|
|
77
|
+
raw = os.environ.get(env_name)
|
|
78
|
+
if raw:
|
|
79
|
+
add(label, Path(raw), "cloud", recommended=False)
|
|
80
|
+
elif os_type == "linux":
|
|
81
|
+
for base in (Path("/mnt"), Path("/media")):
|
|
82
|
+
add(str(base), base, "mounts", recommended=False)
|
|
83
|
+
try:
|
|
84
|
+
if base.exists():
|
|
85
|
+
for mounted in sorted(
|
|
86
|
+
base.iterdir(), key=lambda p: p.name.lower()
|
|
87
|
+
):
|
|
88
|
+
add(mounted.name, mounted, "volume", recommended=False)
|
|
89
|
+
except OSError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"os_type": os_type,
|
|
94
|
+
"computer": platform.node() or "local",
|
|
95
|
+
"roots": roots,
|
|
96
|
+
"privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
|
|
100
|
+
"""List one folder level using metadata only; file contents are not read."""
|
|
101
|
+
root = Path(path).expanduser().resolve()
|
|
102
|
+
if not root.exists():
|
|
103
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
104
|
+
if not root.is_dir():
|
|
105
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
106
|
+
|
|
107
|
+
os_type = _current_os_type()
|
|
108
|
+
max_items = max(1, min(int(max_items or 200), 1000))
|
|
109
|
+
items: List[Dict[str, Any]] = []
|
|
110
|
+
inaccessible = 0
|
|
111
|
+
try:
|
|
112
|
+
children = sorted(
|
|
113
|
+
root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
114
|
+
)
|
|
115
|
+
except PermissionError as exc:
|
|
116
|
+
return {
|
|
117
|
+
"path": str(root),
|
|
118
|
+
"items": [],
|
|
119
|
+
"error": f"접근 권한 없음: {exc}",
|
|
120
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
for child in children[:max_items]:
|
|
124
|
+
try:
|
|
125
|
+
is_dir = child.is_dir()
|
|
126
|
+
stat = child.stat()
|
|
127
|
+
reason = (
|
|
128
|
+
_excluded_directory_reason(child, root=root, os_type=os_type)
|
|
129
|
+
if is_dir
|
|
130
|
+
else _sensitive_file_reason(child, root=root)
|
|
131
|
+
)
|
|
132
|
+
items.append(
|
|
133
|
+
{
|
|
134
|
+
"name": child.name,
|
|
135
|
+
"path": str(child),
|
|
136
|
+
"type": "directory" if is_dir else "file",
|
|
137
|
+
"extension": "" if is_dir else child.suffix.lower(),
|
|
138
|
+
"size_bytes": None if is_dir else stat.st_size,
|
|
139
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
140
|
+
"hidden": _is_hidden_path(child, root),
|
|
141
|
+
"accessible": True,
|
|
142
|
+
"excluded_reason": reason,
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
except PermissionError:
|
|
146
|
+
inaccessible += 1
|
|
147
|
+
items.append(
|
|
148
|
+
{
|
|
149
|
+
"name": child.name,
|
|
150
|
+
"path": str(child),
|
|
151
|
+
"type": "unknown",
|
|
152
|
+
"accessible": False,
|
|
153
|
+
"excluded_reason": "permission_denied",
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
except OSError as exc:
|
|
157
|
+
inaccessible += 1
|
|
158
|
+
items.append(
|
|
159
|
+
{
|
|
160
|
+
"name": child.name,
|
|
161
|
+
"path": str(child),
|
|
162
|
+
"type": "unknown",
|
|
163
|
+
"accessible": False,
|
|
164
|
+
"excluded_reason": str(exc),
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"path": str(root),
|
|
170
|
+
"os_type": os_type,
|
|
171
|
+
"items": items,
|
|
172
|
+
"truncated": len(children) > max_items,
|
|
173
|
+
"inaccessible": inaccessible,
|
|
174
|
+
"warning": _root_warning(root, os_type),
|
|
175
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def _iter_local_scan_entries(
|
|
179
|
+
self, root: Path, *, max_files: int
|
|
180
|
+
) -> Iterable[Dict[str, Any]]:
|
|
181
|
+
os_type = _current_os_type()
|
|
182
|
+
stack = [root]
|
|
183
|
+
files_seen = 0
|
|
184
|
+
while stack:
|
|
185
|
+
current = stack.pop()
|
|
186
|
+
try:
|
|
187
|
+
children = sorted(
|
|
188
|
+
current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
189
|
+
)
|
|
190
|
+
except PermissionError as exc:
|
|
191
|
+
yield {
|
|
192
|
+
"kind": "inaccessible_dir",
|
|
193
|
+
"path": current,
|
|
194
|
+
"reason": f"permission_denied: {exc}",
|
|
195
|
+
}
|
|
196
|
+
continue
|
|
197
|
+
except OSError as exc:
|
|
198
|
+
yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
for child in children:
|
|
202
|
+
if child.is_symlink():
|
|
203
|
+
yield {"kind": "excluded", "path": child, "reason": "symlink"}
|
|
204
|
+
continue
|
|
205
|
+
try:
|
|
206
|
+
if child.is_dir():
|
|
207
|
+
reason = _excluded_directory_reason(
|
|
208
|
+
child, root=root, os_type=os_type
|
|
209
|
+
)
|
|
210
|
+
if reason:
|
|
211
|
+
yield {
|
|
212
|
+
"kind": "excluded_dir",
|
|
213
|
+
"path": child,
|
|
214
|
+
"reason": reason,
|
|
215
|
+
}
|
|
216
|
+
else:
|
|
217
|
+
stack.append(child)
|
|
218
|
+
continue
|
|
219
|
+
if not child.is_file():
|
|
220
|
+
yield {
|
|
221
|
+
"kind": "excluded",
|
|
222
|
+
"path": child,
|
|
223
|
+
"reason": "not_regular_file",
|
|
224
|
+
}
|
|
225
|
+
continue
|
|
226
|
+
stat = child.stat()
|
|
227
|
+
except PermissionError as exc:
|
|
228
|
+
yield {
|
|
229
|
+
"kind": "inaccessible_file",
|
|
230
|
+
"path": child,
|
|
231
|
+
"reason": f"permission_denied: {exc}",
|
|
232
|
+
}
|
|
233
|
+
continue
|
|
234
|
+
except OSError as exc:
|
|
235
|
+
yield {
|
|
236
|
+
"kind": "inaccessible_file",
|
|
237
|
+
"path": child,
|
|
238
|
+
"reason": str(exc),
|
|
239
|
+
}
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
files_seen += 1
|
|
243
|
+
if files_seen > max_files:
|
|
244
|
+
yield {
|
|
245
|
+
"kind": "limit_reached",
|
|
246
|
+
"path": child,
|
|
247
|
+
"reason": "max_files",
|
|
248
|
+
}
|
|
249
|
+
return
|
|
250
|
+
yield {"kind": "file", "path": child, "stat": stat}
|
|
251
|
+
|
|
252
|
+
def _local_file_decision(
|
|
253
|
+
self, path: Path, root: Path, stat: os.stat_result
|
|
254
|
+
) -> Dict[str, Any]:
|
|
255
|
+
ext = path.suffix.lower()
|
|
256
|
+
category = _file_category(ext)
|
|
257
|
+
parser_type = _parser_type_for_category(category, ext)
|
|
258
|
+
sensitive_reason = _sensitive_file_reason(path, root=root)
|
|
259
|
+
if sensitive_reason:
|
|
260
|
+
return {
|
|
261
|
+
"status": "sensitive_blocked",
|
|
262
|
+
"reason": sensitive_reason,
|
|
263
|
+
"category": category,
|
|
264
|
+
"parser_type": parser_type,
|
|
265
|
+
"indexable": False,
|
|
266
|
+
}
|
|
267
|
+
if category == "unsupported":
|
|
268
|
+
return {
|
|
269
|
+
"status": "unsupported",
|
|
270
|
+
"reason": "unsupported_extension",
|
|
271
|
+
"category": category,
|
|
272
|
+
"parser_type": parser_type,
|
|
273
|
+
"indexable": False,
|
|
274
|
+
}
|
|
275
|
+
limit = _size_limit_for_category(category)
|
|
276
|
+
if stat.st_size > limit:
|
|
277
|
+
return {
|
|
278
|
+
"status": "too_large",
|
|
279
|
+
"reason": f"size>{limit}",
|
|
280
|
+
"category": category,
|
|
281
|
+
"parser_type": parser_type,
|
|
282
|
+
"indexable": False,
|
|
283
|
+
}
|
|
284
|
+
return {
|
|
285
|
+
"status": "pending",
|
|
286
|
+
"reason": "",
|
|
287
|
+
"category": category,
|
|
288
|
+
"parser_type": parser_type,
|
|
289
|
+
"indexable": True,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
def audit_local_folder(
|
|
293
|
+
self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000
|
|
294
|
+
) -> Dict[str, Any]:
|
|
295
|
+
"""Safety-check a folder using metadata only; file bodies are not read."""
|
|
296
|
+
root = Path(path).expanduser().resolve()
|
|
297
|
+
if not root.exists():
|
|
298
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
299
|
+
if not root.is_dir():
|
|
300
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
301
|
+
|
|
302
|
+
os_type = _current_os_type()
|
|
303
|
+
max_files = max(1, min(int(max_files or 50_000), 200_000))
|
|
304
|
+
status_counts: Counter = Counter()
|
|
305
|
+
category_counts: Counter = Counter()
|
|
306
|
+
extension_counts: Counter = Counter()
|
|
307
|
+
allowed_samples: List[Dict[str, Any]] = []
|
|
308
|
+
excluded_samples: List[Dict[str, Any]] = []
|
|
309
|
+
total_files = 0
|
|
310
|
+
readable_files = 0
|
|
311
|
+
inaccessible = 0
|
|
312
|
+
excluded_dirs = 0
|
|
313
|
+
limit_reached = False
|
|
314
|
+
|
|
315
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
316
|
+
kind = entry["kind"]
|
|
317
|
+
path_obj = entry["path"]
|
|
318
|
+
if kind == "limit_reached":
|
|
319
|
+
limit_reached = True
|
|
320
|
+
break
|
|
321
|
+
if kind == "excluded_dir":
|
|
322
|
+
excluded_dirs += 1
|
|
323
|
+
if len(excluded_samples) < 25:
|
|
324
|
+
excluded_samples.append(
|
|
325
|
+
_sample_file(
|
|
326
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
continue
|
|
330
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
331
|
+
inaccessible += 1
|
|
332
|
+
status_counts["failed"] += 1
|
|
333
|
+
if len(excluded_samples) < 25:
|
|
334
|
+
excluded_samples.append(
|
|
335
|
+
_sample_file(path_obj, root, "failed", entry.get("reason", ""))
|
|
336
|
+
)
|
|
337
|
+
continue
|
|
338
|
+
if kind == "excluded":
|
|
339
|
+
status_counts["excluded"] += 1
|
|
340
|
+
if len(excluded_samples) < 25:
|
|
341
|
+
excluded_samples.append(
|
|
342
|
+
_sample_file(
|
|
343
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
continue
|
|
347
|
+
if kind != "file":
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
total_files += 1
|
|
351
|
+
stat = entry["stat"]
|
|
352
|
+
decision = self._local_file_decision(path_obj, root, stat)
|
|
353
|
+
status = decision["status"]
|
|
354
|
+
category = decision["category"]
|
|
355
|
+
ext = path_obj.suffix.lower() or "(none)"
|
|
356
|
+
category_counts[category] += 1
|
|
357
|
+
extension_counts[ext] += 1
|
|
358
|
+
if decision["indexable"]:
|
|
359
|
+
readable_files += 1
|
|
360
|
+
status_counts["readable"] += 1
|
|
361
|
+
if len(allowed_samples) < 25:
|
|
362
|
+
allowed_samples.append(_sample_file(path_obj, root, "readable"))
|
|
363
|
+
else:
|
|
364
|
+
status_counts[status] += 1
|
|
365
|
+
if len(excluded_samples) < 25:
|
|
366
|
+
excluded_samples.append(
|
|
367
|
+
_sample_file(path_obj, root, status, decision["reason"])
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
doc_weight = (
|
|
371
|
+
category_counts["pdf"] * 1.4
|
|
372
|
+
+ category_counts["document"] * 0.9
|
|
373
|
+
+ category_counts["slide_deck"] * 1.0
|
|
374
|
+
)
|
|
375
|
+
sheet_weight = category_counts["spreadsheet"] * 0.6
|
|
376
|
+
ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
|
|
377
|
+
estimated_seconds = round(
|
|
378
|
+
readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
"path": str(root),
|
|
383
|
+
"source_id": f"source:{_path_fingerprint(root)}",
|
|
384
|
+
"os_type": os_type,
|
|
385
|
+
"drive_id": _drive_id_for_path(root),
|
|
386
|
+
"warning": _root_warning(root, os_type),
|
|
387
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
388
|
+
"include_ocr_requested": bool(include_ocr),
|
|
389
|
+
"summary": {
|
|
390
|
+
"total_files": total_files,
|
|
391
|
+
"readable_files": readable_files,
|
|
392
|
+
"excluded_files": int(
|
|
393
|
+
status_counts["excluded"]
|
|
394
|
+
+ status_counts["sensitive_blocked"]
|
|
395
|
+
+ status_counts["too_large"]
|
|
396
|
+
+ status_counts["unsupported"]
|
|
397
|
+
),
|
|
398
|
+
"sensitive_files": int(status_counts["sensitive_blocked"]),
|
|
399
|
+
"too_large_files": int(status_counts["too_large"]),
|
|
400
|
+
"unsupported_files": int(status_counts["unsupported"]),
|
|
401
|
+
"image_ocr_candidates": int(category_counts["image"]),
|
|
402
|
+
"inaccessible_items": inaccessible,
|
|
403
|
+
"excluded_dirs": excluded_dirs,
|
|
404
|
+
"estimated_seconds": estimated_seconds,
|
|
405
|
+
"storage_root": str(self.db_path.parent),
|
|
406
|
+
"limit_reached": limit_reached,
|
|
407
|
+
},
|
|
408
|
+
"by_status": dict(status_counts),
|
|
409
|
+
"by_category": dict(category_counts),
|
|
410
|
+
"by_extension": dict(extension_counts.most_common(40)),
|
|
411
|
+
"allowed_samples": allowed_samples,
|
|
412
|
+
"excluded_samples": excluded_samples,
|
|
413
|
+
"consent_required": {
|
|
414
|
+
"knowledge_source": True,
|
|
415
|
+
"image_ocr": bool(category_counts["image"]),
|
|
416
|
+
"watch": True,
|
|
417
|
+
"sensitive_files_default_excluded": True,
|
|
418
|
+
},
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def local_sources(self) -> Dict[str, Any]:
|
|
422
|
+
with self._connect() as conn:
|
|
423
|
+
sources = [
|
|
424
|
+
{
|
|
425
|
+
"id": row["id"],
|
|
426
|
+
"root_path": row["root_path"],
|
|
427
|
+
"os_type": row["os_type"],
|
|
428
|
+
"drive_id": row["drive_id"],
|
|
429
|
+
"label": row["label"],
|
|
430
|
+
"status": row["status"],
|
|
431
|
+
"include_ocr": bool(row["include_ocr"]),
|
|
432
|
+
"watch_enabled": bool(row["watch_enabled"]),
|
|
433
|
+
"consent": _safe_loads(row["consent_json"]),
|
|
434
|
+
"created_at": row["created_at"],
|
|
435
|
+
"updated_at": row["updated_at"],
|
|
436
|
+
"last_scanned_at": row["last_scanned_at"],
|
|
437
|
+
}
|
|
438
|
+
for row in conn.execute(
|
|
439
|
+
"""
|
|
440
|
+
SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
441
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
442
|
+
FROM knowledge_sources
|
|
443
|
+
ORDER BY updated_at DESC, id ASC
|
|
444
|
+
"""
|
|
445
|
+
)
|
|
446
|
+
]
|
|
447
|
+
status_rows = conn.execute(
|
|
448
|
+
"SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
|
|
449
|
+
).fetchall()
|
|
450
|
+
counts: Dict[str, Dict[str, int]] = {}
|
|
451
|
+
for row in status_rows:
|
|
452
|
+
counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
|
|
453
|
+
for source in sources:
|
|
454
|
+
source["file_status"] = counts.get(source["id"], {})
|
|
455
|
+
return {"sources": sources}
|
|
456
|
+
|
|
457
|
+
def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
|
|
458
|
+
source_id = str(source_id or "").strip()
|
|
459
|
+
if not source_id:
|
|
460
|
+
raise ValueError("source_id required")
|
|
461
|
+
with self._connect() as conn:
|
|
462
|
+
row = conn.execute(
|
|
463
|
+
"SELECT id FROM knowledge_sources WHERE id=?",
|
|
464
|
+
(source_id,),
|
|
465
|
+
).fetchone()
|
|
466
|
+
if not row:
|
|
467
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
468
|
+
conn.execute(
|
|
469
|
+
"UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
|
|
470
|
+
(1 if enabled else 0, _now(), source_id),
|
|
471
|
+
)
|
|
472
|
+
return {"source_id": source_id, "watch_enabled": bool(enabled)}
|
|
473
|
+
|
|
474
|
+
def remove_local_source(self, source_id: str) -> Dict[str, Any]:
|
|
475
|
+
"""Remove one approved local source and its derived graph projection.
|
|
476
|
+
|
|
477
|
+
This is intentionally non-destructive for user files: only the LatticeAI
|
|
478
|
+
index rows, graph nodes, edges, and chunks derived from the source are
|
|
479
|
+
removed. The original folder and files are never touched.
|
|
480
|
+
"""
|
|
481
|
+
source_id = str(source_id or "").strip()
|
|
482
|
+
if not source_id:
|
|
483
|
+
raise ValueError("source_id required")
|
|
484
|
+
with self._connect() as conn:
|
|
485
|
+
source = conn.execute(
|
|
486
|
+
"SELECT id, root_path FROM knowledge_sources WHERE id=?",
|
|
487
|
+
(source_id,),
|
|
488
|
+
).fetchone()
|
|
489
|
+
if not source:
|
|
490
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
491
|
+
rows = conn.execute(
|
|
492
|
+
"SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
|
|
493
|
+
(source_id,),
|
|
494
|
+
).fetchall()
|
|
495
|
+
graph_node_ids = [
|
|
496
|
+
row["graph_node_id"] for row in rows if row["graph_node_id"]
|
|
497
|
+
]
|
|
498
|
+
for graph_node_id in graph_node_ids:
|
|
499
|
+
self._delete_local_file_graph(conn, graph_node_id)
|
|
500
|
+
conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
|
|
501
|
+
conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
|
|
502
|
+
self._cleanup_local_graph_orphans(conn, source_id)
|
|
503
|
+
return {
|
|
504
|
+
"source_id": source_id,
|
|
505
|
+
"root_path": source["root_path"],
|
|
506
|
+
"removed_graph_nodes": len(graph_node_ids),
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
def _extract_local_file_text(
|
|
510
|
+
self, path: Path, category: str, *, include_ocr: bool
|
|
511
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
512
|
+
ext = path.suffix.lower()
|
|
513
|
+
meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
|
|
514
|
+
text = ""
|
|
515
|
+
if category in {"text", "code"} or ext == ".csv":
|
|
516
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
517
|
+
elif ext == ".pdf":
|
|
518
|
+
import pdfplumber
|
|
519
|
+
|
|
520
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
521
|
+
meta["pages"] = len(pdf.pages)
|
|
522
|
+
text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
|
|
523
|
+
elif ext == ".docx":
|
|
524
|
+
from docx import Document
|
|
525
|
+
|
|
526
|
+
doc = Document(str(path))
|
|
527
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
528
|
+
table_lines = []
|
|
529
|
+
for table in doc.tables:
|
|
530
|
+
for row in table.rows:
|
|
531
|
+
cells = [_clean_text(cell.text) for cell in row.cells]
|
|
532
|
+
if any(cells):
|
|
533
|
+
table_lines.append("\t".join(cells))
|
|
534
|
+
meta["paragraphs"] = len(paragraphs)
|
|
535
|
+
meta["tables"] = len(doc.tables)
|
|
536
|
+
meta["table_rows"] = len(table_lines)
|
|
537
|
+
text = "\n\n".join([*paragraphs, *table_lines])
|
|
538
|
+
elif ext == ".xlsx":
|
|
539
|
+
from openpyxl import load_workbook
|
|
540
|
+
|
|
541
|
+
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
542
|
+
rows_all = []
|
|
543
|
+
non_empty_rows = 0
|
|
544
|
+
non_empty_cells = 0
|
|
545
|
+
char_count = 0
|
|
546
|
+
for ws in wb.worksheets:
|
|
547
|
+
sheet_rows = []
|
|
548
|
+
for row in ws.iter_rows(values_only=True):
|
|
549
|
+
cells = [
|
|
550
|
+
str(cell).strip() if cell is not None else "" for cell in row
|
|
551
|
+
]
|
|
552
|
+
if not any(cells):
|
|
553
|
+
continue
|
|
554
|
+
line = "\t".join(cells)
|
|
555
|
+
non_empty_rows += 1
|
|
556
|
+
non_empty_cells += sum(1 for cell in cells if cell)
|
|
557
|
+
sheet_rows.append(line)
|
|
558
|
+
char_count += len(line) + 1
|
|
559
|
+
if char_count > 200_000:
|
|
560
|
+
break
|
|
561
|
+
if sheet_rows:
|
|
562
|
+
rows_all.append(f"[Sheet: {ws.title}]")
|
|
563
|
+
rows_all.extend(sheet_rows)
|
|
564
|
+
if char_count > 200_000:
|
|
565
|
+
break
|
|
566
|
+
meta["sheets"] = len(wb.worksheets)
|
|
567
|
+
meta["rows"] = non_empty_rows
|
|
568
|
+
meta["cells"] = non_empty_cells
|
|
569
|
+
text = "\n".join(rows_all)
|
|
570
|
+
elif ext == ".pptx":
|
|
571
|
+
from pptx import Presentation
|
|
572
|
+
|
|
573
|
+
prs = Presentation(str(path))
|
|
574
|
+
slides_text = []
|
|
575
|
+
for index, slide in enumerate(prs.slides, 1):
|
|
576
|
+
parts = []
|
|
577
|
+
for shape in slide.shapes:
|
|
578
|
+
if getattr(shape, "has_text_frame", False):
|
|
579
|
+
slide_text = shape.text_frame.text.strip()
|
|
580
|
+
if slide_text:
|
|
581
|
+
parts.append(slide_text)
|
|
582
|
+
if parts:
|
|
583
|
+
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
584
|
+
meta["slides"] = len(prs.slides)
|
|
585
|
+
meta["text_slides"] = len(slides_text)
|
|
586
|
+
text = "\n\n".join(slides_text)
|
|
587
|
+
elif category == "image":
|
|
588
|
+
from PIL import Image
|
|
589
|
+
|
|
590
|
+
with Image.open(str(path)) as image:
|
|
591
|
+
meta.update(
|
|
592
|
+
{
|
|
593
|
+
"width": image.width,
|
|
594
|
+
"height": image.height,
|
|
595
|
+
"format": image.format,
|
|
596
|
+
"mode": image.mode,
|
|
597
|
+
"ocr_enabled": bool(include_ocr),
|
|
598
|
+
}
|
|
599
|
+
)
|
|
600
|
+
if include_ocr:
|
|
601
|
+
try:
|
|
602
|
+
import pytesseract
|
|
603
|
+
|
|
604
|
+
text = pytesseract.image_to_string(image)
|
|
605
|
+
meta["ocr_chars"] = len(text)
|
|
606
|
+
except (
|
|
607
|
+
Exception
|
|
608
|
+
) as exc: # pragma: no cover - depends on local OCR runtime
|
|
609
|
+
meta["ocr_error"] = str(exc)
|
|
610
|
+
text = ""
|
|
611
|
+
return text[:200_000], meta
|
|
612
|
+
|
|
613
|
+
def _ensure_local_hierarchy(
|
|
614
|
+
self,
|
|
615
|
+
conn: sqlite3.Connection,
|
|
616
|
+
*,
|
|
617
|
+
source_id: str,
|
|
618
|
+
root: Path,
|
|
619
|
+
file_path: Path,
|
|
620
|
+
os_type: str,
|
|
621
|
+
drive_id: str,
|
|
622
|
+
) -> str:
|
|
623
|
+
computer_label = platform.node() or "내 컴퓨터"
|
|
624
|
+
computer_id = f"computer:{_slug(computer_label)}"
|
|
625
|
+
drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
|
|
626
|
+
root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
|
|
627
|
+
self._upsert_node(
|
|
628
|
+
conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type}
|
|
629
|
+
)
|
|
630
|
+
self._upsert_node(
|
|
631
|
+
conn,
|
|
632
|
+
drive_node_id,
|
|
633
|
+
"Drive",
|
|
634
|
+
drive_id,
|
|
635
|
+
metadata={"os_type": os_type, "drive_id": drive_id},
|
|
636
|
+
)
|
|
637
|
+
self._upsert_edge(
|
|
638
|
+
conn,
|
|
639
|
+
computer_id,
|
|
640
|
+
drive_node_id,
|
|
641
|
+
"포함함",
|
|
642
|
+
metadata={"source": "local_scan"},
|
|
643
|
+
)
|
|
644
|
+
self._upsert_node(
|
|
645
|
+
conn,
|
|
646
|
+
root_folder_id,
|
|
647
|
+
"Folder",
|
|
648
|
+
root.name or str(root),
|
|
649
|
+
summary=str(root),
|
|
650
|
+
metadata={"source_id": source_id, "path": str(root), "root": True},
|
|
651
|
+
)
|
|
652
|
+
self._upsert_edge(
|
|
653
|
+
conn,
|
|
654
|
+
drive_node_id,
|
|
655
|
+
root_folder_id,
|
|
656
|
+
"포함함",
|
|
657
|
+
metadata={"source": "local_scan"},
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
relative_parent = file_path.parent.relative_to(root)
|
|
662
|
+
except ValueError:
|
|
663
|
+
relative_parent = Path()
|
|
664
|
+
parent_id = root_folder_id
|
|
665
|
+
current_path = root
|
|
666
|
+
for part in relative_parent.parts:
|
|
667
|
+
current_path = current_path / part
|
|
668
|
+
folder_id = (
|
|
669
|
+
f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
|
|
670
|
+
)
|
|
671
|
+
self._upsert_node(
|
|
672
|
+
conn,
|
|
673
|
+
folder_id,
|
|
674
|
+
"Folder",
|
|
675
|
+
part,
|
|
676
|
+
summary=str(current_path),
|
|
677
|
+
metadata={
|
|
678
|
+
"source_id": source_id,
|
|
679
|
+
"path": str(current_path),
|
|
680
|
+
"root": False,
|
|
681
|
+
},
|
|
682
|
+
)
|
|
683
|
+
self._upsert_edge(
|
|
684
|
+
conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"}
|
|
685
|
+
)
|
|
686
|
+
parent_id = folder_id
|
|
687
|
+
return parent_id
|
|
688
|
+
|
|
689
|
+
def _upsert_local_file_index(
|
|
690
|
+
self,
|
|
691
|
+
conn: sqlite3.Connection,
|
|
692
|
+
*,
|
|
693
|
+
source_id: str,
|
|
694
|
+
root: Path,
|
|
695
|
+
file_path: Path,
|
|
696
|
+
stat: Optional[os.stat_result],
|
|
697
|
+
os_type: str,
|
|
698
|
+
drive_id: str,
|
|
699
|
+
status: str,
|
|
700
|
+
parser_type: str,
|
|
701
|
+
sha256: Optional[str] = None,
|
|
702
|
+
graph_node_id: Optional[str] = None,
|
|
703
|
+
error_message: Optional[str] = None,
|
|
704
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
705
|
+
) -> str:
|
|
706
|
+
try:
|
|
707
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
708
|
+
except ValueError:
|
|
709
|
+
relative_path = file_path.name
|
|
710
|
+
index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
711
|
+
now = _now()
|
|
712
|
+
size = stat.st_size if stat else None
|
|
713
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
|
|
714
|
+
conn.execute(
|
|
715
|
+
"""
|
|
716
|
+
INSERT INTO local_file_index(
|
|
717
|
+
id, source_id, os_type, drive_id, root_path, file_path, relative_path,
|
|
718
|
+
file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
|
|
719
|
+
last_indexed_at, parser_type, status, error_message, graph_node_id,
|
|
720
|
+
deleted, metadata_json
|
|
721
|
+
)
|
|
722
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
723
|
+
ON CONFLICT(source_id, relative_path) DO UPDATE SET
|
|
724
|
+
os_type=excluded.os_type,
|
|
725
|
+
drive_id=excluded.drive_id,
|
|
726
|
+
root_path=excluded.root_path,
|
|
727
|
+
file_path=excluded.file_path,
|
|
728
|
+
file_name=excluded.file_name,
|
|
729
|
+
extension=excluded.extension,
|
|
730
|
+
size_bytes=excluded.size_bytes,
|
|
731
|
+
modified_at=excluded.modified_at,
|
|
732
|
+
sha256=excluded.sha256,
|
|
733
|
+
last_scanned_at=excluded.last_scanned_at,
|
|
734
|
+
last_indexed_at=excluded.last_indexed_at,
|
|
735
|
+
parser_type=excluded.parser_type,
|
|
736
|
+
status=excluded.status,
|
|
737
|
+
error_message=excluded.error_message,
|
|
738
|
+
graph_node_id=excluded.graph_node_id,
|
|
739
|
+
deleted=excluded.deleted,
|
|
740
|
+
metadata_json=excluded.metadata_json
|
|
741
|
+
""",
|
|
742
|
+
(
|
|
743
|
+
index_id,
|
|
744
|
+
source_id,
|
|
745
|
+
os_type,
|
|
746
|
+
drive_id,
|
|
747
|
+
str(root),
|
|
748
|
+
str(file_path),
|
|
749
|
+
relative_path,
|
|
750
|
+
file_path.name,
|
|
751
|
+
file_path.suffix.lower(),
|
|
752
|
+
size,
|
|
753
|
+
modified_at,
|
|
754
|
+
sha256,
|
|
755
|
+
now,
|
|
756
|
+
now if status == "indexed" else None,
|
|
757
|
+
parser_type,
|
|
758
|
+
status,
|
|
759
|
+
error_message,
|
|
760
|
+
graph_node_id,
|
|
761
|
+
0 if status != "deleted" else 1,
|
|
762
|
+
_json(metadata),
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
return index_id
|
|
766
|
+
|
|
767
|
+
def _upsert_local_file_node(
|
|
768
|
+
self,
|
|
769
|
+
conn: sqlite3.Connection,
|
|
770
|
+
*,
|
|
771
|
+
source_id: str,
|
|
772
|
+
root: Path,
|
|
773
|
+
file_path: Path,
|
|
774
|
+
stat: os.stat_result,
|
|
775
|
+
os_type: str,
|
|
776
|
+
drive_id: str,
|
|
777
|
+
sha256: str,
|
|
778
|
+
category: str,
|
|
779
|
+
parser_type: str,
|
|
780
|
+
text: str,
|
|
781
|
+
parser_meta: Dict[str, Any],
|
|
782
|
+
) -> str:
|
|
783
|
+
text = _clean_text(text)
|
|
784
|
+
if not text:
|
|
785
|
+
raise ValueError("텍스트 추출 결과가 비어 있습니다.")
|
|
786
|
+
try:
|
|
787
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
788
|
+
except ValueError:
|
|
789
|
+
relative_path = file_path.name
|
|
790
|
+
file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
791
|
+
parent_folder_id = self._ensure_local_hierarchy(
|
|
792
|
+
conn,
|
|
793
|
+
source_id=source_id,
|
|
794
|
+
root=root,
|
|
795
|
+
file_path=file_path,
|
|
796
|
+
os_type=os_type,
|
|
797
|
+
drive_id=drive_id,
|
|
798
|
+
)
|
|
799
|
+
child_rows = conn.execute(
|
|
800
|
+
"""
|
|
801
|
+
SELECT e.to_node AS id
|
|
802
|
+
FROM edges e
|
|
803
|
+
JOIN nodes n ON n.id=e.to_node
|
|
804
|
+
WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
|
|
805
|
+
""",
|
|
806
|
+
(file_node_id,),
|
|
807
|
+
).fetchall()
|
|
808
|
+
child_ids = [row["id"] for row in child_rows]
|
|
809
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
810
|
+
if child_ids:
|
|
811
|
+
placeholders = ",".join("?" * len(child_ids))
|
|
812
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
|
|
813
|
+
self._v2_delete_nodes(conn, child_ids)
|
|
814
|
+
conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
|
|
815
|
+
self._v2_delete_edges_from(conn, file_node_id)
|
|
816
|
+
|
|
817
|
+
metadata = {
|
|
818
|
+
"source": "local_folder",
|
|
819
|
+
"source_id": source_id,
|
|
820
|
+
"root_path": str(root),
|
|
821
|
+
"file_path": str(file_path),
|
|
822
|
+
"relative_path": relative_path,
|
|
823
|
+
"filename": file_path.name,
|
|
824
|
+
"ext": file_path.suffix.lower(),
|
|
825
|
+
"category": category,
|
|
826
|
+
"parser_type": parser_type,
|
|
827
|
+
"bytes": stat.st_size,
|
|
828
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
829
|
+
"sha256": sha256,
|
|
830
|
+
"parser": parser_meta,
|
|
831
|
+
}
|
|
832
|
+
self._upsert_node(
|
|
833
|
+
conn,
|
|
834
|
+
file_node_id,
|
|
835
|
+
_node_type_for_category(category),
|
|
836
|
+
file_path.name,
|
|
837
|
+
summary=text[:700],
|
|
838
|
+
metadata=metadata,
|
|
839
|
+
raw=metadata,
|
|
840
|
+
)
|
|
841
|
+
self._upsert_edge(
|
|
842
|
+
conn,
|
|
843
|
+
parent_folder_id,
|
|
844
|
+
file_node_id,
|
|
845
|
+
"포함함",
|
|
846
|
+
weight=1.0,
|
|
847
|
+
metadata={"source": "local_scan"},
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
target_for_concepts = text
|
|
851
|
+
if category == "image" and text:
|
|
852
|
+
image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
|
|
853
|
+
self._upsert_node(
|
|
854
|
+
conn,
|
|
855
|
+
image_text_id,
|
|
856
|
+
"ImageText",
|
|
857
|
+
f"{file_path.name} OCR",
|
|
858
|
+
summary=_clean_text(text)[:700],
|
|
859
|
+
metadata={
|
|
860
|
+
"source_node": file_node_id,
|
|
861
|
+
"source_id": source_id,
|
|
862
|
+
"chars": len(text),
|
|
863
|
+
},
|
|
864
|
+
)
|
|
865
|
+
self._upsert_edge(
|
|
866
|
+
conn,
|
|
867
|
+
file_node_id,
|
|
868
|
+
image_text_id,
|
|
869
|
+
"포함함",
|
|
870
|
+
weight=0.8,
|
|
871
|
+
metadata={"source": "ocr"},
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
875
|
+
chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
|
|
876
|
+
self._upsert_node(
|
|
877
|
+
conn,
|
|
878
|
+
chunk_id,
|
|
879
|
+
"Chunk",
|
|
880
|
+
f"{file_path.name} chunk {index + 1}",
|
|
881
|
+
summary=chunk[:500],
|
|
882
|
+
metadata={
|
|
883
|
+
"index": index,
|
|
884
|
+
"source_node": file_node_id,
|
|
885
|
+
"source_id": source_id,
|
|
886
|
+
},
|
|
887
|
+
)
|
|
888
|
+
self._upsert_chunk(
|
|
889
|
+
conn,
|
|
890
|
+
chunk_id=chunk_id,
|
|
891
|
+
source_node=file_node_id,
|
|
892
|
+
text=chunk,
|
|
893
|
+
metadata={
|
|
894
|
+
"index": index,
|
|
895
|
+
"source_node": file_node_id,
|
|
896
|
+
"source_id": source_id,
|
|
897
|
+
},
|
|
898
|
+
)
|
|
899
|
+
self._upsert_edge(
|
|
900
|
+
conn,
|
|
901
|
+
file_node_id,
|
|
902
|
+
chunk_id,
|
|
903
|
+
"포함함",
|
|
904
|
+
weight=0.7,
|
|
905
|
+
metadata={"source": "local_scan"},
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
concepts = _extract_concepts(target_for_concepts, limit=18)
|
|
909
|
+
concept_ids: Dict[str, str] = {}
|
|
910
|
+
for concept in concepts:
|
|
911
|
+
node_t = _classify_node_type(concept, target_for_concepts)
|
|
912
|
+
concept_id = f"{node_t.lower()}:{_slug(concept)}"
|
|
913
|
+
concept_ids[concept.lower()] = concept_id
|
|
914
|
+
self._upsert_node(
|
|
915
|
+
conn,
|
|
916
|
+
concept_id,
|
|
917
|
+
node_t,
|
|
918
|
+
concept,
|
|
919
|
+
metadata={
|
|
920
|
+
"auto_extracted": True,
|
|
921
|
+
"source": "local_folder",
|
|
922
|
+
"source_id": source_id,
|
|
923
|
+
},
|
|
924
|
+
)
|
|
925
|
+
self._upsert_edge(
|
|
926
|
+
conn,
|
|
927
|
+
file_node_id,
|
|
928
|
+
concept_id,
|
|
929
|
+
"언급함",
|
|
930
|
+
weight=0.75,
|
|
931
|
+
metadata={"source": "local_scan"},
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
for triple in _extract_triples(target_for_concepts, concepts, limit=20):
|
|
935
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
936
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
937
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
938
|
+
self._upsert_edge(
|
|
939
|
+
conn,
|
|
940
|
+
subj_id,
|
|
941
|
+
obj_id,
|
|
942
|
+
triple["relation"],
|
|
943
|
+
weight=0.9,
|
|
944
|
+
metadata={
|
|
945
|
+
"context": triple.get("context", "")[:240],
|
|
946
|
+
"source_id": source_id,
|
|
947
|
+
},
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
for item in _semantic_items(target_for_concepts):
|
|
951
|
+
sem_type = item["type"]
|
|
952
|
+
sem_title = item["title"]
|
|
953
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
|
|
954
|
+
self._upsert_node(
|
|
955
|
+
conn,
|
|
956
|
+
sem_id,
|
|
957
|
+
sem_type,
|
|
958
|
+
sem_title,
|
|
959
|
+
summary=item["summary"],
|
|
960
|
+
metadata={
|
|
961
|
+
"auto_extracted": True,
|
|
962
|
+
"source_node": file_node_id,
|
|
963
|
+
"filename": file_path.name,
|
|
964
|
+
},
|
|
965
|
+
raw=item,
|
|
966
|
+
)
|
|
967
|
+
self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
|
|
968
|
+
|
|
969
|
+
return file_node_id
|
|
970
|
+
|
|
971
|
+
def _delete_local_file_graph(
|
|
972
|
+
self, conn: sqlite3.Connection, file_node_id: Optional[str]
|
|
973
|
+
) -> None:
|
|
974
|
+
if not file_node_id:
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
file_row = conn.execute(
|
|
978
|
+
"SELECT metadata_json FROM nodes WHERE id=?",
|
|
979
|
+
(file_node_id,),
|
|
980
|
+
).fetchone()
|
|
981
|
+
source_id = None
|
|
982
|
+
if file_row:
|
|
983
|
+
source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
|
|
984
|
+
|
|
985
|
+
linked_rows = conn.execute(
|
|
986
|
+
"""
|
|
987
|
+
SELECT n.id, n.type, n.metadata_json
|
|
988
|
+
FROM edges e
|
|
989
|
+
JOIN nodes n ON n.id=e.to_node
|
|
990
|
+
WHERE e.from_node=?
|
|
991
|
+
""",
|
|
992
|
+
(file_node_id,),
|
|
993
|
+
).fetchall()
|
|
994
|
+
owned_ids: set = set()
|
|
995
|
+
auto_candidate_ids: set = set()
|
|
996
|
+
for row in linked_rows:
|
|
997
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
998
|
+
if (
|
|
999
|
+
row["type"] in {"Chunk", "ImageText", "Section"}
|
|
1000
|
+
or metadata.get("source_node") == file_node_id
|
|
1001
|
+
):
|
|
1002
|
+
owned_ids.add(row["id"])
|
|
1003
|
+
elif (
|
|
1004
|
+
metadata.get("auto_extracted")
|
|
1005
|
+
and metadata.get("source") == "local_folder"
|
|
1006
|
+
):
|
|
1007
|
+
auto_candidate_ids.add(row["id"])
|
|
1008
|
+
|
|
1009
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
1010
|
+
conn.execute(
|
|
1011
|
+
"DELETE FROM edges WHERE from_node=? OR to_node=?",
|
|
1012
|
+
(file_node_id, file_node_id),
|
|
1013
|
+
)
|
|
1014
|
+
conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
|
|
1015
|
+
self._v2_delete_nodes(conn, [file_node_id])
|
|
1016
|
+
|
|
1017
|
+
def delete_nodes(node_ids: set) -> None:
|
|
1018
|
+
if not node_ids:
|
|
1019
|
+
return
|
|
1020
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
1021
|
+
params = list(node_ids)
|
|
1022
|
+
conn.execute(
|
|
1023
|
+
f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params
|
|
1024
|
+
)
|
|
1025
|
+
conn.execute(
|
|
1026
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1027
|
+
params * 2,
|
|
1028
|
+
)
|
|
1029
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
|
|
1030
|
+
self._v2_delete_nodes(conn, params)
|
|
1031
|
+
|
|
1032
|
+
delete_nodes(owned_ids)
|
|
1033
|
+
|
|
1034
|
+
removable_auto_ids: set = set()
|
|
1035
|
+
for node_id in auto_candidate_ids:
|
|
1036
|
+
remaining_edges = conn.execute(
|
|
1037
|
+
"SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
|
|
1038
|
+
(node_id, node_id),
|
|
1039
|
+
).fetchall()
|
|
1040
|
+
if all(
|
|
1041
|
+
(
|
|
1042
|
+
row["from_node"] in auto_candidate_ids
|
|
1043
|
+
and row["to_node"] in auto_candidate_ids
|
|
1044
|
+
)
|
|
1045
|
+
for row in remaining_edges
|
|
1046
|
+
):
|
|
1047
|
+
removable_auto_ids.add(node_id)
|
|
1048
|
+
delete_nodes(removable_auto_ids)
|
|
1049
|
+
if source_id:
|
|
1050
|
+
self._cleanup_local_graph_orphans(conn, str(source_id))
|
|
1051
|
+
|
|
1052
|
+
def _cleanup_local_graph_orphans(
|
|
1053
|
+
self, conn: sqlite3.Connection, source_id: str
|
|
1054
|
+
) -> None:
|
|
1055
|
+
while True:
|
|
1056
|
+
folder_rows = conn.execute(
|
|
1057
|
+
"SELECT id, metadata_json FROM nodes WHERE type='Folder'"
|
|
1058
|
+
).fetchall()
|
|
1059
|
+
leaf_ids = []
|
|
1060
|
+
for row in folder_rows:
|
|
1061
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1062
|
+
if metadata.get("source_id") != source_id:
|
|
1063
|
+
continue
|
|
1064
|
+
has_children = conn.execute(
|
|
1065
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1066
|
+
(row["id"],),
|
|
1067
|
+
).fetchone()
|
|
1068
|
+
if not has_children:
|
|
1069
|
+
leaf_ids.append(row["id"])
|
|
1070
|
+
if not leaf_ids:
|
|
1071
|
+
break
|
|
1072
|
+
placeholders = ",".join("?" * len(leaf_ids))
|
|
1073
|
+
conn.execute(
|
|
1074
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1075
|
+
leaf_ids * 2,
|
|
1076
|
+
)
|
|
1077
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
|
|
1078
|
+
self._v2_delete_nodes(conn, leaf_ids)
|
|
1079
|
+
|
|
1080
|
+
for node_type in ("Drive", "Computer"):
|
|
1081
|
+
rows = conn.execute(
|
|
1082
|
+
"SELECT id FROM nodes WHERE type=?", (node_type,)
|
|
1083
|
+
).fetchall()
|
|
1084
|
+
removable = []
|
|
1085
|
+
for row in rows:
|
|
1086
|
+
has_children = conn.execute(
|
|
1087
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1088
|
+
(row["id"],),
|
|
1089
|
+
).fetchone()
|
|
1090
|
+
if not has_children:
|
|
1091
|
+
removable.append(row["id"])
|
|
1092
|
+
if removable:
|
|
1093
|
+
placeholders = ",".join("?" * len(removable))
|
|
1094
|
+
conn.execute(
|
|
1095
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1096
|
+
removable * 2,
|
|
1097
|
+
)
|
|
1098
|
+
conn.execute(
|
|
1099
|
+
f"DELETE FROM nodes WHERE id IN ({placeholders})", removable
|
|
1100
|
+
)
|
|
1101
|
+
self._v2_delete_nodes(conn, removable)
|
|
1102
|
+
|
|
1103
|
+
def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
|
|
1104
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1105
|
+
parser = metadata.get("parser") if isinstance(metadata, dict) else {}
|
|
1106
|
+
if not isinstance(parser, dict):
|
|
1107
|
+
return False
|
|
1108
|
+
try:
|
|
1109
|
+
return int(parser.get("extracted_chars") or 0) > 0
|
|
1110
|
+
except (TypeError, ValueError):
|
|
1111
|
+
return False
|
|
1112
|
+
|
|
1113
|
+
def index_local_folder(
|
|
1114
|
+
self,
|
|
1115
|
+
path: Path,
|
|
1116
|
+
*,
|
|
1117
|
+
include_ocr: bool = False,
|
|
1118
|
+
watch_enabled: bool = False,
|
|
1119
|
+
user_email: Optional[str] = None,
|
|
1120
|
+
consent: Optional[Dict[str, Any]] = None,
|
|
1121
|
+
max_files: int = 5_000,
|
|
1122
|
+
) -> Dict[str, Any]:
|
|
1123
|
+
"""Read approved files from a local folder and connect them to Graph RAG."""
|
|
1124
|
+
root = Path(path).expanduser().resolve()
|
|
1125
|
+
if not root.exists():
|
|
1126
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1127
|
+
if not root.is_dir():
|
|
1128
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1129
|
+
|
|
1130
|
+
os_type = _current_os_type()
|
|
1131
|
+
drive_id = _drive_id_for_path(root)
|
|
1132
|
+
source_id = f"source:{_path_fingerprint(root)}"
|
|
1133
|
+
now = _now()
|
|
1134
|
+
max_files = max(1, min(int(max_files or 5_000), 50_000))
|
|
1135
|
+
consent_payload = {
|
|
1136
|
+
"approved_at": now,
|
|
1137
|
+
"approved_by": user_email,
|
|
1138
|
+
"knowledge_source": True,
|
|
1139
|
+
"include_ocr": bool(include_ocr),
|
|
1140
|
+
"watch_enabled": bool(watch_enabled),
|
|
1141
|
+
"sensitive_files_default_excluded": True,
|
|
1142
|
+
**(consent or {}),
|
|
1143
|
+
}
|
|
1144
|
+
counts: Counter = Counter()
|
|
1145
|
+
seen_relative_paths: set = set()
|
|
1146
|
+
indexed_nodes: List[str] = []
|
|
1147
|
+
errors: List[Dict[str, str]] = []
|
|
1148
|
+
limit_reached = False
|
|
1149
|
+
|
|
1150
|
+
with self._connect() as conn:
|
|
1151
|
+
conn.execute(
|
|
1152
|
+
"""
|
|
1153
|
+
INSERT INTO knowledge_sources(
|
|
1154
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1155
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1156
|
+
)
|
|
1157
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1158
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
1159
|
+
root_path=excluded.root_path,
|
|
1160
|
+
os_type=excluded.os_type,
|
|
1161
|
+
drive_id=excluded.drive_id,
|
|
1162
|
+
label=excluded.label,
|
|
1163
|
+
status=excluded.status,
|
|
1164
|
+
include_ocr=excluded.include_ocr,
|
|
1165
|
+
watch_enabled=excluded.watch_enabled,
|
|
1166
|
+
consent_json=excluded.consent_json,
|
|
1167
|
+
updated_at=excluded.updated_at,
|
|
1168
|
+
last_scanned_at=excluded.last_scanned_at
|
|
1169
|
+
""",
|
|
1170
|
+
(
|
|
1171
|
+
source_id,
|
|
1172
|
+
str(root),
|
|
1173
|
+
os_type,
|
|
1174
|
+
drive_id,
|
|
1175
|
+
root.name or str(root),
|
|
1176
|
+
"scanning",
|
|
1177
|
+
1 if include_ocr else 0,
|
|
1178
|
+
1 if watch_enabled else 0,
|
|
1179
|
+
_json(consent_payload),
|
|
1180
|
+
now,
|
|
1181
|
+
now,
|
|
1182
|
+
now,
|
|
1183
|
+
),
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1187
|
+
kind = entry["kind"]
|
|
1188
|
+
file_path = entry["path"]
|
|
1189
|
+
if kind == "limit_reached":
|
|
1190
|
+
counts["limit_reached"] += 1
|
|
1191
|
+
limit_reached = True
|
|
1192
|
+
break
|
|
1193
|
+
if kind in {"excluded_dir", "excluded"}:
|
|
1194
|
+
counts["excluded"] += 1
|
|
1195
|
+
continue
|
|
1196
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1197
|
+
counts["failed"] += 1
|
|
1198
|
+
errors.append(
|
|
1199
|
+
{
|
|
1200
|
+
"path": str(file_path),
|
|
1201
|
+
"error": entry.get("reason", "inaccessible"),
|
|
1202
|
+
}
|
|
1203
|
+
)
|
|
1204
|
+
continue
|
|
1205
|
+
if kind != "file":
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
stat = entry["stat"]
|
|
1209
|
+
try:
|
|
1210
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1211
|
+
except ValueError:
|
|
1212
|
+
relative_path = file_path.name
|
|
1213
|
+
seen_relative_paths.add(relative_path)
|
|
1214
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1215
|
+
existing = conn.execute(
|
|
1216
|
+
"""
|
|
1217
|
+
SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
|
|
1218
|
+
FROM local_file_index
|
|
1219
|
+
WHERE source_id=? AND relative_path=?
|
|
1220
|
+
""",
|
|
1221
|
+
(source_id, relative_path),
|
|
1222
|
+
).fetchone()
|
|
1223
|
+
decision = self._local_file_decision(file_path, root, stat)
|
|
1224
|
+
parser_type = decision["parser_type"]
|
|
1225
|
+
if not decision["indexable"]:
|
|
1226
|
+
counts[decision["status"]] += 1
|
|
1227
|
+
if existing and existing["graph_node_id"]:
|
|
1228
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1229
|
+
self._upsert_local_file_index(
|
|
1230
|
+
conn,
|
|
1231
|
+
source_id=source_id,
|
|
1232
|
+
root=root,
|
|
1233
|
+
file_path=file_path,
|
|
1234
|
+
stat=stat,
|
|
1235
|
+
os_type=os_type,
|
|
1236
|
+
drive_id=drive_id,
|
|
1237
|
+
status=decision["status"],
|
|
1238
|
+
parser_type=parser_type,
|
|
1239
|
+
metadata={
|
|
1240
|
+
"reason": decision["reason"],
|
|
1241
|
+
"category": decision["category"],
|
|
1242
|
+
},
|
|
1243
|
+
)
|
|
1244
|
+
continue
|
|
1245
|
+
|
|
1246
|
+
if (
|
|
1247
|
+
existing
|
|
1248
|
+
and existing["status"] == "indexed"
|
|
1249
|
+
and existing["graph_node_id"]
|
|
1250
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1251
|
+
and existing["size_bytes"] == stat.st_size
|
|
1252
|
+
and existing["modified_at"] == modified_at
|
|
1253
|
+
):
|
|
1254
|
+
counts["skipped_unchanged"] += 1
|
|
1255
|
+
self._upsert_local_file_index(
|
|
1256
|
+
conn,
|
|
1257
|
+
source_id=source_id,
|
|
1258
|
+
root=root,
|
|
1259
|
+
file_path=file_path,
|
|
1260
|
+
stat=stat,
|
|
1261
|
+
os_type=os_type,
|
|
1262
|
+
drive_id=drive_id,
|
|
1263
|
+
status="indexed",
|
|
1264
|
+
parser_type=parser_type,
|
|
1265
|
+
sha256=existing["sha256"],
|
|
1266
|
+
graph_node_id=existing["graph_node_id"],
|
|
1267
|
+
metadata={
|
|
1268
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1269
|
+
"category": decision["category"],
|
|
1270
|
+
"unchanged": True,
|
|
1271
|
+
},
|
|
1272
|
+
)
|
|
1273
|
+
continue
|
|
1274
|
+
|
|
1275
|
+
try:
|
|
1276
|
+
data = file_path.read_bytes()
|
|
1277
|
+
digest = _sha256_bytes(data)
|
|
1278
|
+
except Exception as exc:
|
|
1279
|
+
counts["failed"] += 1
|
|
1280
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1281
|
+
if existing and existing["graph_node_id"]:
|
|
1282
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1283
|
+
self._upsert_local_file_index(
|
|
1284
|
+
conn,
|
|
1285
|
+
source_id=source_id,
|
|
1286
|
+
root=root,
|
|
1287
|
+
file_path=file_path,
|
|
1288
|
+
stat=stat,
|
|
1289
|
+
os_type=os_type,
|
|
1290
|
+
drive_id=drive_id,
|
|
1291
|
+
status="failed",
|
|
1292
|
+
parser_type=parser_type,
|
|
1293
|
+
error_message=str(exc),
|
|
1294
|
+
metadata={"category": decision["category"]},
|
|
1295
|
+
)
|
|
1296
|
+
continue
|
|
1297
|
+
|
|
1298
|
+
if (
|
|
1299
|
+
existing
|
|
1300
|
+
and existing["sha256"] == digest
|
|
1301
|
+
and existing["graph_node_id"]
|
|
1302
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1303
|
+
):
|
|
1304
|
+
counts["skipped_unchanged"] += 1
|
|
1305
|
+
self._upsert_local_file_index(
|
|
1306
|
+
conn,
|
|
1307
|
+
source_id=source_id,
|
|
1308
|
+
root=root,
|
|
1309
|
+
file_path=file_path,
|
|
1310
|
+
stat=stat,
|
|
1311
|
+
os_type=os_type,
|
|
1312
|
+
drive_id=drive_id,
|
|
1313
|
+
status="indexed",
|
|
1314
|
+
parser_type=parser_type,
|
|
1315
|
+
sha256=digest,
|
|
1316
|
+
graph_node_id=existing["graph_node_id"],
|
|
1317
|
+
metadata={
|
|
1318
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1319
|
+
"category": decision["category"],
|
|
1320
|
+
"sha256_unchanged": True,
|
|
1321
|
+
},
|
|
1322
|
+
)
|
|
1323
|
+
continue
|
|
1324
|
+
|
|
1325
|
+
try:
|
|
1326
|
+
text, parser_meta = self._extract_local_file_text(
|
|
1327
|
+
file_path,
|
|
1328
|
+
decision["category"],
|
|
1329
|
+
include_ocr=include_ocr,
|
|
1330
|
+
)
|
|
1331
|
+
text = _clean_text(text)
|
|
1332
|
+
parser_meta = {**parser_meta, "extracted_chars": len(text)}
|
|
1333
|
+
if not text:
|
|
1334
|
+
counts["skipped_empty_text"] += 1
|
|
1335
|
+
if existing and existing["graph_node_id"]:
|
|
1336
|
+
self._delete_local_file_graph(
|
|
1337
|
+
conn, existing["graph_node_id"]
|
|
1338
|
+
)
|
|
1339
|
+
self._upsert_local_file_index(
|
|
1340
|
+
conn,
|
|
1341
|
+
source_id=source_id,
|
|
1342
|
+
root=root,
|
|
1343
|
+
file_path=file_path,
|
|
1344
|
+
stat=stat,
|
|
1345
|
+
os_type=os_type,
|
|
1346
|
+
drive_id=drive_id,
|
|
1347
|
+
status="skipped_empty_text",
|
|
1348
|
+
parser_type=parser_type,
|
|
1349
|
+
sha256=digest,
|
|
1350
|
+
error_message="텍스트 추출 결과가 비어 있습니다.",
|
|
1351
|
+
metadata={
|
|
1352
|
+
"category": decision["category"],
|
|
1353
|
+
"parser": parser_meta,
|
|
1354
|
+
},
|
|
1355
|
+
)
|
|
1356
|
+
continue
|
|
1357
|
+
graph_node_id = self._upsert_local_file_node(
|
|
1358
|
+
conn,
|
|
1359
|
+
source_id=source_id,
|
|
1360
|
+
root=root,
|
|
1361
|
+
file_path=file_path,
|
|
1362
|
+
stat=stat,
|
|
1363
|
+
os_type=os_type,
|
|
1364
|
+
drive_id=drive_id,
|
|
1365
|
+
sha256=digest,
|
|
1366
|
+
category=decision["category"],
|
|
1367
|
+
parser_type=parser_type,
|
|
1368
|
+
text=text,
|
|
1369
|
+
parser_meta=parser_meta,
|
|
1370
|
+
)
|
|
1371
|
+
self._upsert_local_file_index(
|
|
1372
|
+
conn,
|
|
1373
|
+
source_id=source_id,
|
|
1374
|
+
root=root,
|
|
1375
|
+
file_path=file_path,
|
|
1376
|
+
stat=stat,
|
|
1377
|
+
os_type=os_type,
|
|
1378
|
+
drive_id=drive_id,
|
|
1379
|
+
status="indexed",
|
|
1380
|
+
parser_type=parser_type,
|
|
1381
|
+
sha256=digest,
|
|
1382
|
+
graph_node_id=graph_node_id,
|
|
1383
|
+
metadata={
|
|
1384
|
+
"category": decision["category"],
|
|
1385
|
+
"parser": parser_meta,
|
|
1386
|
+
},
|
|
1387
|
+
)
|
|
1388
|
+
counts["indexed"] += 1
|
|
1389
|
+
indexed_nodes.append(graph_node_id)
|
|
1390
|
+
except Exception as exc:
|
|
1391
|
+
counts["failed"] += 1
|
|
1392
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1393
|
+
if existing and existing["graph_node_id"]:
|
|
1394
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1395
|
+
self._upsert_local_file_index(
|
|
1396
|
+
conn,
|
|
1397
|
+
source_id=source_id,
|
|
1398
|
+
root=root,
|
|
1399
|
+
file_path=file_path,
|
|
1400
|
+
stat=stat,
|
|
1401
|
+
os_type=os_type,
|
|
1402
|
+
drive_id=drive_id,
|
|
1403
|
+
status="failed",
|
|
1404
|
+
parser_type=parser_type,
|
|
1405
|
+
sha256=digest,
|
|
1406
|
+
error_message=str(exc),
|
|
1407
|
+
metadata={"category": decision["category"]},
|
|
1408
|
+
)
|
|
1409
|
+
|
|
1410
|
+
if not limit_reached:
|
|
1411
|
+
existing_rows = {
|
|
1412
|
+
row["relative_path"]: row["graph_node_id"]
|
|
1413
|
+
for row in conn.execute(
|
|
1414
|
+
"SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
|
|
1415
|
+
(source_id,),
|
|
1416
|
+
)
|
|
1417
|
+
}
|
|
1418
|
+
deleted_paths = set(existing_rows) - seen_relative_paths
|
|
1419
|
+
for relative_path in deleted_paths:
|
|
1420
|
+
self._delete_local_file_graph(
|
|
1421
|
+
conn, existing_rows.get(relative_path)
|
|
1422
|
+
)
|
|
1423
|
+
conn.execute(
|
|
1424
|
+
"""
|
|
1425
|
+
UPDATE local_file_index
|
|
1426
|
+
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
|
|
1427
|
+
WHERE source_id=? AND relative_path=?
|
|
1428
|
+
""",
|
|
1429
|
+
(_now(), source_id, relative_path),
|
|
1430
|
+
)
|
|
1431
|
+
counts["deleted"] = len(deleted_paths)
|
|
1432
|
+
conn.execute(
|
|
1433
|
+
"""
|
|
1434
|
+
UPDATE knowledge_sources
|
|
1435
|
+
SET status='active', updated_at=?, last_scanned_at=?
|
|
1436
|
+
WHERE id=?
|
|
1437
|
+
""",
|
|
1438
|
+
(_now(), _now(), source_id),
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
return {
|
|
1442
|
+
"status": "ok",
|
|
1443
|
+
"source": {
|
|
1444
|
+
"id": source_id,
|
|
1445
|
+
"root_path": str(root),
|
|
1446
|
+
"os_type": os_type,
|
|
1447
|
+
"drive_id": drive_id,
|
|
1448
|
+
"include_ocr": bool(include_ocr),
|
|
1449
|
+
"watch_enabled": bool(watch_enabled),
|
|
1450
|
+
},
|
|
1451
|
+
"counts": dict(counts),
|
|
1452
|
+
"indexed_nodes": indexed_nodes[:100],
|
|
1453
|
+
"errors": errors[:50],
|
|
1454
|
+
"notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
|
|
1455
|
+
}
|