ltcai 4.0.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -33
- package/desktop/electron/main.cjs +44 -0
- package/docs/CHANGELOG.md +106 -0
- package/docs/REALTIME_COLLABORATION.md +3 -3
- package/docs/V3_FRONTEND.md +9 -8
- package/docs/V4_1_FRONTEND_ARCHITECTURE_REVIEW.md +65 -0
- package/docs/V4_1_FRONTEND_MIGRATION_REPORT.md +70 -0
- package/docs/V4_1_VALIDATION_REPORT.md +47 -0
- package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +95 -45
- package/docs/kg-schema.md +6 -2
- package/docs/spec-vs-impl.md +10 -10
- package/frontend/index.html +24 -0
- package/frontend/openapi.json +14190 -0
- package/frontend/src/App.tsx +184 -0
- package/frontend/src/api/client.ts +317 -0
- package/frontend/src/api/openapi.ts +16637 -0
- package/frontend/src/components/primitives.tsx +204 -0
- package/frontend/src/components/ui/badge.tsx +27 -0
- package/frontend/src/components/ui/button.tsx +37 -0
- package/frontend/src/components/ui/card.tsx +22 -0
- package/frontend/src/components/ui/input.tsx +16 -0
- package/frontend/src/components/ui/textarea.tsx +16 -0
- package/frontend/src/lib/utils.ts +33 -0
- package/frontend/src/main.tsx +23 -0
- package/frontend/src/pages/Act.tsx +245 -0
- package/frontend/src/pages/Ask.tsx +200 -0
- package/frontend/src/pages/Brain.tsx +267 -0
- package/frontend/src/pages/Capture.tsx +158 -0
- package/frontend/src/pages/Library.tsx +187 -0
- package/frontend/src/pages/System.tsx +344 -0
- package/frontend/src/routes.ts +85 -0
- package/frontend/src/store/appStore.ts +54 -0
- package/frontend/src/styles.css +107 -0
- package/kg_schema.py +2 -603
- package/knowledge_graph.py +37 -4958
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +15 -16
- package/latticeai/api/agents.py +13 -6
- package/latticeai/api/auth.py +19 -11
- package/latticeai/api/invitations.py +100 -0
- package/latticeai/api/knowledge_graph.py +4 -11
- package/latticeai/api/plugins.py +3 -6
- package/latticeai/api/realtime.py +4 -7
- package/latticeai/api/setup.py +5 -4
- package/latticeai/api/static_routes.py +13 -16
- package/latticeai/api/ui_redirects.py +26 -0
- package/latticeai/api/workflow_designer.py +39 -6
- package/latticeai/api/workspace.py +24 -10
- package/latticeai/app_factory.py +88 -17
- package/latticeai/brain/_kg_common.py +1123 -0
- package/latticeai/brain/discovery.py +1455 -0
- package/latticeai/brain/documents.py +218 -0
- package/latticeai/brain/ingest.py +644 -0
- package/latticeai/brain/projection.py +561 -0
- package/latticeai/brain/provenance.py +401 -0
- package/latticeai/brain/retrieval.py +1316 -0
- package/latticeai/brain/schema.py +640 -0
- package/latticeai/brain/store.py +216 -0
- package/latticeai/brain/write_master.py +225 -0
- package/latticeai/core/invitations.py +131 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/multi_agent.py +1 -1
- package/latticeai/core/policy.py +54 -0
- package/latticeai/core/realtime.py +65 -44
- package/latticeai/core/sessions.py +31 -5
- package/latticeai/core/users.py +147 -0
- package/latticeai/core/workspace_os.py +420 -20
- package/latticeai/services/agent_runtime.py +242 -4
- package/latticeai/services/run_executor.py +328 -0
- package/latticeai/services/workspace_service.py +27 -19
- package/package.json +54 -27
- package/scripts/build_frontend_assets.mjs +38 -0
- package/scripts/bump_version.py +1 -1
- package/scripts/export_openapi.py +31 -0
- package/scripts/lint_frontend.mjs +86 -0
- package/scripts/run_python.mjs +47 -0
- package/src-tauri/Cargo.lock +4833 -0
- package/src-tauri/Cargo.toml +19 -0
- package/src-tauri/build.rs +3 -0
- package/src-tauri/capabilities/default.json +7 -0
- package/src-tauri/src/main.rs +78 -0
- package/src-tauri/tauri.conf.json +36 -0
- package/static/app/asset-manifest.json +32 -0
- package/static/app/assets/core-CwxXejkd.js +2 -0
- package/static/app/assets/core-CwxXejkd.js.map +1 -0
- package/static/app/assets/index-CJRAzNnf.js +333 -0
- package/static/app/assets/index-CJRAzNnf.js.map +1 -0
- package/static/app/assets/index-CSwBBgf4.css +2 -0
- package/static/app/index.html +25 -0
- package/static/manifest.json +2 -2
- package/static/sw.js +4 -4
- package/scripts/build_v3_assets.mjs +0 -170
- package/scripts/lint_v3.mjs +0 -97
- package/static/account.html +0 -113
- package/static/activity.html +0 -73
- package/static/admin.html +0 -486
- package/static/agents.html +0 -139
- package/static/chat.html +0 -841
- package/static/css/reference/account.css +0 -439
- package/static/css/reference/admin.css +0 -610
- package/static/css/reference/base.css +0 -1661
- package/static/css/reference/chat.css +0 -4623
- package/static/css/reference/graph.css +0 -1016
- package/static/css/responsive.css +0 -861
- package/static/graph.html +0 -122
- package/static/platform.css +0 -104
- package/static/plugins.html +0 -136
- package/static/scripts/account.js +0 -238
- package/static/scripts/admin.js +0 -1614
- package/static/scripts/chat.js +0 -5081
- package/static/scripts/graph.js +0 -1804
- package/static/scripts/platform.js +0 -64
- package/static/scripts/ux.js +0 -167
- package/static/scripts/workspace.js +0 -948
- package/static/v3/asset-manifest.json +0 -56
- package/static/v3/css/lattice.base.49deefb5.css +0 -128
- package/static/v3/css/lattice.base.css +0 -128
- package/static/v3/css/lattice.components.cde18231.css +0 -472
- package/static/v3/css/lattice.components.css +0 -472
- package/static/v3/css/lattice.shell.29d36d85.css +0 -452
- package/static/v3/css/lattice.shell.css +0 -452
- package/static/v3/css/lattice.tokens.304cbc40.css +0 -135
- package/static/v3/css/lattice.tokens.css +0 -135
- package/static/v3/css/lattice.views.0a18b6c5.css +0 -360
- package/static/v3/css/lattice.views.css +0 -360
- package/static/v3/index.html +0 -68
- package/static/v3/js/app.356e6452.js +0 -26
- package/static/v3/js/app.js +0 -26
- package/static/v3/js/core/api.7a308b89.js +0 -568
- package/static/v3/js/core/api.js +0 -568
- package/static/v3/js/core/components.f25b3b93.js +0 -230
- package/static/v3/js/core/components.js +0 -230
- package/static/v3/js/core/dom.a2773eb0.js +0 -148
- package/static/v3/js/core/dom.js +0 -148
- package/static/v3/js/core/router.584570f2.js +0 -37
- package/static/v3/js/core/router.js +0 -37
- package/static/v3/js/core/routes.7222343d.js +0 -93
- package/static/v3/js/core/routes.js +0 -93
- package/static/v3/js/core/shell.a1657f20.js +0 -391
- package/static/v3/js/core/shell.js +0 -391
- package/static/v3/js/core/store.204a08b2.js +0 -113
- package/static/v3/js/core/store.js +0 -113
- package/static/v3/js/views/admin-audit.660a1fb1.js +0 -185
- package/static/v3/js/views/admin-audit.js +0 -185
- package/static/v3/js/views/admin-permissions.a7ae5f09.js +0 -177
- package/static/v3/js/views/admin-permissions.js +0 -177
- package/static/v3/js/views/admin-policies.3658fd86.js +0 -102
- package/static/v3/js/views/admin-policies.js +0 -102
- package/static/v3/js/views/admin-private-vpc.7d342d36.js +0 -135
- package/static/v3/js/views/admin-private-vpc.js +0 -135
- package/static/v3/js/views/admin-security.07c66b72.js +0 -180
- package/static/v3/js/views/admin-security.js +0 -180
- package/static/v3/js/views/admin-users.03bac88c.js +0 -168
- package/static/v3/js/views/admin-users.js +0 -168
- package/static/v3/js/views/agents.014d0b74.js +0 -541
- package/static/v3/js/views/agents.js +0 -541
- package/static/v3/js/views/chat.e6dd7dd0.js +0 -601
- package/static/v3/js/views/chat.js +0 -601
- package/static/v3/js/views/files.adad14c1.js +0 -365
- package/static/v3/js/views/files.js +0 -365
- package/static/v3/js/views/graph-canvas.17c15d65.js +0 -509
- package/static/v3/js/views/graph-canvas.js +0 -509
- package/static/v3/js/views/home.24f8b8ae.js +0 -200
- package/static/v3/js/views/home.js +0 -200
- package/static/v3/js/views/hooks.37895880.js +0 -220
- package/static/v3/js/views/hooks.js +0 -220
- package/static/v3/js/views/hybrid-search.2fb63ed9.js +0 -194
- package/static/v3/js/views/hybrid-search.js +0 -194
- package/static/v3/js/views/knowledge-graph.5e40cbeb.js +0 -509
- package/static/v3/js/views/knowledge-graph.js +0 -509
- package/static/v3/js/views/marketplace.ab0583d4.js +0 -141
- package/static/v3/js/views/marketplace.js +0 -141
- package/static/v3/js/views/mcp.99b5c6a7.js +0 -114
- package/static/v3/js/views/mcp.js +0 -114
- package/static/v3/js/views/memory.4ebdf474.js +0 -147
- package/static/v3/js/views/memory.js +0 -147
- package/static/v3/js/views/models.a1ffa147.js +0 -256
- package/static/v3/js/views/models.js +0 -256
- package/static/v3/js/views/my-computer.d9d9ae1c.js +0 -463
- package/static/v3/js/views/my-computer.js +0 -463
- package/static/v3/js/views/pipeline.c522f1ce.js +0 -157
- package/static/v3/js/views/pipeline.js +0 -157
- package/static/v3/js/views/planning.9ac3e313.js +0 -153
- package/static/v3/js/views/planning.js +0 -153
- package/static/v3/js/views/settings.8631fa5e.js +0 -318
- package/static/v3/js/views/settings.js +0 -318
- package/static/v3/js/views/skills.c6c2f965.js +0 -109
- package/static/v3/js/views/skills.js +0 -109
- package/static/v3/js/views/tools.e4f11276.js +0 -108
- package/static/v3/js/views/tools.js +0 -108
- package/static/v3/js/views/workflows.26c57290.js +0 -128
- package/static/v3/js/views/workflows.js +0 -128
- package/static/workflows.html +0 -146
- package/static/workspace.css +0 -1121
- package/static/workspace.html +0 -357
|
@@ -0,0 +1,1455 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: F403,F405
|
|
4
|
+
|
|
5
|
+
from ._kg_common import * # noqa: F403,F401
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KnowledgeGraphDiscoveryMixin:
|
|
9
|
+
def discover_local_roots(self) -> Dict[str, Any]:
|
|
10
|
+
"""Return safe, cross-platform starting points for structure browsing."""
|
|
11
|
+
os_type = _current_os_type()
|
|
12
|
+
home = Path.home().expanduser()
|
|
13
|
+
roots: List[Dict[str, Any]] = []
|
|
14
|
+
seen: set = set()
|
|
15
|
+
|
|
16
|
+
def add(
|
|
17
|
+
label: str,
|
|
18
|
+
path: Path,
|
|
19
|
+
kind: str,
|
|
20
|
+
*,
|
|
21
|
+
recommended: bool = True,
|
|
22
|
+
warning: Optional[str] = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
try:
|
|
25
|
+
resolved = path.expanduser().resolve()
|
|
26
|
+
except OSError:
|
|
27
|
+
resolved = path.expanduser()
|
|
28
|
+
key = str(resolved)
|
|
29
|
+
if key in seen or not resolved.exists():
|
|
30
|
+
return
|
|
31
|
+
seen.add(key)
|
|
32
|
+
roots.append(
|
|
33
|
+
{
|
|
34
|
+
"id": f"{kind}:{_path_fingerprint(resolved)}",
|
|
35
|
+
"label": label,
|
|
36
|
+
"path": key,
|
|
37
|
+
"kind": kind,
|
|
38
|
+
"recommended": recommended,
|
|
39
|
+
"warning": warning or _root_warning(resolved, os_type),
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
add("홈", home, "home", warning=_root_warning(home, os_type))
|
|
44
|
+
for name, label in (
|
|
45
|
+
("Documents", "문서"),
|
|
46
|
+
("Desktop", "데스크탑"),
|
|
47
|
+
("Downloads", "다운로드"),
|
|
48
|
+
("Pictures", "사진"),
|
|
49
|
+
("Projects", "프로젝트"),
|
|
50
|
+
):
|
|
51
|
+
add(label, home / name, name.lower())
|
|
52
|
+
|
|
53
|
+
if os_type == "macos":
|
|
54
|
+
volumes = Path("/Volumes")
|
|
55
|
+
if volumes.exists():
|
|
56
|
+
try:
|
|
57
|
+
for volume in sorted(
|
|
58
|
+
volumes.iterdir(), key=lambda p: p.name.lower()
|
|
59
|
+
):
|
|
60
|
+
add(volume.name, volume, "volume", recommended=False)
|
|
61
|
+
except OSError:
|
|
62
|
+
pass
|
|
63
|
+
elif os_type == "windows":
|
|
64
|
+
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
65
|
+
drive = Path(f"{letter}:\\")
|
|
66
|
+
if drive.exists():
|
|
67
|
+
add(
|
|
68
|
+
f"{letter}: 드라이브",
|
|
69
|
+
drive,
|
|
70
|
+
"drive",
|
|
71
|
+
recommended=(letter != "C"),
|
|
72
|
+
)
|
|
73
|
+
for env_name, label in (
|
|
74
|
+
("OneDrive", "OneDrive"),
|
|
75
|
+
("OneDriveCommercial", "OneDrive"),
|
|
76
|
+
):
|
|
77
|
+
raw = os.environ.get(env_name)
|
|
78
|
+
if raw:
|
|
79
|
+
add(label, Path(raw), "cloud", recommended=False)
|
|
80
|
+
elif os_type == "linux":
|
|
81
|
+
for base in (Path("/mnt"), Path("/media")):
|
|
82
|
+
add(str(base), base, "mounts", recommended=False)
|
|
83
|
+
try:
|
|
84
|
+
if base.exists():
|
|
85
|
+
for mounted in sorted(
|
|
86
|
+
base.iterdir(), key=lambda p: p.name.lower()
|
|
87
|
+
):
|
|
88
|
+
add(mounted.name, mounted, "volume", recommended=False)
|
|
89
|
+
except OSError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"os_type": os_type,
|
|
94
|
+
"computer": platform.node() or "local",
|
|
95
|
+
"roots": roots,
|
|
96
|
+
"privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
|
|
100
|
+
"""List one folder level using metadata only; file contents are not read."""
|
|
101
|
+
root = Path(path).expanduser().resolve()
|
|
102
|
+
if not root.exists():
|
|
103
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
104
|
+
if not root.is_dir():
|
|
105
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
106
|
+
|
|
107
|
+
os_type = _current_os_type()
|
|
108
|
+
max_items = max(1, min(int(max_items or 200), 1000))
|
|
109
|
+
items: List[Dict[str, Any]] = []
|
|
110
|
+
inaccessible = 0
|
|
111
|
+
try:
|
|
112
|
+
children = sorted(
|
|
113
|
+
root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
114
|
+
)
|
|
115
|
+
except PermissionError as exc:
|
|
116
|
+
return {
|
|
117
|
+
"path": str(root),
|
|
118
|
+
"items": [],
|
|
119
|
+
"error": f"접근 권한 없음: {exc}",
|
|
120
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
for child in children[:max_items]:
|
|
124
|
+
try:
|
|
125
|
+
is_dir = child.is_dir()
|
|
126
|
+
stat = child.stat()
|
|
127
|
+
reason = (
|
|
128
|
+
_excluded_directory_reason(child, root=root, os_type=os_type)
|
|
129
|
+
if is_dir
|
|
130
|
+
else _sensitive_file_reason(child, root=root)
|
|
131
|
+
)
|
|
132
|
+
items.append(
|
|
133
|
+
{
|
|
134
|
+
"name": child.name,
|
|
135
|
+
"path": str(child),
|
|
136
|
+
"type": "directory" if is_dir else "file",
|
|
137
|
+
"extension": "" if is_dir else child.suffix.lower(),
|
|
138
|
+
"size_bytes": None if is_dir else stat.st_size,
|
|
139
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
140
|
+
"hidden": _is_hidden_path(child, root),
|
|
141
|
+
"accessible": True,
|
|
142
|
+
"excluded_reason": reason,
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
except PermissionError:
|
|
146
|
+
inaccessible += 1
|
|
147
|
+
items.append(
|
|
148
|
+
{
|
|
149
|
+
"name": child.name,
|
|
150
|
+
"path": str(child),
|
|
151
|
+
"type": "unknown",
|
|
152
|
+
"accessible": False,
|
|
153
|
+
"excluded_reason": "permission_denied",
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
except OSError as exc:
|
|
157
|
+
inaccessible += 1
|
|
158
|
+
items.append(
|
|
159
|
+
{
|
|
160
|
+
"name": child.name,
|
|
161
|
+
"path": str(child),
|
|
162
|
+
"type": "unknown",
|
|
163
|
+
"accessible": False,
|
|
164
|
+
"excluded_reason": str(exc),
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"path": str(root),
|
|
170
|
+
"os_type": os_type,
|
|
171
|
+
"items": items,
|
|
172
|
+
"truncated": len(children) > max_items,
|
|
173
|
+
"inaccessible": inaccessible,
|
|
174
|
+
"warning": _root_warning(root, os_type),
|
|
175
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def _iter_local_scan_entries(
|
|
179
|
+
self, root: Path, *, max_files: int
|
|
180
|
+
) -> Iterable[Dict[str, Any]]:
|
|
181
|
+
os_type = _current_os_type()
|
|
182
|
+
stack = [root]
|
|
183
|
+
files_seen = 0
|
|
184
|
+
while stack:
|
|
185
|
+
current = stack.pop()
|
|
186
|
+
try:
|
|
187
|
+
children = sorted(
|
|
188
|
+
current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
189
|
+
)
|
|
190
|
+
except PermissionError as exc:
|
|
191
|
+
yield {
|
|
192
|
+
"kind": "inaccessible_dir",
|
|
193
|
+
"path": current,
|
|
194
|
+
"reason": f"permission_denied: {exc}",
|
|
195
|
+
}
|
|
196
|
+
continue
|
|
197
|
+
except OSError as exc:
|
|
198
|
+
yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
for child in children:
|
|
202
|
+
if child.is_symlink():
|
|
203
|
+
yield {"kind": "excluded", "path": child, "reason": "symlink"}
|
|
204
|
+
continue
|
|
205
|
+
try:
|
|
206
|
+
if child.is_dir():
|
|
207
|
+
reason = _excluded_directory_reason(
|
|
208
|
+
child, root=root, os_type=os_type
|
|
209
|
+
)
|
|
210
|
+
if reason:
|
|
211
|
+
yield {
|
|
212
|
+
"kind": "excluded_dir",
|
|
213
|
+
"path": child,
|
|
214
|
+
"reason": reason,
|
|
215
|
+
}
|
|
216
|
+
else:
|
|
217
|
+
stack.append(child)
|
|
218
|
+
continue
|
|
219
|
+
if not child.is_file():
|
|
220
|
+
yield {
|
|
221
|
+
"kind": "excluded",
|
|
222
|
+
"path": child,
|
|
223
|
+
"reason": "not_regular_file",
|
|
224
|
+
}
|
|
225
|
+
continue
|
|
226
|
+
stat = child.stat()
|
|
227
|
+
except PermissionError as exc:
|
|
228
|
+
yield {
|
|
229
|
+
"kind": "inaccessible_file",
|
|
230
|
+
"path": child,
|
|
231
|
+
"reason": f"permission_denied: {exc}",
|
|
232
|
+
}
|
|
233
|
+
continue
|
|
234
|
+
except OSError as exc:
|
|
235
|
+
yield {
|
|
236
|
+
"kind": "inaccessible_file",
|
|
237
|
+
"path": child,
|
|
238
|
+
"reason": str(exc),
|
|
239
|
+
}
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
files_seen += 1
|
|
243
|
+
if files_seen > max_files:
|
|
244
|
+
yield {
|
|
245
|
+
"kind": "limit_reached",
|
|
246
|
+
"path": child,
|
|
247
|
+
"reason": "max_files",
|
|
248
|
+
}
|
|
249
|
+
return
|
|
250
|
+
yield {"kind": "file", "path": child, "stat": stat}
|
|
251
|
+
|
|
252
|
+
def _local_file_decision(
|
|
253
|
+
self, path: Path, root: Path, stat: os.stat_result
|
|
254
|
+
) -> Dict[str, Any]:
|
|
255
|
+
ext = path.suffix.lower()
|
|
256
|
+
category = _file_category(ext)
|
|
257
|
+
parser_type = _parser_type_for_category(category, ext)
|
|
258
|
+
sensitive_reason = _sensitive_file_reason(path, root=root)
|
|
259
|
+
if sensitive_reason:
|
|
260
|
+
return {
|
|
261
|
+
"status": "sensitive_blocked",
|
|
262
|
+
"reason": sensitive_reason,
|
|
263
|
+
"category": category,
|
|
264
|
+
"parser_type": parser_type,
|
|
265
|
+
"indexable": False,
|
|
266
|
+
}
|
|
267
|
+
if category == "unsupported":
|
|
268
|
+
return {
|
|
269
|
+
"status": "unsupported",
|
|
270
|
+
"reason": "unsupported_extension",
|
|
271
|
+
"category": category,
|
|
272
|
+
"parser_type": parser_type,
|
|
273
|
+
"indexable": False,
|
|
274
|
+
}
|
|
275
|
+
limit = _size_limit_for_category(category)
|
|
276
|
+
if stat.st_size > limit:
|
|
277
|
+
return {
|
|
278
|
+
"status": "too_large",
|
|
279
|
+
"reason": f"size>{limit}",
|
|
280
|
+
"category": category,
|
|
281
|
+
"parser_type": parser_type,
|
|
282
|
+
"indexable": False,
|
|
283
|
+
}
|
|
284
|
+
return {
|
|
285
|
+
"status": "pending",
|
|
286
|
+
"reason": "",
|
|
287
|
+
"category": category,
|
|
288
|
+
"parser_type": parser_type,
|
|
289
|
+
"indexable": True,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
def audit_local_folder(
|
|
293
|
+
self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000
|
|
294
|
+
) -> Dict[str, Any]:
|
|
295
|
+
"""Safety-check a folder using metadata only; file bodies are not read."""
|
|
296
|
+
root = Path(path).expanduser().resolve()
|
|
297
|
+
if not root.exists():
|
|
298
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
299
|
+
if not root.is_dir():
|
|
300
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
301
|
+
|
|
302
|
+
os_type = _current_os_type()
|
|
303
|
+
max_files = max(1, min(int(max_files or 50_000), 200_000))
|
|
304
|
+
status_counts: Counter = Counter()
|
|
305
|
+
category_counts: Counter = Counter()
|
|
306
|
+
extension_counts: Counter = Counter()
|
|
307
|
+
allowed_samples: List[Dict[str, Any]] = []
|
|
308
|
+
excluded_samples: List[Dict[str, Any]] = []
|
|
309
|
+
total_files = 0
|
|
310
|
+
readable_files = 0
|
|
311
|
+
inaccessible = 0
|
|
312
|
+
excluded_dirs = 0
|
|
313
|
+
limit_reached = False
|
|
314
|
+
|
|
315
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
316
|
+
kind = entry["kind"]
|
|
317
|
+
path_obj = entry["path"]
|
|
318
|
+
if kind == "limit_reached":
|
|
319
|
+
limit_reached = True
|
|
320
|
+
break
|
|
321
|
+
if kind == "excluded_dir":
|
|
322
|
+
excluded_dirs += 1
|
|
323
|
+
if len(excluded_samples) < 25:
|
|
324
|
+
excluded_samples.append(
|
|
325
|
+
_sample_file(
|
|
326
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
continue
|
|
330
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
331
|
+
inaccessible += 1
|
|
332
|
+
status_counts["failed"] += 1
|
|
333
|
+
if len(excluded_samples) < 25:
|
|
334
|
+
excluded_samples.append(
|
|
335
|
+
_sample_file(path_obj, root, "failed", entry.get("reason", ""))
|
|
336
|
+
)
|
|
337
|
+
continue
|
|
338
|
+
if kind == "excluded":
|
|
339
|
+
status_counts["excluded"] += 1
|
|
340
|
+
if len(excluded_samples) < 25:
|
|
341
|
+
excluded_samples.append(
|
|
342
|
+
_sample_file(
|
|
343
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
continue
|
|
347
|
+
if kind != "file":
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
total_files += 1
|
|
351
|
+
stat = entry["stat"]
|
|
352
|
+
decision = self._local_file_decision(path_obj, root, stat)
|
|
353
|
+
status = decision["status"]
|
|
354
|
+
category = decision["category"]
|
|
355
|
+
ext = path_obj.suffix.lower() or "(none)"
|
|
356
|
+
category_counts[category] += 1
|
|
357
|
+
extension_counts[ext] += 1
|
|
358
|
+
if decision["indexable"]:
|
|
359
|
+
readable_files += 1
|
|
360
|
+
status_counts["readable"] += 1
|
|
361
|
+
if len(allowed_samples) < 25:
|
|
362
|
+
allowed_samples.append(_sample_file(path_obj, root, "readable"))
|
|
363
|
+
else:
|
|
364
|
+
status_counts[status] += 1
|
|
365
|
+
if len(excluded_samples) < 25:
|
|
366
|
+
excluded_samples.append(
|
|
367
|
+
_sample_file(path_obj, root, status, decision["reason"])
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
doc_weight = (
|
|
371
|
+
category_counts["pdf"] * 1.4
|
|
372
|
+
+ category_counts["document"] * 0.9
|
|
373
|
+
+ category_counts["slide_deck"] * 1.0
|
|
374
|
+
)
|
|
375
|
+
sheet_weight = category_counts["spreadsheet"] * 0.6
|
|
376
|
+
ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
|
|
377
|
+
estimated_seconds = round(
|
|
378
|
+
readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
"path": str(root),
|
|
383
|
+
"source_id": f"source:{_path_fingerprint(root)}",
|
|
384
|
+
"os_type": os_type,
|
|
385
|
+
"drive_id": _drive_id_for_path(root),
|
|
386
|
+
"warning": _root_warning(root, os_type),
|
|
387
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
388
|
+
"include_ocr_requested": bool(include_ocr),
|
|
389
|
+
"summary": {
|
|
390
|
+
"total_files": total_files,
|
|
391
|
+
"readable_files": readable_files,
|
|
392
|
+
"excluded_files": int(
|
|
393
|
+
status_counts["excluded"]
|
|
394
|
+
+ status_counts["sensitive_blocked"]
|
|
395
|
+
+ status_counts["too_large"]
|
|
396
|
+
+ status_counts["unsupported"]
|
|
397
|
+
),
|
|
398
|
+
"sensitive_files": int(status_counts["sensitive_blocked"]),
|
|
399
|
+
"too_large_files": int(status_counts["too_large"]),
|
|
400
|
+
"unsupported_files": int(status_counts["unsupported"]),
|
|
401
|
+
"image_ocr_candidates": int(category_counts["image"]),
|
|
402
|
+
"inaccessible_items": inaccessible,
|
|
403
|
+
"excluded_dirs": excluded_dirs,
|
|
404
|
+
"estimated_seconds": estimated_seconds,
|
|
405
|
+
"storage_root": str(self.db_path.parent),
|
|
406
|
+
"limit_reached": limit_reached,
|
|
407
|
+
},
|
|
408
|
+
"by_status": dict(status_counts),
|
|
409
|
+
"by_category": dict(category_counts),
|
|
410
|
+
"by_extension": dict(extension_counts.most_common(40)),
|
|
411
|
+
"allowed_samples": allowed_samples,
|
|
412
|
+
"excluded_samples": excluded_samples,
|
|
413
|
+
"consent_required": {
|
|
414
|
+
"knowledge_source": True,
|
|
415
|
+
"image_ocr": bool(category_counts["image"]),
|
|
416
|
+
"watch": True,
|
|
417
|
+
"sensitive_files_default_excluded": True,
|
|
418
|
+
},
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def local_sources(self) -> Dict[str, Any]:
|
|
422
|
+
with self._connect() as conn:
|
|
423
|
+
sources = [
|
|
424
|
+
{
|
|
425
|
+
"id": row["id"],
|
|
426
|
+
"root_path": row["root_path"],
|
|
427
|
+
"os_type": row["os_type"],
|
|
428
|
+
"drive_id": row["drive_id"],
|
|
429
|
+
"label": row["label"],
|
|
430
|
+
"status": row["status"],
|
|
431
|
+
"include_ocr": bool(row["include_ocr"]),
|
|
432
|
+
"watch_enabled": bool(row["watch_enabled"]),
|
|
433
|
+
"consent": _safe_loads(row["consent_json"]),
|
|
434
|
+
"created_at": row["created_at"],
|
|
435
|
+
"updated_at": row["updated_at"],
|
|
436
|
+
"last_scanned_at": row["last_scanned_at"],
|
|
437
|
+
}
|
|
438
|
+
for row in conn.execute(
|
|
439
|
+
"""
|
|
440
|
+
SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
441
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
442
|
+
FROM knowledge_sources
|
|
443
|
+
ORDER BY updated_at DESC, id ASC
|
|
444
|
+
"""
|
|
445
|
+
)
|
|
446
|
+
]
|
|
447
|
+
status_rows = conn.execute(
|
|
448
|
+
"SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
|
|
449
|
+
).fetchall()
|
|
450
|
+
counts: Dict[str, Dict[str, int]] = {}
|
|
451
|
+
for row in status_rows:
|
|
452
|
+
counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
|
|
453
|
+
for source in sources:
|
|
454
|
+
source["file_status"] = counts.get(source["id"], {})
|
|
455
|
+
return {"sources": sources}
|
|
456
|
+
|
|
457
|
+
def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
|
|
458
|
+
source_id = str(source_id or "").strip()
|
|
459
|
+
if not source_id:
|
|
460
|
+
raise ValueError("source_id required")
|
|
461
|
+
with self._connect() as conn:
|
|
462
|
+
row = conn.execute(
|
|
463
|
+
"SELECT id FROM knowledge_sources WHERE id=?",
|
|
464
|
+
(source_id,),
|
|
465
|
+
).fetchone()
|
|
466
|
+
if not row:
|
|
467
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
468
|
+
conn.execute(
|
|
469
|
+
"UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
|
|
470
|
+
(1 if enabled else 0, _now(), source_id),
|
|
471
|
+
)
|
|
472
|
+
return {"source_id": source_id, "watch_enabled": bool(enabled)}
|
|
473
|
+
|
|
474
|
+
def remove_local_source(self, source_id: str) -> Dict[str, Any]:
|
|
475
|
+
"""Remove one approved local source and its derived graph projection.
|
|
476
|
+
|
|
477
|
+
This is intentionally non-destructive for user files: only the LatticeAI
|
|
478
|
+
index rows, graph nodes, edges, and chunks derived from the source are
|
|
479
|
+
removed. The original folder and files are never touched.
|
|
480
|
+
"""
|
|
481
|
+
source_id = str(source_id or "").strip()
|
|
482
|
+
if not source_id:
|
|
483
|
+
raise ValueError("source_id required")
|
|
484
|
+
with self._connect() as conn:
|
|
485
|
+
source = conn.execute(
|
|
486
|
+
"SELECT id, root_path FROM knowledge_sources WHERE id=?",
|
|
487
|
+
(source_id,),
|
|
488
|
+
).fetchone()
|
|
489
|
+
if not source:
|
|
490
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
491
|
+
rows = conn.execute(
|
|
492
|
+
"SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
|
|
493
|
+
(source_id,),
|
|
494
|
+
).fetchall()
|
|
495
|
+
graph_node_ids = [
|
|
496
|
+
row["graph_node_id"] for row in rows if row["graph_node_id"]
|
|
497
|
+
]
|
|
498
|
+
for graph_node_id in graph_node_ids:
|
|
499
|
+
self._delete_local_file_graph(conn, graph_node_id)
|
|
500
|
+
conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
|
|
501
|
+
conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
|
|
502
|
+
self._cleanup_local_graph_orphans(conn, source_id)
|
|
503
|
+
return {
|
|
504
|
+
"source_id": source_id,
|
|
505
|
+
"root_path": source["root_path"],
|
|
506
|
+
"removed_graph_nodes": len(graph_node_ids),
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
def _extract_local_file_text(
|
|
510
|
+
self, path: Path, category: str, *, include_ocr: bool
|
|
511
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
512
|
+
ext = path.suffix.lower()
|
|
513
|
+
meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
|
|
514
|
+
text = ""
|
|
515
|
+
if category in {"text", "code"} or ext == ".csv":
|
|
516
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
517
|
+
elif ext == ".pdf":
|
|
518
|
+
import pdfplumber
|
|
519
|
+
|
|
520
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
521
|
+
meta["pages"] = len(pdf.pages)
|
|
522
|
+
text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
|
|
523
|
+
elif ext == ".docx":
|
|
524
|
+
from docx import Document
|
|
525
|
+
|
|
526
|
+
doc = Document(str(path))
|
|
527
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
528
|
+
table_lines = []
|
|
529
|
+
for table in doc.tables:
|
|
530
|
+
for row in table.rows:
|
|
531
|
+
cells = [_clean_text(cell.text) for cell in row.cells]
|
|
532
|
+
if any(cells):
|
|
533
|
+
table_lines.append("\t".join(cells))
|
|
534
|
+
meta["paragraphs"] = len(paragraphs)
|
|
535
|
+
meta["tables"] = len(doc.tables)
|
|
536
|
+
meta["table_rows"] = len(table_lines)
|
|
537
|
+
text = "\n\n".join([*paragraphs, *table_lines])
|
|
538
|
+
elif ext == ".xlsx":
|
|
539
|
+
from openpyxl import load_workbook
|
|
540
|
+
|
|
541
|
+
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
542
|
+
rows_all = []
|
|
543
|
+
non_empty_rows = 0
|
|
544
|
+
non_empty_cells = 0
|
|
545
|
+
char_count = 0
|
|
546
|
+
for ws in wb.worksheets:
|
|
547
|
+
sheet_rows = []
|
|
548
|
+
for row in ws.iter_rows(values_only=True):
|
|
549
|
+
cells = [
|
|
550
|
+
str(cell).strip() if cell is not None else "" for cell in row
|
|
551
|
+
]
|
|
552
|
+
if not any(cells):
|
|
553
|
+
continue
|
|
554
|
+
line = "\t".join(cells)
|
|
555
|
+
non_empty_rows += 1
|
|
556
|
+
non_empty_cells += sum(1 for cell in cells if cell)
|
|
557
|
+
sheet_rows.append(line)
|
|
558
|
+
char_count += len(line) + 1
|
|
559
|
+
if char_count > 200_000:
|
|
560
|
+
break
|
|
561
|
+
if sheet_rows:
|
|
562
|
+
rows_all.append(f"[Sheet: {ws.title}]")
|
|
563
|
+
rows_all.extend(sheet_rows)
|
|
564
|
+
if char_count > 200_000:
|
|
565
|
+
break
|
|
566
|
+
meta["sheets"] = len(wb.worksheets)
|
|
567
|
+
meta["rows"] = non_empty_rows
|
|
568
|
+
meta["cells"] = non_empty_cells
|
|
569
|
+
text = "\n".join(rows_all)
|
|
570
|
+
elif ext == ".pptx":
|
|
571
|
+
from pptx import Presentation
|
|
572
|
+
|
|
573
|
+
prs = Presentation(str(path))
|
|
574
|
+
slides_text = []
|
|
575
|
+
for index, slide in enumerate(prs.slides, 1):
|
|
576
|
+
parts = []
|
|
577
|
+
for shape in slide.shapes:
|
|
578
|
+
if getattr(shape, "has_text_frame", False):
|
|
579
|
+
slide_text = shape.text_frame.text.strip()
|
|
580
|
+
if slide_text:
|
|
581
|
+
parts.append(slide_text)
|
|
582
|
+
if parts:
|
|
583
|
+
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
584
|
+
meta["slides"] = len(prs.slides)
|
|
585
|
+
meta["text_slides"] = len(slides_text)
|
|
586
|
+
text = "\n\n".join(slides_text)
|
|
587
|
+
elif category == "image":
|
|
588
|
+
from PIL import Image
|
|
589
|
+
|
|
590
|
+
with Image.open(str(path)) as image:
|
|
591
|
+
meta.update(
|
|
592
|
+
{
|
|
593
|
+
"width": image.width,
|
|
594
|
+
"height": image.height,
|
|
595
|
+
"format": image.format,
|
|
596
|
+
"mode": image.mode,
|
|
597
|
+
"ocr_enabled": bool(include_ocr),
|
|
598
|
+
}
|
|
599
|
+
)
|
|
600
|
+
if include_ocr:
|
|
601
|
+
try:
|
|
602
|
+
import pytesseract
|
|
603
|
+
|
|
604
|
+
text = pytesseract.image_to_string(image)
|
|
605
|
+
meta["ocr_chars"] = len(text)
|
|
606
|
+
except (
|
|
607
|
+
Exception
|
|
608
|
+
) as exc: # pragma: no cover - depends on local OCR runtime
|
|
609
|
+
meta["ocr_error"] = str(exc)
|
|
610
|
+
text = ""
|
|
611
|
+
return text[:200_000], meta
|
|
612
|
+
|
|
613
|
+
def _ensure_local_hierarchy(
|
|
614
|
+
self,
|
|
615
|
+
conn: sqlite3.Connection,
|
|
616
|
+
*,
|
|
617
|
+
source_id: str,
|
|
618
|
+
root: Path,
|
|
619
|
+
file_path: Path,
|
|
620
|
+
os_type: str,
|
|
621
|
+
drive_id: str,
|
|
622
|
+
) -> str:
|
|
623
|
+
computer_label = platform.node() or "내 컴퓨터"
|
|
624
|
+
computer_id = f"computer:{_slug(computer_label)}"
|
|
625
|
+
drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
|
|
626
|
+
root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
|
|
627
|
+
self._upsert_node(
|
|
628
|
+
conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type}
|
|
629
|
+
)
|
|
630
|
+
self._upsert_node(
|
|
631
|
+
conn,
|
|
632
|
+
drive_node_id,
|
|
633
|
+
"Drive",
|
|
634
|
+
drive_id,
|
|
635
|
+
metadata={"os_type": os_type, "drive_id": drive_id},
|
|
636
|
+
)
|
|
637
|
+
self._upsert_edge(
|
|
638
|
+
conn,
|
|
639
|
+
computer_id,
|
|
640
|
+
drive_node_id,
|
|
641
|
+
"포함함",
|
|
642
|
+
metadata={"source": "local_scan"},
|
|
643
|
+
)
|
|
644
|
+
self._upsert_node(
|
|
645
|
+
conn,
|
|
646
|
+
root_folder_id,
|
|
647
|
+
"Folder",
|
|
648
|
+
root.name or str(root),
|
|
649
|
+
summary=str(root),
|
|
650
|
+
metadata={"source_id": source_id, "path": str(root), "root": True},
|
|
651
|
+
)
|
|
652
|
+
self._upsert_edge(
|
|
653
|
+
conn,
|
|
654
|
+
drive_node_id,
|
|
655
|
+
root_folder_id,
|
|
656
|
+
"포함함",
|
|
657
|
+
metadata={"source": "local_scan"},
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
relative_parent = file_path.parent.relative_to(root)
|
|
662
|
+
except ValueError:
|
|
663
|
+
relative_parent = Path()
|
|
664
|
+
parent_id = root_folder_id
|
|
665
|
+
current_path = root
|
|
666
|
+
for part in relative_parent.parts:
|
|
667
|
+
current_path = current_path / part
|
|
668
|
+
folder_id = (
|
|
669
|
+
f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
|
|
670
|
+
)
|
|
671
|
+
self._upsert_node(
|
|
672
|
+
conn,
|
|
673
|
+
folder_id,
|
|
674
|
+
"Folder",
|
|
675
|
+
part,
|
|
676
|
+
summary=str(current_path),
|
|
677
|
+
metadata={
|
|
678
|
+
"source_id": source_id,
|
|
679
|
+
"path": str(current_path),
|
|
680
|
+
"root": False,
|
|
681
|
+
},
|
|
682
|
+
)
|
|
683
|
+
self._upsert_edge(
|
|
684
|
+
conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"}
|
|
685
|
+
)
|
|
686
|
+
parent_id = folder_id
|
|
687
|
+
return parent_id
|
|
688
|
+
|
|
689
|
+
def _upsert_local_file_index(
|
|
690
|
+
self,
|
|
691
|
+
conn: sqlite3.Connection,
|
|
692
|
+
*,
|
|
693
|
+
source_id: str,
|
|
694
|
+
root: Path,
|
|
695
|
+
file_path: Path,
|
|
696
|
+
stat: Optional[os.stat_result],
|
|
697
|
+
os_type: str,
|
|
698
|
+
drive_id: str,
|
|
699
|
+
status: str,
|
|
700
|
+
parser_type: str,
|
|
701
|
+
sha256: Optional[str] = None,
|
|
702
|
+
graph_node_id: Optional[str] = None,
|
|
703
|
+
error_message: Optional[str] = None,
|
|
704
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
705
|
+
) -> str:
|
|
706
|
+
try:
|
|
707
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
708
|
+
except ValueError:
|
|
709
|
+
relative_path = file_path.name
|
|
710
|
+
index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
711
|
+
now = _now()
|
|
712
|
+
size = stat.st_size if stat else None
|
|
713
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
|
|
714
|
+
conn.execute(
|
|
715
|
+
"""
|
|
716
|
+
INSERT INTO local_file_index(
|
|
717
|
+
id, source_id, os_type, drive_id, root_path, file_path, relative_path,
|
|
718
|
+
file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
|
|
719
|
+
last_indexed_at, parser_type, status, error_message, graph_node_id,
|
|
720
|
+
deleted, metadata_json
|
|
721
|
+
)
|
|
722
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
723
|
+
ON CONFLICT(source_id, relative_path) DO UPDATE SET
|
|
724
|
+
os_type=excluded.os_type,
|
|
725
|
+
drive_id=excluded.drive_id,
|
|
726
|
+
root_path=excluded.root_path,
|
|
727
|
+
file_path=excluded.file_path,
|
|
728
|
+
file_name=excluded.file_name,
|
|
729
|
+
extension=excluded.extension,
|
|
730
|
+
size_bytes=excluded.size_bytes,
|
|
731
|
+
modified_at=excluded.modified_at,
|
|
732
|
+
sha256=excluded.sha256,
|
|
733
|
+
last_scanned_at=excluded.last_scanned_at,
|
|
734
|
+
last_indexed_at=excluded.last_indexed_at,
|
|
735
|
+
parser_type=excluded.parser_type,
|
|
736
|
+
status=excluded.status,
|
|
737
|
+
error_message=excluded.error_message,
|
|
738
|
+
graph_node_id=excluded.graph_node_id,
|
|
739
|
+
deleted=excluded.deleted,
|
|
740
|
+
metadata_json=excluded.metadata_json
|
|
741
|
+
""",
|
|
742
|
+
(
|
|
743
|
+
index_id,
|
|
744
|
+
source_id,
|
|
745
|
+
os_type,
|
|
746
|
+
drive_id,
|
|
747
|
+
str(root),
|
|
748
|
+
str(file_path),
|
|
749
|
+
relative_path,
|
|
750
|
+
file_path.name,
|
|
751
|
+
file_path.suffix.lower(),
|
|
752
|
+
size,
|
|
753
|
+
modified_at,
|
|
754
|
+
sha256,
|
|
755
|
+
now,
|
|
756
|
+
now if status == "indexed" else None,
|
|
757
|
+
parser_type,
|
|
758
|
+
status,
|
|
759
|
+
error_message,
|
|
760
|
+
graph_node_id,
|
|
761
|
+
0 if status != "deleted" else 1,
|
|
762
|
+
_json(metadata),
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
return index_id
|
|
766
|
+
|
|
767
|
+
def _upsert_local_file_node(
|
|
768
|
+
self,
|
|
769
|
+
conn: sqlite3.Connection,
|
|
770
|
+
*,
|
|
771
|
+
source_id: str,
|
|
772
|
+
root: Path,
|
|
773
|
+
file_path: Path,
|
|
774
|
+
stat: os.stat_result,
|
|
775
|
+
os_type: str,
|
|
776
|
+
drive_id: str,
|
|
777
|
+
sha256: str,
|
|
778
|
+
category: str,
|
|
779
|
+
parser_type: str,
|
|
780
|
+
text: str,
|
|
781
|
+
parser_meta: Dict[str, Any],
|
|
782
|
+
) -> str:
|
|
783
|
+
text = _clean_text(text)
|
|
784
|
+
if not text:
|
|
785
|
+
raise ValueError("텍스트 추출 결과가 비어 있습니다.")
|
|
786
|
+
try:
|
|
787
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
788
|
+
except ValueError:
|
|
789
|
+
relative_path = file_path.name
|
|
790
|
+
file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
791
|
+
parent_folder_id = self._ensure_local_hierarchy(
|
|
792
|
+
conn,
|
|
793
|
+
source_id=source_id,
|
|
794
|
+
root=root,
|
|
795
|
+
file_path=file_path,
|
|
796
|
+
os_type=os_type,
|
|
797
|
+
drive_id=drive_id,
|
|
798
|
+
)
|
|
799
|
+
child_rows = conn.execute(
|
|
800
|
+
"""
|
|
801
|
+
SELECT e.to_node AS id
|
|
802
|
+
FROM edges e
|
|
803
|
+
JOIN nodes n ON n.id=e.to_node
|
|
804
|
+
WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
|
|
805
|
+
""",
|
|
806
|
+
(file_node_id,),
|
|
807
|
+
).fetchall()
|
|
808
|
+
child_ids = [row["id"] for row in child_rows]
|
|
809
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
810
|
+
if child_ids:
|
|
811
|
+
placeholders = ",".join("?" * len(child_ids))
|
|
812
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
|
|
813
|
+
self._v2_delete_nodes(conn, child_ids)
|
|
814
|
+
conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
|
|
815
|
+
self._v2_delete_edges_from(conn, file_node_id)
|
|
816
|
+
|
|
817
|
+
metadata = {
|
|
818
|
+
"source": "local_folder",
|
|
819
|
+
"source_id": source_id,
|
|
820
|
+
"root_path": str(root),
|
|
821
|
+
"file_path": str(file_path),
|
|
822
|
+
"relative_path": relative_path,
|
|
823
|
+
"filename": file_path.name,
|
|
824
|
+
"ext": file_path.suffix.lower(),
|
|
825
|
+
"category": category,
|
|
826
|
+
"parser_type": parser_type,
|
|
827
|
+
"bytes": stat.st_size,
|
|
828
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
829
|
+
"sha256": sha256,
|
|
830
|
+
"parser": parser_meta,
|
|
831
|
+
}
|
|
832
|
+
self._upsert_node(
|
|
833
|
+
conn,
|
|
834
|
+
file_node_id,
|
|
835
|
+
_node_type_for_category(category),
|
|
836
|
+
file_path.name,
|
|
837
|
+
summary=text[:700],
|
|
838
|
+
metadata=metadata,
|
|
839
|
+
raw=metadata,
|
|
840
|
+
)
|
|
841
|
+
self._upsert_edge(
|
|
842
|
+
conn,
|
|
843
|
+
parent_folder_id,
|
|
844
|
+
file_node_id,
|
|
845
|
+
"포함함",
|
|
846
|
+
weight=1.0,
|
|
847
|
+
metadata={"source": "local_scan"},
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
target_for_concepts = text
|
|
851
|
+
if category == "image" and text:
|
|
852
|
+
image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
|
|
853
|
+
self._upsert_node(
|
|
854
|
+
conn,
|
|
855
|
+
image_text_id,
|
|
856
|
+
"ImageText",
|
|
857
|
+
f"{file_path.name} OCR",
|
|
858
|
+
summary=_clean_text(text)[:700],
|
|
859
|
+
metadata={
|
|
860
|
+
"source_node": file_node_id,
|
|
861
|
+
"source_id": source_id,
|
|
862
|
+
"chars": len(text),
|
|
863
|
+
},
|
|
864
|
+
)
|
|
865
|
+
self._upsert_edge(
|
|
866
|
+
conn,
|
|
867
|
+
file_node_id,
|
|
868
|
+
image_text_id,
|
|
869
|
+
"포함함",
|
|
870
|
+
weight=0.8,
|
|
871
|
+
metadata={"source": "ocr"},
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
875
|
+
chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
|
|
876
|
+
self._upsert_node(
|
|
877
|
+
conn,
|
|
878
|
+
chunk_id,
|
|
879
|
+
"Chunk",
|
|
880
|
+
f"{file_path.name} chunk {index + 1}",
|
|
881
|
+
summary=chunk[:500],
|
|
882
|
+
metadata={
|
|
883
|
+
"index": index,
|
|
884
|
+
"source_node": file_node_id,
|
|
885
|
+
"source_id": source_id,
|
|
886
|
+
},
|
|
887
|
+
)
|
|
888
|
+
self._upsert_chunk(
|
|
889
|
+
conn,
|
|
890
|
+
chunk_id=chunk_id,
|
|
891
|
+
source_node=file_node_id,
|
|
892
|
+
text=chunk,
|
|
893
|
+
metadata={
|
|
894
|
+
"index": index,
|
|
895
|
+
"source_node": file_node_id,
|
|
896
|
+
"source_id": source_id,
|
|
897
|
+
},
|
|
898
|
+
)
|
|
899
|
+
self._upsert_edge(
|
|
900
|
+
conn,
|
|
901
|
+
file_node_id,
|
|
902
|
+
chunk_id,
|
|
903
|
+
"포함함",
|
|
904
|
+
weight=0.7,
|
|
905
|
+
metadata={"source": "local_scan"},
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
concepts = _extract_concepts(target_for_concepts, limit=18)
|
|
909
|
+
concept_ids: Dict[str, str] = {}
|
|
910
|
+
for concept in concepts:
|
|
911
|
+
node_t = _classify_node_type(concept, target_for_concepts)
|
|
912
|
+
concept_id = f"{node_t.lower()}:{_slug(concept)}"
|
|
913
|
+
concept_ids[concept.lower()] = concept_id
|
|
914
|
+
self._upsert_node(
|
|
915
|
+
conn,
|
|
916
|
+
concept_id,
|
|
917
|
+
node_t,
|
|
918
|
+
concept,
|
|
919
|
+
metadata={
|
|
920
|
+
"auto_extracted": True,
|
|
921
|
+
"source": "local_folder",
|
|
922
|
+
"source_id": source_id,
|
|
923
|
+
},
|
|
924
|
+
)
|
|
925
|
+
self._upsert_edge(
|
|
926
|
+
conn,
|
|
927
|
+
file_node_id,
|
|
928
|
+
concept_id,
|
|
929
|
+
"언급함",
|
|
930
|
+
weight=0.75,
|
|
931
|
+
metadata={"source": "local_scan"},
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
for triple in _extract_triples(target_for_concepts, concepts, limit=20):
|
|
935
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
936
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
937
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
938
|
+
self._upsert_edge(
|
|
939
|
+
conn,
|
|
940
|
+
subj_id,
|
|
941
|
+
obj_id,
|
|
942
|
+
triple["relation"],
|
|
943
|
+
weight=0.9,
|
|
944
|
+
metadata={
|
|
945
|
+
"context": triple.get("context", "")[:240],
|
|
946
|
+
"source_id": source_id,
|
|
947
|
+
},
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
for item in _semantic_items(target_for_concepts):
|
|
951
|
+
sem_type = item["type"]
|
|
952
|
+
sem_title = item["title"]
|
|
953
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
|
|
954
|
+
self._upsert_node(
|
|
955
|
+
conn,
|
|
956
|
+
sem_id,
|
|
957
|
+
sem_type,
|
|
958
|
+
sem_title,
|
|
959
|
+
summary=item["summary"],
|
|
960
|
+
metadata={
|
|
961
|
+
"auto_extracted": True,
|
|
962
|
+
"source_node": file_node_id,
|
|
963
|
+
"filename": file_path.name,
|
|
964
|
+
},
|
|
965
|
+
raw=item,
|
|
966
|
+
)
|
|
967
|
+
self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
|
|
968
|
+
|
|
969
|
+
return file_node_id
|
|
970
|
+
|
|
971
|
+
def _delete_local_file_graph(
|
|
972
|
+
self, conn: sqlite3.Connection, file_node_id: Optional[str]
|
|
973
|
+
) -> None:
|
|
974
|
+
if not file_node_id:
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
file_row = conn.execute(
|
|
978
|
+
"SELECT metadata_json FROM nodes WHERE id=?",
|
|
979
|
+
(file_node_id,),
|
|
980
|
+
).fetchone()
|
|
981
|
+
source_id = None
|
|
982
|
+
if file_row:
|
|
983
|
+
source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
|
|
984
|
+
|
|
985
|
+
linked_rows = conn.execute(
|
|
986
|
+
"""
|
|
987
|
+
SELECT n.id, n.type, n.metadata_json
|
|
988
|
+
FROM edges e
|
|
989
|
+
JOIN nodes n ON n.id=e.to_node
|
|
990
|
+
WHERE e.from_node=?
|
|
991
|
+
""",
|
|
992
|
+
(file_node_id,),
|
|
993
|
+
).fetchall()
|
|
994
|
+
owned_ids: set = set()
|
|
995
|
+
auto_candidate_ids: set = set()
|
|
996
|
+
for row in linked_rows:
|
|
997
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
998
|
+
if (
|
|
999
|
+
row["type"] in {"Chunk", "ImageText", "Section"}
|
|
1000
|
+
or metadata.get("source_node") == file_node_id
|
|
1001
|
+
):
|
|
1002
|
+
owned_ids.add(row["id"])
|
|
1003
|
+
elif (
|
|
1004
|
+
metadata.get("auto_extracted")
|
|
1005
|
+
and metadata.get("source") == "local_folder"
|
|
1006
|
+
):
|
|
1007
|
+
auto_candidate_ids.add(row["id"])
|
|
1008
|
+
|
|
1009
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
1010
|
+
conn.execute(
|
|
1011
|
+
"DELETE FROM edges WHERE from_node=? OR to_node=?",
|
|
1012
|
+
(file_node_id, file_node_id),
|
|
1013
|
+
)
|
|
1014
|
+
conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
|
|
1015
|
+
self._v2_delete_nodes(conn, [file_node_id])
|
|
1016
|
+
|
|
1017
|
+
def delete_nodes(node_ids: set) -> None:
|
|
1018
|
+
if not node_ids:
|
|
1019
|
+
return
|
|
1020
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
1021
|
+
params = list(node_ids)
|
|
1022
|
+
conn.execute(
|
|
1023
|
+
f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params
|
|
1024
|
+
)
|
|
1025
|
+
conn.execute(
|
|
1026
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1027
|
+
params * 2,
|
|
1028
|
+
)
|
|
1029
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
|
|
1030
|
+
self._v2_delete_nodes(conn, params)
|
|
1031
|
+
|
|
1032
|
+
delete_nodes(owned_ids)
|
|
1033
|
+
|
|
1034
|
+
removable_auto_ids: set = set()
|
|
1035
|
+
for node_id in auto_candidate_ids:
|
|
1036
|
+
remaining_edges = conn.execute(
|
|
1037
|
+
"SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
|
|
1038
|
+
(node_id, node_id),
|
|
1039
|
+
).fetchall()
|
|
1040
|
+
if all(
|
|
1041
|
+
(
|
|
1042
|
+
row["from_node"] in auto_candidate_ids
|
|
1043
|
+
and row["to_node"] in auto_candidate_ids
|
|
1044
|
+
)
|
|
1045
|
+
for row in remaining_edges
|
|
1046
|
+
):
|
|
1047
|
+
removable_auto_ids.add(node_id)
|
|
1048
|
+
delete_nodes(removable_auto_ids)
|
|
1049
|
+
if source_id:
|
|
1050
|
+
self._cleanup_local_graph_orphans(conn, str(source_id))
|
|
1051
|
+
|
|
1052
|
+
def _cleanup_local_graph_orphans(
|
|
1053
|
+
self, conn: sqlite3.Connection, source_id: str
|
|
1054
|
+
) -> None:
|
|
1055
|
+
while True:
|
|
1056
|
+
folder_rows = conn.execute(
|
|
1057
|
+
"SELECT id, metadata_json FROM nodes WHERE type='Folder'"
|
|
1058
|
+
).fetchall()
|
|
1059
|
+
leaf_ids = []
|
|
1060
|
+
for row in folder_rows:
|
|
1061
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1062
|
+
if metadata.get("source_id") != source_id:
|
|
1063
|
+
continue
|
|
1064
|
+
has_children = conn.execute(
|
|
1065
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1066
|
+
(row["id"],),
|
|
1067
|
+
).fetchone()
|
|
1068
|
+
if not has_children:
|
|
1069
|
+
leaf_ids.append(row["id"])
|
|
1070
|
+
if not leaf_ids:
|
|
1071
|
+
break
|
|
1072
|
+
placeholders = ",".join("?" * len(leaf_ids))
|
|
1073
|
+
conn.execute(
|
|
1074
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1075
|
+
leaf_ids * 2,
|
|
1076
|
+
)
|
|
1077
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
|
|
1078
|
+
self._v2_delete_nodes(conn, leaf_ids)
|
|
1079
|
+
|
|
1080
|
+
for node_type in ("Drive", "Computer"):
|
|
1081
|
+
rows = conn.execute(
|
|
1082
|
+
"SELECT id FROM nodes WHERE type=?", (node_type,)
|
|
1083
|
+
).fetchall()
|
|
1084
|
+
removable = []
|
|
1085
|
+
for row in rows:
|
|
1086
|
+
has_children = conn.execute(
|
|
1087
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1088
|
+
(row["id"],),
|
|
1089
|
+
).fetchone()
|
|
1090
|
+
if not has_children:
|
|
1091
|
+
removable.append(row["id"])
|
|
1092
|
+
if removable:
|
|
1093
|
+
placeholders = ",".join("?" * len(removable))
|
|
1094
|
+
conn.execute(
|
|
1095
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1096
|
+
removable * 2,
|
|
1097
|
+
)
|
|
1098
|
+
conn.execute(
|
|
1099
|
+
f"DELETE FROM nodes WHERE id IN ({placeholders})", removable
|
|
1100
|
+
)
|
|
1101
|
+
self._v2_delete_nodes(conn, removable)
|
|
1102
|
+
|
|
1103
|
+
def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
|
|
1104
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1105
|
+
parser = metadata.get("parser") if isinstance(metadata, dict) else {}
|
|
1106
|
+
if not isinstance(parser, dict):
|
|
1107
|
+
return False
|
|
1108
|
+
try:
|
|
1109
|
+
return int(parser.get("extracted_chars") or 0) > 0
|
|
1110
|
+
except (TypeError, ValueError):
|
|
1111
|
+
return False
|
|
1112
|
+
|
|
1113
|
+
def index_local_folder(
|
|
1114
|
+
self,
|
|
1115
|
+
path: Path,
|
|
1116
|
+
*,
|
|
1117
|
+
include_ocr: bool = False,
|
|
1118
|
+
watch_enabled: bool = False,
|
|
1119
|
+
user_email: Optional[str] = None,
|
|
1120
|
+
consent: Optional[Dict[str, Any]] = None,
|
|
1121
|
+
max_files: int = 5_000,
|
|
1122
|
+
) -> Dict[str, Any]:
|
|
1123
|
+
"""Read approved files from a local folder and connect them to Graph RAG."""
|
|
1124
|
+
root = Path(path).expanduser().resolve()
|
|
1125
|
+
if not root.exists():
|
|
1126
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1127
|
+
if not root.is_dir():
|
|
1128
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1129
|
+
|
|
1130
|
+
os_type = _current_os_type()
|
|
1131
|
+
drive_id = _drive_id_for_path(root)
|
|
1132
|
+
source_id = f"source:{_path_fingerprint(root)}"
|
|
1133
|
+
now = _now()
|
|
1134
|
+
max_files = max(1, min(int(max_files or 5_000), 50_000))
|
|
1135
|
+
consent_payload = {
|
|
1136
|
+
"approved_at": now,
|
|
1137
|
+
"approved_by": user_email,
|
|
1138
|
+
"knowledge_source": True,
|
|
1139
|
+
"include_ocr": bool(include_ocr),
|
|
1140
|
+
"watch_enabled": bool(watch_enabled),
|
|
1141
|
+
"sensitive_files_default_excluded": True,
|
|
1142
|
+
**(consent or {}),
|
|
1143
|
+
}
|
|
1144
|
+
counts: Counter = Counter()
|
|
1145
|
+
seen_relative_paths: set = set()
|
|
1146
|
+
indexed_nodes: List[str] = []
|
|
1147
|
+
errors: List[Dict[str, str]] = []
|
|
1148
|
+
limit_reached = False
|
|
1149
|
+
|
|
1150
|
+
with self._connect() as conn:
|
|
1151
|
+
conn.execute(
|
|
1152
|
+
"""
|
|
1153
|
+
INSERT INTO knowledge_sources(
|
|
1154
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1155
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1156
|
+
)
|
|
1157
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1158
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
1159
|
+
root_path=excluded.root_path,
|
|
1160
|
+
os_type=excluded.os_type,
|
|
1161
|
+
drive_id=excluded.drive_id,
|
|
1162
|
+
label=excluded.label,
|
|
1163
|
+
status=excluded.status,
|
|
1164
|
+
include_ocr=excluded.include_ocr,
|
|
1165
|
+
watch_enabled=excluded.watch_enabled,
|
|
1166
|
+
consent_json=excluded.consent_json,
|
|
1167
|
+
updated_at=excluded.updated_at,
|
|
1168
|
+
last_scanned_at=excluded.last_scanned_at
|
|
1169
|
+
""",
|
|
1170
|
+
(
|
|
1171
|
+
source_id,
|
|
1172
|
+
str(root),
|
|
1173
|
+
os_type,
|
|
1174
|
+
drive_id,
|
|
1175
|
+
root.name or str(root),
|
|
1176
|
+
"scanning",
|
|
1177
|
+
1 if include_ocr else 0,
|
|
1178
|
+
1 if watch_enabled else 0,
|
|
1179
|
+
_json(consent_payload),
|
|
1180
|
+
now,
|
|
1181
|
+
now,
|
|
1182
|
+
now,
|
|
1183
|
+
),
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1187
|
+
kind = entry["kind"]
|
|
1188
|
+
file_path = entry["path"]
|
|
1189
|
+
if kind == "limit_reached":
|
|
1190
|
+
counts["limit_reached"] += 1
|
|
1191
|
+
limit_reached = True
|
|
1192
|
+
break
|
|
1193
|
+
if kind in {"excluded_dir", "excluded"}:
|
|
1194
|
+
counts["excluded"] += 1
|
|
1195
|
+
continue
|
|
1196
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1197
|
+
counts["failed"] += 1
|
|
1198
|
+
errors.append(
|
|
1199
|
+
{
|
|
1200
|
+
"path": str(file_path),
|
|
1201
|
+
"error": entry.get("reason", "inaccessible"),
|
|
1202
|
+
}
|
|
1203
|
+
)
|
|
1204
|
+
continue
|
|
1205
|
+
if kind != "file":
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
stat = entry["stat"]
|
|
1209
|
+
try:
|
|
1210
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1211
|
+
except ValueError:
|
|
1212
|
+
relative_path = file_path.name
|
|
1213
|
+
seen_relative_paths.add(relative_path)
|
|
1214
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1215
|
+
existing = conn.execute(
|
|
1216
|
+
"""
|
|
1217
|
+
SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
|
|
1218
|
+
FROM local_file_index
|
|
1219
|
+
WHERE source_id=? AND relative_path=?
|
|
1220
|
+
""",
|
|
1221
|
+
(source_id, relative_path),
|
|
1222
|
+
).fetchone()
|
|
1223
|
+
decision = self._local_file_decision(file_path, root, stat)
|
|
1224
|
+
parser_type = decision["parser_type"]
|
|
1225
|
+
if not decision["indexable"]:
|
|
1226
|
+
counts[decision["status"]] += 1
|
|
1227
|
+
if existing and existing["graph_node_id"]:
|
|
1228
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1229
|
+
self._upsert_local_file_index(
|
|
1230
|
+
conn,
|
|
1231
|
+
source_id=source_id,
|
|
1232
|
+
root=root,
|
|
1233
|
+
file_path=file_path,
|
|
1234
|
+
stat=stat,
|
|
1235
|
+
os_type=os_type,
|
|
1236
|
+
drive_id=drive_id,
|
|
1237
|
+
status=decision["status"],
|
|
1238
|
+
parser_type=parser_type,
|
|
1239
|
+
metadata={
|
|
1240
|
+
"reason": decision["reason"],
|
|
1241
|
+
"category": decision["category"],
|
|
1242
|
+
},
|
|
1243
|
+
)
|
|
1244
|
+
continue
|
|
1245
|
+
|
|
1246
|
+
if (
|
|
1247
|
+
existing
|
|
1248
|
+
and existing["status"] == "indexed"
|
|
1249
|
+
and existing["graph_node_id"]
|
|
1250
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1251
|
+
and existing["size_bytes"] == stat.st_size
|
|
1252
|
+
and existing["modified_at"] == modified_at
|
|
1253
|
+
):
|
|
1254
|
+
counts["skipped_unchanged"] += 1
|
|
1255
|
+
self._upsert_local_file_index(
|
|
1256
|
+
conn,
|
|
1257
|
+
source_id=source_id,
|
|
1258
|
+
root=root,
|
|
1259
|
+
file_path=file_path,
|
|
1260
|
+
stat=stat,
|
|
1261
|
+
os_type=os_type,
|
|
1262
|
+
drive_id=drive_id,
|
|
1263
|
+
status="indexed",
|
|
1264
|
+
parser_type=parser_type,
|
|
1265
|
+
sha256=existing["sha256"],
|
|
1266
|
+
graph_node_id=existing["graph_node_id"],
|
|
1267
|
+
metadata={
|
|
1268
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1269
|
+
"category": decision["category"],
|
|
1270
|
+
"unchanged": True,
|
|
1271
|
+
},
|
|
1272
|
+
)
|
|
1273
|
+
continue
|
|
1274
|
+
|
|
1275
|
+
try:
|
|
1276
|
+
data = file_path.read_bytes()
|
|
1277
|
+
digest = _sha256_bytes(data)
|
|
1278
|
+
except Exception as exc:
|
|
1279
|
+
counts["failed"] += 1
|
|
1280
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1281
|
+
if existing and existing["graph_node_id"]:
|
|
1282
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1283
|
+
self._upsert_local_file_index(
|
|
1284
|
+
conn,
|
|
1285
|
+
source_id=source_id,
|
|
1286
|
+
root=root,
|
|
1287
|
+
file_path=file_path,
|
|
1288
|
+
stat=stat,
|
|
1289
|
+
os_type=os_type,
|
|
1290
|
+
drive_id=drive_id,
|
|
1291
|
+
status="failed",
|
|
1292
|
+
parser_type=parser_type,
|
|
1293
|
+
error_message=str(exc),
|
|
1294
|
+
metadata={"category": decision["category"]},
|
|
1295
|
+
)
|
|
1296
|
+
continue
|
|
1297
|
+
|
|
1298
|
+
if (
|
|
1299
|
+
existing
|
|
1300
|
+
and existing["sha256"] == digest
|
|
1301
|
+
and existing["graph_node_id"]
|
|
1302
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1303
|
+
):
|
|
1304
|
+
counts["skipped_unchanged"] += 1
|
|
1305
|
+
self._upsert_local_file_index(
|
|
1306
|
+
conn,
|
|
1307
|
+
source_id=source_id,
|
|
1308
|
+
root=root,
|
|
1309
|
+
file_path=file_path,
|
|
1310
|
+
stat=stat,
|
|
1311
|
+
os_type=os_type,
|
|
1312
|
+
drive_id=drive_id,
|
|
1313
|
+
status="indexed",
|
|
1314
|
+
parser_type=parser_type,
|
|
1315
|
+
sha256=digest,
|
|
1316
|
+
graph_node_id=existing["graph_node_id"],
|
|
1317
|
+
metadata={
|
|
1318
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1319
|
+
"category": decision["category"],
|
|
1320
|
+
"sha256_unchanged": True,
|
|
1321
|
+
},
|
|
1322
|
+
)
|
|
1323
|
+
continue
|
|
1324
|
+
|
|
1325
|
+
try:
|
|
1326
|
+
text, parser_meta = self._extract_local_file_text(
|
|
1327
|
+
file_path,
|
|
1328
|
+
decision["category"],
|
|
1329
|
+
include_ocr=include_ocr,
|
|
1330
|
+
)
|
|
1331
|
+
text = _clean_text(text)
|
|
1332
|
+
parser_meta = {**parser_meta, "extracted_chars": len(text)}
|
|
1333
|
+
if not text:
|
|
1334
|
+
counts["skipped_empty_text"] += 1
|
|
1335
|
+
if existing and existing["graph_node_id"]:
|
|
1336
|
+
self._delete_local_file_graph(
|
|
1337
|
+
conn, existing["graph_node_id"]
|
|
1338
|
+
)
|
|
1339
|
+
self._upsert_local_file_index(
|
|
1340
|
+
conn,
|
|
1341
|
+
source_id=source_id,
|
|
1342
|
+
root=root,
|
|
1343
|
+
file_path=file_path,
|
|
1344
|
+
stat=stat,
|
|
1345
|
+
os_type=os_type,
|
|
1346
|
+
drive_id=drive_id,
|
|
1347
|
+
status="skipped_empty_text",
|
|
1348
|
+
parser_type=parser_type,
|
|
1349
|
+
sha256=digest,
|
|
1350
|
+
error_message="텍스트 추출 결과가 비어 있습니다.",
|
|
1351
|
+
metadata={
|
|
1352
|
+
"category": decision["category"],
|
|
1353
|
+
"parser": parser_meta,
|
|
1354
|
+
},
|
|
1355
|
+
)
|
|
1356
|
+
continue
|
|
1357
|
+
graph_node_id = self._upsert_local_file_node(
|
|
1358
|
+
conn,
|
|
1359
|
+
source_id=source_id,
|
|
1360
|
+
root=root,
|
|
1361
|
+
file_path=file_path,
|
|
1362
|
+
stat=stat,
|
|
1363
|
+
os_type=os_type,
|
|
1364
|
+
drive_id=drive_id,
|
|
1365
|
+
sha256=digest,
|
|
1366
|
+
category=decision["category"],
|
|
1367
|
+
parser_type=parser_type,
|
|
1368
|
+
text=text,
|
|
1369
|
+
parser_meta=parser_meta,
|
|
1370
|
+
)
|
|
1371
|
+
self._upsert_local_file_index(
|
|
1372
|
+
conn,
|
|
1373
|
+
source_id=source_id,
|
|
1374
|
+
root=root,
|
|
1375
|
+
file_path=file_path,
|
|
1376
|
+
stat=stat,
|
|
1377
|
+
os_type=os_type,
|
|
1378
|
+
drive_id=drive_id,
|
|
1379
|
+
status="indexed",
|
|
1380
|
+
parser_type=parser_type,
|
|
1381
|
+
sha256=digest,
|
|
1382
|
+
graph_node_id=graph_node_id,
|
|
1383
|
+
metadata={
|
|
1384
|
+
"category": decision["category"],
|
|
1385
|
+
"parser": parser_meta,
|
|
1386
|
+
},
|
|
1387
|
+
)
|
|
1388
|
+
counts["indexed"] += 1
|
|
1389
|
+
indexed_nodes.append(graph_node_id)
|
|
1390
|
+
except Exception as exc:
|
|
1391
|
+
counts["failed"] += 1
|
|
1392
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1393
|
+
if existing and existing["graph_node_id"]:
|
|
1394
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1395
|
+
self._upsert_local_file_index(
|
|
1396
|
+
conn,
|
|
1397
|
+
source_id=source_id,
|
|
1398
|
+
root=root,
|
|
1399
|
+
file_path=file_path,
|
|
1400
|
+
stat=stat,
|
|
1401
|
+
os_type=os_type,
|
|
1402
|
+
drive_id=drive_id,
|
|
1403
|
+
status="failed",
|
|
1404
|
+
parser_type=parser_type,
|
|
1405
|
+
sha256=digest,
|
|
1406
|
+
error_message=str(exc),
|
|
1407
|
+
metadata={"category": decision["category"]},
|
|
1408
|
+
)
|
|
1409
|
+
|
|
1410
|
+
if not limit_reached:
|
|
1411
|
+
existing_rows = {
|
|
1412
|
+
row["relative_path"]: row["graph_node_id"]
|
|
1413
|
+
for row in conn.execute(
|
|
1414
|
+
"SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
|
|
1415
|
+
(source_id,),
|
|
1416
|
+
)
|
|
1417
|
+
}
|
|
1418
|
+
deleted_paths = set(existing_rows) - seen_relative_paths
|
|
1419
|
+
for relative_path in deleted_paths:
|
|
1420
|
+
self._delete_local_file_graph(
|
|
1421
|
+
conn, existing_rows.get(relative_path)
|
|
1422
|
+
)
|
|
1423
|
+
conn.execute(
|
|
1424
|
+
"""
|
|
1425
|
+
UPDATE local_file_index
|
|
1426
|
+
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
|
|
1427
|
+
WHERE source_id=? AND relative_path=?
|
|
1428
|
+
""",
|
|
1429
|
+
(_now(), source_id, relative_path),
|
|
1430
|
+
)
|
|
1431
|
+
counts["deleted"] = len(deleted_paths)
|
|
1432
|
+
conn.execute(
|
|
1433
|
+
"""
|
|
1434
|
+
UPDATE knowledge_sources
|
|
1435
|
+
SET status='active', updated_at=?, last_scanned_at=?
|
|
1436
|
+
WHERE id=?
|
|
1437
|
+
""",
|
|
1438
|
+
(_now(), _now(), source_id),
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
return {
|
|
1442
|
+
"status": "ok",
|
|
1443
|
+
"source": {
|
|
1444
|
+
"id": source_id,
|
|
1445
|
+
"root_path": str(root),
|
|
1446
|
+
"os_type": os_type,
|
|
1447
|
+
"drive_id": drive_id,
|
|
1448
|
+
"include_ocr": bool(include_ocr),
|
|
1449
|
+
"watch_enabled": bool(watch_enabled),
|
|
1450
|
+
},
|
|
1451
|
+
"counts": dict(counts),
|
|
1452
|
+
"indexed_nodes": indexed_nodes[:100],
|
|
1453
|
+
"errors": errors[:50],
|
|
1454
|
+
"notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
|
|
1455
|
+
}
|