ltcai 3.6.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -31
- package/docs/CHANGELOG.md +64 -0
- package/docs/REALTIME_COLLABORATION.md +3 -3
- package/docs/V3_FRONTEND.md +9 -8
- package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
- package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +552 -0
- package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
- package/docs/kg-schema.md +51 -53
- package/docs/spec-vs-impl.md +10 -10
- package/kg_schema.py +2 -520
- package/knowledge_graph.py +37 -4629
- package/knowledge_graph_api.py +11 -127
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +16 -17
- package/latticeai/api/agents.py +20 -7
- package/latticeai/api/auth.py +46 -15
- package/latticeai/api/chat.py +112 -76
- package/latticeai/api/health.py +1 -1
- package/latticeai/api/hooks.py +1 -1
- package/latticeai/api/invitations.py +100 -0
- package/latticeai/api/knowledge_graph.py +139 -0
- package/latticeai/api/local_files.py +1 -1
- package/latticeai/api/mcp.py +23 -11
- package/latticeai/api/memory.py +1 -1
- package/latticeai/api/models.py +1 -1
- package/latticeai/api/network.py +81 -0
- package/latticeai/api/plugins.py +3 -6
- package/latticeai/api/realtime.py +5 -8
- package/latticeai/api/search.py +26 -2
- package/latticeai/api/security_dashboard.py +2 -3
- package/latticeai/api/setup.py +2 -2
- package/latticeai/api/static_routes.py +11 -16
- package/latticeai/api/tools.py +3 -0
- package/latticeai/api/ui_redirects.py +26 -0
- package/latticeai/api/workflow_designer.py +85 -6
- package/latticeai/api/workspace.py +93 -57
- package/latticeai/app_factory.py +1781 -0
- package/latticeai/brain/__init__.py +18 -0
- package/latticeai/brain/_kg_common.py +1123 -0
- package/latticeai/brain/context.py +213 -0
- package/latticeai/brain/conversations.py +236 -0
- package/latticeai/brain/discovery.py +1455 -0
- package/latticeai/brain/documents.py +218 -0
- package/latticeai/brain/identity.py +175 -0
- package/latticeai/brain/ingest.py +644 -0
- package/latticeai/brain/memory.py +102 -0
- package/latticeai/brain/network.py +205 -0
- package/latticeai/brain/projection.py +561 -0
- package/latticeai/brain/provenance.py +401 -0
- package/latticeai/brain/retrieval.py +1316 -0
- package/latticeai/brain/schema.py +640 -0
- package/latticeai/brain/store.py +216 -0
- package/latticeai/brain/write_master.py +225 -0
- package/latticeai/core/agent.py +31 -7
- package/latticeai/core/audit.py +0 -7
- package/latticeai/core/config.py +1 -1
- package/latticeai/core/context_builder.py +1 -2
- package/latticeai/core/enterprise.py +1 -1
- package/latticeai/core/graph_curator.py +2 -2
- package/latticeai/core/invitations.py +131 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/mcp_registry.py +791 -0
- package/latticeai/core/model_compat.py +1 -1
- package/latticeai/core/model_resolution.py +0 -1
- package/latticeai/core/multi_agent.py +238 -4
- package/latticeai/core/policy.py +54 -0
- package/latticeai/core/realtime.py +65 -44
- package/latticeai/core/security.py +1 -1
- package/latticeai/core/sessions.py +66 -10
- package/latticeai/core/users.py +147 -0
- package/latticeai/core/workflow_engine.py +114 -2
- package/latticeai/core/workspace_os.py +477 -29
- package/latticeai/models/__init__.py +7 -0
- package/latticeai/models/router.py +779 -0
- package/latticeai/server_app.py +29 -1536
- package/latticeai/services/agent_runtime.py +243 -4
- package/latticeai/services/app_context.py +75 -14
- package/latticeai/services/ingestion.py +47 -0
- package/latticeai/services/kg_portability.py +33 -3
- package/latticeai/services/memory_service.py +39 -11
- package/latticeai/services/model_runtime.py +2 -5
- package/latticeai/services/platform_runtime.py +100 -23
- package/latticeai/services/run_executor.py +328 -0
- package/latticeai/services/search_service.py +17 -8
- package/latticeai/services/tool_dispatch.py +12 -2
- package/latticeai/services/triggers.py +241 -0
- package/latticeai/services/upload_service.py +37 -12
- package/latticeai/services/workspace_service.py +55 -16
- package/llm_router.py +29 -772
- package/ltcai_cli.py +1 -2
- package/mcp_registry.py +25 -788
- package/p_reinforce.py +124 -14
- package/package.json +10 -20
- package/scripts/bump_version.py +99 -0
- package/scripts/generate_diagrams.py +0 -1
- package/scripts/lint_v3.mjs +105 -18
- package/scripts/validate_release_artifacts.py +0 -1
- package/scripts/wheel_smoke.py +142 -0
- package/server.py +11 -7
- package/setup_wizard.py +1142 -0
- package/static/sw.js +81 -52
- package/static/v3/asset-manifest.json +33 -25
- package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
- package/static/v3/css/lattice.base.css +1 -1
- package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
- package/static/v3/css/lattice.components.css +1 -1
- package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
- package/static/v3/css/lattice.shell.css +1 -1
- package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
- package/static/v3/css/lattice.tokens.css +3 -0
- package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
- package/static/v3/css/lattice.views.css +2 -2
- package/static/v3/index.html +3 -4
- package/static/v3/js/{app.c541f955.js → app.c5c80c46.js} +1 -1
- package/static/v3/js/core/{api.33d6320e.js → api.ba0fbf14.js} +58 -1
- package/static/v3/js/core/api.js +57 -0
- package/static/v3/js/core/i18n.880e1fec.js +575 -0
- package/static/v3/js/core/i18n.js +575 -0
- package/static/v3/js/core/routes.37522821.js +101 -0
- package/static/v3/js/core/routes.js +71 -63
- package/static/v3/js/core/{shell.8c163e0e.js → shell.e3f6bbfa.js} +68 -39
- package/static/v3/js/core/shell.js +66 -37
- package/static/v3/js/core/{store.34ebd5e6.js → store.7b2aa044.js} +11 -1
- package/static/v3/js/core/store.js +11 -1
- package/static/v3/js/views/account.eff40715.js +143 -0
- package/static/v3/js/views/account.js +143 -0
- package/static/v3/js/views/activity.0d271ef9.js +67 -0
- package/static/v3/js/views/activity.js +67 -0
- package/static/v3/js/views/{admin-users.03bac88c.js → admin-users.f7ac7b43.js} +4 -6
- package/static/v3/js/views/admin-users.js +4 -6
- package/static/v3/js/views/{agents.014d0b74.js → agents.17c5288d.js} +35 -12
- package/static/v3/js/views/agents.js +35 -12
- package/static/v3/js/views/{chat.e6dd7dd0.js → chat.e250e2cc.js} +23 -0
- package/static/v3/js/views/chat.js +23 -0
- package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
- package/static/v3/js/views/graph-canvas.js +509 -0
- package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
- package/static/v3/js/views/hybrid-search.js +1 -2
- package/static/v3/js/views/{knowledge-graph.a96040a5.js → knowledge-graph.4d09c537.js} +60 -44
- package/static/v3/js/views/knowledge-graph.js +60 -44
- package/static/v3/js/views/network.52a4f181.js +97 -0
- package/static/v3/js/views/network.js +97 -0
- package/static/v3/js/views/{planning.9ac3e313.js → planning.4876fd77.js} +26 -5
- package/static/v3/js/views/planning.js +26 -5
- package/static/v3/js/views/runs.b63b2afa.js +144 -0
- package/static/v3/js/views/runs.js +144 -0
- package/static/v3/js/views/{settings.8631fa5e.js → settings.b7140634.js} +7 -8
- package/static/v3/js/views/settings.js +7 -8
- package/static/v3/js/views/snapshots.6f5db095.js +135 -0
- package/static/v3/js/views/snapshots.js +135 -0
- package/static/v3/js/views/{workflows.26c57290.js → workflows.7752225a.js} +87 -2
- package/static/v3/js/views/workflows.js +87 -2
- package/static/v3/js/views/workspace-admin.c466029b.js +156 -0
- package/static/v3/js/views/workspace-admin.js +156 -0
- package/static/vendor/chart.umd.min.js +20 -0
- package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
- package/static/vendor/fonts/inter.css +44 -0
- package/static/vendor/icons/tabler-icons.min.css +4 -0
- package/static/vendor/icons/tabler-icons.woff2 +0 -0
- package/static/vendor/marked.min.js +69 -0
- package/telegram_bot.py +1 -2
- package/tools/commands.py +4 -2
- package/tools/computer.py +1 -1
- package/tools/documents.py +1 -3
- package/tools/filesystem.py +0 -4
- package/tools/knowledge.py +1 -3
- package/tools/network.py +1 -3
- package/codex_telegram_bot.py +0 -195
- package/docs/assets/v3.4.0/agent-run.png +0 -0
- package/docs/assets/v3.4.0/agents.png +0 -0
- package/docs/assets/v3.4.0/before/chat-before.png +0 -0
- package/docs/assets/v3.4.0/before/files-before.png +0 -0
- package/docs/assets/v3.4.0/chat.png +0 -0
- package/docs/assets/v3.4.0/connect-folder.png +0 -0
- package/docs/assets/v3.4.0/files.png +0 -0
- package/docs/assets/v3.4.0/home.png +0 -0
- package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
- package/docs/assets/v3.4.0/local-agent.png +0 -0
- package/docs/assets/v3.4.0/memory.png +0 -0
- package/docs/assets/v3.4.0/settings.png +0 -0
- package/docs/assets/v3.4.0/vision-input.png +0 -0
- package/docs/assets/v3.4.0/workflows.png +0 -0
- package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
- package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.1/local-agent.png +0 -0
- package/docs/images/admin-dashboard.png +0 -0
- package/docs/images/architecture.png +0 -0
- package/docs/images/enterprise.png +0 -0
- package/docs/images/graph.png +0 -0
- package/docs/images/hero.gif +0 -0
- package/docs/images/knowledge-graph.png +0 -0
- package/docs/images/lattice-ai-demo.gif +0 -0
- package/docs/images/lattice-ai-hero.png +0 -0
- package/docs/images/logo.svg +0 -33
- package/docs/images/mobile-responsive.png +0 -0
- package/docs/images/model-recommendation.png +0 -0
- package/docs/images/onboarding.png +0 -0
- package/docs/images/organization.png +0 -0
- package/docs/images/pipeline.png +0 -0
- package/docs/images/screenshot-admin.png +0 -0
- package/docs/images/screenshot-chat.png +0 -0
- package/docs/images/screenshot-graph.png +0 -0
- package/docs/images/skills.png +0 -0
- package/docs/images/workspace-dark.png +0 -0
- package/docs/images/workspace-light.png +0 -0
- package/docs/images/workspace.png +0 -0
- package/requirements.txt +0 -16
- package/static/account.html +0 -115
- package/static/activity.html +0 -73
- package/static/admin.html +0 -488
- package/static/agents.html +0 -139
- package/static/chat.html +0 -844
- package/static/css/reference/account.css +0 -439
- package/static/css/reference/admin.css +0 -610
- package/static/css/reference/base.css +0 -1661
- package/static/css/reference/chat.css +0 -4623
- package/static/css/reference/graph.css +0 -1016
- package/static/css/responsive.css +0 -861
- package/static/graph.html +0 -124
- package/static/platform.css +0 -104
- package/static/plugins.html +0 -136
- package/static/scripts/account.js +0 -238
- package/static/scripts/admin.js +0 -1614
- package/static/scripts/chat.js +0 -5081
- package/static/scripts/graph.js +0 -1804
- package/static/scripts/platform.js +0 -64
- package/static/scripts/ux.js +0 -167
- package/static/scripts/workspace.js +0 -948
- package/static/v3/js/core/routes.2ce3815a.js +0 -93
- package/static/workflows.html +0 -146
- package/static/workspace.css +0 -1121
- package/static/workspace.html +0 -357
|
@@ -0,0 +1,1455 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: F403,F405
|
|
4
|
+
|
|
5
|
+
from ._kg_common import * # noqa: F403,F401
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KnowledgeGraphDiscoveryMixin:
|
|
9
|
+
def discover_local_roots(self) -> Dict[str, Any]:
|
|
10
|
+
"""Return safe, cross-platform starting points for structure browsing."""
|
|
11
|
+
os_type = _current_os_type()
|
|
12
|
+
home = Path.home().expanduser()
|
|
13
|
+
roots: List[Dict[str, Any]] = []
|
|
14
|
+
seen: set = set()
|
|
15
|
+
|
|
16
|
+
def add(
|
|
17
|
+
label: str,
|
|
18
|
+
path: Path,
|
|
19
|
+
kind: str,
|
|
20
|
+
*,
|
|
21
|
+
recommended: bool = True,
|
|
22
|
+
warning: Optional[str] = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
try:
|
|
25
|
+
resolved = path.expanduser().resolve()
|
|
26
|
+
except OSError:
|
|
27
|
+
resolved = path.expanduser()
|
|
28
|
+
key = str(resolved)
|
|
29
|
+
if key in seen or not resolved.exists():
|
|
30
|
+
return
|
|
31
|
+
seen.add(key)
|
|
32
|
+
roots.append(
|
|
33
|
+
{
|
|
34
|
+
"id": f"{kind}:{_path_fingerprint(resolved)}",
|
|
35
|
+
"label": label,
|
|
36
|
+
"path": key,
|
|
37
|
+
"kind": kind,
|
|
38
|
+
"recommended": recommended,
|
|
39
|
+
"warning": warning or _root_warning(resolved, os_type),
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
add("홈", home, "home", warning=_root_warning(home, os_type))
|
|
44
|
+
for name, label in (
|
|
45
|
+
("Documents", "문서"),
|
|
46
|
+
("Desktop", "데스크탑"),
|
|
47
|
+
("Downloads", "다운로드"),
|
|
48
|
+
("Pictures", "사진"),
|
|
49
|
+
("Projects", "프로젝트"),
|
|
50
|
+
):
|
|
51
|
+
add(label, home / name, name.lower())
|
|
52
|
+
|
|
53
|
+
if os_type == "macos":
|
|
54
|
+
volumes = Path("/Volumes")
|
|
55
|
+
if volumes.exists():
|
|
56
|
+
try:
|
|
57
|
+
for volume in sorted(
|
|
58
|
+
volumes.iterdir(), key=lambda p: p.name.lower()
|
|
59
|
+
):
|
|
60
|
+
add(volume.name, volume, "volume", recommended=False)
|
|
61
|
+
except OSError:
|
|
62
|
+
pass
|
|
63
|
+
elif os_type == "windows":
|
|
64
|
+
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
65
|
+
drive = Path(f"{letter}:\\")
|
|
66
|
+
if drive.exists():
|
|
67
|
+
add(
|
|
68
|
+
f"{letter}: 드라이브",
|
|
69
|
+
drive,
|
|
70
|
+
"drive",
|
|
71
|
+
recommended=(letter != "C"),
|
|
72
|
+
)
|
|
73
|
+
for env_name, label in (
|
|
74
|
+
("OneDrive", "OneDrive"),
|
|
75
|
+
("OneDriveCommercial", "OneDrive"),
|
|
76
|
+
):
|
|
77
|
+
raw = os.environ.get(env_name)
|
|
78
|
+
if raw:
|
|
79
|
+
add(label, Path(raw), "cloud", recommended=False)
|
|
80
|
+
elif os_type == "linux":
|
|
81
|
+
for base in (Path("/mnt"), Path("/media")):
|
|
82
|
+
add(str(base), base, "mounts", recommended=False)
|
|
83
|
+
try:
|
|
84
|
+
if base.exists():
|
|
85
|
+
for mounted in sorted(
|
|
86
|
+
base.iterdir(), key=lambda p: p.name.lower()
|
|
87
|
+
):
|
|
88
|
+
add(mounted.name, mounted, "volume", recommended=False)
|
|
89
|
+
except OSError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"os_type": os_type,
|
|
94
|
+
"computer": platform.node() or "local",
|
|
95
|
+
"roots": roots,
|
|
96
|
+
"privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
|
|
100
|
+
"""List one folder level using metadata only; file contents are not read."""
|
|
101
|
+
root = Path(path).expanduser().resolve()
|
|
102
|
+
if not root.exists():
|
|
103
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
104
|
+
if not root.is_dir():
|
|
105
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
106
|
+
|
|
107
|
+
os_type = _current_os_type()
|
|
108
|
+
max_items = max(1, min(int(max_items or 200), 1000))
|
|
109
|
+
items: List[Dict[str, Any]] = []
|
|
110
|
+
inaccessible = 0
|
|
111
|
+
try:
|
|
112
|
+
children = sorted(
|
|
113
|
+
root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
114
|
+
)
|
|
115
|
+
except PermissionError as exc:
|
|
116
|
+
return {
|
|
117
|
+
"path": str(root),
|
|
118
|
+
"items": [],
|
|
119
|
+
"error": f"접근 권한 없음: {exc}",
|
|
120
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
for child in children[:max_items]:
|
|
124
|
+
try:
|
|
125
|
+
is_dir = child.is_dir()
|
|
126
|
+
stat = child.stat()
|
|
127
|
+
reason = (
|
|
128
|
+
_excluded_directory_reason(child, root=root, os_type=os_type)
|
|
129
|
+
if is_dir
|
|
130
|
+
else _sensitive_file_reason(child, root=root)
|
|
131
|
+
)
|
|
132
|
+
items.append(
|
|
133
|
+
{
|
|
134
|
+
"name": child.name,
|
|
135
|
+
"path": str(child),
|
|
136
|
+
"type": "directory" if is_dir else "file",
|
|
137
|
+
"extension": "" if is_dir else child.suffix.lower(),
|
|
138
|
+
"size_bytes": None if is_dir else stat.st_size,
|
|
139
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
140
|
+
"hidden": _is_hidden_path(child, root),
|
|
141
|
+
"accessible": True,
|
|
142
|
+
"excluded_reason": reason,
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
except PermissionError:
|
|
146
|
+
inaccessible += 1
|
|
147
|
+
items.append(
|
|
148
|
+
{
|
|
149
|
+
"name": child.name,
|
|
150
|
+
"path": str(child),
|
|
151
|
+
"type": "unknown",
|
|
152
|
+
"accessible": False,
|
|
153
|
+
"excluded_reason": "permission_denied",
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
except OSError as exc:
|
|
157
|
+
inaccessible += 1
|
|
158
|
+
items.append(
|
|
159
|
+
{
|
|
160
|
+
"name": child.name,
|
|
161
|
+
"path": str(child),
|
|
162
|
+
"type": "unknown",
|
|
163
|
+
"accessible": False,
|
|
164
|
+
"excluded_reason": str(exc),
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"path": str(root),
|
|
170
|
+
"os_type": os_type,
|
|
171
|
+
"items": items,
|
|
172
|
+
"truncated": len(children) > max_items,
|
|
173
|
+
"inaccessible": inaccessible,
|
|
174
|
+
"warning": _root_warning(root, os_type),
|
|
175
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def _iter_local_scan_entries(
|
|
179
|
+
self, root: Path, *, max_files: int
|
|
180
|
+
) -> Iterable[Dict[str, Any]]:
|
|
181
|
+
os_type = _current_os_type()
|
|
182
|
+
stack = [root]
|
|
183
|
+
files_seen = 0
|
|
184
|
+
while stack:
|
|
185
|
+
current = stack.pop()
|
|
186
|
+
try:
|
|
187
|
+
children = sorted(
|
|
188
|
+
current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
|
|
189
|
+
)
|
|
190
|
+
except PermissionError as exc:
|
|
191
|
+
yield {
|
|
192
|
+
"kind": "inaccessible_dir",
|
|
193
|
+
"path": current,
|
|
194
|
+
"reason": f"permission_denied: {exc}",
|
|
195
|
+
}
|
|
196
|
+
continue
|
|
197
|
+
except OSError as exc:
|
|
198
|
+
yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
for child in children:
|
|
202
|
+
if child.is_symlink():
|
|
203
|
+
yield {"kind": "excluded", "path": child, "reason": "symlink"}
|
|
204
|
+
continue
|
|
205
|
+
try:
|
|
206
|
+
if child.is_dir():
|
|
207
|
+
reason = _excluded_directory_reason(
|
|
208
|
+
child, root=root, os_type=os_type
|
|
209
|
+
)
|
|
210
|
+
if reason:
|
|
211
|
+
yield {
|
|
212
|
+
"kind": "excluded_dir",
|
|
213
|
+
"path": child,
|
|
214
|
+
"reason": reason,
|
|
215
|
+
}
|
|
216
|
+
else:
|
|
217
|
+
stack.append(child)
|
|
218
|
+
continue
|
|
219
|
+
if not child.is_file():
|
|
220
|
+
yield {
|
|
221
|
+
"kind": "excluded",
|
|
222
|
+
"path": child,
|
|
223
|
+
"reason": "not_regular_file",
|
|
224
|
+
}
|
|
225
|
+
continue
|
|
226
|
+
stat = child.stat()
|
|
227
|
+
except PermissionError as exc:
|
|
228
|
+
yield {
|
|
229
|
+
"kind": "inaccessible_file",
|
|
230
|
+
"path": child,
|
|
231
|
+
"reason": f"permission_denied: {exc}",
|
|
232
|
+
}
|
|
233
|
+
continue
|
|
234
|
+
except OSError as exc:
|
|
235
|
+
yield {
|
|
236
|
+
"kind": "inaccessible_file",
|
|
237
|
+
"path": child,
|
|
238
|
+
"reason": str(exc),
|
|
239
|
+
}
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
files_seen += 1
|
|
243
|
+
if files_seen > max_files:
|
|
244
|
+
yield {
|
|
245
|
+
"kind": "limit_reached",
|
|
246
|
+
"path": child,
|
|
247
|
+
"reason": "max_files",
|
|
248
|
+
}
|
|
249
|
+
return
|
|
250
|
+
yield {"kind": "file", "path": child, "stat": stat}
|
|
251
|
+
|
|
252
|
+
def _local_file_decision(
|
|
253
|
+
self, path: Path, root: Path, stat: os.stat_result
|
|
254
|
+
) -> Dict[str, Any]:
|
|
255
|
+
ext = path.suffix.lower()
|
|
256
|
+
category = _file_category(ext)
|
|
257
|
+
parser_type = _parser_type_for_category(category, ext)
|
|
258
|
+
sensitive_reason = _sensitive_file_reason(path, root=root)
|
|
259
|
+
if sensitive_reason:
|
|
260
|
+
return {
|
|
261
|
+
"status": "sensitive_blocked",
|
|
262
|
+
"reason": sensitive_reason,
|
|
263
|
+
"category": category,
|
|
264
|
+
"parser_type": parser_type,
|
|
265
|
+
"indexable": False,
|
|
266
|
+
}
|
|
267
|
+
if category == "unsupported":
|
|
268
|
+
return {
|
|
269
|
+
"status": "unsupported",
|
|
270
|
+
"reason": "unsupported_extension",
|
|
271
|
+
"category": category,
|
|
272
|
+
"parser_type": parser_type,
|
|
273
|
+
"indexable": False,
|
|
274
|
+
}
|
|
275
|
+
limit = _size_limit_for_category(category)
|
|
276
|
+
if stat.st_size > limit:
|
|
277
|
+
return {
|
|
278
|
+
"status": "too_large",
|
|
279
|
+
"reason": f"size>{limit}",
|
|
280
|
+
"category": category,
|
|
281
|
+
"parser_type": parser_type,
|
|
282
|
+
"indexable": False,
|
|
283
|
+
}
|
|
284
|
+
return {
|
|
285
|
+
"status": "pending",
|
|
286
|
+
"reason": "",
|
|
287
|
+
"category": category,
|
|
288
|
+
"parser_type": parser_type,
|
|
289
|
+
"indexable": True,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
def audit_local_folder(
|
|
293
|
+
self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000
|
|
294
|
+
) -> Dict[str, Any]:
|
|
295
|
+
"""Safety-check a folder using metadata only; file bodies are not read."""
|
|
296
|
+
root = Path(path).expanduser().resolve()
|
|
297
|
+
if not root.exists():
|
|
298
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
299
|
+
if not root.is_dir():
|
|
300
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
301
|
+
|
|
302
|
+
os_type = _current_os_type()
|
|
303
|
+
max_files = max(1, min(int(max_files or 50_000), 200_000))
|
|
304
|
+
status_counts: Counter = Counter()
|
|
305
|
+
category_counts: Counter = Counter()
|
|
306
|
+
extension_counts: Counter = Counter()
|
|
307
|
+
allowed_samples: List[Dict[str, Any]] = []
|
|
308
|
+
excluded_samples: List[Dict[str, Any]] = []
|
|
309
|
+
total_files = 0
|
|
310
|
+
readable_files = 0
|
|
311
|
+
inaccessible = 0
|
|
312
|
+
excluded_dirs = 0
|
|
313
|
+
limit_reached = False
|
|
314
|
+
|
|
315
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
316
|
+
kind = entry["kind"]
|
|
317
|
+
path_obj = entry["path"]
|
|
318
|
+
if kind == "limit_reached":
|
|
319
|
+
limit_reached = True
|
|
320
|
+
break
|
|
321
|
+
if kind == "excluded_dir":
|
|
322
|
+
excluded_dirs += 1
|
|
323
|
+
if len(excluded_samples) < 25:
|
|
324
|
+
excluded_samples.append(
|
|
325
|
+
_sample_file(
|
|
326
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
continue
|
|
330
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
331
|
+
inaccessible += 1
|
|
332
|
+
status_counts["failed"] += 1
|
|
333
|
+
if len(excluded_samples) < 25:
|
|
334
|
+
excluded_samples.append(
|
|
335
|
+
_sample_file(path_obj, root, "failed", entry.get("reason", ""))
|
|
336
|
+
)
|
|
337
|
+
continue
|
|
338
|
+
if kind == "excluded":
|
|
339
|
+
status_counts["excluded"] += 1
|
|
340
|
+
if len(excluded_samples) < 25:
|
|
341
|
+
excluded_samples.append(
|
|
342
|
+
_sample_file(
|
|
343
|
+
path_obj, root, "excluded", entry.get("reason", "")
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
continue
|
|
347
|
+
if kind != "file":
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
total_files += 1
|
|
351
|
+
stat = entry["stat"]
|
|
352
|
+
decision = self._local_file_decision(path_obj, root, stat)
|
|
353
|
+
status = decision["status"]
|
|
354
|
+
category = decision["category"]
|
|
355
|
+
ext = path_obj.suffix.lower() or "(none)"
|
|
356
|
+
category_counts[category] += 1
|
|
357
|
+
extension_counts[ext] += 1
|
|
358
|
+
if decision["indexable"]:
|
|
359
|
+
readable_files += 1
|
|
360
|
+
status_counts["readable"] += 1
|
|
361
|
+
if len(allowed_samples) < 25:
|
|
362
|
+
allowed_samples.append(_sample_file(path_obj, root, "readable"))
|
|
363
|
+
else:
|
|
364
|
+
status_counts[status] += 1
|
|
365
|
+
if len(excluded_samples) < 25:
|
|
366
|
+
excluded_samples.append(
|
|
367
|
+
_sample_file(path_obj, root, status, decision["reason"])
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
doc_weight = (
|
|
371
|
+
category_counts["pdf"] * 1.4
|
|
372
|
+
+ category_counts["document"] * 0.9
|
|
373
|
+
+ category_counts["slide_deck"] * 1.0
|
|
374
|
+
)
|
|
375
|
+
sheet_weight = category_counts["spreadsheet"] * 0.6
|
|
376
|
+
ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
|
|
377
|
+
estimated_seconds = round(
|
|
378
|
+
readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
"path": str(root),
|
|
383
|
+
"source_id": f"source:{_path_fingerprint(root)}",
|
|
384
|
+
"os_type": os_type,
|
|
385
|
+
"drive_id": _drive_id_for_path(root),
|
|
386
|
+
"warning": _root_warning(root, os_type),
|
|
387
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
388
|
+
"include_ocr_requested": bool(include_ocr),
|
|
389
|
+
"summary": {
|
|
390
|
+
"total_files": total_files,
|
|
391
|
+
"readable_files": readable_files,
|
|
392
|
+
"excluded_files": int(
|
|
393
|
+
status_counts["excluded"]
|
|
394
|
+
+ status_counts["sensitive_blocked"]
|
|
395
|
+
+ status_counts["too_large"]
|
|
396
|
+
+ status_counts["unsupported"]
|
|
397
|
+
),
|
|
398
|
+
"sensitive_files": int(status_counts["sensitive_blocked"]),
|
|
399
|
+
"too_large_files": int(status_counts["too_large"]),
|
|
400
|
+
"unsupported_files": int(status_counts["unsupported"]),
|
|
401
|
+
"image_ocr_candidates": int(category_counts["image"]),
|
|
402
|
+
"inaccessible_items": inaccessible,
|
|
403
|
+
"excluded_dirs": excluded_dirs,
|
|
404
|
+
"estimated_seconds": estimated_seconds,
|
|
405
|
+
"storage_root": str(self.db_path.parent),
|
|
406
|
+
"limit_reached": limit_reached,
|
|
407
|
+
},
|
|
408
|
+
"by_status": dict(status_counts),
|
|
409
|
+
"by_category": dict(category_counts),
|
|
410
|
+
"by_extension": dict(extension_counts.most_common(40)),
|
|
411
|
+
"allowed_samples": allowed_samples,
|
|
412
|
+
"excluded_samples": excluded_samples,
|
|
413
|
+
"consent_required": {
|
|
414
|
+
"knowledge_source": True,
|
|
415
|
+
"image_ocr": bool(category_counts["image"]),
|
|
416
|
+
"watch": True,
|
|
417
|
+
"sensitive_files_default_excluded": True,
|
|
418
|
+
},
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def local_sources(self) -> Dict[str, Any]:
|
|
422
|
+
with self._connect() as conn:
|
|
423
|
+
sources = [
|
|
424
|
+
{
|
|
425
|
+
"id": row["id"],
|
|
426
|
+
"root_path": row["root_path"],
|
|
427
|
+
"os_type": row["os_type"],
|
|
428
|
+
"drive_id": row["drive_id"],
|
|
429
|
+
"label": row["label"],
|
|
430
|
+
"status": row["status"],
|
|
431
|
+
"include_ocr": bool(row["include_ocr"]),
|
|
432
|
+
"watch_enabled": bool(row["watch_enabled"]),
|
|
433
|
+
"consent": _safe_loads(row["consent_json"]),
|
|
434
|
+
"created_at": row["created_at"],
|
|
435
|
+
"updated_at": row["updated_at"],
|
|
436
|
+
"last_scanned_at": row["last_scanned_at"],
|
|
437
|
+
}
|
|
438
|
+
for row in conn.execute(
|
|
439
|
+
"""
|
|
440
|
+
SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
441
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
442
|
+
FROM knowledge_sources
|
|
443
|
+
ORDER BY updated_at DESC, id ASC
|
|
444
|
+
"""
|
|
445
|
+
)
|
|
446
|
+
]
|
|
447
|
+
status_rows = conn.execute(
|
|
448
|
+
"SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
|
|
449
|
+
).fetchall()
|
|
450
|
+
counts: Dict[str, Dict[str, int]] = {}
|
|
451
|
+
for row in status_rows:
|
|
452
|
+
counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
|
|
453
|
+
for source in sources:
|
|
454
|
+
source["file_status"] = counts.get(source["id"], {})
|
|
455
|
+
return {"sources": sources}
|
|
456
|
+
|
|
457
|
+
def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
|
|
458
|
+
source_id = str(source_id or "").strip()
|
|
459
|
+
if not source_id:
|
|
460
|
+
raise ValueError("source_id required")
|
|
461
|
+
with self._connect() as conn:
|
|
462
|
+
row = conn.execute(
|
|
463
|
+
"SELECT id FROM knowledge_sources WHERE id=?",
|
|
464
|
+
(source_id,),
|
|
465
|
+
).fetchone()
|
|
466
|
+
if not row:
|
|
467
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
468
|
+
conn.execute(
|
|
469
|
+
"UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
|
|
470
|
+
(1 if enabled else 0, _now(), source_id),
|
|
471
|
+
)
|
|
472
|
+
return {"source_id": source_id, "watch_enabled": bool(enabled)}
|
|
473
|
+
|
|
474
|
+
def remove_local_source(self, source_id: str) -> Dict[str, Any]:
|
|
475
|
+
"""Remove one approved local source and its derived graph projection.
|
|
476
|
+
|
|
477
|
+
This is intentionally non-destructive for user files: only the LatticeAI
|
|
478
|
+
index rows, graph nodes, edges, and chunks derived from the source are
|
|
479
|
+
removed. The original folder and files are never touched.
|
|
480
|
+
"""
|
|
481
|
+
source_id = str(source_id or "").strip()
|
|
482
|
+
if not source_id:
|
|
483
|
+
raise ValueError("source_id required")
|
|
484
|
+
with self._connect() as conn:
|
|
485
|
+
source = conn.execute(
|
|
486
|
+
"SELECT id, root_path FROM knowledge_sources WHERE id=?",
|
|
487
|
+
(source_id,),
|
|
488
|
+
).fetchone()
|
|
489
|
+
if not source:
|
|
490
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
491
|
+
rows = conn.execute(
|
|
492
|
+
"SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
|
|
493
|
+
(source_id,),
|
|
494
|
+
).fetchall()
|
|
495
|
+
graph_node_ids = [
|
|
496
|
+
row["graph_node_id"] for row in rows if row["graph_node_id"]
|
|
497
|
+
]
|
|
498
|
+
for graph_node_id in graph_node_ids:
|
|
499
|
+
self._delete_local_file_graph(conn, graph_node_id)
|
|
500
|
+
conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
|
|
501
|
+
conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
|
|
502
|
+
self._cleanup_local_graph_orphans(conn, source_id)
|
|
503
|
+
return {
|
|
504
|
+
"source_id": source_id,
|
|
505
|
+
"root_path": source["root_path"],
|
|
506
|
+
"removed_graph_nodes": len(graph_node_ids),
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
def _extract_local_file_text(
|
|
510
|
+
self, path: Path, category: str, *, include_ocr: bool
|
|
511
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
512
|
+
ext = path.suffix.lower()
|
|
513
|
+
meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
|
|
514
|
+
text = ""
|
|
515
|
+
if category in {"text", "code"} or ext == ".csv":
|
|
516
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
517
|
+
elif ext == ".pdf":
|
|
518
|
+
import pdfplumber
|
|
519
|
+
|
|
520
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
521
|
+
meta["pages"] = len(pdf.pages)
|
|
522
|
+
text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
|
|
523
|
+
elif ext == ".docx":
|
|
524
|
+
from docx import Document
|
|
525
|
+
|
|
526
|
+
doc = Document(str(path))
|
|
527
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
528
|
+
table_lines = []
|
|
529
|
+
for table in doc.tables:
|
|
530
|
+
for row in table.rows:
|
|
531
|
+
cells = [_clean_text(cell.text) for cell in row.cells]
|
|
532
|
+
if any(cells):
|
|
533
|
+
table_lines.append("\t".join(cells))
|
|
534
|
+
meta["paragraphs"] = len(paragraphs)
|
|
535
|
+
meta["tables"] = len(doc.tables)
|
|
536
|
+
meta["table_rows"] = len(table_lines)
|
|
537
|
+
text = "\n\n".join([*paragraphs, *table_lines])
|
|
538
|
+
elif ext == ".xlsx":
|
|
539
|
+
from openpyxl import load_workbook
|
|
540
|
+
|
|
541
|
+
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
542
|
+
rows_all = []
|
|
543
|
+
non_empty_rows = 0
|
|
544
|
+
non_empty_cells = 0
|
|
545
|
+
char_count = 0
|
|
546
|
+
for ws in wb.worksheets:
|
|
547
|
+
sheet_rows = []
|
|
548
|
+
for row in ws.iter_rows(values_only=True):
|
|
549
|
+
cells = [
|
|
550
|
+
str(cell).strip() if cell is not None else "" for cell in row
|
|
551
|
+
]
|
|
552
|
+
if not any(cells):
|
|
553
|
+
continue
|
|
554
|
+
line = "\t".join(cells)
|
|
555
|
+
non_empty_rows += 1
|
|
556
|
+
non_empty_cells += sum(1 for cell in cells if cell)
|
|
557
|
+
sheet_rows.append(line)
|
|
558
|
+
char_count += len(line) + 1
|
|
559
|
+
if char_count > 200_000:
|
|
560
|
+
break
|
|
561
|
+
if sheet_rows:
|
|
562
|
+
rows_all.append(f"[Sheet: {ws.title}]")
|
|
563
|
+
rows_all.extend(sheet_rows)
|
|
564
|
+
if char_count > 200_000:
|
|
565
|
+
break
|
|
566
|
+
meta["sheets"] = len(wb.worksheets)
|
|
567
|
+
meta["rows"] = non_empty_rows
|
|
568
|
+
meta["cells"] = non_empty_cells
|
|
569
|
+
text = "\n".join(rows_all)
|
|
570
|
+
elif ext == ".pptx":
|
|
571
|
+
from pptx import Presentation
|
|
572
|
+
|
|
573
|
+
prs = Presentation(str(path))
|
|
574
|
+
slides_text = []
|
|
575
|
+
for index, slide in enumerate(prs.slides, 1):
|
|
576
|
+
parts = []
|
|
577
|
+
for shape in slide.shapes:
|
|
578
|
+
if getattr(shape, "has_text_frame", False):
|
|
579
|
+
slide_text = shape.text_frame.text.strip()
|
|
580
|
+
if slide_text:
|
|
581
|
+
parts.append(slide_text)
|
|
582
|
+
if parts:
|
|
583
|
+
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
584
|
+
meta["slides"] = len(prs.slides)
|
|
585
|
+
meta["text_slides"] = len(slides_text)
|
|
586
|
+
text = "\n\n".join(slides_text)
|
|
587
|
+
elif category == "image":
|
|
588
|
+
from PIL import Image
|
|
589
|
+
|
|
590
|
+
with Image.open(str(path)) as image:
|
|
591
|
+
meta.update(
|
|
592
|
+
{
|
|
593
|
+
"width": image.width,
|
|
594
|
+
"height": image.height,
|
|
595
|
+
"format": image.format,
|
|
596
|
+
"mode": image.mode,
|
|
597
|
+
"ocr_enabled": bool(include_ocr),
|
|
598
|
+
}
|
|
599
|
+
)
|
|
600
|
+
if include_ocr:
|
|
601
|
+
try:
|
|
602
|
+
import pytesseract
|
|
603
|
+
|
|
604
|
+
text = pytesseract.image_to_string(image)
|
|
605
|
+
meta["ocr_chars"] = len(text)
|
|
606
|
+
except (
|
|
607
|
+
Exception
|
|
608
|
+
) as exc: # pragma: no cover - depends on local OCR runtime
|
|
609
|
+
meta["ocr_error"] = str(exc)
|
|
610
|
+
text = ""
|
|
611
|
+
return text[:200_000], meta
|
|
612
|
+
|
|
613
|
+
def _ensure_local_hierarchy(
|
|
614
|
+
self,
|
|
615
|
+
conn: sqlite3.Connection,
|
|
616
|
+
*,
|
|
617
|
+
source_id: str,
|
|
618
|
+
root: Path,
|
|
619
|
+
file_path: Path,
|
|
620
|
+
os_type: str,
|
|
621
|
+
drive_id: str,
|
|
622
|
+
) -> str:
|
|
623
|
+
computer_label = platform.node() or "내 컴퓨터"
|
|
624
|
+
computer_id = f"computer:{_slug(computer_label)}"
|
|
625
|
+
drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
|
|
626
|
+
root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
|
|
627
|
+
self._upsert_node(
|
|
628
|
+
conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type}
|
|
629
|
+
)
|
|
630
|
+
self._upsert_node(
|
|
631
|
+
conn,
|
|
632
|
+
drive_node_id,
|
|
633
|
+
"Drive",
|
|
634
|
+
drive_id,
|
|
635
|
+
metadata={"os_type": os_type, "drive_id": drive_id},
|
|
636
|
+
)
|
|
637
|
+
self._upsert_edge(
|
|
638
|
+
conn,
|
|
639
|
+
computer_id,
|
|
640
|
+
drive_node_id,
|
|
641
|
+
"포함함",
|
|
642
|
+
metadata={"source": "local_scan"},
|
|
643
|
+
)
|
|
644
|
+
self._upsert_node(
|
|
645
|
+
conn,
|
|
646
|
+
root_folder_id,
|
|
647
|
+
"Folder",
|
|
648
|
+
root.name or str(root),
|
|
649
|
+
summary=str(root),
|
|
650
|
+
metadata={"source_id": source_id, "path": str(root), "root": True},
|
|
651
|
+
)
|
|
652
|
+
self._upsert_edge(
|
|
653
|
+
conn,
|
|
654
|
+
drive_node_id,
|
|
655
|
+
root_folder_id,
|
|
656
|
+
"포함함",
|
|
657
|
+
metadata={"source": "local_scan"},
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
relative_parent = file_path.parent.relative_to(root)
|
|
662
|
+
except ValueError:
|
|
663
|
+
relative_parent = Path()
|
|
664
|
+
parent_id = root_folder_id
|
|
665
|
+
current_path = root
|
|
666
|
+
for part in relative_parent.parts:
|
|
667
|
+
current_path = current_path / part
|
|
668
|
+
folder_id = (
|
|
669
|
+
f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
|
|
670
|
+
)
|
|
671
|
+
self._upsert_node(
|
|
672
|
+
conn,
|
|
673
|
+
folder_id,
|
|
674
|
+
"Folder",
|
|
675
|
+
part,
|
|
676
|
+
summary=str(current_path),
|
|
677
|
+
metadata={
|
|
678
|
+
"source_id": source_id,
|
|
679
|
+
"path": str(current_path),
|
|
680
|
+
"root": False,
|
|
681
|
+
},
|
|
682
|
+
)
|
|
683
|
+
self._upsert_edge(
|
|
684
|
+
conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"}
|
|
685
|
+
)
|
|
686
|
+
parent_id = folder_id
|
|
687
|
+
return parent_id
|
|
688
|
+
|
|
689
|
+
def _upsert_local_file_index(
|
|
690
|
+
self,
|
|
691
|
+
conn: sqlite3.Connection,
|
|
692
|
+
*,
|
|
693
|
+
source_id: str,
|
|
694
|
+
root: Path,
|
|
695
|
+
file_path: Path,
|
|
696
|
+
stat: Optional[os.stat_result],
|
|
697
|
+
os_type: str,
|
|
698
|
+
drive_id: str,
|
|
699
|
+
status: str,
|
|
700
|
+
parser_type: str,
|
|
701
|
+
sha256: Optional[str] = None,
|
|
702
|
+
graph_node_id: Optional[str] = None,
|
|
703
|
+
error_message: Optional[str] = None,
|
|
704
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
705
|
+
) -> str:
|
|
706
|
+
try:
|
|
707
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
708
|
+
except ValueError:
|
|
709
|
+
relative_path = file_path.name
|
|
710
|
+
index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
711
|
+
now = _now()
|
|
712
|
+
size = stat.st_size if stat else None
|
|
713
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
|
|
714
|
+
conn.execute(
|
|
715
|
+
"""
|
|
716
|
+
INSERT INTO local_file_index(
|
|
717
|
+
id, source_id, os_type, drive_id, root_path, file_path, relative_path,
|
|
718
|
+
file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
|
|
719
|
+
last_indexed_at, parser_type, status, error_message, graph_node_id,
|
|
720
|
+
deleted, metadata_json
|
|
721
|
+
)
|
|
722
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
723
|
+
ON CONFLICT(source_id, relative_path) DO UPDATE SET
|
|
724
|
+
os_type=excluded.os_type,
|
|
725
|
+
drive_id=excluded.drive_id,
|
|
726
|
+
root_path=excluded.root_path,
|
|
727
|
+
file_path=excluded.file_path,
|
|
728
|
+
file_name=excluded.file_name,
|
|
729
|
+
extension=excluded.extension,
|
|
730
|
+
size_bytes=excluded.size_bytes,
|
|
731
|
+
modified_at=excluded.modified_at,
|
|
732
|
+
sha256=excluded.sha256,
|
|
733
|
+
last_scanned_at=excluded.last_scanned_at,
|
|
734
|
+
last_indexed_at=excluded.last_indexed_at,
|
|
735
|
+
parser_type=excluded.parser_type,
|
|
736
|
+
status=excluded.status,
|
|
737
|
+
error_message=excluded.error_message,
|
|
738
|
+
graph_node_id=excluded.graph_node_id,
|
|
739
|
+
deleted=excluded.deleted,
|
|
740
|
+
metadata_json=excluded.metadata_json
|
|
741
|
+
""",
|
|
742
|
+
(
|
|
743
|
+
index_id,
|
|
744
|
+
source_id,
|
|
745
|
+
os_type,
|
|
746
|
+
drive_id,
|
|
747
|
+
str(root),
|
|
748
|
+
str(file_path),
|
|
749
|
+
relative_path,
|
|
750
|
+
file_path.name,
|
|
751
|
+
file_path.suffix.lower(),
|
|
752
|
+
size,
|
|
753
|
+
modified_at,
|
|
754
|
+
sha256,
|
|
755
|
+
now,
|
|
756
|
+
now if status == "indexed" else None,
|
|
757
|
+
parser_type,
|
|
758
|
+
status,
|
|
759
|
+
error_message,
|
|
760
|
+
graph_node_id,
|
|
761
|
+
0 if status != "deleted" else 1,
|
|
762
|
+
_json(metadata),
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
return index_id
|
|
766
|
+
|
|
767
|
+
def _upsert_local_file_node(
|
|
768
|
+
self,
|
|
769
|
+
conn: sqlite3.Connection,
|
|
770
|
+
*,
|
|
771
|
+
source_id: str,
|
|
772
|
+
root: Path,
|
|
773
|
+
file_path: Path,
|
|
774
|
+
stat: os.stat_result,
|
|
775
|
+
os_type: str,
|
|
776
|
+
drive_id: str,
|
|
777
|
+
sha256: str,
|
|
778
|
+
category: str,
|
|
779
|
+
parser_type: str,
|
|
780
|
+
text: str,
|
|
781
|
+
parser_meta: Dict[str, Any],
|
|
782
|
+
) -> str:
|
|
783
|
+
text = _clean_text(text)
|
|
784
|
+
if not text:
|
|
785
|
+
raise ValueError("텍스트 추출 결과가 비어 있습니다.")
|
|
786
|
+
try:
|
|
787
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
788
|
+
except ValueError:
|
|
789
|
+
relative_path = file_path.name
|
|
790
|
+
file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
791
|
+
parent_folder_id = self._ensure_local_hierarchy(
|
|
792
|
+
conn,
|
|
793
|
+
source_id=source_id,
|
|
794
|
+
root=root,
|
|
795
|
+
file_path=file_path,
|
|
796
|
+
os_type=os_type,
|
|
797
|
+
drive_id=drive_id,
|
|
798
|
+
)
|
|
799
|
+
child_rows = conn.execute(
|
|
800
|
+
"""
|
|
801
|
+
SELECT e.to_node AS id
|
|
802
|
+
FROM edges e
|
|
803
|
+
JOIN nodes n ON n.id=e.to_node
|
|
804
|
+
WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
|
|
805
|
+
""",
|
|
806
|
+
(file_node_id,),
|
|
807
|
+
).fetchall()
|
|
808
|
+
child_ids = [row["id"] for row in child_rows]
|
|
809
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
810
|
+
if child_ids:
|
|
811
|
+
placeholders = ",".join("?" * len(child_ids))
|
|
812
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
|
|
813
|
+
self._v2_delete_nodes(conn, child_ids)
|
|
814
|
+
conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
|
|
815
|
+
self._v2_delete_edges_from(conn, file_node_id)
|
|
816
|
+
|
|
817
|
+
metadata = {
|
|
818
|
+
"source": "local_folder",
|
|
819
|
+
"source_id": source_id,
|
|
820
|
+
"root_path": str(root),
|
|
821
|
+
"file_path": str(file_path),
|
|
822
|
+
"relative_path": relative_path,
|
|
823
|
+
"filename": file_path.name,
|
|
824
|
+
"ext": file_path.suffix.lower(),
|
|
825
|
+
"category": category,
|
|
826
|
+
"parser_type": parser_type,
|
|
827
|
+
"bytes": stat.st_size,
|
|
828
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
829
|
+
"sha256": sha256,
|
|
830
|
+
"parser": parser_meta,
|
|
831
|
+
}
|
|
832
|
+
self._upsert_node(
|
|
833
|
+
conn,
|
|
834
|
+
file_node_id,
|
|
835
|
+
_node_type_for_category(category),
|
|
836
|
+
file_path.name,
|
|
837
|
+
summary=text[:700],
|
|
838
|
+
metadata=metadata,
|
|
839
|
+
raw=metadata,
|
|
840
|
+
)
|
|
841
|
+
self._upsert_edge(
|
|
842
|
+
conn,
|
|
843
|
+
parent_folder_id,
|
|
844
|
+
file_node_id,
|
|
845
|
+
"포함함",
|
|
846
|
+
weight=1.0,
|
|
847
|
+
metadata={"source": "local_scan"},
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
target_for_concepts = text
|
|
851
|
+
if category == "image" and text:
|
|
852
|
+
image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
|
|
853
|
+
self._upsert_node(
|
|
854
|
+
conn,
|
|
855
|
+
image_text_id,
|
|
856
|
+
"ImageText",
|
|
857
|
+
f"{file_path.name} OCR",
|
|
858
|
+
summary=_clean_text(text)[:700],
|
|
859
|
+
metadata={
|
|
860
|
+
"source_node": file_node_id,
|
|
861
|
+
"source_id": source_id,
|
|
862
|
+
"chars": len(text),
|
|
863
|
+
},
|
|
864
|
+
)
|
|
865
|
+
self._upsert_edge(
|
|
866
|
+
conn,
|
|
867
|
+
file_node_id,
|
|
868
|
+
image_text_id,
|
|
869
|
+
"포함함",
|
|
870
|
+
weight=0.8,
|
|
871
|
+
metadata={"source": "ocr"},
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
875
|
+
chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
|
|
876
|
+
self._upsert_node(
|
|
877
|
+
conn,
|
|
878
|
+
chunk_id,
|
|
879
|
+
"Chunk",
|
|
880
|
+
f"{file_path.name} chunk {index + 1}",
|
|
881
|
+
summary=chunk[:500],
|
|
882
|
+
metadata={
|
|
883
|
+
"index": index,
|
|
884
|
+
"source_node": file_node_id,
|
|
885
|
+
"source_id": source_id,
|
|
886
|
+
},
|
|
887
|
+
)
|
|
888
|
+
self._upsert_chunk(
|
|
889
|
+
conn,
|
|
890
|
+
chunk_id=chunk_id,
|
|
891
|
+
source_node=file_node_id,
|
|
892
|
+
text=chunk,
|
|
893
|
+
metadata={
|
|
894
|
+
"index": index,
|
|
895
|
+
"source_node": file_node_id,
|
|
896
|
+
"source_id": source_id,
|
|
897
|
+
},
|
|
898
|
+
)
|
|
899
|
+
self._upsert_edge(
|
|
900
|
+
conn,
|
|
901
|
+
file_node_id,
|
|
902
|
+
chunk_id,
|
|
903
|
+
"포함함",
|
|
904
|
+
weight=0.7,
|
|
905
|
+
metadata={"source": "local_scan"},
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
concepts = _extract_concepts(target_for_concepts, limit=18)
|
|
909
|
+
concept_ids: Dict[str, str] = {}
|
|
910
|
+
for concept in concepts:
|
|
911
|
+
node_t = _classify_node_type(concept, target_for_concepts)
|
|
912
|
+
concept_id = f"{node_t.lower()}:{_slug(concept)}"
|
|
913
|
+
concept_ids[concept.lower()] = concept_id
|
|
914
|
+
self._upsert_node(
|
|
915
|
+
conn,
|
|
916
|
+
concept_id,
|
|
917
|
+
node_t,
|
|
918
|
+
concept,
|
|
919
|
+
metadata={
|
|
920
|
+
"auto_extracted": True,
|
|
921
|
+
"source": "local_folder",
|
|
922
|
+
"source_id": source_id,
|
|
923
|
+
},
|
|
924
|
+
)
|
|
925
|
+
self._upsert_edge(
|
|
926
|
+
conn,
|
|
927
|
+
file_node_id,
|
|
928
|
+
concept_id,
|
|
929
|
+
"언급함",
|
|
930
|
+
weight=0.75,
|
|
931
|
+
metadata={"source": "local_scan"},
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
for triple in _extract_triples(target_for_concepts, concepts, limit=20):
|
|
935
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
936
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
937
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
938
|
+
self._upsert_edge(
|
|
939
|
+
conn,
|
|
940
|
+
subj_id,
|
|
941
|
+
obj_id,
|
|
942
|
+
triple["relation"],
|
|
943
|
+
weight=0.9,
|
|
944
|
+
metadata={
|
|
945
|
+
"context": triple.get("context", "")[:240],
|
|
946
|
+
"source_id": source_id,
|
|
947
|
+
},
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
for item in _semantic_items(target_for_concepts):
|
|
951
|
+
sem_type = item["type"]
|
|
952
|
+
sem_title = item["title"]
|
|
953
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
|
|
954
|
+
self._upsert_node(
|
|
955
|
+
conn,
|
|
956
|
+
sem_id,
|
|
957
|
+
sem_type,
|
|
958
|
+
sem_title,
|
|
959
|
+
summary=item["summary"],
|
|
960
|
+
metadata={
|
|
961
|
+
"auto_extracted": True,
|
|
962
|
+
"source_node": file_node_id,
|
|
963
|
+
"filename": file_path.name,
|
|
964
|
+
},
|
|
965
|
+
raw=item,
|
|
966
|
+
)
|
|
967
|
+
self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
|
|
968
|
+
|
|
969
|
+
return file_node_id
|
|
970
|
+
|
|
971
|
+
def _delete_local_file_graph(
|
|
972
|
+
self, conn: sqlite3.Connection, file_node_id: Optional[str]
|
|
973
|
+
) -> None:
|
|
974
|
+
if not file_node_id:
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
file_row = conn.execute(
|
|
978
|
+
"SELECT metadata_json FROM nodes WHERE id=?",
|
|
979
|
+
(file_node_id,),
|
|
980
|
+
).fetchone()
|
|
981
|
+
source_id = None
|
|
982
|
+
if file_row:
|
|
983
|
+
source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
|
|
984
|
+
|
|
985
|
+
linked_rows = conn.execute(
|
|
986
|
+
"""
|
|
987
|
+
SELECT n.id, n.type, n.metadata_json
|
|
988
|
+
FROM edges e
|
|
989
|
+
JOIN nodes n ON n.id=e.to_node
|
|
990
|
+
WHERE e.from_node=?
|
|
991
|
+
""",
|
|
992
|
+
(file_node_id,),
|
|
993
|
+
).fetchall()
|
|
994
|
+
owned_ids: set = set()
|
|
995
|
+
auto_candidate_ids: set = set()
|
|
996
|
+
for row in linked_rows:
|
|
997
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
998
|
+
if (
|
|
999
|
+
row["type"] in {"Chunk", "ImageText", "Section"}
|
|
1000
|
+
or metadata.get("source_node") == file_node_id
|
|
1001
|
+
):
|
|
1002
|
+
owned_ids.add(row["id"])
|
|
1003
|
+
elif (
|
|
1004
|
+
metadata.get("auto_extracted")
|
|
1005
|
+
and metadata.get("source") == "local_folder"
|
|
1006
|
+
):
|
|
1007
|
+
auto_candidate_ids.add(row["id"])
|
|
1008
|
+
|
|
1009
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
1010
|
+
conn.execute(
|
|
1011
|
+
"DELETE FROM edges WHERE from_node=? OR to_node=?",
|
|
1012
|
+
(file_node_id, file_node_id),
|
|
1013
|
+
)
|
|
1014
|
+
conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
|
|
1015
|
+
self._v2_delete_nodes(conn, [file_node_id])
|
|
1016
|
+
|
|
1017
|
+
def delete_nodes(node_ids: set) -> None:
|
|
1018
|
+
if not node_ids:
|
|
1019
|
+
return
|
|
1020
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
1021
|
+
params = list(node_ids)
|
|
1022
|
+
conn.execute(
|
|
1023
|
+
f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params
|
|
1024
|
+
)
|
|
1025
|
+
conn.execute(
|
|
1026
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1027
|
+
params * 2,
|
|
1028
|
+
)
|
|
1029
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
|
|
1030
|
+
self._v2_delete_nodes(conn, params)
|
|
1031
|
+
|
|
1032
|
+
delete_nodes(owned_ids)
|
|
1033
|
+
|
|
1034
|
+
removable_auto_ids: set = set()
|
|
1035
|
+
for node_id in auto_candidate_ids:
|
|
1036
|
+
remaining_edges = conn.execute(
|
|
1037
|
+
"SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
|
|
1038
|
+
(node_id, node_id),
|
|
1039
|
+
).fetchall()
|
|
1040
|
+
if all(
|
|
1041
|
+
(
|
|
1042
|
+
row["from_node"] in auto_candidate_ids
|
|
1043
|
+
and row["to_node"] in auto_candidate_ids
|
|
1044
|
+
)
|
|
1045
|
+
for row in remaining_edges
|
|
1046
|
+
):
|
|
1047
|
+
removable_auto_ids.add(node_id)
|
|
1048
|
+
delete_nodes(removable_auto_ids)
|
|
1049
|
+
if source_id:
|
|
1050
|
+
self._cleanup_local_graph_orphans(conn, str(source_id))
|
|
1051
|
+
|
|
1052
|
+
def _cleanup_local_graph_orphans(
|
|
1053
|
+
self, conn: sqlite3.Connection, source_id: str
|
|
1054
|
+
) -> None:
|
|
1055
|
+
while True:
|
|
1056
|
+
folder_rows = conn.execute(
|
|
1057
|
+
"SELECT id, metadata_json FROM nodes WHERE type='Folder'"
|
|
1058
|
+
).fetchall()
|
|
1059
|
+
leaf_ids = []
|
|
1060
|
+
for row in folder_rows:
|
|
1061
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1062
|
+
if metadata.get("source_id") != source_id:
|
|
1063
|
+
continue
|
|
1064
|
+
has_children = conn.execute(
|
|
1065
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1066
|
+
(row["id"],),
|
|
1067
|
+
).fetchone()
|
|
1068
|
+
if not has_children:
|
|
1069
|
+
leaf_ids.append(row["id"])
|
|
1070
|
+
if not leaf_ids:
|
|
1071
|
+
break
|
|
1072
|
+
placeholders = ",".join("?" * len(leaf_ids))
|
|
1073
|
+
conn.execute(
|
|
1074
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1075
|
+
leaf_ids * 2,
|
|
1076
|
+
)
|
|
1077
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
|
|
1078
|
+
self._v2_delete_nodes(conn, leaf_ids)
|
|
1079
|
+
|
|
1080
|
+
for node_type in ("Drive", "Computer"):
|
|
1081
|
+
rows = conn.execute(
|
|
1082
|
+
"SELECT id FROM nodes WHERE type=?", (node_type,)
|
|
1083
|
+
).fetchall()
|
|
1084
|
+
removable = []
|
|
1085
|
+
for row in rows:
|
|
1086
|
+
has_children = conn.execute(
|
|
1087
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1088
|
+
(row["id"],),
|
|
1089
|
+
).fetchone()
|
|
1090
|
+
if not has_children:
|
|
1091
|
+
removable.append(row["id"])
|
|
1092
|
+
if removable:
|
|
1093
|
+
placeholders = ",".join("?" * len(removable))
|
|
1094
|
+
conn.execute(
|
|
1095
|
+
f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})",
|
|
1096
|
+
removable * 2,
|
|
1097
|
+
)
|
|
1098
|
+
conn.execute(
|
|
1099
|
+
f"DELETE FROM nodes WHERE id IN ({placeholders})", removable
|
|
1100
|
+
)
|
|
1101
|
+
self._v2_delete_nodes(conn, removable)
|
|
1102
|
+
|
|
1103
|
+
def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
|
|
1104
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1105
|
+
parser = metadata.get("parser") if isinstance(metadata, dict) else {}
|
|
1106
|
+
if not isinstance(parser, dict):
|
|
1107
|
+
return False
|
|
1108
|
+
try:
|
|
1109
|
+
return int(parser.get("extracted_chars") or 0) > 0
|
|
1110
|
+
except (TypeError, ValueError):
|
|
1111
|
+
return False
|
|
1112
|
+
|
|
1113
|
+
def index_local_folder(
|
|
1114
|
+
self,
|
|
1115
|
+
path: Path,
|
|
1116
|
+
*,
|
|
1117
|
+
include_ocr: bool = False,
|
|
1118
|
+
watch_enabled: bool = False,
|
|
1119
|
+
user_email: Optional[str] = None,
|
|
1120
|
+
consent: Optional[Dict[str, Any]] = None,
|
|
1121
|
+
max_files: int = 5_000,
|
|
1122
|
+
) -> Dict[str, Any]:
|
|
1123
|
+
"""Read approved files from a local folder and connect them to Graph RAG."""
|
|
1124
|
+
root = Path(path).expanduser().resolve()
|
|
1125
|
+
if not root.exists():
|
|
1126
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1127
|
+
if not root.is_dir():
|
|
1128
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1129
|
+
|
|
1130
|
+
os_type = _current_os_type()
|
|
1131
|
+
drive_id = _drive_id_for_path(root)
|
|
1132
|
+
source_id = f"source:{_path_fingerprint(root)}"
|
|
1133
|
+
now = _now()
|
|
1134
|
+
max_files = max(1, min(int(max_files or 5_000), 50_000))
|
|
1135
|
+
consent_payload = {
|
|
1136
|
+
"approved_at": now,
|
|
1137
|
+
"approved_by": user_email,
|
|
1138
|
+
"knowledge_source": True,
|
|
1139
|
+
"include_ocr": bool(include_ocr),
|
|
1140
|
+
"watch_enabled": bool(watch_enabled),
|
|
1141
|
+
"sensitive_files_default_excluded": True,
|
|
1142
|
+
**(consent or {}),
|
|
1143
|
+
}
|
|
1144
|
+
counts: Counter = Counter()
|
|
1145
|
+
seen_relative_paths: set = set()
|
|
1146
|
+
indexed_nodes: List[str] = []
|
|
1147
|
+
errors: List[Dict[str, str]] = []
|
|
1148
|
+
limit_reached = False
|
|
1149
|
+
|
|
1150
|
+
with self._connect() as conn:
|
|
1151
|
+
conn.execute(
|
|
1152
|
+
"""
|
|
1153
|
+
INSERT INTO knowledge_sources(
|
|
1154
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1155
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1156
|
+
)
|
|
1157
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1158
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
1159
|
+
root_path=excluded.root_path,
|
|
1160
|
+
os_type=excluded.os_type,
|
|
1161
|
+
drive_id=excluded.drive_id,
|
|
1162
|
+
label=excluded.label,
|
|
1163
|
+
status=excluded.status,
|
|
1164
|
+
include_ocr=excluded.include_ocr,
|
|
1165
|
+
watch_enabled=excluded.watch_enabled,
|
|
1166
|
+
consent_json=excluded.consent_json,
|
|
1167
|
+
updated_at=excluded.updated_at,
|
|
1168
|
+
last_scanned_at=excluded.last_scanned_at
|
|
1169
|
+
""",
|
|
1170
|
+
(
|
|
1171
|
+
source_id,
|
|
1172
|
+
str(root),
|
|
1173
|
+
os_type,
|
|
1174
|
+
drive_id,
|
|
1175
|
+
root.name or str(root),
|
|
1176
|
+
"scanning",
|
|
1177
|
+
1 if include_ocr else 0,
|
|
1178
|
+
1 if watch_enabled else 0,
|
|
1179
|
+
_json(consent_payload),
|
|
1180
|
+
now,
|
|
1181
|
+
now,
|
|
1182
|
+
now,
|
|
1183
|
+
),
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1187
|
+
kind = entry["kind"]
|
|
1188
|
+
file_path = entry["path"]
|
|
1189
|
+
if kind == "limit_reached":
|
|
1190
|
+
counts["limit_reached"] += 1
|
|
1191
|
+
limit_reached = True
|
|
1192
|
+
break
|
|
1193
|
+
if kind in {"excluded_dir", "excluded"}:
|
|
1194
|
+
counts["excluded"] += 1
|
|
1195
|
+
continue
|
|
1196
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1197
|
+
counts["failed"] += 1
|
|
1198
|
+
errors.append(
|
|
1199
|
+
{
|
|
1200
|
+
"path": str(file_path),
|
|
1201
|
+
"error": entry.get("reason", "inaccessible"),
|
|
1202
|
+
}
|
|
1203
|
+
)
|
|
1204
|
+
continue
|
|
1205
|
+
if kind != "file":
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
stat = entry["stat"]
|
|
1209
|
+
try:
|
|
1210
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1211
|
+
except ValueError:
|
|
1212
|
+
relative_path = file_path.name
|
|
1213
|
+
seen_relative_paths.add(relative_path)
|
|
1214
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1215
|
+
existing = conn.execute(
|
|
1216
|
+
"""
|
|
1217
|
+
SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
|
|
1218
|
+
FROM local_file_index
|
|
1219
|
+
WHERE source_id=? AND relative_path=?
|
|
1220
|
+
""",
|
|
1221
|
+
(source_id, relative_path),
|
|
1222
|
+
).fetchone()
|
|
1223
|
+
decision = self._local_file_decision(file_path, root, stat)
|
|
1224
|
+
parser_type = decision["parser_type"]
|
|
1225
|
+
if not decision["indexable"]:
|
|
1226
|
+
counts[decision["status"]] += 1
|
|
1227
|
+
if existing and existing["graph_node_id"]:
|
|
1228
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1229
|
+
self._upsert_local_file_index(
|
|
1230
|
+
conn,
|
|
1231
|
+
source_id=source_id,
|
|
1232
|
+
root=root,
|
|
1233
|
+
file_path=file_path,
|
|
1234
|
+
stat=stat,
|
|
1235
|
+
os_type=os_type,
|
|
1236
|
+
drive_id=drive_id,
|
|
1237
|
+
status=decision["status"],
|
|
1238
|
+
parser_type=parser_type,
|
|
1239
|
+
metadata={
|
|
1240
|
+
"reason": decision["reason"],
|
|
1241
|
+
"category": decision["category"],
|
|
1242
|
+
},
|
|
1243
|
+
)
|
|
1244
|
+
continue
|
|
1245
|
+
|
|
1246
|
+
if (
|
|
1247
|
+
existing
|
|
1248
|
+
and existing["status"] == "indexed"
|
|
1249
|
+
and existing["graph_node_id"]
|
|
1250
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1251
|
+
and existing["size_bytes"] == stat.st_size
|
|
1252
|
+
and existing["modified_at"] == modified_at
|
|
1253
|
+
):
|
|
1254
|
+
counts["skipped_unchanged"] += 1
|
|
1255
|
+
self._upsert_local_file_index(
|
|
1256
|
+
conn,
|
|
1257
|
+
source_id=source_id,
|
|
1258
|
+
root=root,
|
|
1259
|
+
file_path=file_path,
|
|
1260
|
+
stat=stat,
|
|
1261
|
+
os_type=os_type,
|
|
1262
|
+
drive_id=drive_id,
|
|
1263
|
+
status="indexed",
|
|
1264
|
+
parser_type=parser_type,
|
|
1265
|
+
sha256=existing["sha256"],
|
|
1266
|
+
graph_node_id=existing["graph_node_id"],
|
|
1267
|
+
metadata={
|
|
1268
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1269
|
+
"category": decision["category"],
|
|
1270
|
+
"unchanged": True,
|
|
1271
|
+
},
|
|
1272
|
+
)
|
|
1273
|
+
continue
|
|
1274
|
+
|
|
1275
|
+
try:
|
|
1276
|
+
data = file_path.read_bytes()
|
|
1277
|
+
digest = _sha256_bytes(data)
|
|
1278
|
+
except Exception as exc:
|
|
1279
|
+
counts["failed"] += 1
|
|
1280
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1281
|
+
if existing and existing["graph_node_id"]:
|
|
1282
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1283
|
+
self._upsert_local_file_index(
|
|
1284
|
+
conn,
|
|
1285
|
+
source_id=source_id,
|
|
1286
|
+
root=root,
|
|
1287
|
+
file_path=file_path,
|
|
1288
|
+
stat=stat,
|
|
1289
|
+
os_type=os_type,
|
|
1290
|
+
drive_id=drive_id,
|
|
1291
|
+
status="failed",
|
|
1292
|
+
parser_type=parser_type,
|
|
1293
|
+
error_message=str(exc),
|
|
1294
|
+
metadata={"category": decision["category"]},
|
|
1295
|
+
)
|
|
1296
|
+
continue
|
|
1297
|
+
|
|
1298
|
+
if (
|
|
1299
|
+
existing
|
|
1300
|
+
and existing["sha256"] == digest
|
|
1301
|
+
and existing["graph_node_id"]
|
|
1302
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1303
|
+
):
|
|
1304
|
+
counts["skipped_unchanged"] += 1
|
|
1305
|
+
self._upsert_local_file_index(
|
|
1306
|
+
conn,
|
|
1307
|
+
source_id=source_id,
|
|
1308
|
+
root=root,
|
|
1309
|
+
file_path=file_path,
|
|
1310
|
+
stat=stat,
|
|
1311
|
+
os_type=os_type,
|
|
1312
|
+
drive_id=drive_id,
|
|
1313
|
+
status="indexed",
|
|
1314
|
+
parser_type=parser_type,
|
|
1315
|
+
sha256=digest,
|
|
1316
|
+
graph_node_id=existing["graph_node_id"],
|
|
1317
|
+
metadata={
|
|
1318
|
+
**_safe_loads(existing["metadata_json"]),
|
|
1319
|
+
"category": decision["category"],
|
|
1320
|
+
"sha256_unchanged": True,
|
|
1321
|
+
},
|
|
1322
|
+
)
|
|
1323
|
+
continue
|
|
1324
|
+
|
|
1325
|
+
try:
|
|
1326
|
+
text, parser_meta = self._extract_local_file_text(
|
|
1327
|
+
file_path,
|
|
1328
|
+
decision["category"],
|
|
1329
|
+
include_ocr=include_ocr,
|
|
1330
|
+
)
|
|
1331
|
+
text = _clean_text(text)
|
|
1332
|
+
parser_meta = {**parser_meta, "extracted_chars": len(text)}
|
|
1333
|
+
if not text:
|
|
1334
|
+
counts["skipped_empty_text"] += 1
|
|
1335
|
+
if existing and existing["graph_node_id"]:
|
|
1336
|
+
self._delete_local_file_graph(
|
|
1337
|
+
conn, existing["graph_node_id"]
|
|
1338
|
+
)
|
|
1339
|
+
self._upsert_local_file_index(
|
|
1340
|
+
conn,
|
|
1341
|
+
source_id=source_id,
|
|
1342
|
+
root=root,
|
|
1343
|
+
file_path=file_path,
|
|
1344
|
+
stat=stat,
|
|
1345
|
+
os_type=os_type,
|
|
1346
|
+
drive_id=drive_id,
|
|
1347
|
+
status="skipped_empty_text",
|
|
1348
|
+
parser_type=parser_type,
|
|
1349
|
+
sha256=digest,
|
|
1350
|
+
error_message="텍스트 추출 결과가 비어 있습니다.",
|
|
1351
|
+
metadata={
|
|
1352
|
+
"category": decision["category"],
|
|
1353
|
+
"parser": parser_meta,
|
|
1354
|
+
},
|
|
1355
|
+
)
|
|
1356
|
+
continue
|
|
1357
|
+
graph_node_id = self._upsert_local_file_node(
|
|
1358
|
+
conn,
|
|
1359
|
+
source_id=source_id,
|
|
1360
|
+
root=root,
|
|
1361
|
+
file_path=file_path,
|
|
1362
|
+
stat=stat,
|
|
1363
|
+
os_type=os_type,
|
|
1364
|
+
drive_id=drive_id,
|
|
1365
|
+
sha256=digest,
|
|
1366
|
+
category=decision["category"],
|
|
1367
|
+
parser_type=parser_type,
|
|
1368
|
+
text=text,
|
|
1369
|
+
parser_meta=parser_meta,
|
|
1370
|
+
)
|
|
1371
|
+
self._upsert_local_file_index(
|
|
1372
|
+
conn,
|
|
1373
|
+
source_id=source_id,
|
|
1374
|
+
root=root,
|
|
1375
|
+
file_path=file_path,
|
|
1376
|
+
stat=stat,
|
|
1377
|
+
os_type=os_type,
|
|
1378
|
+
drive_id=drive_id,
|
|
1379
|
+
status="indexed",
|
|
1380
|
+
parser_type=parser_type,
|
|
1381
|
+
sha256=digest,
|
|
1382
|
+
graph_node_id=graph_node_id,
|
|
1383
|
+
metadata={
|
|
1384
|
+
"category": decision["category"],
|
|
1385
|
+
"parser": parser_meta,
|
|
1386
|
+
},
|
|
1387
|
+
)
|
|
1388
|
+
counts["indexed"] += 1
|
|
1389
|
+
indexed_nodes.append(graph_node_id)
|
|
1390
|
+
except Exception as exc:
|
|
1391
|
+
counts["failed"] += 1
|
|
1392
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1393
|
+
if existing and existing["graph_node_id"]:
|
|
1394
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1395
|
+
self._upsert_local_file_index(
|
|
1396
|
+
conn,
|
|
1397
|
+
source_id=source_id,
|
|
1398
|
+
root=root,
|
|
1399
|
+
file_path=file_path,
|
|
1400
|
+
stat=stat,
|
|
1401
|
+
os_type=os_type,
|
|
1402
|
+
drive_id=drive_id,
|
|
1403
|
+
status="failed",
|
|
1404
|
+
parser_type=parser_type,
|
|
1405
|
+
sha256=digest,
|
|
1406
|
+
error_message=str(exc),
|
|
1407
|
+
metadata={"category": decision["category"]},
|
|
1408
|
+
)
|
|
1409
|
+
|
|
1410
|
+
if not limit_reached:
|
|
1411
|
+
existing_rows = {
|
|
1412
|
+
row["relative_path"]: row["graph_node_id"]
|
|
1413
|
+
for row in conn.execute(
|
|
1414
|
+
"SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
|
|
1415
|
+
(source_id,),
|
|
1416
|
+
)
|
|
1417
|
+
}
|
|
1418
|
+
deleted_paths = set(existing_rows) - seen_relative_paths
|
|
1419
|
+
for relative_path in deleted_paths:
|
|
1420
|
+
self._delete_local_file_graph(
|
|
1421
|
+
conn, existing_rows.get(relative_path)
|
|
1422
|
+
)
|
|
1423
|
+
conn.execute(
|
|
1424
|
+
"""
|
|
1425
|
+
UPDATE local_file_index
|
|
1426
|
+
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
|
|
1427
|
+
WHERE source_id=? AND relative_path=?
|
|
1428
|
+
""",
|
|
1429
|
+
(_now(), source_id, relative_path),
|
|
1430
|
+
)
|
|
1431
|
+
counts["deleted"] = len(deleted_paths)
|
|
1432
|
+
conn.execute(
|
|
1433
|
+
"""
|
|
1434
|
+
UPDATE knowledge_sources
|
|
1435
|
+
SET status='active', updated_at=?, last_scanned_at=?
|
|
1436
|
+
WHERE id=?
|
|
1437
|
+
""",
|
|
1438
|
+
(_now(), _now(), source_id),
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
return {
|
|
1442
|
+
"status": "ok",
|
|
1443
|
+
"source": {
|
|
1444
|
+
"id": source_id,
|
|
1445
|
+
"root_path": str(root),
|
|
1446
|
+
"os_type": os_type,
|
|
1447
|
+
"drive_id": drive_id,
|
|
1448
|
+
"include_ocr": bool(include_ocr),
|
|
1449
|
+
"watch_enabled": bool(watch_enabled),
|
|
1450
|
+
},
|
|
1451
|
+
"counts": dict(counts),
|
|
1452
|
+
"indexed_nodes": indexed_nodes[:100],
|
|
1453
|
+
"errors": errors[:50],
|
|
1454
|
+
"notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
|
|
1455
|
+
}
|