ltcai 0.1.30 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +233 -184
- package/auto_setup.py +279 -55
- package/docs/CHANGELOG.md +69 -0
- package/knowledge_graph.py +1338 -3
- package/knowledge_graph_api.py +112 -0
- package/latticeai/__init__.py +1 -0
- package/latticeai/__pycache__/__init__.cpython-314.pyc +0 -0
- package/latticeai/api/__init__.py +1 -0
- package/latticeai/api/__pycache__/admin.cpython-314.pyc +0 -0
- package/latticeai/api/__pycache__/auth.cpython-314.pyc +0 -0
- package/latticeai/api/admin.py +187 -0
- package/latticeai/api/auth.py +233 -0
- package/latticeai/core/__init__.py +1 -0
- package/latticeai/core/__pycache__/__init__.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/audit.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/security.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/sessions.cpython-314.pyc +0 -0
- package/latticeai/core/audit.py +245 -0
- package/latticeai/core/security.py +131 -0
- package/latticeai/core/sessions.py +72 -0
- package/llm_router.py +13 -7
- package/local_knowledge_api.py +319 -0
- package/package.json +5 -2
- package/requirements.txt +2 -1
- package/server.py +290 -901
- package/static/graph.html +7 -2
- package/static/lattice-reference.css +220 -0
- package/static/scripts/graph.js +305 -4
package/knowledge_graph.py
CHANGED
|
@@ -10,13 +10,16 @@ import hashlib
|
|
|
10
10
|
import json
|
|
11
11
|
import logging
|
|
12
12
|
import math
|
|
13
|
+
import os
|
|
14
|
+
import platform
|
|
13
15
|
import re
|
|
14
16
|
import shutil
|
|
15
17
|
import sqlite3
|
|
16
18
|
import zipfile
|
|
19
|
+
from collections import Counter
|
|
17
20
|
from datetime import datetime
|
|
18
21
|
from pathlib import Path
|
|
19
|
-
from typing import Any, Dict, List, Optional
|
|
22
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
20
23
|
|
|
21
24
|
try:
|
|
22
25
|
from kg_schema import KGStoreV2
|
|
@@ -26,6 +29,76 @@ except Exception: # pragma: no cover - v2 schema is optional at import time
|
|
|
26
29
|
|
|
27
30
|
GRAPH_SCHEMA_VERSION = 1
|
|
28
31
|
|
|
32
|
+
LOCAL_TEXT_EXTENSIONS = {".txt", ".md"}
|
|
33
|
+
LOCAL_CODE_EXTENSIONS = {
|
|
34
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".html", ".css", ".json",
|
|
35
|
+
".yaml", ".yml", ".xml", ".sql", ".sh", ".zsh", ".toml", ".ini",
|
|
36
|
+
}
|
|
37
|
+
LOCAL_DOCUMENT_EXTENSIONS = {".pdf", ".docx"}
|
|
38
|
+
LOCAL_SPREADSHEET_EXTENSIONS = {".xlsx", ".csv"}
|
|
39
|
+
LOCAL_SLIDE_EXTENSIONS = {".pptx"}
|
|
40
|
+
LOCAL_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
|
41
|
+
LOCAL_SUPPORTED_EXTENSIONS = (
|
|
42
|
+
LOCAL_TEXT_EXTENSIONS
|
|
43
|
+
| LOCAL_CODE_EXTENSIONS
|
|
44
|
+
| LOCAL_DOCUMENT_EXTENSIONS
|
|
45
|
+
| LOCAL_SPREADSHEET_EXTENSIONS
|
|
46
|
+
| LOCAL_SLIDE_EXTENSIONS
|
|
47
|
+
| LOCAL_IMAGE_EXTENSIONS
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
LOCAL_SIZE_LIMITS = {
|
|
51
|
+
"text": 4_000_000,
|
|
52
|
+
"code": 4_000_000,
|
|
53
|
+
"pdf": 50_000_000,
|
|
54
|
+
"document": 50_000_000,
|
|
55
|
+
"spreadsheet": 50_000_000,
|
|
56
|
+
"slide_deck": 50_000_000,
|
|
57
|
+
"image": 100_000_000,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
COMMON_EXCLUDED_DIRS = {
|
|
61
|
+
".git", "node_modules", ".venv", "venv", "env", "__pycache__",
|
|
62
|
+
".pytest_cache", ".mypy_cache", ".ruff_cache", ".next", ".nuxt",
|
|
63
|
+
".turbo", "dist", "build", "target", "out", "coverage", ".cache",
|
|
64
|
+
".config", ".ssh", ".gnupg", ".docker", ".kube", ".aws", ".azure",
|
|
65
|
+
".npm", ".pnpm-store", ".yarn", ".bun", ".cargo", ".rustup", ".pyenv",
|
|
66
|
+
".conda", ".local", ".claude", ".codex", ".cursor", ".copilot",
|
|
67
|
+
".antigravity", ".antigravity-ide",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
COMMON_EXCLUDED_FILE_NAMES = {
|
|
71
|
+
".env", ".env.local", ".env.production", ".env.development",
|
|
72
|
+
"id_rsa", "id_ed25519", "authorized_keys", "known_hosts",
|
|
73
|
+
"credentials.json", "service-account.json", "token.json", "secrets.json",
|
|
74
|
+
"cookies", "login data", "history", "web data", ".ds_store", "thumbs.db",
|
|
75
|
+
}
|
|
76
|
+
COMMON_EXCLUDED_FILE_SUFFIXES = {
|
|
77
|
+
".pem", ".key", ".p12", ".pfx", ".kdbx", ".wallet", ".sqlite", ".db",
|
|
78
|
+
".exe", ".dll", ".sys", ".msi", ".dmg", ".pkg", ".app", ".zip", ".tar",
|
|
79
|
+
".gz", ".7z", ".rar", ".mp4", ".mov", ".mp3", ".wav", ".tmp", ".bak",
|
|
80
|
+
".lock",
|
|
81
|
+
}
|
|
82
|
+
SENSITIVE_PATH_KEYWORDS = {
|
|
83
|
+
"secret", "secrets", "token", "password", "passwd", "credential",
|
|
84
|
+
"credentials", "private", "key", "wallet", "recovery", "seed",
|
|
85
|
+
"mnemonic", "cookie", "session", "auth", "oauth", "certificate",
|
|
86
|
+
"cert", "api_key", "apikey",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
MACOS_EXCLUDED_PREFIXES = (
|
|
90
|
+
"/System", "/Library", "/Applications", "/private", "/tmp", "/var",
|
|
91
|
+
)
|
|
92
|
+
WINDOWS_EXCLUDED_NAMES = {
|
|
93
|
+
"windows", "program files", "program files (x86)", "programdata", "appdata",
|
|
94
|
+
"$recycle.bin", "system volume information", "recovery", "perflogs",
|
|
95
|
+
"intel", "amd", "nvidia",
|
|
96
|
+
}
|
|
97
|
+
LINUX_EXCLUDED_PREFIXES = (
|
|
98
|
+
"/bin", "/boot", "/dev", "/etc", "/lib", "/lib64", "/proc", "/root",
|
|
99
|
+
"/run", "/sbin", "/sys", "/tmp", "/usr", "/var", "/snap", "/lost+found",
|
|
100
|
+
)
|
|
101
|
+
|
|
29
102
|
|
|
30
103
|
def _now() -> str:
|
|
31
104
|
return datetime.now().isoformat()
|
|
@@ -80,6 +153,199 @@ def _sha256_text(text: str) -> str:
|
|
|
80
153
|
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
81
154
|
|
|
82
155
|
|
|
156
|
+
def _safe_iso_from_stat_mtime(mtime: float) -> str:
|
|
157
|
+
try:
|
|
158
|
+
return datetime.fromtimestamp(float(mtime)).isoformat()
|
|
159
|
+
except (TypeError, ValueError, OSError):
|
|
160
|
+
return ""
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _path_fingerprint(path: Path) -> str:
|
|
164
|
+
return _sha256_text(str(path.expanduser().resolve()))[:24]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _is_relative_to(path: Path, base: Path) -> bool:
|
|
168
|
+
try:
|
|
169
|
+
path.relative_to(base)
|
|
170
|
+
return True
|
|
171
|
+
except ValueError:
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _path_parts_lower(path: Path) -> List[str]:
|
|
176
|
+
return [part.lower() for part in path.parts if part and part not in {os.sep, path.anchor}]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _current_os_type() -> str:
|
|
180
|
+
system = platform.system().lower()
|
|
181
|
+
if system.startswith("darwin"):
|
|
182
|
+
return "macos"
|
|
183
|
+
if system.startswith("windows"):
|
|
184
|
+
return "windows"
|
|
185
|
+
if system.startswith("linux"):
|
|
186
|
+
return "linux"
|
|
187
|
+
return system or "unknown"
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _drive_id_for_path(path: Path) -> str:
|
|
191
|
+
resolved = path.expanduser().resolve()
|
|
192
|
+
if resolved.drive:
|
|
193
|
+
return resolved.drive.upper()
|
|
194
|
+
parts = resolved.parts
|
|
195
|
+
if len(parts) >= 3 and parts[1] == "Volumes":
|
|
196
|
+
return f"/Volumes/{parts[2]}"
|
|
197
|
+
if len(parts) >= 3 and parts[1] == "media":
|
|
198
|
+
return f"/media/{parts[2]}"
|
|
199
|
+
if len(parts) >= 3 and parts[1] == "mnt":
|
|
200
|
+
return f"/mnt/{parts[2]}"
|
|
201
|
+
return resolved.anchor or "/"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _file_category(ext: str) -> str:
|
|
205
|
+
ext = (ext or "").lower()
|
|
206
|
+
if ext in LOCAL_CODE_EXTENSIONS:
|
|
207
|
+
return "code"
|
|
208
|
+
if ext in LOCAL_TEXT_EXTENSIONS:
|
|
209
|
+
return "text"
|
|
210
|
+
if ext == ".pdf":
|
|
211
|
+
return "pdf"
|
|
212
|
+
if ext in LOCAL_DOCUMENT_EXTENSIONS:
|
|
213
|
+
return "document"
|
|
214
|
+
if ext in LOCAL_SPREADSHEET_EXTENSIONS:
|
|
215
|
+
return "spreadsheet"
|
|
216
|
+
if ext in LOCAL_SLIDE_EXTENSIONS:
|
|
217
|
+
return "slide_deck"
|
|
218
|
+
if ext in LOCAL_IMAGE_EXTENSIONS:
|
|
219
|
+
return "image"
|
|
220
|
+
return "unsupported"
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _node_type_for_category(category: str) -> str:
|
|
224
|
+
return {
|
|
225
|
+
"code": "CodeFile",
|
|
226
|
+
"spreadsheet": "Spreadsheet",
|
|
227
|
+
"slide_deck": "SlideDeck",
|
|
228
|
+
"image": "Image",
|
|
229
|
+
"unsupported": "File",
|
|
230
|
+
}.get(category, "Document")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _parser_type_for_category(category: str, ext: str) -> str:
|
|
234
|
+
if category in {"text", "code"}:
|
|
235
|
+
return "plain_text"
|
|
236
|
+
if category == "spreadsheet" and ext == ".csv":
|
|
237
|
+
return "csv_text"
|
|
238
|
+
if category == "image":
|
|
239
|
+
return "image_ocr"
|
|
240
|
+
return ext.lstrip(".") or category
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _size_limit_for_category(category: str) -> int:
|
|
244
|
+
return LOCAL_SIZE_LIMITS.get(category, LOCAL_SIZE_LIMITS["document"])
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _is_hidden_path(path: Path, root: Optional[Path] = None) -> bool:
|
|
248
|
+
parts: Iterable[str]
|
|
249
|
+
if root is not None:
|
|
250
|
+
try:
|
|
251
|
+
parts = path.relative_to(root).parts
|
|
252
|
+
except ValueError:
|
|
253
|
+
parts = path.parts
|
|
254
|
+
else:
|
|
255
|
+
parts = path.parts
|
|
256
|
+
return any(part.startswith(".") and part not in {".", ".."} for part in parts)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _excluded_directory_reason(path: Path, *, root: Optional[Path] = None, os_type: Optional[str] = None) -> Optional[str]:
|
|
260
|
+
os_type = os_type or _current_os_type()
|
|
261
|
+
name = path.name.lower()
|
|
262
|
+
if name in COMMON_EXCLUDED_DIRS:
|
|
263
|
+
return "excluded_folder"
|
|
264
|
+
if _is_hidden_path(path, root):
|
|
265
|
+
return "hidden_folder"
|
|
266
|
+
parts = _path_parts_lower(path)
|
|
267
|
+
if os_type == "windows" and any(part in WINDOWS_EXCLUDED_NAMES for part in parts):
|
|
268
|
+
return "system_folder"
|
|
269
|
+
normalized = path.as_posix()
|
|
270
|
+
root_normalized = root.as_posix() if root else ""
|
|
271
|
+
|
|
272
|
+
def _prefix_blocks(prefixes: Tuple[str, ...]) -> bool:
|
|
273
|
+
for prefix in prefixes:
|
|
274
|
+
path_under_prefix = normalized == prefix or normalized.startswith(f"{prefix}/")
|
|
275
|
+
root_under_prefix = bool(root_normalized) and (
|
|
276
|
+
root_normalized == prefix or root_normalized.startswith(f"{prefix}/")
|
|
277
|
+
)
|
|
278
|
+
if path_under_prefix and not root_under_prefix:
|
|
279
|
+
return True
|
|
280
|
+
return False
|
|
281
|
+
|
|
282
|
+
if os_type == "macos":
|
|
283
|
+
home_library = Path.home() / "Library"
|
|
284
|
+
try:
|
|
285
|
+
root_is_library = bool(root) and _is_relative_to(root.expanduser().resolve(), home_library.expanduser().resolve())
|
|
286
|
+
if _is_relative_to(path.expanduser().resolve(), home_library.expanduser().resolve()) and not root_is_library:
|
|
287
|
+
return "user_library"
|
|
288
|
+
except OSError:
|
|
289
|
+
pass
|
|
290
|
+
if _prefix_blocks(MACOS_EXCLUDED_PREFIXES):
|
|
291
|
+
return "system_folder"
|
|
292
|
+
if os_type == "linux":
|
|
293
|
+
if _prefix_blocks(LINUX_EXCLUDED_PREFIXES):
|
|
294
|
+
return "system_folder"
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _sensitive_file_reason(path: Path, *, root: Optional[Path] = None) -> Optional[str]:
|
|
299
|
+
name = path.name.lower()
|
|
300
|
+
suffix = path.suffix.lower()
|
|
301
|
+
if name in COMMON_EXCLUDED_FILE_NAMES or suffix in COMMON_EXCLUDED_FILE_SUFFIXES:
|
|
302
|
+
return "sensitive_or_excluded_file"
|
|
303
|
+
try:
|
|
304
|
+
rel_text = path.relative_to(root).as_posix().lower() if root else path.as_posix().lower()
|
|
305
|
+
except ValueError:
|
|
306
|
+
rel_text = path.as_posix().lower()
|
|
307
|
+
tokens = re.split(r"[^0-9a-zA-Z_가-힣]+", rel_text)
|
|
308
|
+
if any(token in SENSITIVE_PATH_KEYWORDS for token in tokens):
|
|
309
|
+
return "sensitive_name"
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _root_warning(path: Path, os_type: str) -> Optional[str]:
|
|
314
|
+
resolved = path.expanduser().resolve()
|
|
315
|
+
home = Path.home().expanduser().resolve()
|
|
316
|
+
if os_type == "macos" and resolved == home:
|
|
317
|
+
return "홈 전체에는 설정/숨김 폴더가 포함될 수 있습니다. 문서, 데스크탑, 다운로드, 프로젝트 폴더부터 추가하는 것을 권장합니다."
|
|
318
|
+
if os_type == "linux" and resolved.as_posix() == "/":
|
|
319
|
+
return "루트 디렉터리에는 시스템 파일이 포함되어 있습니다. 일반 사용자 폴더나 마운트된 데이터 폴더를 권장합니다."
|
|
320
|
+
if os_type == "windows" and str(resolved).rstrip("\\/").upper() in {"C:", "C:\\"}:
|
|
321
|
+
return "C드라이브에는 Windows 시스템 파일과 앱 설정 파일이 포함되어 있습니다. 하위 폴더를 선택하는 것을 권장합니다."
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _sample_file(path: Path, root: Path, status: str, reason: str = "") -> Dict[str, Any]:
|
|
326
|
+
try:
|
|
327
|
+
rel = path.relative_to(root).as_posix()
|
|
328
|
+
except ValueError:
|
|
329
|
+
rel = path.name
|
|
330
|
+
try:
|
|
331
|
+
stat = path.stat()
|
|
332
|
+
size = stat.st_size if path.is_file() else None
|
|
333
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
334
|
+
except OSError:
|
|
335
|
+
size = None
|
|
336
|
+
modified_at = ""
|
|
337
|
+
return {
|
|
338
|
+
"path": str(path),
|
|
339
|
+
"relative_path": rel,
|
|
340
|
+
"name": path.name,
|
|
341
|
+
"extension": path.suffix.lower(),
|
|
342
|
+
"status": status,
|
|
343
|
+
"reason": reason,
|
|
344
|
+
"size_bytes": size,
|
|
345
|
+
"modified_at": modified_at,
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
|
|
83
349
|
def _clean_text(text: str) -> str:
|
|
84
350
|
return re.sub(r"\s+", " ", str(text or "")).strip()
|
|
85
351
|
|
|
@@ -380,6 +646,22 @@ def _semantic_items(text: str) -> List[Dict[str, str]]:
|
|
|
380
646
|
return items[:8]
|
|
381
647
|
|
|
382
648
|
|
|
649
|
+
def _topic_candidates(text: str, limit: int = 8) -> List[str]:
|
|
650
|
+
"""Return compact keyword candidates for fallback graph search."""
|
|
651
|
+
candidates = _extract_concepts(text, limit=limit)
|
|
652
|
+
if candidates:
|
|
653
|
+
return candidates[:limit]
|
|
654
|
+
seen: Dict[str, str] = {}
|
|
655
|
+
for token in re.findall(r"[A-Za-z][A-Za-z0-9_.:-]{2,}|[가-힣]{2,12}", str(text or "")):
|
|
656
|
+
key = token.lower()
|
|
657
|
+
if key in _CONCEPT_STOP or key.isdigit():
|
|
658
|
+
continue
|
|
659
|
+
seen.setdefault(key, token)
|
|
660
|
+
if len(seen) >= limit:
|
|
661
|
+
break
|
|
662
|
+
return list(seen.values())[:limit]
|
|
663
|
+
|
|
664
|
+
|
|
383
665
|
class KnowledgeGraphStore:
|
|
384
666
|
def __init__(self, db_path: Path, blob_dir: Path):
|
|
385
667
|
self.db_path = Path(db_path)
|
|
@@ -433,10 +715,52 @@ class KnowledgeGraphStore:
|
|
|
433
715
|
created_at TEXT NOT NULL,
|
|
434
716
|
FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
435
717
|
);
|
|
718
|
+
CREATE TABLE IF NOT EXISTS knowledge_sources (
|
|
719
|
+
id TEXT PRIMARY KEY,
|
|
720
|
+
root_path TEXT NOT NULL UNIQUE,
|
|
721
|
+
os_type TEXT NOT NULL,
|
|
722
|
+
drive_id TEXT,
|
|
723
|
+
label TEXT,
|
|
724
|
+
status TEXT NOT NULL,
|
|
725
|
+
include_ocr INTEGER NOT NULL DEFAULT 0,
|
|
726
|
+
watch_enabled INTEGER NOT NULL DEFAULT 0,
|
|
727
|
+
consent_json TEXT NOT NULL CHECK (json_valid(consent_json)),
|
|
728
|
+
created_at TEXT NOT NULL,
|
|
729
|
+
updated_at TEXT NOT NULL,
|
|
730
|
+
last_scanned_at TEXT
|
|
731
|
+
);
|
|
732
|
+
CREATE TABLE IF NOT EXISTS local_file_index (
|
|
733
|
+
id TEXT PRIMARY KEY,
|
|
734
|
+
source_id TEXT NOT NULL,
|
|
735
|
+
os_type TEXT NOT NULL,
|
|
736
|
+
drive_id TEXT,
|
|
737
|
+
root_path TEXT NOT NULL,
|
|
738
|
+
file_path TEXT NOT NULL,
|
|
739
|
+
relative_path TEXT NOT NULL,
|
|
740
|
+
file_name TEXT NOT NULL,
|
|
741
|
+
extension TEXT NOT NULL,
|
|
742
|
+
size_bytes INTEGER,
|
|
743
|
+
modified_at TEXT,
|
|
744
|
+
sha256 TEXT,
|
|
745
|
+
last_scanned_at TEXT,
|
|
746
|
+
last_indexed_at TEXT,
|
|
747
|
+
parser_type TEXT,
|
|
748
|
+
status TEXT NOT NULL,
|
|
749
|
+
error_message TEXT,
|
|
750
|
+
graph_node_id TEXT,
|
|
751
|
+
deleted INTEGER NOT NULL DEFAULT 0,
|
|
752
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
753
|
+
UNIQUE(source_id, relative_path),
|
|
754
|
+
FOREIGN KEY(source_id) REFERENCES knowledge_sources(id) ON DELETE CASCADE
|
|
755
|
+
);
|
|
436
756
|
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
437
757
|
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
438
758
|
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
439
759
|
CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
|
|
760
|
+
CREATE INDEX IF NOT EXISTS idx_knowledge_sources_root ON knowledge_sources(root_path);
|
|
761
|
+
CREATE INDEX IF NOT EXISTS idx_local_file_index_source ON local_file_index(source_id);
|
|
762
|
+
CREATE INDEX IF NOT EXISTS idx_local_file_index_status ON local_file_index(status);
|
|
763
|
+
CREATE INDEX IF NOT EXISTS idx_local_file_index_graph_node ON local_file_index(graph_node_id);
|
|
440
764
|
"""
|
|
441
765
|
)
|
|
442
766
|
conn.execute(
|
|
@@ -502,6 +826,988 @@ class KnowledgeGraphStore:
|
|
|
502
826
|
)
|
|
503
827
|
return edge_id
|
|
504
828
|
|
|
829
|
+
# ── Local folder sources → Graph RAG ──────────────────────────────────
|
|
830
|
+
|
|
831
|
+
def discover_local_roots(self) -> Dict[str, Any]:
|
|
832
|
+
"""Return safe, cross-platform starting points for structure browsing."""
|
|
833
|
+
os_type = _current_os_type()
|
|
834
|
+
home = Path.home().expanduser()
|
|
835
|
+
roots: List[Dict[str, Any]] = []
|
|
836
|
+
seen: set = set()
|
|
837
|
+
|
|
838
|
+
def add(label: str, path: Path, kind: str, *, recommended: bool = True, warning: Optional[str] = None) -> None:
|
|
839
|
+
try:
|
|
840
|
+
resolved = path.expanduser().resolve()
|
|
841
|
+
except OSError:
|
|
842
|
+
resolved = path.expanduser()
|
|
843
|
+
key = str(resolved)
|
|
844
|
+
if key in seen or not resolved.exists():
|
|
845
|
+
return
|
|
846
|
+
seen.add(key)
|
|
847
|
+
roots.append({
|
|
848
|
+
"id": f"{kind}:{_path_fingerprint(resolved)}",
|
|
849
|
+
"label": label,
|
|
850
|
+
"path": key,
|
|
851
|
+
"kind": kind,
|
|
852
|
+
"recommended": recommended,
|
|
853
|
+
"warning": warning or _root_warning(resolved, os_type),
|
|
854
|
+
})
|
|
855
|
+
|
|
856
|
+
add("홈", home, "home", warning=_root_warning(home, os_type))
|
|
857
|
+
for name, label in (
|
|
858
|
+
("Documents", "문서"),
|
|
859
|
+
("Desktop", "데스크탑"),
|
|
860
|
+
("Downloads", "다운로드"),
|
|
861
|
+
("Pictures", "사진"),
|
|
862
|
+
("Projects", "프로젝트"),
|
|
863
|
+
):
|
|
864
|
+
add(label, home / name, name.lower())
|
|
865
|
+
|
|
866
|
+
if os_type == "macos":
|
|
867
|
+
volumes = Path("/Volumes")
|
|
868
|
+
if volumes.exists():
|
|
869
|
+
try:
|
|
870
|
+
for volume in sorted(volumes.iterdir(), key=lambda p: p.name.lower()):
|
|
871
|
+
add(volume.name, volume, "volume", recommended=False)
|
|
872
|
+
except OSError:
|
|
873
|
+
pass
|
|
874
|
+
elif os_type == "windows":
|
|
875
|
+
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
876
|
+
drive = Path(f"{letter}:\\")
|
|
877
|
+
if drive.exists():
|
|
878
|
+
add(f"{letter}: 드라이브", drive, "drive", recommended=(letter != "C"))
|
|
879
|
+
for env_name, label in (("OneDrive", "OneDrive"), ("OneDriveCommercial", "OneDrive")):
|
|
880
|
+
raw = os.environ.get(env_name)
|
|
881
|
+
if raw:
|
|
882
|
+
add(label, Path(raw), "cloud", recommended=False)
|
|
883
|
+
elif os_type == "linux":
|
|
884
|
+
for base in (Path("/mnt"), Path("/media")):
|
|
885
|
+
add(str(base), base, "mounts", recommended=False)
|
|
886
|
+
try:
|
|
887
|
+
if base.exists():
|
|
888
|
+
for mounted in sorted(base.iterdir(), key=lambda p: p.name.lower()):
|
|
889
|
+
add(mounted.name, mounted, "volume", recommended=False)
|
|
890
|
+
except OSError:
|
|
891
|
+
pass
|
|
892
|
+
|
|
893
|
+
return {
|
|
894
|
+
"os_type": os_type,
|
|
895
|
+
"computer": platform.node() or "local",
|
|
896
|
+
"roots": roots,
|
|
897
|
+
"privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
|
|
901
|
+
"""List one folder level using metadata only; file contents are not read."""
|
|
902
|
+
root = Path(path).expanduser().resolve()
|
|
903
|
+
if not root.exists():
|
|
904
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
905
|
+
if not root.is_dir():
|
|
906
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
907
|
+
|
|
908
|
+
os_type = _current_os_type()
|
|
909
|
+
max_items = max(1, min(int(max_items or 200), 1000))
|
|
910
|
+
items: List[Dict[str, Any]] = []
|
|
911
|
+
inaccessible = 0
|
|
912
|
+
try:
|
|
913
|
+
children = sorted(root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
|
|
914
|
+
except PermissionError as exc:
|
|
915
|
+
return {
|
|
916
|
+
"path": str(root),
|
|
917
|
+
"items": [],
|
|
918
|
+
"error": f"접근 권한 없음: {exc}",
|
|
919
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
for child in children[:max_items]:
|
|
923
|
+
try:
|
|
924
|
+
is_dir = child.is_dir()
|
|
925
|
+
stat = child.stat()
|
|
926
|
+
reason = _excluded_directory_reason(child, root=root, os_type=os_type) if is_dir else _sensitive_file_reason(child, root=root)
|
|
927
|
+
items.append({
|
|
928
|
+
"name": child.name,
|
|
929
|
+
"path": str(child),
|
|
930
|
+
"type": "directory" if is_dir else "file",
|
|
931
|
+
"extension": "" if is_dir else child.suffix.lower(),
|
|
932
|
+
"size_bytes": None if is_dir else stat.st_size,
|
|
933
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
934
|
+
"hidden": _is_hidden_path(child, root),
|
|
935
|
+
"accessible": True,
|
|
936
|
+
"excluded_reason": reason,
|
|
937
|
+
})
|
|
938
|
+
except PermissionError:
|
|
939
|
+
inaccessible += 1
|
|
940
|
+
items.append({
|
|
941
|
+
"name": child.name,
|
|
942
|
+
"path": str(child),
|
|
943
|
+
"type": "unknown",
|
|
944
|
+
"accessible": False,
|
|
945
|
+
"excluded_reason": "permission_denied",
|
|
946
|
+
})
|
|
947
|
+
except OSError as exc:
|
|
948
|
+
inaccessible += 1
|
|
949
|
+
items.append({
|
|
950
|
+
"name": child.name,
|
|
951
|
+
"path": str(child),
|
|
952
|
+
"type": "unknown",
|
|
953
|
+
"accessible": False,
|
|
954
|
+
"excluded_reason": str(exc),
|
|
955
|
+
})
|
|
956
|
+
|
|
957
|
+
return {
|
|
958
|
+
"path": str(root),
|
|
959
|
+
"os_type": os_type,
|
|
960
|
+
"items": items,
|
|
961
|
+
"truncated": len(children) > max_items,
|
|
962
|
+
"inaccessible": inaccessible,
|
|
963
|
+
"warning": _root_warning(root, os_type),
|
|
964
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
def _iter_local_scan_entries(self, root: Path, *, max_files: int) -> Iterable[Dict[str, Any]]:
|
|
968
|
+
os_type = _current_os_type()
|
|
969
|
+
stack = [root]
|
|
970
|
+
files_seen = 0
|
|
971
|
+
while stack:
|
|
972
|
+
current = stack.pop()
|
|
973
|
+
try:
|
|
974
|
+
children = sorted(current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
|
|
975
|
+
except PermissionError as exc:
|
|
976
|
+
yield {"kind": "inaccessible_dir", "path": current, "reason": f"permission_denied: {exc}"}
|
|
977
|
+
continue
|
|
978
|
+
except OSError as exc:
|
|
979
|
+
yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
|
|
980
|
+
continue
|
|
981
|
+
|
|
982
|
+
for child in children:
|
|
983
|
+
if child.is_symlink():
|
|
984
|
+
yield {"kind": "excluded", "path": child, "reason": "symlink"}
|
|
985
|
+
continue
|
|
986
|
+
try:
|
|
987
|
+
if child.is_dir():
|
|
988
|
+
reason = _excluded_directory_reason(child, root=root, os_type=os_type)
|
|
989
|
+
if reason:
|
|
990
|
+
yield {"kind": "excluded_dir", "path": child, "reason": reason}
|
|
991
|
+
else:
|
|
992
|
+
stack.append(child)
|
|
993
|
+
continue
|
|
994
|
+
if not child.is_file():
|
|
995
|
+
yield {"kind": "excluded", "path": child, "reason": "not_regular_file"}
|
|
996
|
+
continue
|
|
997
|
+
stat = child.stat()
|
|
998
|
+
except PermissionError as exc:
|
|
999
|
+
yield {"kind": "inaccessible_file", "path": child, "reason": f"permission_denied: {exc}"}
|
|
1000
|
+
continue
|
|
1001
|
+
except OSError as exc:
|
|
1002
|
+
yield {"kind": "inaccessible_file", "path": child, "reason": str(exc)}
|
|
1003
|
+
continue
|
|
1004
|
+
|
|
1005
|
+
files_seen += 1
|
|
1006
|
+
if files_seen > max_files:
|
|
1007
|
+
yield {"kind": "limit_reached", "path": child, "reason": "max_files"}
|
|
1008
|
+
return
|
|
1009
|
+
yield {"kind": "file", "path": child, "stat": stat}
|
|
1010
|
+
|
|
1011
|
+
def _local_file_decision(self, path: Path, root: Path, stat: os.stat_result) -> Dict[str, Any]:
|
|
1012
|
+
ext = path.suffix.lower()
|
|
1013
|
+
category = _file_category(ext)
|
|
1014
|
+
parser_type = _parser_type_for_category(category, ext)
|
|
1015
|
+
sensitive_reason = _sensitive_file_reason(path, root=root)
|
|
1016
|
+
if sensitive_reason:
|
|
1017
|
+
return {
|
|
1018
|
+
"status": "sensitive_blocked",
|
|
1019
|
+
"reason": sensitive_reason,
|
|
1020
|
+
"category": category,
|
|
1021
|
+
"parser_type": parser_type,
|
|
1022
|
+
"indexable": False,
|
|
1023
|
+
}
|
|
1024
|
+
if category == "unsupported":
|
|
1025
|
+
return {
|
|
1026
|
+
"status": "unsupported",
|
|
1027
|
+
"reason": "unsupported_extension",
|
|
1028
|
+
"category": category,
|
|
1029
|
+
"parser_type": parser_type,
|
|
1030
|
+
"indexable": False,
|
|
1031
|
+
}
|
|
1032
|
+
limit = _size_limit_for_category(category)
|
|
1033
|
+
if stat.st_size > limit:
|
|
1034
|
+
return {
|
|
1035
|
+
"status": "too_large",
|
|
1036
|
+
"reason": f"size>{limit}",
|
|
1037
|
+
"category": category,
|
|
1038
|
+
"parser_type": parser_type,
|
|
1039
|
+
"indexable": False,
|
|
1040
|
+
}
|
|
1041
|
+
return {
|
|
1042
|
+
"status": "pending",
|
|
1043
|
+
"reason": "",
|
|
1044
|
+
"category": category,
|
|
1045
|
+
"parser_type": parser_type,
|
|
1046
|
+
"indexable": True,
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
def audit_local_folder(self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000) -> Dict[str, Any]:
|
|
1050
|
+
"""Safety-check a folder using metadata only; file bodies are not read."""
|
|
1051
|
+
root = Path(path).expanduser().resolve()
|
|
1052
|
+
if not root.exists():
|
|
1053
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1054
|
+
if not root.is_dir():
|
|
1055
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1056
|
+
|
|
1057
|
+
os_type = _current_os_type()
|
|
1058
|
+
max_files = max(1, min(int(max_files or 50_000), 200_000))
|
|
1059
|
+
status_counts: Counter = Counter()
|
|
1060
|
+
category_counts: Counter = Counter()
|
|
1061
|
+
extension_counts: Counter = Counter()
|
|
1062
|
+
allowed_samples: List[Dict[str, Any]] = []
|
|
1063
|
+
excluded_samples: List[Dict[str, Any]] = []
|
|
1064
|
+
total_files = 0
|
|
1065
|
+
readable_files = 0
|
|
1066
|
+
inaccessible = 0
|
|
1067
|
+
excluded_dirs = 0
|
|
1068
|
+
limit_reached = False
|
|
1069
|
+
|
|
1070
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1071
|
+
kind = entry["kind"]
|
|
1072
|
+
path_obj = entry["path"]
|
|
1073
|
+
if kind == "limit_reached":
|
|
1074
|
+
limit_reached = True
|
|
1075
|
+
break
|
|
1076
|
+
if kind == "excluded_dir":
|
|
1077
|
+
excluded_dirs += 1
|
|
1078
|
+
if len(excluded_samples) < 25:
|
|
1079
|
+
excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
|
|
1080
|
+
continue
|
|
1081
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1082
|
+
inaccessible += 1
|
|
1083
|
+
status_counts["failed"] += 1
|
|
1084
|
+
if len(excluded_samples) < 25:
|
|
1085
|
+
excluded_samples.append(_sample_file(path_obj, root, "failed", entry.get("reason", "")))
|
|
1086
|
+
continue
|
|
1087
|
+
if kind == "excluded":
|
|
1088
|
+
status_counts["excluded"] += 1
|
|
1089
|
+
if len(excluded_samples) < 25:
|
|
1090
|
+
excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
|
|
1091
|
+
continue
|
|
1092
|
+
if kind != "file":
|
|
1093
|
+
continue
|
|
1094
|
+
|
|
1095
|
+
total_files += 1
|
|
1096
|
+
stat = entry["stat"]
|
|
1097
|
+
decision = self._local_file_decision(path_obj, root, stat)
|
|
1098
|
+
status = decision["status"]
|
|
1099
|
+
category = decision["category"]
|
|
1100
|
+
ext = path_obj.suffix.lower() or "(none)"
|
|
1101
|
+
category_counts[category] += 1
|
|
1102
|
+
extension_counts[ext] += 1
|
|
1103
|
+
if decision["indexable"]:
|
|
1104
|
+
readable_files += 1
|
|
1105
|
+
status_counts["readable"] += 1
|
|
1106
|
+
if len(allowed_samples) < 25:
|
|
1107
|
+
allowed_samples.append(_sample_file(path_obj, root, "readable"))
|
|
1108
|
+
else:
|
|
1109
|
+
status_counts[status] += 1
|
|
1110
|
+
if len(excluded_samples) < 25:
|
|
1111
|
+
excluded_samples.append(_sample_file(path_obj, root, status, decision["reason"]))
|
|
1112
|
+
|
|
1113
|
+
doc_weight = category_counts["pdf"] * 1.4 + category_counts["document"] * 0.9 + category_counts["slide_deck"] * 1.0
|
|
1114
|
+
sheet_weight = category_counts["spreadsheet"] * 0.6
|
|
1115
|
+
ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
|
|
1116
|
+
estimated_seconds = round(readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1)
|
|
1117
|
+
|
|
1118
|
+
return {
|
|
1119
|
+
"path": str(root),
|
|
1120
|
+
"source_id": f"source:{_path_fingerprint(root)}",
|
|
1121
|
+
"os_type": os_type,
|
|
1122
|
+
"drive_id": _drive_id_for_path(root),
|
|
1123
|
+
"warning": _root_warning(root, os_type),
|
|
1124
|
+
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
1125
|
+
"include_ocr_requested": bool(include_ocr),
|
|
1126
|
+
"summary": {
|
|
1127
|
+
"total_files": total_files,
|
|
1128
|
+
"readable_files": readable_files,
|
|
1129
|
+
"excluded_files": int(
|
|
1130
|
+
status_counts["excluded"]
|
|
1131
|
+
+ status_counts["sensitive_blocked"]
|
|
1132
|
+
+ status_counts["too_large"]
|
|
1133
|
+
+ status_counts["unsupported"]
|
|
1134
|
+
),
|
|
1135
|
+
"sensitive_files": int(status_counts["sensitive_blocked"]),
|
|
1136
|
+
"too_large_files": int(status_counts["too_large"]),
|
|
1137
|
+
"unsupported_files": int(status_counts["unsupported"]),
|
|
1138
|
+
"image_ocr_candidates": int(category_counts["image"]),
|
|
1139
|
+
"inaccessible_items": inaccessible,
|
|
1140
|
+
"excluded_dirs": excluded_dirs,
|
|
1141
|
+
"estimated_seconds": estimated_seconds,
|
|
1142
|
+
"storage_root": str(self.db_path.parent),
|
|
1143
|
+
"limit_reached": limit_reached,
|
|
1144
|
+
},
|
|
1145
|
+
"by_status": dict(status_counts),
|
|
1146
|
+
"by_category": dict(category_counts),
|
|
1147
|
+
"by_extension": dict(extension_counts.most_common(40)),
|
|
1148
|
+
"allowed_samples": allowed_samples,
|
|
1149
|
+
"excluded_samples": excluded_samples,
|
|
1150
|
+
"consent_required": {
|
|
1151
|
+
"knowledge_source": True,
|
|
1152
|
+
"image_ocr": bool(category_counts["image"]),
|
|
1153
|
+
"watch": True,
|
|
1154
|
+
"sensitive_files_default_excluded": True,
|
|
1155
|
+
},
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
def local_sources(self) -> Dict[str, Any]:
|
|
1159
|
+
with self._connect() as conn:
|
|
1160
|
+
sources = [
|
|
1161
|
+
{
|
|
1162
|
+
"id": row["id"],
|
|
1163
|
+
"root_path": row["root_path"],
|
|
1164
|
+
"os_type": row["os_type"],
|
|
1165
|
+
"drive_id": row["drive_id"],
|
|
1166
|
+
"label": row["label"],
|
|
1167
|
+
"status": row["status"],
|
|
1168
|
+
"include_ocr": bool(row["include_ocr"]),
|
|
1169
|
+
"watch_enabled": bool(row["watch_enabled"]),
|
|
1170
|
+
"consent": _safe_loads(row["consent_json"]),
|
|
1171
|
+
"created_at": row["created_at"],
|
|
1172
|
+
"updated_at": row["updated_at"],
|
|
1173
|
+
"last_scanned_at": row["last_scanned_at"],
|
|
1174
|
+
}
|
|
1175
|
+
for row in conn.execute(
|
|
1176
|
+
"""
|
|
1177
|
+
SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1178
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1179
|
+
FROM knowledge_sources
|
|
1180
|
+
ORDER BY updated_at DESC
|
|
1181
|
+
"""
|
|
1182
|
+
)
|
|
1183
|
+
]
|
|
1184
|
+
status_rows = conn.execute(
|
|
1185
|
+
"SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
|
|
1186
|
+
).fetchall()
|
|
1187
|
+
counts: Dict[str, Dict[str, int]] = {}
|
|
1188
|
+
for row in status_rows:
|
|
1189
|
+
counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
|
|
1190
|
+
for source in sources:
|
|
1191
|
+
source["file_status"] = counts.get(source["id"], {})
|
|
1192
|
+
return {"sources": sources}
|
|
1193
|
+
|
|
1194
|
+
def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
|
|
1195
|
+
source_id = str(source_id or "").strip()
|
|
1196
|
+
if not source_id:
|
|
1197
|
+
raise ValueError("source_id required")
|
|
1198
|
+
with self._connect() as conn:
|
|
1199
|
+
row = conn.execute(
|
|
1200
|
+
"SELECT id FROM knowledge_sources WHERE id=?",
|
|
1201
|
+
(source_id,),
|
|
1202
|
+
).fetchone()
|
|
1203
|
+
if not row:
|
|
1204
|
+
raise ValueError(f"knowledge source not found: {source_id}")
|
|
1205
|
+
conn.execute(
|
|
1206
|
+
"UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
|
|
1207
|
+
(1 if enabled else 0, _now(), source_id),
|
|
1208
|
+
)
|
|
1209
|
+
return {"source_id": source_id, "watch_enabled": bool(enabled)}
|
|
1210
|
+
|
|
1211
|
+
def _extract_local_file_text(self, path: Path, category: str, *, include_ocr: bool) -> Tuple[str, Dict[str, Any]]:
|
|
1212
|
+
ext = path.suffix.lower()
|
|
1213
|
+
meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
|
|
1214
|
+
text = ""
|
|
1215
|
+
if category in {"text", "code"} or ext == ".csv":
|
|
1216
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
1217
|
+
elif ext == ".pdf":
|
|
1218
|
+
import pdfplumber
|
|
1219
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
1220
|
+
meta["pages"] = len(pdf.pages)
|
|
1221
|
+
text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
|
|
1222
|
+
elif ext == ".docx":
|
|
1223
|
+
from docx import Document
|
|
1224
|
+
doc = Document(str(path))
|
|
1225
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
1226
|
+
meta["paragraphs"] = len(paragraphs)
|
|
1227
|
+
meta["tables"] = len(doc.tables)
|
|
1228
|
+
text = "\n\n".join(paragraphs)
|
|
1229
|
+
elif ext == ".xlsx":
|
|
1230
|
+
from openpyxl import load_workbook
|
|
1231
|
+
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
1232
|
+
rows_all = []
|
|
1233
|
+
for ws in wb.worksheets:
|
|
1234
|
+
rows_all.append(f"[Sheet: {ws.title}]")
|
|
1235
|
+
for row in ws.iter_rows(values_only=True):
|
|
1236
|
+
cells = [str(cell) if cell is not None else "" for cell in row]
|
|
1237
|
+
rows_all.append("\t".join(cells))
|
|
1238
|
+
if len("\n".join(rows_all)) > 200_000:
|
|
1239
|
+
break
|
|
1240
|
+
meta["sheets"] = len(wb.worksheets)
|
|
1241
|
+
text = "\n".join(rows_all)
|
|
1242
|
+
elif ext == ".pptx":
|
|
1243
|
+
from pptx import Presentation
|
|
1244
|
+
prs = Presentation(str(path))
|
|
1245
|
+
slides_text = []
|
|
1246
|
+
for index, slide in enumerate(prs.slides, 1):
|
|
1247
|
+
parts = []
|
|
1248
|
+
for shape in slide.shapes:
|
|
1249
|
+
if getattr(shape, "has_text_frame", False):
|
|
1250
|
+
parts.append(shape.text_frame.text)
|
|
1251
|
+
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
1252
|
+
meta["slides"] = len(prs.slides)
|
|
1253
|
+
text = "\n\n".join(slides_text)
|
|
1254
|
+
elif category == "image":
|
|
1255
|
+
from PIL import Image
|
|
1256
|
+
with Image.open(str(path)) as image:
|
|
1257
|
+
meta.update({
|
|
1258
|
+
"width": image.width,
|
|
1259
|
+
"height": image.height,
|
|
1260
|
+
"format": image.format,
|
|
1261
|
+
"mode": image.mode,
|
|
1262
|
+
"ocr_enabled": bool(include_ocr),
|
|
1263
|
+
})
|
|
1264
|
+
if include_ocr:
|
|
1265
|
+
try:
|
|
1266
|
+
import pytesseract
|
|
1267
|
+
text = pytesseract.image_to_string(image)
|
|
1268
|
+
meta["ocr_chars"] = len(text)
|
|
1269
|
+
except Exception as exc: # pragma: no cover - depends on local OCR runtime
|
|
1270
|
+
meta["ocr_error"] = str(exc)
|
|
1271
|
+
text = ""
|
|
1272
|
+
return text[:200_000], meta
|
|
1273
|
+
|
|
1274
|
+
def _ensure_local_hierarchy(
|
|
1275
|
+
self,
|
|
1276
|
+
conn: sqlite3.Connection,
|
|
1277
|
+
*,
|
|
1278
|
+
source_id: str,
|
|
1279
|
+
root: Path,
|
|
1280
|
+
file_path: Path,
|
|
1281
|
+
os_type: str,
|
|
1282
|
+
drive_id: str,
|
|
1283
|
+
) -> str:
|
|
1284
|
+
computer_label = platform.node() or "내 컴퓨터"
|
|
1285
|
+
computer_id = f"computer:{_slug(computer_label)}"
|
|
1286
|
+
drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
|
|
1287
|
+
root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
|
|
1288
|
+
self._upsert_node(conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type})
|
|
1289
|
+
self._upsert_node(conn, drive_node_id, "Drive", drive_id, metadata={"os_type": os_type, "drive_id": drive_id})
|
|
1290
|
+
self._upsert_edge(conn, computer_id, drive_node_id, "포함함", metadata={"source": "local_scan"})
|
|
1291
|
+
self._upsert_node(
|
|
1292
|
+
conn,
|
|
1293
|
+
root_folder_id,
|
|
1294
|
+
"Folder",
|
|
1295
|
+
root.name or str(root),
|
|
1296
|
+
summary=str(root),
|
|
1297
|
+
metadata={"source_id": source_id, "path": str(root), "root": True},
|
|
1298
|
+
)
|
|
1299
|
+
self._upsert_edge(conn, drive_node_id, root_folder_id, "포함함", metadata={"source": "local_scan"})
|
|
1300
|
+
|
|
1301
|
+
try:
|
|
1302
|
+
relative_parent = file_path.parent.relative_to(root)
|
|
1303
|
+
except ValueError:
|
|
1304
|
+
relative_parent = Path()
|
|
1305
|
+
parent_id = root_folder_id
|
|
1306
|
+
current_path = root
|
|
1307
|
+
for part in relative_parent.parts:
|
|
1308
|
+
current_path = current_path / part
|
|
1309
|
+
folder_id = f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
|
|
1310
|
+
self._upsert_node(
|
|
1311
|
+
conn,
|
|
1312
|
+
folder_id,
|
|
1313
|
+
"Folder",
|
|
1314
|
+
part,
|
|
1315
|
+
summary=str(current_path),
|
|
1316
|
+
metadata={"source_id": source_id, "path": str(current_path), "root": False},
|
|
1317
|
+
)
|
|
1318
|
+
self._upsert_edge(conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"})
|
|
1319
|
+
parent_id = folder_id
|
|
1320
|
+
return parent_id
|
|
1321
|
+
|
|
1322
|
+
def _upsert_local_file_index(
|
|
1323
|
+
self,
|
|
1324
|
+
conn: sqlite3.Connection,
|
|
1325
|
+
*,
|
|
1326
|
+
source_id: str,
|
|
1327
|
+
root: Path,
|
|
1328
|
+
file_path: Path,
|
|
1329
|
+
stat: Optional[os.stat_result],
|
|
1330
|
+
os_type: str,
|
|
1331
|
+
drive_id: str,
|
|
1332
|
+
status: str,
|
|
1333
|
+
parser_type: str,
|
|
1334
|
+
sha256: Optional[str] = None,
|
|
1335
|
+
graph_node_id: Optional[str] = None,
|
|
1336
|
+
error_message: Optional[str] = None,
|
|
1337
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
1338
|
+
) -> str:
|
|
1339
|
+
try:
|
|
1340
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1341
|
+
except ValueError:
|
|
1342
|
+
relative_path = file_path.name
|
|
1343
|
+
index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
1344
|
+
now = _now()
|
|
1345
|
+
size = stat.st_size if stat else None
|
|
1346
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
|
|
1347
|
+
conn.execute(
|
|
1348
|
+
"""
|
|
1349
|
+
INSERT INTO local_file_index(
|
|
1350
|
+
id, source_id, os_type, drive_id, root_path, file_path, relative_path,
|
|
1351
|
+
file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
|
|
1352
|
+
last_indexed_at, parser_type, status, error_message, graph_node_id,
|
|
1353
|
+
deleted, metadata_json
|
|
1354
|
+
)
|
|
1355
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1356
|
+
ON CONFLICT(source_id, relative_path) DO UPDATE SET
|
|
1357
|
+
os_type=excluded.os_type,
|
|
1358
|
+
drive_id=excluded.drive_id,
|
|
1359
|
+
root_path=excluded.root_path,
|
|
1360
|
+
file_path=excluded.file_path,
|
|
1361
|
+
file_name=excluded.file_name,
|
|
1362
|
+
extension=excluded.extension,
|
|
1363
|
+
size_bytes=excluded.size_bytes,
|
|
1364
|
+
modified_at=excluded.modified_at,
|
|
1365
|
+
sha256=COALESCE(excluded.sha256, local_file_index.sha256),
|
|
1366
|
+
last_scanned_at=excluded.last_scanned_at,
|
|
1367
|
+
last_indexed_at=COALESCE(excluded.last_indexed_at, local_file_index.last_indexed_at),
|
|
1368
|
+
parser_type=excluded.parser_type,
|
|
1369
|
+
status=excluded.status,
|
|
1370
|
+
error_message=excluded.error_message,
|
|
1371
|
+
graph_node_id=COALESCE(excluded.graph_node_id, local_file_index.graph_node_id),
|
|
1372
|
+
deleted=excluded.deleted,
|
|
1373
|
+
metadata_json=excluded.metadata_json
|
|
1374
|
+
""",
|
|
1375
|
+
(
|
|
1376
|
+
index_id, source_id, os_type, drive_id, str(root), str(file_path), relative_path,
|
|
1377
|
+
file_path.name, file_path.suffix.lower(), size, modified_at, sha256, now,
|
|
1378
|
+
now if status == "indexed" else None, parser_type, status, error_message,
|
|
1379
|
+
graph_node_id, 0 if status != "deleted" else 1, _json(metadata),
|
|
1380
|
+
),
|
|
1381
|
+
)
|
|
1382
|
+
return index_id
|
|
1383
|
+
|
|
1384
|
+
def _upsert_local_file_node(
|
|
1385
|
+
self,
|
|
1386
|
+
conn: sqlite3.Connection,
|
|
1387
|
+
*,
|
|
1388
|
+
source_id: str,
|
|
1389
|
+
root: Path,
|
|
1390
|
+
file_path: Path,
|
|
1391
|
+
stat: os.stat_result,
|
|
1392
|
+
os_type: str,
|
|
1393
|
+
drive_id: str,
|
|
1394
|
+
sha256: str,
|
|
1395
|
+
category: str,
|
|
1396
|
+
parser_type: str,
|
|
1397
|
+
text: str,
|
|
1398
|
+
parser_meta: Dict[str, Any],
|
|
1399
|
+
) -> str:
|
|
1400
|
+
try:
|
|
1401
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1402
|
+
except ValueError:
|
|
1403
|
+
relative_path = file_path.name
|
|
1404
|
+
file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
1405
|
+
parent_folder_id = self._ensure_local_hierarchy(
|
|
1406
|
+
conn,
|
|
1407
|
+
source_id=source_id,
|
|
1408
|
+
root=root,
|
|
1409
|
+
file_path=file_path,
|
|
1410
|
+
os_type=os_type,
|
|
1411
|
+
drive_id=drive_id,
|
|
1412
|
+
)
|
|
1413
|
+
child_rows = conn.execute(
|
|
1414
|
+
"""
|
|
1415
|
+
SELECT e.to_node AS id
|
|
1416
|
+
FROM edges e
|
|
1417
|
+
JOIN nodes n ON n.id=e.to_node
|
|
1418
|
+
WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
|
|
1419
|
+
""",
|
|
1420
|
+
(file_node_id,),
|
|
1421
|
+
).fetchall()
|
|
1422
|
+
child_ids = [row["id"] for row in child_rows]
|
|
1423
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
1424
|
+
if child_ids:
|
|
1425
|
+
placeholders = ",".join("?" * len(child_ids))
|
|
1426
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
|
|
1427
|
+
conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
|
|
1428
|
+
|
|
1429
|
+
metadata = {
|
|
1430
|
+
"source": "local_folder",
|
|
1431
|
+
"source_id": source_id,
|
|
1432
|
+
"root_path": str(root),
|
|
1433
|
+
"file_path": str(file_path),
|
|
1434
|
+
"relative_path": relative_path,
|
|
1435
|
+
"filename": file_path.name,
|
|
1436
|
+
"ext": file_path.suffix.lower(),
|
|
1437
|
+
"category": category,
|
|
1438
|
+
"parser_type": parser_type,
|
|
1439
|
+
"bytes": stat.st_size,
|
|
1440
|
+
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
1441
|
+
"sha256": sha256,
|
|
1442
|
+
"parser": parser_meta,
|
|
1443
|
+
}
|
|
1444
|
+
self._upsert_node(
|
|
1445
|
+
conn,
|
|
1446
|
+
file_node_id,
|
|
1447
|
+
_node_type_for_category(category),
|
|
1448
|
+
file_path.name,
|
|
1449
|
+
summary=(_clean_text(text) or relative_path)[:700],
|
|
1450
|
+
metadata=metadata,
|
|
1451
|
+
raw=metadata,
|
|
1452
|
+
)
|
|
1453
|
+
self._upsert_edge(conn, parent_folder_id, file_node_id, "포함함", weight=1.0, metadata={"source": "local_scan"})
|
|
1454
|
+
|
|
1455
|
+
target_for_concepts = text
|
|
1456
|
+
if category == "image" and text:
|
|
1457
|
+
image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
|
|
1458
|
+
self._upsert_node(
|
|
1459
|
+
conn,
|
|
1460
|
+
image_text_id,
|
|
1461
|
+
"ImageText",
|
|
1462
|
+
f"{file_path.name} OCR",
|
|
1463
|
+
summary=_clean_text(text)[:700],
|
|
1464
|
+
metadata={"source_node": file_node_id, "source_id": source_id, "chars": len(text)},
|
|
1465
|
+
)
|
|
1466
|
+
self._upsert_edge(conn, file_node_id, image_text_id, "포함함", weight=0.8, metadata={"source": "ocr"})
|
|
1467
|
+
|
|
1468
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
1469
|
+
chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
|
|
1470
|
+
self._upsert_node(
|
|
1471
|
+
conn,
|
|
1472
|
+
chunk_id,
|
|
1473
|
+
"Chunk",
|
|
1474
|
+
f"{file_path.name} chunk {index + 1}",
|
|
1475
|
+
summary=chunk[:500],
|
|
1476
|
+
metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
|
|
1477
|
+
)
|
|
1478
|
+
conn.execute(
|
|
1479
|
+
"INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) "
|
|
1480
|
+
"VALUES (?, ?, ?, ?, ?)",
|
|
1481
|
+
(
|
|
1482
|
+
chunk_id,
|
|
1483
|
+
file_node_id,
|
|
1484
|
+
chunk,
|
|
1485
|
+
_json({"index": index, "source_node": file_node_id, "source_id": source_id}),
|
|
1486
|
+
_now(),
|
|
1487
|
+
),
|
|
1488
|
+
)
|
|
1489
|
+
self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
|
|
1490
|
+
|
|
1491
|
+
concepts = _extract_concepts(f"{file_path.name}\n{target_for_concepts}", limit=18)
|
|
1492
|
+
concept_ids: Dict[str, str] = {}
|
|
1493
|
+
for concept in concepts:
|
|
1494
|
+
node_t = _classify_node_type(concept, target_for_concepts)
|
|
1495
|
+
concept_id = f"{node_t.lower()}:{_slug(concept)}"
|
|
1496
|
+
concept_ids[concept.lower()] = concept_id
|
|
1497
|
+
self._upsert_node(
|
|
1498
|
+
conn,
|
|
1499
|
+
concept_id,
|
|
1500
|
+
node_t,
|
|
1501
|
+
concept,
|
|
1502
|
+
metadata={"auto_extracted": True, "source": "local_folder", "source_id": source_id},
|
|
1503
|
+
)
|
|
1504
|
+
self._upsert_edge(conn, file_node_id, concept_id, "언급함", weight=0.75, metadata={"source": "local_scan"})
|
|
1505
|
+
|
|
1506
|
+
for triple in _extract_triples(target_for_concepts, concepts, limit=20):
|
|
1507
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
1508
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
1509
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
1510
|
+
self._upsert_edge(
|
|
1511
|
+
conn,
|
|
1512
|
+
subj_id,
|
|
1513
|
+
obj_id,
|
|
1514
|
+
triple["relation"],
|
|
1515
|
+
weight=0.9,
|
|
1516
|
+
metadata={"context": triple.get("context", "")[:240], "source_id": source_id},
|
|
1517
|
+
)
|
|
1518
|
+
|
|
1519
|
+
for item in _semantic_items(target_for_concepts):
|
|
1520
|
+
sem_type = item["type"]
|
|
1521
|
+
sem_title = item["title"]
|
|
1522
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
|
|
1523
|
+
self._upsert_node(
|
|
1524
|
+
conn,
|
|
1525
|
+
sem_id,
|
|
1526
|
+
sem_type,
|
|
1527
|
+
sem_title,
|
|
1528
|
+
summary=item["summary"],
|
|
1529
|
+
metadata={"auto_extracted": True, "source_node": file_node_id, "filename": file_path.name},
|
|
1530
|
+
raw=item,
|
|
1531
|
+
)
|
|
1532
|
+
self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
|
|
1533
|
+
|
|
1534
|
+
return file_node_id
|
|
1535
|
+
|
|
1536
|
+
def index_local_folder(
|
|
1537
|
+
self,
|
|
1538
|
+
path: Path,
|
|
1539
|
+
*,
|
|
1540
|
+
include_ocr: bool = False,
|
|
1541
|
+
watch_enabled: bool = False,
|
|
1542
|
+
user_email: Optional[str] = None,
|
|
1543
|
+
consent: Optional[Dict[str, Any]] = None,
|
|
1544
|
+
max_files: int = 5_000,
|
|
1545
|
+
) -> Dict[str, Any]:
|
|
1546
|
+
"""Read approved files from a local folder and connect them to Graph RAG."""
|
|
1547
|
+
root = Path(path).expanduser().resolve()
|
|
1548
|
+
if not root.exists():
|
|
1549
|
+
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1550
|
+
if not root.is_dir():
|
|
1551
|
+
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1552
|
+
|
|
1553
|
+
os_type = _current_os_type()
|
|
1554
|
+
drive_id = _drive_id_for_path(root)
|
|
1555
|
+
source_id = f"source:{_path_fingerprint(root)}"
|
|
1556
|
+
now = _now()
|
|
1557
|
+
max_files = max(1, min(int(max_files or 5_000), 50_000))
|
|
1558
|
+
consent_payload = {
|
|
1559
|
+
"approved_at": now,
|
|
1560
|
+
"approved_by": user_email,
|
|
1561
|
+
"knowledge_source": True,
|
|
1562
|
+
"include_ocr": bool(include_ocr),
|
|
1563
|
+
"watch_enabled": bool(watch_enabled),
|
|
1564
|
+
"sensitive_files_default_excluded": True,
|
|
1565
|
+
**(consent or {}),
|
|
1566
|
+
}
|
|
1567
|
+
counts: Counter = Counter()
|
|
1568
|
+
seen_relative_paths: set = set()
|
|
1569
|
+
indexed_nodes: List[str] = []
|
|
1570
|
+
errors: List[Dict[str, str]] = []
|
|
1571
|
+
limit_reached = False
|
|
1572
|
+
|
|
1573
|
+
with self._connect() as conn:
|
|
1574
|
+
conn.execute(
|
|
1575
|
+
"""
|
|
1576
|
+
INSERT INTO knowledge_sources(
|
|
1577
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1578
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1579
|
+
)
|
|
1580
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1581
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
1582
|
+
root_path=excluded.root_path,
|
|
1583
|
+
os_type=excluded.os_type,
|
|
1584
|
+
drive_id=excluded.drive_id,
|
|
1585
|
+
label=excluded.label,
|
|
1586
|
+
status=excluded.status,
|
|
1587
|
+
include_ocr=excluded.include_ocr,
|
|
1588
|
+
watch_enabled=excluded.watch_enabled,
|
|
1589
|
+
consent_json=excluded.consent_json,
|
|
1590
|
+
updated_at=excluded.updated_at,
|
|
1591
|
+
last_scanned_at=excluded.last_scanned_at
|
|
1592
|
+
""",
|
|
1593
|
+
(
|
|
1594
|
+
source_id, str(root), os_type, drive_id, root.name or str(root), "scanning",
|
|
1595
|
+
1 if include_ocr else 0, 1 if watch_enabled else 0, _json(consent_payload),
|
|
1596
|
+
now, now, now,
|
|
1597
|
+
),
|
|
1598
|
+
)
|
|
1599
|
+
|
|
1600
|
+
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1601
|
+
kind = entry["kind"]
|
|
1602
|
+
file_path = entry["path"]
|
|
1603
|
+
if kind == "limit_reached":
|
|
1604
|
+
counts["limit_reached"] += 1
|
|
1605
|
+
limit_reached = True
|
|
1606
|
+
break
|
|
1607
|
+
if kind in {"excluded_dir", "excluded"}:
|
|
1608
|
+
counts["excluded"] += 1
|
|
1609
|
+
continue
|
|
1610
|
+
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1611
|
+
counts["failed"] += 1
|
|
1612
|
+
errors.append({"path": str(file_path), "error": entry.get("reason", "inaccessible")})
|
|
1613
|
+
continue
|
|
1614
|
+
if kind != "file":
|
|
1615
|
+
continue
|
|
1616
|
+
|
|
1617
|
+
stat = entry["stat"]
|
|
1618
|
+
try:
|
|
1619
|
+
relative_path = file_path.relative_to(root).as_posix()
|
|
1620
|
+
except ValueError:
|
|
1621
|
+
relative_path = file_path.name
|
|
1622
|
+
seen_relative_paths.add(relative_path)
|
|
1623
|
+
decision = self._local_file_decision(file_path, root, stat)
|
|
1624
|
+
parser_type = decision["parser_type"]
|
|
1625
|
+
if not decision["indexable"]:
|
|
1626
|
+
counts[decision["status"]] += 1
|
|
1627
|
+
self._upsert_local_file_index(
|
|
1628
|
+
conn,
|
|
1629
|
+
source_id=source_id,
|
|
1630
|
+
root=root,
|
|
1631
|
+
file_path=file_path,
|
|
1632
|
+
stat=stat,
|
|
1633
|
+
os_type=os_type,
|
|
1634
|
+
drive_id=drive_id,
|
|
1635
|
+
status=decision["status"],
|
|
1636
|
+
parser_type=parser_type,
|
|
1637
|
+
metadata={"reason": decision["reason"], "category": decision["category"]},
|
|
1638
|
+
)
|
|
1639
|
+
continue
|
|
1640
|
+
|
|
1641
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1642
|
+
existing = conn.execute(
|
|
1643
|
+
"""
|
|
1644
|
+
SELECT size_bytes, modified_at, sha256, graph_node_id, status
|
|
1645
|
+
FROM local_file_index
|
|
1646
|
+
WHERE source_id=? AND relative_path=?
|
|
1647
|
+
""",
|
|
1648
|
+
(source_id, relative_path),
|
|
1649
|
+
).fetchone()
|
|
1650
|
+
if (
|
|
1651
|
+
existing
|
|
1652
|
+
and existing["status"] == "indexed"
|
|
1653
|
+
and existing["graph_node_id"]
|
|
1654
|
+
and existing["size_bytes"] == stat.st_size
|
|
1655
|
+
and existing["modified_at"] == modified_at
|
|
1656
|
+
):
|
|
1657
|
+
counts["skipped_unchanged"] += 1
|
|
1658
|
+
self._upsert_local_file_index(
|
|
1659
|
+
conn,
|
|
1660
|
+
source_id=source_id,
|
|
1661
|
+
root=root,
|
|
1662
|
+
file_path=file_path,
|
|
1663
|
+
stat=stat,
|
|
1664
|
+
os_type=os_type,
|
|
1665
|
+
drive_id=drive_id,
|
|
1666
|
+
status="indexed",
|
|
1667
|
+
parser_type=parser_type,
|
|
1668
|
+
sha256=existing["sha256"],
|
|
1669
|
+
graph_node_id=existing["graph_node_id"],
|
|
1670
|
+
metadata={"category": decision["category"], "unchanged": True},
|
|
1671
|
+
)
|
|
1672
|
+
continue
|
|
1673
|
+
|
|
1674
|
+
try:
|
|
1675
|
+
data = file_path.read_bytes()
|
|
1676
|
+
digest = _sha256_bytes(data)
|
|
1677
|
+
except Exception as exc:
|
|
1678
|
+
counts["failed"] += 1
|
|
1679
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1680
|
+
self._upsert_local_file_index(
|
|
1681
|
+
conn,
|
|
1682
|
+
source_id=source_id,
|
|
1683
|
+
root=root,
|
|
1684
|
+
file_path=file_path,
|
|
1685
|
+
stat=stat,
|
|
1686
|
+
os_type=os_type,
|
|
1687
|
+
drive_id=drive_id,
|
|
1688
|
+
status="failed",
|
|
1689
|
+
parser_type=parser_type,
|
|
1690
|
+
error_message=str(exc),
|
|
1691
|
+
metadata={"category": decision["category"]},
|
|
1692
|
+
)
|
|
1693
|
+
continue
|
|
1694
|
+
|
|
1695
|
+
if existing and existing["sha256"] == digest and existing["graph_node_id"]:
|
|
1696
|
+
counts["skipped_unchanged"] += 1
|
|
1697
|
+
self._upsert_local_file_index(
|
|
1698
|
+
conn,
|
|
1699
|
+
source_id=source_id,
|
|
1700
|
+
root=root,
|
|
1701
|
+
file_path=file_path,
|
|
1702
|
+
stat=stat,
|
|
1703
|
+
os_type=os_type,
|
|
1704
|
+
drive_id=drive_id,
|
|
1705
|
+
status="indexed",
|
|
1706
|
+
parser_type=parser_type,
|
|
1707
|
+
sha256=digest,
|
|
1708
|
+
graph_node_id=existing["graph_node_id"],
|
|
1709
|
+
metadata={"category": decision["category"], "sha256_unchanged": True},
|
|
1710
|
+
)
|
|
1711
|
+
continue
|
|
1712
|
+
|
|
1713
|
+
try:
|
|
1714
|
+
text, parser_meta = self._extract_local_file_text(
|
|
1715
|
+
file_path,
|
|
1716
|
+
decision["category"],
|
|
1717
|
+
include_ocr=include_ocr,
|
|
1718
|
+
)
|
|
1719
|
+
graph_node_id = self._upsert_local_file_node(
|
|
1720
|
+
conn,
|
|
1721
|
+
source_id=source_id,
|
|
1722
|
+
root=root,
|
|
1723
|
+
file_path=file_path,
|
|
1724
|
+
stat=stat,
|
|
1725
|
+
os_type=os_type,
|
|
1726
|
+
drive_id=drive_id,
|
|
1727
|
+
sha256=digest,
|
|
1728
|
+
category=decision["category"],
|
|
1729
|
+
parser_type=parser_type,
|
|
1730
|
+
text=text,
|
|
1731
|
+
parser_meta=parser_meta,
|
|
1732
|
+
)
|
|
1733
|
+
self._upsert_local_file_index(
|
|
1734
|
+
conn,
|
|
1735
|
+
source_id=source_id,
|
|
1736
|
+
root=root,
|
|
1737
|
+
file_path=file_path,
|
|
1738
|
+
stat=stat,
|
|
1739
|
+
os_type=os_type,
|
|
1740
|
+
drive_id=drive_id,
|
|
1741
|
+
status="indexed",
|
|
1742
|
+
parser_type=parser_type,
|
|
1743
|
+
sha256=digest,
|
|
1744
|
+
graph_node_id=graph_node_id,
|
|
1745
|
+
metadata={"category": decision["category"], "parser": parser_meta},
|
|
1746
|
+
)
|
|
1747
|
+
counts["indexed"] += 1
|
|
1748
|
+
indexed_nodes.append(graph_node_id)
|
|
1749
|
+
except Exception as exc:
|
|
1750
|
+
counts["failed"] += 1
|
|
1751
|
+
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1752
|
+
self._upsert_local_file_index(
|
|
1753
|
+
conn,
|
|
1754
|
+
source_id=source_id,
|
|
1755
|
+
root=root,
|
|
1756
|
+
file_path=file_path,
|
|
1757
|
+
stat=stat,
|
|
1758
|
+
os_type=os_type,
|
|
1759
|
+
drive_id=drive_id,
|
|
1760
|
+
status="failed",
|
|
1761
|
+
parser_type=parser_type,
|
|
1762
|
+
sha256=digest,
|
|
1763
|
+
error_message=str(exc),
|
|
1764
|
+
metadata={"category": decision["category"]},
|
|
1765
|
+
)
|
|
1766
|
+
|
|
1767
|
+
if not limit_reached:
|
|
1768
|
+
existing_paths = {
|
|
1769
|
+
row["relative_path"]
|
|
1770
|
+
for row in conn.execute(
|
|
1771
|
+
"SELECT relative_path FROM local_file_index WHERE source_id=?",
|
|
1772
|
+
(source_id,),
|
|
1773
|
+
)
|
|
1774
|
+
}
|
|
1775
|
+
deleted_paths = existing_paths - seen_relative_paths
|
|
1776
|
+
for relative_path in deleted_paths:
|
|
1777
|
+
conn.execute(
|
|
1778
|
+
"""
|
|
1779
|
+
UPDATE local_file_index
|
|
1780
|
+
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL
|
|
1781
|
+
WHERE source_id=? AND relative_path=?
|
|
1782
|
+
""",
|
|
1783
|
+
(_now(), source_id, relative_path),
|
|
1784
|
+
)
|
|
1785
|
+
counts["deleted"] = len(deleted_paths)
|
|
1786
|
+
conn.execute(
|
|
1787
|
+
"""
|
|
1788
|
+
UPDATE knowledge_sources
|
|
1789
|
+
SET status='active', updated_at=?, last_scanned_at=?
|
|
1790
|
+
WHERE id=?
|
|
1791
|
+
""",
|
|
1792
|
+
(_now(), _now(), source_id),
|
|
1793
|
+
)
|
|
1794
|
+
|
|
1795
|
+
return {
|
|
1796
|
+
"status": "ok",
|
|
1797
|
+
"source": {
|
|
1798
|
+
"id": source_id,
|
|
1799
|
+
"root_path": str(root),
|
|
1800
|
+
"os_type": os_type,
|
|
1801
|
+
"drive_id": drive_id,
|
|
1802
|
+
"include_ocr": bool(include_ocr),
|
|
1803
|
+
"watch_enabled": bool(watch_enabled),
|
|
1804
|
+
},
|
|
1805
|
+
"counts": dict(counts),
|
|
1806
|
+
"indexed_nodes": indexed_nodes[:100],
|
|
1807
|
+
"errors": errors[:50],
|
|
1808
|
+
"notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
|
|
1809
|
+
}
|
|
1810
|
+
|
|
505
1811
|
def ingest_message(
|
|
506
1812
|
self,
|
|
507
1813
|
role: str,
|
|
@@ -961,8 +2267,17 @@ class KnowledgeGraphStore:
|
|
|
961
2267
|
# ── 그래프에 표시되는 노드 타입 (점 = 명사) ──────────────────────────────
|
|
962
2268
|
# Message / AIResponse / Chunk 는 RAG 검색용으로만 저장, 그래프에서 숨김.
|
|
963
2269
|
_GRAPH_VISIBLE_TYPES = (
|
|
2270
|
+
"Computer", # 내 컴퓨터
|
|
2271
|
+
"Drive", # 드라이브 / 볼륨
|
|
2272
|
+
"Folder", # 폴더
|
|
2273
|
+
"File", # 일반 파일
|
|
964
2274
|
"Chat", # 대화 세션
|
|
965
2275
|
"Document", # 파일 (PDF·PPT·Word·Excel·이미지)
|
|
2276
|
+
"CodeFile", # 코드 파일
|
|
2277
|
+
"Spreadsheet",# 엑셀/CSV
|
|
2278
|
+
"SlideDeck", # 프레젠테이션
|
|
2279
|
+
"Image", # 이미지
|
|
2280
|
+
"ImageText", # OCR 텍스트
|
|
966
2281
|
"Concept", # 개념 / 아이디어 / 기술 용어
|
|
967
2282
|
"Person", # 사람
|
|
968
2283
|
"Error", # 오류 / 버그
|
|
@@ -1133,7 +2448,10 @@ class KnowledgeGraphStore:
|
|
|
1133
2448
|
def score(row: sqlite3.Row) -> tuple:
|
|
1134
2449
|
haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
|
|
1135
2450
|
hits = sum(1 for term in terms_for_score if term.lower() in haystack)
|
|
1136
|
-
type_boost = 1 if row["type"] in {
|
|
2451
|
+
type_boost = 1 if row["type"] in {
|
|
2452
|
+
"Decision", "Task", "File", "Document", "CodeFile",
|
|
2453
|
+
"Spreadsheet", "SlideDeck", "Image", "ImageText", "Page", "Slide",
|
|
2454
|
+
} else 0
|
|
1137
2455
|
return (hits, type_boost, row["updated_at"] or "")
|
|
1138
2456
|
|
|
1139
2457
|
rows = sorted(rows, key=score, reverse=True)[:limit]
|
|
@@ -1192,7 +2510,13 @@ class KnowledgeGraphStore:
|
|
|
1192
2510
|
lines = []
|
|
1193
2511
|
for match in matches[:limit]:
|
|
1194
2512
|
meta = match.get("metadata") or {}
|
|
1195
|
-
source =
|
|
2513
|
+
source = (
|
|
2514
|
+
meta.get("relative_path")
|
|
2515
|
+
or meta.get("filename")
|
|
2516
|
+
or meta.get("conversation_id")
|
|
2517
|
+
or meta.get("source")
|
|
2518
|
+
or match["id"]
|
|
2519
|
+
)
|
|
1196
2520
|
summary = _clean_text(match.get("summary") or "")[:700]
|
|
1197
2521
|
lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
|
|
1198
2522
|
return "\n".join(lines)
|
|
@@ -1271,7 +2595,11 @@ class KnowledgeGraphStore:
|
|
|
1271
2595
|
"nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
|
|
1272
2596
|
"edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
|
|
1273
2597
|
"chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
|
|
2598
|
+
"knowledge_sources": conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"],
|
|
2599
|
+
"local_file_index": conn.execute("SELECT COUNT(*) AS c FROM local_file_index").fetchone()["c"],
|
|
1274
2600
|
}
|
|
2601
|
+
conn.execute("DELETE FROM local_file_index")
|
|
2602
|
+
conn.execute("DELETE FROM knowledge_sources")
|
|
1275
2603
|
conn.execute("DELETE FROM chunks")
|
|
1276
2604
|
conn.execute("DELETE FROM edges")
|
|
1277
2605
|
conn.execute("DELETE FROM nodes")
|
|
@@ -1290,6 +2618,11 @@ class KnowledgeGraphStore:
|
|
|
1290
2618
|
row["type"]: row["count"]
|
|
1291
2619
|
for row in conn.execute("SELECT type, COUNT(*) AS count FROM edges GROUP BY type")
|
|
1292
2620
|
}
|
|
2621
|
+
local_sources = conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"]
|
|
2622
|
+
local_file_status = {
|
|
2623
|
+
row["status"]: row["count"]
|
|
2624
|
+
for row in conn.execute("SELECT status, COUNT(*) AS count FROM local_file_index GROUP BY status")
|
|
2625
|
+
}
|
|
1293
2626
|
v2 = None
|
|
1294
2627
|
if KGStoreV2 is not None:
|
|
1295
2628
|
try:
|
|
@@ -1302,5 +2635,7 @@ class KnowledgeGraphStore:
|
|
|
1302
2635
|
"v2_schema_available": KGStoreV2 is not None,
|
|
1303
2636
|
"nodes": node_counts,
|
|
1304
2637
|
"edges": edge_counts,
|
|
2638
|
+
"local_sources": local_sources,
|
|
2639
|
+
"local_file_status": local_file_status,
|
|
1305
2640
|
"v2": v2,
|
|
1306
2641
|
}
|