ltcai 0.1.30 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,13 +10,16 @@ import hashlib
10
10
  import json
11
11
  import logging
12
12
  import math
13
+ import os
14
+ import platform
13
15
  import re
14
16
  import shutil
15
17
  import sqlite3
16
18
  import zipfile
19
+ from collections import Counter
17
20
  from datetime import datetime
18
21
  from pathlib import Path
19
- from typing import Any, Dict, List, Optional
22
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
20
23
 
21
24
  try:
22
25
  from kg_schema import KGStoreV2
@@ -26,6 +29,76 @@ except Exception: # pragma: no cover - v2 schema is optional at import time
26
29
 
27
30
  GRAPH_SCHEMA_VERSION = 1
28
31
 
32
+ LOCAL_TEXT_EXTENSIONS = {".txt", ".md"}
33
+ LOCAL_CODE_EXTENSIONS = {
34
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".html", ".css", ".json",
35
+ ".yaml", ".yml", ".xml", ".sql", ".sh", ".zsh", ".toml", ".ini",
36
+ }
37
+ LOCAL_DOCUMENT_EXTENSIONS = {".pdf", ".docx"}
38
+ LOCAL_SPREADSHEET_EXTENSIONS = {".xlsx", ".csv"}
39
+ LOCAL_SLIDE_EXTENSIONS = {".pptx"}
40
+ LOCAL_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
41
+ LOCAL_SUPPORTED_EXTENSIONS = (
42
+ LOCAL_TEXT_EXTENSIONS
43
+ | LOCAL_CODE_EXTENSIONS
44
+ | LOCAL_DOCUMENT_EXTENSIONS
45
+ | LOCAL_SPREADSHEET_EXTENSIONS
46
+ | LOCAL_SLIDE_EXTENSIONS
47
+ | LOCAL_IMAGE_EXTENSIONS
48
+ )
49
+
50
+ LOCAL_SIZE_LIMITS = {
51
+ "text": 4_000_000,
52
+ "code": 4_000_000,
53
+ "pdf": 50_000_000,
54
+ "document": 50_000_000,
55
+ "spreadsheet": 50_000_000,
56
+ "slide_deck": 50_000_000,
57
+ "image": 100_000_000,
58
+ }
59
+
60
+ COMMON_EXCLUDED_DIRS = {
61
+ ".git", "node_modules", ".venv", "venv", "env", "__pycache__",
62
+ ".pytest_cache", ".mypy_cache", ".ruff_cache", ".next", ".nuxt",
63
+ ".turbo", "dist", "build", "target", "out", "coverage", ".cache",
64
+ ".config", ".ssh", ".gnupg", ".docker", ".kube", ".aws", ".azure",
65
+ ".npm", ".pnpm-store", ".yarn", ".bun", ".cargo", ".rustup", ".pyenv",
66
+ ".conda", ".local", ".claude", ".codex", ".cursor", ".copilot",
67
+ ".antigravity", ".antigravity-ide",
68
+ }
69
+
70
+ COMMON_EXCLUDED_FILE_NAMES = {
71
+ ".env", ".env.local", ".env.production", ".env.development",
72
+ "id_rsa", "id_ed25519", "authorized_keys", "known_hosts",
73
+ "credentials.json", "service-account.json", "token.json", "secrets.json",
74
+ "cookies", "login data", "history", "web data", ".ds_store", "thumbs.db",
75
+ }
76
+ COMMON_EXCLUDED_FILE_SUFFIXES = {
77
+ ".pem", ".key", ".p12", ".pfx", ".kdbx", ".wallet", ".sqlite", ".db",
78
+ ".exe", ".dll", ".sys", ".msi", ".dmg", ".pkg", ".app", ".zip", ".tar",
79
+ ".gz", ".7z", ".rar", ".mp4", ".mov", ".mp3", ".wav", ".tmp", ".bak",
80
+ ".lock",
81
+ }
82
+ SENSITIVE_PATH_KEYWORDS = {
83
+ "secret", "secrets", "token", "password", "passwd", "credential",
84
+ "credentials", "private", "key", "wallet", "recovery", "seed",
85
+ "mnemonic", "cookie", "session", "auth", "oauth", "certificate",
86
+ "cert", "api_key", "apikey",
87
+ }
88
+
89
+ MACOS_EXCLUDED_PREFIXES = (
90
+ "/System", "/Library", "/Applications", "/private", "/tmp", "/var",
91
+ )
92
+ WINDOWS_EXCLUDED_NAMES = {
93
+ "windows", "program files", "program files (x86)", "programdata", "appdata",
94
+ "$recycle.bin", "system volume information", "recovery", "perflogs",
95
+ "intel", "amd", "nvidia",
96
+ }
97
+ LINUX_EXCLUDED_PREFIXES = (
98
+ "/bin", "/boot", "/dev", "/etc", "/lib", "/lib64", "/proc", "/root",
99
+ "/run", "/sbin", "/sys", "/tmp", "/usr", "/var", "/snap", "/lost+found",
100
+ )
101
+
29
102
 
30
103
  def _now() -> str:
31
104
  return datetime.now().isoformat()
@@ -80,6 +153,199 @@ def _sha256_text(text: str) -> str:
80
153
  return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
81
154
 
82
155
 
156
+ def _safe_iso_from_stat_mtime(mtime: float) -> str:
157
+ try:
158
+ return datetime.fromtimestamp(float(mtime)).isoformat()
159
+ except (TypeError, ValueError, OSError):
160
+ return ""
161
+
162
+
163
+ def _path_fingerprint(path: Path) -> str:
164
+ return _sha256_text(str(path.expanduser().resolve()))[:24]
165
+
166
+
167
+ def _is_relative_to(path: Path, base: Path) -> bool:
168
+ try:
169
+ path.relative_to(base)
170
+ return True
171
+ except ValueError:
172
+ return False
173
+
174
+
175
+ def _path_parts_lower(path: Path) -> List[str]:
176
+ return [part.lower() for part in path.parts if part and part not in {os.sep, path.anchor}]
177
+
178
+
179
+ def _current_os_type() -> str:
180
+ system = platform.system().lower()
181
+ if system.startswith("darwin"):
182
+ return "macos"
183
+ if system.startswith("windows"):
184
+ return "windows"
185
+ if system.startswith("linux"):
186
+ return "linux"
187
+ return system or "unknown"
188
+
189
+
190
+ def _drive_id_for_path(path: Path) -> str:
191
+ resolved = path.expanduser().resolve()
192
+ if resolved.drive:
193
+ return resolved.drive.upper()
194
+ parts = resolved.parts
195
+ if len(parts) >= 3 and parts[1] == "Volumes":
196
+ return f"/Volumes/{parts[2]}"
197
+ if len(parts) >= 3 and parts[1] == "media":
198
+ return f"/media/{parts[2]}"
199
+ if len(parts) >= 3 and parts[1] == "mnt":
200
+ return f"/mnt/{parts[2]}"
201
+ return resolved.anchor or "/"
202
+
203
+
204
+ def _file_category(ext: str) -> str:
205
+ ext = (ext or "").lower()
206
+ if ext in LOCAL_CODE_EXTENSIONS:
207
+ return "code"
208
+ if ext in LOCAL_TEXT_EXTENSIONS:
209
+ return "text"
210
+ if ext == ".pdf":
211
+ return "pdf"
212
+ if ext in LOCAL_DOCUMENT_EXTENSIONS:
213
+ return "document"
214
+ if ext in LOCAL_SPREADSHEET_EXTENSIONS:
215
+ return "spreadsheet"
216
+ if ext in LOCAL_SLIDE_EXTENSIONS:
217
+ return "slide_deck"
218
+ if ext in LOCAL_IMAGE_EXTENSIONS:
219
+ return "image"
220
+ return "unsupported"
221
+
222
+
223
+ def _node_type_for_category(category: str) -> str:
224
+ return {
225
+ "code": "CodeFile",
226
+ "spreadsheet": "Spreadsheet",
227
+ "slide_deck": "SlideDeck",
228
+ "image": "Image",
229
+ "unsupported": "File",
230
+ }.get(category, "Document")
231
+
232
+
233
+ def _parser_type_for_category(category: str, ext: str) -> str:
234
+ if category in {"text", "code"}:
235
+ return "plain_text"
236
+ if category == "spreadsheet" and ext == ".csv":
237
+ return "csv_text"
238
+ if category == "image":
239
+ return "image_ocr"
240
+ return ext.lstrip(".") or category
241
+
242
+
243
+ def _size_limit_for_category(category: str) -> int:
244
+ return LOCAL_SIZE_LIMITS.get(category, LOCAL_SIZE_LIMITS["document"])
245
+
246
+
247
+ def _is_hidden_path(path: Path, root: Optional[Path] = None) -> bool:
248
+ parts: Iterable[str]
249
+ if root is not None:
250
+ try:
251
+ parts = path.relative_to(root).parts
252
+ except ValueError:
253
+ parts = path.parts
254
+ else:
255
+ parts = path.parts
256
+ return any(part.startswith(".") and part not in {".", ".."} for part in parts)
257
+
258
+
259
+ def _excluded_directory_reason(path: Path, *, root: Optional[Path] = None, os_type: Optional[str] = None) -> Optional[str]:
260
+ os_type = os_type or _current_os_type()
261
+ name = path.name.lower()
262
+ if name in COMMON_EXCLUDED_DIRS:
263
+ return "excluded_folder"
264
+ if _is_hidden_path(path, root):
265
+ return "hidden_folder"
266
+ parts = _path_parts_lower(path)
267
+ if os_type == "windows" and any(part in WINDOWS_EXCLUDED_NAMES for part in parts):
268
+ return "system_folder"
269
+ normalized = path.as_posix()
270
+ root_normalized = root.as_posix() if root else ""
271
+
272
+ def _prefix_blocks(prefixes: Tuple[str, ...]) -> bool:
273
+ for prefix in prefixes:
274
+ path_under_prefix = normalized == prefix or normalized.startswith(f"{prefix}/")
275
+ root_under_prefix = bool(root_normalized) and (
276
+ root_normalized == prefix or root_normalized.startswith(f"{prefix}/")
277
+ )
278
+ if path_under_prefix and not root_under_prefix:
279
+ return True
280
+ return False
281
+
282
+ if os_type == "macos":
283
+ home_library = Path.home() / "Library"
284
+ try:
285
+ root_is_library = bool(root) and _is_relative_to(root.expanduser().resolve(), home_library.expanduser().resolve())
286
+ if _is_relative_to(path.expanduser().resolve(), home_library.expanduser().resolve()) and not root_is_library:
287
+ return "user_library"
288
+ except OSError:
289
+ pass
290
+ if _prefix_blocks(MACOS_EXCLUDED_PREFIXES):
291
+ return "system_folder"
292
+ if os_type == "linux":
293
+ if _prefix_blocks(LINUX_EXCLUDED_PREFIXES):
294
+ return "system_folder"
295
+ return None
296
+
297
+
298
+ def _sensitive_file_reason(path: Path, *, root: Optional[Path] = None) -> Optional[str]:
299
+ name = path.name.lower()
300
+ suffix = path.suffix.lower()
301
+ if name in COMMON_EXCLUDED_FILE_NAMES or suffix in COMMON_EXCLUDED_FILE_SUFFIXES:
302
+ return "sensitive_or_excluded_file"
303
+ try:
304
+ rel_text = path.relative_to(root).as_posix().lower() if root else path.as_posix().lower()
305
+ except ValueError:
306
+ rel_text = path.as_posix().lower()
307
+ tokens = re.split(r"[^0-9a-zA-Z_가-힣]+", rel_text)
308
+ if any(token in SENSITIVE_PATH_KEYWORDS for token in tokens):
309
+ return "sensitive_name"
310
+ return None
311
+
312
+
313
+ def _root_warning(path: Path, os_type: str) -> Optional[str]:
314
+ resolved = path.expanduser().resolve()
315
+ home = Path.home().expanduser().resolve()
316
+ if os_type == "macos" and resolved == home:
317
+ return "홈 전체에는 설정/숨김 폴더가 포함될 수 있습니다. 문서, 데스크탑, 다운로드, 프로젝트 폴더부터 추가하는 것을 권장합니다."
318
+ if os_type == "linux" and resolved.as_posix() == "/":
319
+ return "루트 디렉터리에는 시스템 파일이 포함되어 있습니다. 일반 사용자 폴더나 마운트된 데이터 폴더를 권장합니다."
320
+ if os_type == "windows" and str(resolved).rstrip("\\/").upper() in {"C:", "C:\\"}:
321
+ return "C드라이브에는 Windows 시스템 파일과 앱 설정 파일이 포함되어 있습니다. 하위 폴더를 선택하는 것을 권장합니다."
322
+ return None
323
+
324
+
325
+ def _sample_file(path: Path, root: Path, status: str, reason: str = "") -> Dict[str, Any]:
326
+ try:
327
+ rel = path.relative_to(root).as_posix()
328
+ except ValueError:
329
+ rel = path.name
330
+ try:
331
+ stat = path.stat()
332
+ size = stat.st_size if path.is_file() else None
333
+ modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
334
+ except OSError:
335
+ size = None
336
+ modified_at = ""
337
+ return {
338
+ "path": str(path),
339
+ "relative_path": rel,
340
+ "name": path.name,
341
+ "extension": path.suffix.lower(),
342
+ "status": status,
343
+ "reason": reason,
344
+ "size_bytes": size,
345
+ "modified_at": modified_at,
346
+ }
347
+
348
+
83
349
  def _clean_text(text: str) -> str:
84
350
  return re.sub(r"\s+", " ", str(text or "")).strip()
85
351
 
@@ -380,6 +646,22 @@ def _semantic_items(text: str) -> List[Dict[str, str]]:
380
646
  return items[:8]
381
647
 
382
648
 
649
+ def _topic_candidates(text: str, limit: int = 8) -> List[str]:
650
+ """Return compact keyword candidates for fallback graph search."""
651
+ candidates = _extract_concepts(text, limit=limit)
652
+ if candidates:
653
+ return candidates[:limit]
654
+ seen: Dict[str, str] = {}
655
+ for token in re.findall(r"[A-Za-z][A-Za-z0-9_.:-]{2,}|[가-힣]{2,12}", str(text or "")):
656
+ key = token.lower()
657
+ if key in _CONCEPT_STOP or key.isdigit():
658
+ continue
659
+ seen.setdefault(key, token)
660
+ if len(seen) >= limit:
661
+ break
662
+ return list(seen.values())[:limit]
663
+
664
+
383
665
  class KnowledgeGraphStore:
384
666
  def __init__(self, db_path: Path, blob_dir: Path):
385
667
  self.db_path = Path(db_path)
@@ -433,10 +715,52 @@ class KnowledgeGraphStore:
433
715
  created_at TEXT NOT NULL,
434
716
  FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
435
717
  );
718
+ CREATE TABLE IF NOT EXISTS knowledge_sources (
719
+ id TEXT PRIMARY KEY,
720
+ root_path TEXT NOT NULL UNIQUE,
721
+ os_type TEXT NOT NULL,
722
+ drive_id TEXT,
723
+ label TEXT,
724
+ status TEXT NOT NULL,
725
+ include_ocr INTEGER NOT NULL DEFAULT 0,
726
+ watch_enabled INTEGER NOT NULL DEFAULT 0,
727
+ consent_json TEXT NOT NULL CHECK (json_valid(consent_json)),
728
+ created_at TEXT NOT NULL,
729
+ updated_at TEXT NOT NULL,
730
+ last_scanned_at TEXT
731
+ );
732
+ CREATE TABLE IF NOT EXISTS local_file_index (
733
+ id TEXT PRIMARY KEY,
734
+ source_id TEXT NOT NULL,
735
+ os_type TEXT NOT NULL,
736
+ drive_id TEXT,
737
+ root_path TEXT NOT NULL,
738
+ file_path TEXT NOT NULL,
739
+ relative_path TEXT NOT NULL,
740
+ file_name TEXT NOT NULL,
741
+ extension TEXT NOT NULL,
742
+ size_bytes INTEGER,
743
+ modified_at TEXT,
744
+ sha256 TEXT,
745
+ last_scanned_at TEXT,
746
+ last_indexed_at TEXT,
747
+ parser_type TEXT,
748
+ status TEXT NOT NULL,
749
+ error_message TEXT,
750
+ graph_node_id TEXT,
751
+ deleted INTEGER NOT NULL DEFAULT 0,
752
+ metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
753
+ UNIQUE(source_id, relative_path),
754
+ FOREIGN KEY(source_id) REFERENCES knowledge_sources(id) ON DELETE CASCADE
755
+ );
436
756
  CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
437
757
  CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
438
758
  CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
439
759
  CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
760
+ CREATE INDEX IF NOT EXISTS idx_knowledge_sources_root ON knowledge_sources(root_path);
761
+ CREATE INDEX IF NOT EXISTS idx_local_file_index_source ON local_file_index(source_id);
762
+ CREATE INDEX IF NOT EXISTS idx_local_file_index_status ON local_file_index(status);
763
+ CREATE INDEX IF NOT EXISTS idx_local_file_index_graph_node ON local_file_index(graph_node_id);
440
764
  """
441
765
  )
442
766
  conn.execute(
@@ -502,6 +826,988 @@ class KnowledgeGraphStore:
502
826
  )
503
827
  return edge_id
504
828
 
829
+ # ── Local folder sources → Graph RAG ──────────────────────────────────
830
+
831
+ def discover_local_roots(self) -> Dict[str, Any]:
832
+ """Return safe, cross-platform starting points for structure browsing."""
833
+ os_type = _current_os_type()
834
+ home = Path.home().expanduser()
835
+ roots: List[Dict[str, Any]] = []
836
+ seen: set = set()
837
+
838
+ def add(label: str, path: Path, kind: str, *, recommended: bool = True, warning: Optional[str] = None) -> None:
839
+ try:
840
+ resolved = path.expanduser().resolve()
841
+ except OSError:
842
+ resolved = path.expanduser()
843
+ key = str(resolved)
844
+ if key in seen or not resolved.exists():
845
+ return
846
+ seen.add(key)
847
+ roots.append({
848
+ "id": f"{kind}:{_path_fingerprint(resolved)}",
849
+ "label": label,
850
+ "path": key,
851
+ "kind": kind,
852
+ "recommended": recommended,
853
+ "warning": warning or _root_warning(resolved, os_type),
854
+ })
855
+
856
+ add("홈", home, "home", warning=_root_warning(home, os_type))
857
+ for name, label in (
858
+ ("Documents", "문서"),
859
+ ("Desktop", "데스크탑"),
860
+ ("Downloads", "다운로드"),
861
+ ("Pictures", "사진"),
862
+ ("Projects", "프로젝트"),
863
+ ):
864
+ add(label, home / name, name.lower())
865
+
866
+ if os_type == "macos":
867
+ volumes = Path("/Volumes")
868
+ if volumes.exists():
869
+ try:
870
+ for volume in sorted(volumes.iterdir(), key=lambda p: p.name.lower()):
871
+ add(volume.name, volume, "volume", recommended=False)
872
+ except OSError:
873
+ pass
874
+ elif os_type == "windows":
875
+ for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
876
+ drive = Path(f"{letter}:\\")
877
+ if drive.exists():
878
+ add(f"{letter}: 드라이브", drive, "drive", recommended=(letter != "C"))
879
+ for env_name, label in (("OneDrive", "OneDrive"), ("OneDriveCommercial", "OneDrive")):
880
+ raw = os.environ.get(env_name)
881
+ if raw:
882
+ add(label, Path(raw), "cloud", recommended=False)
883
+ elif os_type == "linux":
884
+ for base in (Path("/mnt"), Path("/media")):
885
+ add(str(base), base, "mounts", recommended=False)
886
+ try:
887
+ if base.exists():
888
+ for mounted in sorted(base.iterdir(), key=lambda p: p.name.lower()):
889
+ add(mounted.name, mounted, "volume", recommended=False)
890
+ except OSError:
891
+ pass
892
+
893
+ return {
894
+ "os_type": os_type,
895
+ "computer": platform.node() or "local",
896
+ "roots": roots,
897
+ "privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
898
+ }
899
+
900
+ def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
901
+ """List one folder level using metadata only; file contents are not read."""
902
+ root = Path(path).expanduser().resolve()
903
+ if not root.exists():
904
+ raise ValueError(f"경로가 존재하지 않습니다: {path}")
905
+ if not root.is_dir():
906
+ raise ValueError(f"폴더가 아닙니다: {path}")
907
+
908
+ os_type = _current_os_type()
909
+ max_items = max(1, min(int(max_items or 200), 1000))
910
+ items: List[Dict[str, Any]] = []
911
+ inaccessible = 0
912
+ try:
913
+ children = sorted(root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
914
+ except PermissionError as exc:
915
+ return {
916
+ "path": str(root),
917
+ "items": [],
918
+ "error": f"접근 권한 없음: {exc}",
919
+ "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
920
+ }
921
+
922
+ for child in children[:max_items]:
923
+ try:
924
+ is_dir = child.is_dir()
925
+ stat = child.stat()
926
+ reason = _excluded_directory_reason(child, root=root, os_type=os_type) if is_dir else _sensitive_file_reason(child, root=root)
927
+ items.append({
928
+ "name": child.name,
929
+ "path": str(child),
930
+ "type": "directory" if is_dir else "file",
931
+ "extension": "" if is_dir else child.suffix.lower(),
932
+ "size_bytes": None if is_dir else stat.st_size,
933
+ "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
934
+ "hidden": _is_hidden_path(child, root),
935
+ "accessible": True,
936
+ "excluded_reason": reason,
937
+ })
938
+ except PermissionError:
939
+ inaccessible += 1
940
+ items.append({
941
+ "name": child.name,
942
+ "path": str(child),
943
+ "type": "unknown",
944
+ "accessible": False,
945
+ "excluded_reason": "permission_denied",
946
+ })
947
+ except OSError as exc:
948
+ inaccessible += 1
949
+ items.append({
950
+ "name": child.name,
951
+ "path": str(child),
952
+ "type": "unknown",
953
+ "accessible": False,
954
+ "excluded_reason": str(exc),
955
+ })
956
+
957
+ return {
958
+ "path": str(root),
959
+ "os_type": os_type,
960
+ "items": items,
961
+ "truncated": len(children) > max_items,
962
+ "inaccessible": inaccessible,
963
+ "warning": _root_warning(root, os_type),
964
+ "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
965
+ }
966
+
967
+ def _iter_local_scan_entries(self, root: Path, *, max_files: int) -> Iterable[Dict[str, Any]]:
968
+ os_type = _current_os_type()
969
+ stack = [root]
970
+ files_seen = 0
971
+ while stack:
972
+ current = stack.pop()
973
+ try:
974
+ children = sorted(current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
975
+ except PermissionError as exc:
976
+ yield {"kind": "inaccessible_dir", "path": current, "reason": f"permission_denied: {exc}"}
977
+ continue
978
+ except OSError as exc:
979
+ yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
980
+ continue
981
+
982
+ for child in children:
983
+ if child.is_symlink():
984
+ yield {"kind": "excluded", "path": child, "reason": "symlink"}
985
+ continue
986
+ try:
987
+ if child.is_dir():
988
+ reason = _excluded_directory_reason(child, root=root, os_type=os_type)
989
+ if reason:
990
+ yield {"kind": "excluded_dir", "path": child, "reason": reason}
991
+ else:
992
+ stack.append(child)
993
+ continue
994
+ if not child.is_file():
995
+ yield {"kind": "excluded", "path": child, "reason": "not_regular_file"}
996
+ continue
997
+ stat = child.stat()
998
+ except PermissionError as exc:
999
+ yield {"kind": "inaccessible_file", "path": child, "reason": f"permission_denied: {exc}"}
1000
+ continue
1001
+ except OSError as exc:
1002
+ yield {"kind": "inaccessible_file", "path": child, "reason": str(exc)}
1003
+ continue
1004
+
1005
+ files_seen += 1
1006
+ if files_seen > max_files:
1007
+ yield {"kind": "limit_reached", "path": child, "reason": "max_files"}
1008
+ return
1009
+ yield {"kind": "file", "path": child, "stat": stat}
1010
+
1011
+ def _local_file_decision(self, path: Path, root: Path, stat: os.stat_result) -> Dict[str, Any]:
1012
+ ext = path.suffix.lower()
1013
+ category = _file_category(ext)
1014
+ parser_type = _parser_type_for_category(category, ext)
1015
+ sensitive_reason = _sensitive_file_reason(path, root=root)
1016
+ if sensitive_reason:
1017
+ return {
1018
+ "status": "sensitive_blocked",
1019
+ "reason": sensitive_reason,
1020
+ "category": category,
1021
+ "parser_type": parser_type,
1022
+ "indexable": False,
1023
+ }
1024
+ if category == "unsupported":
1025
+ return {
1026
+ "status": "unsupported",
1027
+ "reason": "unsupported_extension",
1028
+ "category": category,
1029
+ "parser_type": parser_type,
1030
+ "indexable": False,
1031
+ }
1032
+ limit = _size_limit_for_category(category)
1033
+ if stat.st_size > limit:
1034
+ return {
1035
+ "status": "too_large",
1036
+ "reason": f"size>{limit}",
1037
+ "category": category,
1038
+ "parser_type": parser_type,
1039
+ "indexable": False,
1040
+ }
1041
+ return {
1042
+ "status": "pending",
1043
+ "reason": "",
1044
+ "category": category,
1045
+ "parser_type": parser_type,
1046
+ "indexable": True,
1047
+ }
1048
+
1049
+ def audit_local_folder(self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000) -> Dict[str, Any]:
1050
+ """Safety-check a folder using metadata only; file bodies are not read."""
1051
+ root = Path(path).expanduser().resolve()
1052
+ if not root.exists():
1053
+ raise ValueError(f"경로가 존재하지 않습니다: {path}")
1054
+ if not root.is_dir():
1055
+ raise ValueError(f"폴더가 아닙니다: {path}")
1056
+
1057
+ os_type = _current_os_type()
1058
+ max_files = max(1, min(int(max_files or 50_000), 200_000))
1059
+ status_counts: Counter = Counter()
1060
+ category_counts: Counter = Counter()
1061
+ extension_counts: Counter = Counter()
1062
+ allowed_samples: List[Dict[str, Any]] = []
1063
+ excluded_samples: List[Dict[str, Any]] = []
1064
+ total_files = 0
1065
+ readable_files = 0
1066
+ inaccessible = 0
1067
+ excluded_dirs = 0
1068
+ limit_reached = False
1069
+
1070
+ for entry in self._iter_local_scan_entries(root, max_files=max_files):
1071
+ kind = entry["kind"]
1072
+ path_obj = entry["path"]
1073
+ if kind == "limit_reached":
1074
+ limit_reached = True
1075
+ break
1076
+ if kind == "excluded_dir":
1077
+ excluded_dirs += 1
1078
+ if len(excluded_samples) < 25:
1079
+ excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
1080
+ continue
1081
+ if kind in {"inaccessible_dir", "inaccessible_file"}:
1082
+ inaccessible += 1
1083
+ status_counts["failed"] += 1
1084
+ if len(excluded_samples) < 25:
1085
+ excluded_samples.append(_sample_file(path_obj, root, "failed", entry.get("reason", "")))
1086
+ continue
1087
+ if kind == "excluded":
1088
+ status_counts["excluded"] += 1
1089
+ if len(excluded_samples) < 25:
1090
+ excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
1091
+ continue
1092
+ if kind != "file":
1093
+ continue
1094
+
1095
+ total_files += 1
1096
+ stat = entry["stat"]
1097
+ decision = self._local_file_decision(path_obj, root, stat)
1098
+ status = decision["status"]
1099
+ category = decision["category"]
1100
+ ext = path_obj.suffix.lower() or "(none)"
1101
+ category_counts[category] += 1
1102
+ extension_counts[ext] += 1
1103
+ if decision["indexable"]:
1104
+ readable_files += 1
1105
+ status_counts["readable"] += 1
1106
+ if len(allowed_samples) < 25:
1107
+ allowed_samples.append(_sample_file(path_obj, root, "readable"))
1108
+ else:
1109
+ status_counts[status] += 1
1110
+ if len(excluded_samples) < 25:
1111
+ excluded_samples.append(_sample_file(path_obj, root, status, decision["reason"]))
1112
+
1113
+ doc_weight = category_counts["pdf"] * 1.4 + category_counts["document"] * 0.9 + category_counts["slide_deck"] * 1.0
1114
+ sheet_weight = category_counts["spreadsheet"] * 0.6
1115
+ ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
1116
+ estimated_seconds = round(readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1)
1117
+
1118
+ return {
1119
+ "path": str(root),
1120
+ "source_id": f"source:{_path_fingerprint(root)}",
1121
+ "os_type": os_type,
1122
+ "drive_id": _drive_id_for_path(root),
1123
+ "warning": _root_warning(root, os_type),
1124
+ "privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
1125
+ "include_ocr_requested": bool(include_ocr),
1126
+ "summary": {
1127
+ "total_files": total_files,
1128
+ "readable_files": readable_files,
1129
+ "excluded_files": int(
1130
+ status_counts["excluded"]
1131
+ + status_counts["sensitive_blocked"]
1132
+ + status_counts["too_large"]
1133
+ + status_counts["unsupported"]
1134
+ ),
1135
+ "sensitive_files": int(status_counts["sensitive_blocked"]),
1136
+ "too_large_files": int(status_counts["too_large"]),
1137
+ "unsupported_files": int(status_counts["unsupported"]),
1138
+ "image_ocr_candidates": int(category_counts["image"]),
1139
+ "inaccessible_items": inaccessible,
1140
+ "excluded_dirs": excluded_dirs,
1141
+ "estimated_seconds": estimated_seconds,
1142
+ "storage_root": str(self.db_path.parent),
1143
+ "limit_reached": limit_reached,
1144
+ },
1145
+ "by_status": dict(status_counts),
1146
+ "by_category": dict(category_counts),
1147
+ "by_extension": dict(extension_counts.most_common(40)),
1148
+ "allowed_samples": allowed_samples,
1149
+ "excluded_samples": excluded_samples,
1150
+ "consent_required": {
1151
+ "knowledge_source": True,
1152
+ "image_ocr": bool(category_counts["image"]),
1153
+ "watch": True,
1154
+ "sensitive_files_default_excluded": True,
1155
+ },
1156
+ }
1157
+
1158
+ def local_sources(self) -> Dict[str, Any]:
1159
+ with self._connect() as conn:
1160
+ sources = [
1161
+ {
1162
+ "id": row["id"],
1163
+ "root_path": row["root_path"],
1164
+ "os_type": row["os_type"],
1165
+ "drive_id": row["drive_id"],
1166
+ "label": row["label"],
1167
+ "status": row["status"],
1168
+ "include_ocr": bool(row["include_ocr"]),
1169
+ "watch_enabled": bool(row["watch_enabled"]),
1170
+ "consent": _safe_loads(row["consent_json"]),
1171
+ "created_at": row["created_at"],
1172
+ "updated_at": row["updated_at"],
1173
+ "last_scanned_at": row["last_scanned_at"],
1174
+ }
1175
+ for row in conn.execute(
1176
+ """
1177
+ SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
1178
+ watch_enabled, consent_json, created_at, updated_at, last_scanned_at
1179
+ FROM knowledge_sources
1180
+ ORDER BY updated_at DESC
1181
+ """
1182
+ )
1183
+ ]
1184
+ status_rows = conn.execute(
1185
+ "SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
1186
+ ).fetchall()
1187
+ counts: Dict[str, Dict[str, int]] = {}
1188
+ for row in status_rows:
1189
+ counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
1190
+ for source in sources:
1191
+ source["file_status"] = counts.get(source["id"], {})
1192
+ return {"sources": sources}
1193
+
1194
+ def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
1195
+ source_id = str(source_id or "").strip()
1196
+ if not source_id:
1197
+ raise ValueError("source_id required")
1198
+ with self._connect() as conn:
1199
+ row = conn.execute(
1200
+ "SELECT id FROM knowledge_sources WHERE id=?",
1201
+ (source_id,),
1202
+ ).fetchone()
1203
+ if not row:
1204
+ raise ValueError(f"knowledge source not found: {source_id}")
1205
+ conn.execute(
1206
+ "UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
1207
+ (1 if enabled else 0, _now(), source_id),
1208
+ )
1209
+ return {"source_id": source_id, "watch_enabled": bool(enabled)}
1210
+
1211
+ def _extract_local_file_text(self, path: Path, category: str, *, include_ocr: bool) -> Tuple[str, Dict[str, Any]]:
1212
+ ext = path.suffix.lower()
1213
+ meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
1214
+ text = ""
1215
+ if category in {"text", "code"} or ext == ".csv":
1216
+ text = path.read_text(encoding="utf-8", errors="replace")
1217
+ elif ext == ".pdf":
1218
+ import pdfplumber
1219
+ with pdfplumber.open(str(path)) as pdf:
1220
+ meta["pages"] = len(pdf.pages)
1221
+ text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
1222
+ elif ext == ".docx":
1223
+ from docx import Document
1224
+ doc = Document(str(path))
1225
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
1226
+ meta["paragraphs"] = len(paragraphs)
1227
+ meta["tables"] = len(doc.tables)
1228
+ text = "\n\n".join(paragraphs)
1229
+ elif ext == ".xlsx":
1230
+ from openpyxl import load_workbook
1231
+ wb = load_workbook(str(path), read_only=True, data_only=True)
1232
+ rows_all = []
1233
+ for ws in wb.worksheets:
1234
+ rows_all.append(f"[Sheet: {ws.title}]")
1235
+ for row in ws.iter_rows(values_only=True):
1236
+ cells = [str(cell) if cell is not None else "" for cell in row]
1237
+ rows_all.append("\t".join(cells))
1238
+ if len("\n".join(rows_all)) > 200_000:
1239
+ break
1240
+ meta["sheets"] = len(wb.worksheets)
1241
+ text = "\n".join(rows_all)
1242
+ elif ext == ".pptx":
1243
+ from pptx import Presentation
1244
+ prs = Presentation(str(path))
1245
+ slides_text = []
1246
+ for index, slide in enumerate(prs.slides, 1):
1247
+ parts = []
1248
+ for shape in slide.shapes:
1249
+ if getattr(shape, "has_text_frame", False):
1250
+ parts.append(shape.text_frame.text)
1251
+ slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
1252
+ meta["slides"] = len(prs.slides)
1253
+ text = "\n\n".join(slides_text)
1254
+ elif category == "image":
1255
+ from PIL import Image
1256
+ with Image.open(str(path)) as image:
1257
+ meta.update({
1258
+ "width": image.width,
1259
+ "height": image.height,
1260
+ "format": image.format,
1261
+ "mode": image.mode,
1262
+ "ocr_enabled": bool(include_ocr),
1263
+ })
1264
+ if include_ocr:
1265
+ try:
1266
+ import pytesseract
1267
+ text = pytesseract.image_to_string(image)
1268
+ meta["ocr_chars"] = len(text)
1269
+ except Exception as exc: # pragma: no cover - depends on local OCR runtime
1270
+ meta["ocr_error"] = str(exc)
1271
+ text = ""
1272
+ return text[:200_000], meta
1273
+
1274
+ def _ensure_local_hierarchy(
1275
+ self,
1276
+ conn: sqlite3.Connection,
1277
+ *,
1278
+ source_id: str,
1279
+ root: Path,
1280
+ file_path: Path,
1281
+ os_type: str,
1282
+ drive_id: str,
1283
+ ) -> str:
1284
+ computer_label = platform.node() or "내 컴퓨터"
1285
+ computer_id = f"computer:{_slug(computer_label)}"
1286
+ drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
1287
+ root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
1288
+ self._upsert_node(conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type})
1289
+ self._upsert_node(conn, drive_node_id, "Drive", drive_id, metadata={"os_type": os_type, "drive_id": drive_id})
1290
+ self._upsert_edge(conn, computer_id, drive_node_id, "포함함", metadata={"source": "local_scan"})
1291
+ self._upsert_node(
1292
+ conn,
1293
+ root_folder_id,
1294
+ "Folder",
1295
+ root.name or str(root),
1296
+ summary=str(root),
1297
+ metadata={"source_id": source_id, "path": str(root), "root": True},
1298
+ )
1299
+ self._upsert_edge(conn, drive_node_id, root_folder_id, "포함함", metadata={"source": "local_scan"})
1300
+
1301
+ try:
1302
+ relative_parent = file_path.parent.relative_to(root)
1303
+ except ValueError:
1304
+ relative_parent = Path()
1305
+ parent_id = root_folder_id
1306
+ current_path = root
1307
+ for part in relative_parent.parts:
1308
+ current_path = current_path / part
1309
+ folder_id = f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
1310
+ self._upsert_node(
1311
+ conn,
1312
+ folder_id,
1313
+ "Folder",
1314
+ part,
1315
+ summary=str(current_path),
1316
+ metadata={"source_id": source_id, "path": str(current_path), "root": False},
1317
+ )
1318
+ self._upsert_edge(conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"})
1319
+ parent_id = folder_id
1320
+ return parent_id
1321
+
1322
+ def _upsert_local_file_index(
1323
+ self,
1324
+ conn: sqlite3.Connection,
1325
+ *,
1326
+ source_id: str,
1327
+ root: Path,
1328
+ file_path: Path,
1329
+ stat: Optional[os.stat_result],
1330
+ os_type: str,
1331
+ drive_id: str,
1332
+ status: str,
1333
+ parser_type: str,
1334
+ sha256: Optional[str] = None,
1335
+ graph_node_id: Optional[str] = None,
1336
+ error_message: Optional[str] = None,
1337
+ metadata: Optional[Dict[str, Any]] = None,
1338
+ ) -> str:
1339
+ try:
1340
+ relative_path = file_path.relative_to(root).as_posix()
1341
+ except ValueError:
1342
+ relative_path = file_path.name
1343
+ index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
1344
+ now = _now()
1345
+ size = stat.st_size if stat else None
1346
+ modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
1347
+ conn.execute(
1348
+ """
1349
+ INSERT INTO local_file_index(
1350
+ id, source_id, os_type, drive_id, root_path, file_path, relative_path,
1351
+ file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
1352
+ last_indexed_at, parser_type, status, error_message, graph_node_id,
1353
+ deleted, metadata_json
1354
+ )
1355
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1356
+ ON CONFLICT(source_id, relative_path) DO UPDATE SET
1357
+ os_type=excluded.os_type,
1358
+ drive_id=excluded.drive_id,
1359
+ root_path=excluded.root_path,
1360
+ file_path=excluded.file_path,
1361
+ file_name=excluded.file_name,
1362
+ extension=excluded.extension,
1363
+ size_bytes=excluded.size_bytes,
1364
+ modified_at=excluded.modified_at,
1365
+ sha256=COALESCE(excluded.sha256, local_file_index.sha256),
1366
+ last_scanned_at=excluded.last_scanned_at,
1367
+ last_indexed_at=COALESCE(excluded.last_indexed_at, local_file_index.last_indexed_at),
1368
+ parser_type=excluded.parser_type,
1369
+ status=excluded.status,
1370
+ error_message=excluded.error_message,
1371
+ graph_node_id=COALESCE(excluded.graph_node_id, local_file_index.graph_node_id),
1372
+ deleted=excluded.deleted,
1373
+ metadata_json=excluded.metadata_json
1374
+ """,
1375
+ (
1376
+ index_id, source_id, os_type, drive_id, str(root), str(file_path), relative_path,
1377
+ file_path.name, file_path.suffix.lower(), size, modified_at, sha256, now,
1378
+ now if status == "indexed" else None, parser_type, status, error_message,
1379
+ graph_node_id, 0 if status != "deleted" else 1, _json(metadata),
1380
+ ),
1381
+ )
1382
+ return index_id
1383
+
1384
+ def _upsert_local_file_node(
1385
+ self,
1386
+ conn: sqlite3.Connection,
1387
+ *,
1388
+ source_id: str,
1389
+ root: Path,
1390
+ file_path: Path,
1391
+ stat: os.stat_result,
1392
+ os_type: str,
1393
+ drive_id: str,
1394
+ sha256: str,
1395
+ category: str,
1396
+ parser_type: str,
1397
+ text: str,
1398
+ parser_meta: Dict[str, Any],
1399
+ ) -> str:
1400
+ try:
1401
+ relative_path = file_path.relative_to(root).as_posix()
1402
+ except ValueError:
1403
+ relative_path = file_path.name
1404
+ file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
1405
+ parent_folder_id = self._ensure_local_hierarchy(
1406
+ conn,
1407
+ source_id=source_id,
1408
+ root=root,
1409
+ file_path=file_path,
1410
+ os_type=os_type,
1411
+ drive_id=drive_id,
1412
+ )
1413
+ child_rows = conn.execute(
1414
+ """
1415
+ SELECT e.to_node AS id
1416
+ FROM edges e
1417
+ JOIN nodes n ON n.id=e.to_node
1418
+ WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
1419
+ """,
1420
+ (file_node_id,),
1421
+ ).fetchall()
1422
+ child_ids = [row["id"] for row in child_rows]
1423
+ conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
1424
+ if child_ids:
1425
+ placeholders = ",".join("?" * len(child_ids))
1426
+ conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
1427
+ conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
1428
+
1429
+ metadata = {
1430
+ "source": "local_folder",
1431
+ "source_id": source_id,
1432
+ "root_path": str(root),
1433
+ "file_path": str(file_path),
1434
+ "relative_path": relative_path,
1435
+ "filename": file_path.name,
1436
+ "ext": file_path.suffix.lower(),
1437
+ "category": category,
1438
+ "parser_type": parser_type,
1439
+ "bytes": stat.st_size,
1440
+ "modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
1441
+ "sha256": sha256,
1442
+ "parser": parser_meta,
1443
+ }
1444
+ self._upsert_node(
1445
+ conn,
1446
+ file_node_id,
1447
+ _node_type_for_category(category),
1448
+ file_path.name,
1449
+ summary=(_clean_text(text) or relative_path)[:700],
1450
+ metadata=metadata,
1451
+ raw=metadata,
1452
+ )
1453
+ self._upsert_edge(conn, parent_folder_id, file_node_id, "포함함", weight=1.0, metadata={"source": "local_scan"})
1454
+
1455
+ target_for_concepts = text
1456
+ if category == "image" and text:
1457
+ image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
1458
+ self._upsert_node(
1459
+ conn,
1460
+ image_text_id,
1461
+ "ImageText",
1462
+ f"{file_path.name} OCR",
1463
+ summary=_clean_text(text)[:700],
1464
+ metadata={"source_node": file_node_id, "source_id": source_id, "chars": len(text)},
1465
+ )
1466
+ self._upsert_edge(conn, file_node_id, image_text_id, "포함함", weight=0.8, metadata={"source": "ocr"})
1467
+
1468
+ for index, chunk in enumerate(_chunks(text)):
1469
+ chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
1470
+ self._upsert_node(
1471
+ conn,
1472
+ chunk_id,
1473
+ "Chunk",
1474
+ f"{file_path.name} chunk {index + 1}",
1475
+ summary=chunk[:500],
1476
+ metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
1477
+ )
1478
+ conn.execute(
1479
+ "INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) "
1480
+ "VALUES (?, ?, ?, ?, ?)",
1481
+ (
1482
+ chunk_id,
1483
+ file_node_id,
1484
+ chunk,
1485
+ _json({"index": index, "source_node": file_node_id, "source_id": source_id}),
1486
+ _now(),
1487
+ ),
1488
+ )
1489
+ self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
1490
+
1491
+ concepts = _extract_concepts(f"{file_path.name}\n{target_for_concepts}", limit=18)
1492
+ concept_ids: Dict[str, str] = {}
1493
+ for concept in concepts:
1494
+ node_t = _classify_node_type(concept, target_for_concepts)
1495
+ concept_id = f"{node_t.lower()}:{_slug(concept)}"
1496
+ concept_ids[concept.lower()] = concept_id
1497
+ self._upsert_node(
1498
+ conn,
1499
+ concept_id,
1500
+ node_t,
1501
+ concept,
1502
+ metadata={"auto_extracted": True, "source": "local_folder", "source_id": source_id},
1503
+ )
1504
+ self._upsert_edge(conn, file_node_id, concept_id, "언급함", weight=0.75, metadata={"source": "local_scan"})
1505
+
1506
+ for triple in _extract_triples(target_for_concepts, concepts, limit=20):
1507
+ subj_id = concept_ids.get(triple["subject"].lower())
1508
+ obj_id = concept_ids.get(triple["object"].lower())
1509
+ if subj_id and obj_id and subj_id != obj_id:
1510
+ self._upsert_edge(
1511
+ conn,
1512
+ subj_id,
1513
+ obj_id,
1514
+ triple["relation"],
1515
+ weight=0.9,
1516
+ metadata={"context": triple.get("context", "")[:240], "source_id": source_id},
1517
+ )
1518
+
1519
+ for item in _semantic_items(target_for_concepts):
1520
+ sem_type = item["type"]
1521
+ sem_title = item["title"]
1522
+ sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
1523
+ self._upsert_node(
1524
+ conn,
1525
+ sem_id,
1526
+ sem_type,
1527
+ sem_title,
1528
+ summary=item["summary"],
1529
+ metadata={"auto_extracted": True, "source_node": file_node_id, "filename": file_path.name},
1530
+ raw=item,
1531
+ )
1532
+ self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
1533
+
1534
+ return file_node_id
1535
+
1536
+ def index_local_folder(
1537
+ self,
1538
+ path: Path,
1539
+ *,
1540
+ include_ocr: bool = False,
1541
+ watch_enabled: bool = False,
1542
+ user_email: Optional[str] = None,
1543
+ consent: Optional[Dict[str, Any]] = None,
1544
+ max_files: int = 5_000,
1545
+ ) -> Dict[str, Any]:
1546
+ """Read approved files from a local folder and connect them to Graph RAG."""
1547
+ root = Path(path).expanduser().resolve()
1548
+ if not root.exists():
1549
+ raise ValueError(f"경로가 존재하지 않습니다: {path}")
1550
+ if not root.is_dir():
1551
+ raise ValueError(f"폴더가 아닙니다: {path}")
1552
+
1553
+ os_type = _current_os_type()
1554
+ drive_id = _drive_id_for_path(root)
1555
+ source_id = f"source:{_path_fingerprint(root)}"
1556
+ now = _now()
1557
+ max_files = max(1, min(int(max_files or 5_000), 50_000))
1558
+ consent_payload = {
1559
+ "approved_at": now,
1560
+ "approved_by": user_email,
1561
+ "knowledge_source": True,
1562
+ "include_ocr": bool(include_ocr),
1563
+ "watch_enabled": bool(watch_enabled),
1564
+ "sensitive_files_default_excluded": True,
1565
+ **(consent or {}),
1566
+ }
1567
+ counts: Counter = Counter()
1568
+ seen_relative_paths: set = set()
1569
+ indexed_nodes: List[str] = []
1570
+ errors: List[Dict[str, str]] = []
1571
+ limit_reached = False
1572
+
1573
+ with self._connect() as conn:
1574
+ conn.execute(
1575
+ """
1576
+ INSERT INTO knowledge_sources(
1577
+ id, root_path, os_type, drive_id, label, status, include_ocr,
1578
+ watch_enabled, consent_json, created_at, updated_at, last_scanned_at
1579
+ )
1580
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1581
+ ON CONFLICT(id) DO UPDATE SET
1582
+ root_path=excluded.root_path,
1583
+ os_type=excluded.os_type,
1584
+ drive_id=excluded.drive_id,
1585
+ label=excluded.label,
1586
+ status=excluded.status,
1587
+ include_ocr=excluded.include_ocr,
1588
+ watch_enabled=excluded.watch_enabled,
1589
+ consent_json=excluded.consent_json,
1590
+ updated_at=excluded.updated_at,
1591
+ last_scanned_at=excluded.last_scanned_at
1592
+ """,
1593
+ (
1594
+ source_id, str(root), os_type, drive_id, root.name or str(root), "scanning",
1595
+ 1 if include_ocr else 0, 1 if watch_enabled else 0, _json(consent_payload),
1596
+ now, now, now,
1597
+ ),
1598
+ )
1599
+
1600
+ for entry in self._iter_local_scan_entries(root, max_files=max_files):
1601
+ kind = entry["kind"]
1602
+ file_path = entry["path"]
1603
+ if kind == "limit_reached":
1604
+ counts["limit_reached"] += 1
1605
+ limit_reached = True
1606
+ break
1607
+ if kind in {"excluded_dir", "excluded"}:
1608
+ counts["excluded"] += 1
1609
+ continue
1610
+ if kind in {"inaccessible_dir", "inaccessible_file"}:
1611
+ counts["failed"] += 1
1612
+ errors.append({"path": str(file_path), "error": entry.get("reason", "inaccessible")})
1613
+ continue
1614
+ if kind != "file":
1615
+ continue
1616
+
1617
+ stat = entry["stat"]
1618
+ try:
1619
+ relative_path = file_path.relative_to(root).as_posix()
1620
+ except ValueError:
1621
+ relative_path = file_path.name
1622
+ seen_relative_paths.add(relative_path)
1623
+ decision = self._local_file_decision(file_path, root, stat)
1624
+ parser_type = decision["parser_type"]
1625
+ if not decision["indexable"]:
1626
+ counts[decision["status"]] += 1
1627
+ self._upsert_local_file_index(
1628
+ conn,
1629
+ source_id=source_id,
1630
+ root=root,
1631
+ file_path=file_path,
1632
+ stat=stat,
1633
+ os_type=os_type,
1634
+ drive_id=drive_id,
1635
+ status=decision["status"],
1636
+ parser_type=parser_type,
1637
+ metadata={"reason": decision["reason"], "category": decision["category"]},
1638
+ )
1639
+ continue
1640
+
1641
+ modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
1642
+ existing = conn.execute(
1643
+ """
1644
+ SELECT size_bytes, modified_at, sha256, graph_node_id, status
1645
+ FROM local_file_index
1646
+ WHERE source_id=? AND relative_path=?
1647
+ """,
1648
+ (source_id, relative_path),
1649
+ ).fetchone()
1650
+ if (
1651
+ existing
1652
+ and existing["status"] == "indexed"
1653
+ and existing["graph_node_id"]
1654
+ and existing["size_bytes"] == stat.st_size
1655
+ and existing["modified_at"] == modified_at
1656
+ ):
1657
+ counts["skipped_unchanged"] += 1
1658
+ self._upsert_local_file_index(
1659
+ conn,
1660
+ source_id=source_id,
1661
+ root=root,
1662
+ file_path=file_path,
1663
+ stat=stat,
1664
+ os_type=os_type,
1665
+ drive_id=drive_id,
1666
+ status="indexed",
1667
+ parser_type=parser_type,
1668
+ sha256=existing["sha256"],
1669
+ graph_node_id=existing["graph_node_id"],
1670
+ metadata={"category": decision["category"], "unchanged": True},
1671
+ )
1672
+ continue
1673
+
1674
+ try:
1675
+ data = file_path.read_bytes()
1676
+ digest = _sha256_bytes(data)
1677
+ except Exception as exc:
1678
+ counts["failed"] += 1
1679
+ errors.append({"path": str(file_path), "error": str(exc)})
1680
+ self._upsert_local_file_index(
1681
+ conn,
1682
+ source_id=source_id,
1683
+ root=root,
1684
+ file_path=file_path,
1685
+ stat=stat,
1686
+ os_type=os_type,
1687
+ drive_id=drive_id,
1688
+ status="failed",
1689
+ parser_type=parser_type,
1690
+ error_message=str(exc),
1691
+ metadata={"category": decision["category"]},
1692
+ )
1693
+ continue
1694
+
1695
+ if existing and existing["sha256"] == digest and existing["graph_node_id"]:
1696
+ counts["skipped_unchanged"] += 1
1697
+ self._upsert_local_file_index(
1698
+ conn,
1699
+ source_id=source_id,
1700
+ root=root,
1701
+ file_path=file_path,
1702
+ stat=stat,
1703
+ os_type=os_type,
1704
+ drive_id=drive_id,
1705
+ status="indexed",
1706
+ parser_type=parser_type,
1707
+ sha256=digest,
1708
+ graph_node_id=existing["graph_node_id"],
1709
+ metadata={"category": decision["category"], "sha256_unchanged": True},
1710
+ )
1711
+ continue
1712
+
1713
+ try:
1714
+ text, parser_meta = self._extract_local_file_text(
1715
+ file_path,
1716
+ decision["category"],
1717
+ include_ocr=include_ocr,
1718
+ )
1719
+ graph_node_id = self._upsert_local_file_node(
1720
+ conn,
1721
+ source_id=source_id,
1722
+ root=root,
1723
+ file_path=file_path,
1724
+ stat=stat,
1725
+ os_type=os_type,
1726
+ drive_id=drive_id,
1727
+ sha256=digest,
1728
+ category=decision["category"],
1729
+ parser_type=parser_type,
1730
+ text=text,
1731
+ parser_meta=parser_meta,
1732
+ )
1733
+ self._upsert_local_file_index(
1734
+ conn,
1735
+ source_id=source_id,
1736
+ root=root,
1737
+ file_path=file_path,
1738
+ stat=stat,
1739
+ os_type=os_type,
1740
+ drive_id=drive_id,
1741
+ status="indexed",
1742
+ parser_type=parser_type,
1743
+ sha256=digest,
1744
+ graph_node_id=graph_node_id,
1745
+ metadata={"category": decision["category"], "parser": parser_meta},
1746
+ )
1747
+ counts["indexed"] += 1
1748
+ indexed_nodes.append(graph_node_id)
1749
+ except Exception as exc:
1750
+ counts["failed"] += 1
1751
+ errors.append({"path": str(file_path), "error": str(exc)})
1752
+ self._upsert_local_file_index(
1753
+ conn,
1754
+ source_id=source_id,
1755
+ root=root,
1756
+ file_path=file_path,
1757
+ stat=stat,
1758
+ os_type=os_type,
1759
+ drive_id=drive_id,
1760
+ status="failed",
1761
+ parser_type=parser_type,
1762
+ sha256=digest,
1763
+ error_message=str(exc),
1764
+ metadata={"category": decision["category"]},
1765
+ )
1766
+
1767
+ if not limit_reached:
1768
+ existing_paths = {
1769
+ row["relative_path"]
1770
+ for row in conn.execute(
1771
+ "SELECT relative_path FROM local_file_index WHERE source_id=?",
1772
+ (source_id,),
1773
+ )
1774
+ }
1775
+ deleted_paths = existing_paths - seen_relative_paths
1776
+ for relative_path in deleted_paths:
1777
+ conn.execute(
1778
+ """
1779
+ UPDATE local_file_index
1780
+ SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL
1781
+ WHERE source_id=? AND relative_path=?
1782
+ """,
1783
+ (_now(), source_id, relative_path),
1784
+ )
1785
+ counts["deleted"] = len(deleted_paths)
1786
+ conn.execute(
1787
+ """
1788
+ UPDATE knowledge_sources
1789
+ SET status='active', updated_at=?, last_scanned_at=?
1790
+ WHERE id=?
1791
+ """,
1792
+ (_now(), _now(), source_id),
1793
+ )
1794
+
1795
+ return {
1796
+ "status": "ok",
1797
+ "source": {
1798
+ "id": source_id,
1799
+ "root_path": str(root),
1800
+ "os_type": os_type,
1801
+ "drive_id": drive_id,
1802
+ "include_ocr": bool(include_ocr),
1803
+ "watch_enabled": bool(watch_enabled),
1804
+ },
1805
+ "counts": dict(counts),
1806
+ "indexed_nodes": indexed_nodes[:100],
1807
+ "errors": errors[:50],
1808
+ "notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
1809
+ }
1810
+
505
1811
  def ingest_message(
506
1812
  self,
507
1813
  role: str,
@@ -961,8 +2267,17 @@ class KnowledgeGraphStore:
961
2267
  # ── 그래프에 표시되는 노드 타입 (점 = 명사) ──────────────────────────────
962
2268
  # Message / AIResponse / Chunk 는 RAG 검색용으로만 저장, 그래프에서 숨김.
963
2269
  _GRAPH_VISIBLE_TYPES = (
2270
+ "Computer", # 내 컴퓨터
2271
+ "Drive", # 드라이브 / 볼륨
2272
+ "Folder", # 폴더
2273
+ "File", # 일반 파일
964
2274
  "Chat", # 대화 세션
965
2275
  "Document", # 파일 (PDF·PPT·Word·Excel·이미지)
2276
+ "CodeFile", # 코드 파일
2277
+ "Spreadsheet",# 엑셀/CSV
2278
+ "SlideDeck", # 프레젠테이션
2279
+ "Image", # 이미지
2280
+ "ImageText", # OCR 텍스트
966
2281
  "Concept", # 개념 / 아이디어 / 기술 용어
967
2282
  "Person", # 사람
968
2283
  "Error", # 오류 / 버그
@@ -1133,7 +2448,10 @@ class KnowledgeGraphStore:
1133
2448
  def score(row: sqlite3.Row) -> tuple:
1134
2449
  haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
1135
2450
  hits = sum(1 for term in terms_for_score if term.lower() in haystack)
1136
- type_boost = 1 if row["type"] in {"Decision", "Task", "File", "Page", "Slide"} else 0
2451
+ type_boost = 1 if row["type"] in {
2452
+ "Decision", "Task", "File", "Document", "CodeFile",
2453
+ "Spreadsheet", "SlideDeck", "Image", "ImageText", "Page", "Slide",
2454
+ } else 0
1137
2455
  return (hits, type_boost, row["updated_at"] or "")
1138
2456
 
1139
2457
  rows = sorted(rows, key=score, reverse=True)[:limit]
@@ -1192,7 +2510,13 @@ class KnowledgeGraphStore:
1192
2510
  lines = []
1193
2511
  for match in matches[:limit]:
1194
2512
  meta = match.get("metadata") or {}
1195
- source = meta.get("filename") or meta.get("conversation_id") or meta.get("source") or match["id"]
2513
+ source = (
2514
+ meta.get("relative_path")
2515
+ or meta.get("filename")
2516
+ or meta.get("conversation_id")
2517
+ or meta.get("source")
2518
+ or match["id"]
2519
+ )
1196
2520
  summary = _clean_text(match.get("summary") or "")[:700]
1197
2521
  lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
1198
2522
  return "\n".join(lines)
@@ -1271,7 +2595,11 @@ class KnowledgeGraphStore:
1271
2595
  "nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
1272
2596
  "edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
1273
2597
  "chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
2598
+ "knowledge_sources": conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"],
2599
+ "local_file_index": conn.execute("SELECT COUNT(*) AS c FROM local_file_index").fetchone()["c"],
1274
2600
  }
2601
+ conn.execute("DELETE FROM local_file_index")
2602
+ conn.execute("DELETE FROM knowledge_sources")
1275
2603
  conn.execute("DELETE FROM chunks")
1276
2604
  conn.execute("DELETE FROM edges")
1277
2605
  conn.execute("DELETE FROM nodes")
@@ -1290,6 +2618,11 @@ class KnowledgeGraphStore:
1290
2618
  row["type"]: row["count"]
1291
2619
  for row in conn.execute("SELECT type, COUNT(*) AS count FROM edges GROUP BY type")
1292
2620
  }
2621
+ local_sources = conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"]
2622
+ local_file_status = {
2623
+ row["status"]: row["count"]
2624
+ for row in conn.execute("SELECT status, COUNT(*) AS count FROM local_file_index GROUP BY status")
2625
+ }
1293
2626
  v2 = None
1294
2627
  if KGStoreV2 is not None:
1295
2628
  try:
@@ -1302,5 +2635,7 @@ class KnowledgeGraphStore:
1302
2635
  "v2_schema_available": KGStoreV2 is not None,
1303
2636
  "nodes": node_counts,
1304
2637
  "edges": edge_counts,
2638
+ "local_sources": local_sources,
2639
+ "local_file_status": local_file_status,
1305
2640
  "v2": v2,
1306
2641
  }