ltcai 4.3.3 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -16
- package/docs/CHANGELOG.md +37 -0
- package/docs/V4_4_0_EXTRACTION_REPORT.md +239 -0
- package/lattice_brain/__init__.py +38 -23
- package/lattice_brain/_kg_common.py +11 -1
- package/lattice_brain/context.py +212 -2
- package/lattice_brain/conversations.py +234 -1
- package/lattice_brain/discovery.py +11 -1
- package/lattice_brain/documents.py +11 -1
- package/lattice_brain/graph/__init__.py +28 -0
- package/lattice_brain/graph/_kg_common.py +1123 -0
- package/lattice_brain/graph/curator.py +473 -0
- package/lattice_brain/graph/discovery.py +1455 -0
- package/lattice_brain/graph/documents.py +218 -0
- package/lattice_brain/graph/identity.py +175 -0
- package/lattice_brain/graph/ingest.py +644 -0
- package/lattice_brain/graph/network.py +205 -0
- package/lattice_brain/graph/projection.py +571 -0
- package/lattice_brain/graph/provenance.py +401 -0
- package/lattice_brain/graph/retrieval.py +1341 -0
- package/lattice_brain/graph/schema.py +640 -0
- package/lattice_brain/graph/store.py +237 -0
- package/lattice_brain/graph/write_master.py +225 -0
- package/lattice_brain/identity.py +11 -13
- package/lattice_brain/ingest.py +11 -1
- package/lattice_brain/ingestion.py +318 -0
- package/lattice_brain/memory.py +100 -1
- package/lattice_brain/network.py +11 -1
- package/lattice_brain/portability.py +431 -0
- package/lattice_brain/projection.py +11 -1
- package/lattice_brain/provenance.py +11 -1
- package/lattice_brain/retrieval.py +11 -1
- package/lattice_brain/runtime/__init__.py +32 -0
- package/lattice_brain/runtime/agent_runtime.py +569 -0
- package/lattice_brain/runtime/hooks.py +754 -0
- package/lattice_brain/runtime/multi_agent.py +795 -0
- package/lattice_brain/schema.py +11 -1
- package/lattice_brain/store.py +10 -2
- package/lattice_brain/workflow.py +461 -0
- package/lattice_brain/write_master.py +11 -1
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/agents.py +2 -2
- package/latticeai/api/browser.py +1 -1
- package/latticeai/api/chat.py +1 -1
- package/latticeai/api/computer_use.py +1 -1
- package/latticeai/api/hooks.py +2 -2
- package/latticeai/api/mcp.py +1 -1
- package/latticeai/api/tools.py +1 -1
- package/latticeai/api/workflow_designer.py +2 -2
- package/latticeai/app_factory.py +4 -4
- package/latticeai/brain/__init__.py +24 -6
- package/latticeai/brain/_kg_common.py +11 -1117
- package/latticeai/brain/context.py +12 -208
- package/latticeai/brain/conversations.py +12 -231
- package/latticeai/brain/discovery.py +13 -1451
- package/latticeai/brain/documents.py +13 -214
- package/latticeai/brain/identity.py +11 -169
- package/latticeai/brain/ingest.py +13 -640
- package/latticeai/brain/memory.py +12 -97
- package/latticeai/brain/network.py +12 -200
- package/latticeai/brain/projection.py +13 -567
- package/latticeai/brain/provenance.py +13 -397
- package/latticeai/brain/retrieval.py +13 -1337
- package/latticeai/brain/schema.py +12 -635
- package/latticeai/brain/store.py +13 -233
- package/latticeai/brain/write_master.py +13 -221
- package/latticeai/core/agent.py +1 -1
- package/latticeai/core/agent_registry.py +2 -2
- package/latticeai/core/builtin_hooks.py +2 -2
- package/latticeai/core/graph_curator.py +6 -468
- package/latticeai/core/hooks.py +6 -749
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/multi_agent.py +6 -790
- package/latticeai/core/workflow_engine.py +6 -456
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/services/agent_runtime.py +6 -564
- package/latticeai/services/ingestion.py +6 -313
- package/latticeai/services/kg_portability.py +6 -426
- package/latticeai/services/platform_runtime.py +3 -3
- package/latticeai/services/run_executor.py +1 -1
- package/latticeai/services/upload_service.py +1 -1
- package/p_reinforce.py +1 -1
- package/package.json +1 -1
- package/scripts/bump_version.py +1 -1
- package/scripts/wheel_smoke.py +7 -0
- package/src-tauri/Cargo.lock +1 -1
- package/src-tauri/Cargo.toml +1 -1
- package/src-tauri/tauri.conf.json +1 -1
- package/static/app/asset-manifest.json +1 -1
|
@@ -1,1123 +1,17 @@
|
|
|
1
|
-
"""
|
|
2
|
-
SQLite knowledge graph for Lattice AI workspace memory.
|
|
1
|
+
"""Deprecated shim: physically moved to lattice_brain.graph._kg_common.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
the ingestion contract.
|
|
3
|
+
Kept only for the compatibility window. The module aliases itself to the
|
|
4
|
+
physical module so identity, singletons, and monkeypatching are preserved.
|
|
7
5
|
"""
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
import asyncio
|
|
12
|
-
import hashlib
|
|
13
|
-
import json
|
|
14
|
-
import logging
|
|
15
|
-
import math
|
|
16
|
-
import os
|
|
17
|
-
import platform
|
|
18
|
-
import re
|
|
19
|
-
import shutil
|
|
20
|
-
import sqlite3
|
|
21
|
-
import time
|
|
22
|
-
import zipfile
|
|
23
|
-
from collections import Counter
|
|
24
|
-
from datetime import datetime
|
|
25
|
-
from pathlib import Path
|
|
26
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
27
|
-
|
|
28
|
-
try:
|
|
29
|
-
from latticeai.brain.schema import KGStoreV2, NodeType, EdgeType, _exec_script
|
|
30
|
-
except Exception: # pragma: no cover - v2 schema is optional at import time
|
|
31
|
-
KGStoreV2 = None # type: ignore[assignment]
|
|
32
|
-
NodeType = None # type: ignore[assignment]
|
|
33
|
-
EdgeType = None # type: ignore[assignment]
|
|
34
|
-
_exec_script = None # type: ignore[assignment]
|
|
35
|
-
|
|
36
|
-
from lattice_brain.embeddings import LocalEmbeddingModel
|
|
37
|
-
|
|
38
|
-
# Default read source for the graph queries: v2 reconstruction views.
|
|
39
|
-
# Override with LATTICEAI_KG_READ_V2=0 to fall back to the legacy tables.
|
|
40
|
-
_READ_FROM_V2_DEFAULT = os.getenv("LATTICEAI_KG_READ_V2", "1") != "0"
|
|
41
|
-
|
|
42
|
-
# Bump when the v2 projection layout changes (columns, normalization rules).
|
|
43
|
-
# On init, a stale projection is dropped and rebuilt from the authoritative
|
|
44
|
-
# legacy tables — safe because nodes_v2/edges_v2 only ever hold a derived view.
|
|
45
|
-
# v4: summary nullable + verbatim (byte-faithful) projection of legacy values.
|
|
46
|
-
_PROJECTION_VERSION = 4
|
|
47
|
-
_KG_DB_FORMAT_VERSION = 4
|
|
48
|
-
_KG_DB_FORMAT_KEY = "db_format_version"
|
|
49
|
-
_V2_WRITE_MASTER_KEY = "v2_write_mastered_at"
|
|
50
|
-
|
|
51
|
-
_llm_router_ref = None
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def set_llm_router(router_instance):
|
|
55
|
-
global _llm_router_ref
|
|
56
|
-
_llm_router_ref = router_instance
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
GRAPH_SCHEMA_VERSION = 1
|
|
60
|
-
|
|
61
|
-
LOCAL_TEXT_EXTENSIONS = {".txt", ".md"}
|
|
62
|
-
LOCAL_CODE_EXTENSIONS = {
|
|
63
|
-
".py",
|
|
64
|
-
".js",
|
|
65
|
-
".ts",
|
|
66
|
-
".tsx",
|
|
67
|
-
".jsx",
|
|
68
|
-
".html",
|
|
69
|
-
".css",
|
|
70
|
-
".json",
|
|
71
|
-
".yaml",
|
|
72
|
-
".yml",
|
|
73
|
-
".xml",
|
|
74
|
-
".sql",
|
|
75
|
-
".sh",
|
|
76
|
-
".zsh",
|
|
77
|
-
".toml",
|
|
78
|
-
".ini",
|
|
79
|
-
}
|
|
80
|
-
LOCAL_DOCUMENT_EXTENSIONS = {".pdf", ".docx"}
|
|
81
|
-
LOCAL_SPREADSHEET_EXTENSIONS = {".xlsx", ".csv"}
|
|
82
|
-
LOCAL_SLIDE_EXTENSIONS = {".pptx"}
|
|
83
|
-
LOCAL_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
|
84
|
-
LOCAL_SUPPORTED_EXTENSIONS = (
|
|
85
|
-
LOCAL_TEXT_EXTENSIONS
|
|
86
|
-
| LOCAL_CODE_EXTENSIONS
|
|
87
|
-
| LOCAL_DOCUMENT_EXTENSIONS
|
|
88
|
-
| LOCAL_SPREADSHEET_EXTENSIONS
|
|
89
|
-
| LOCAL_SLIDE_EXTENSIONS
|
|
90
|
-
| LOCAL_IMAGE_EXTENSIONS
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
LOCAL_SIZE_LIMITS = {
|
|
94
|
-
"text": 4_000_000,
|
|
95
|
-
"code": 4_000_000,
|
|
96
|
-
"pdf": 50_000_000,
|
|
97
|
-
"document": 50_000_000,
|
|
98
|
-
"spreadsheet": 50_000_000,
|
|
99
|
-
"slide_deck": 50_000_000,
|
|
100
|
-
"image": 100_000_000,
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
COMMON_EXCLUDED_DIRS = {
|
|
104
|
-
".git",
|
|
105
|
-
"node_modules",
|
|
106
|
-
".venv",
|
|
107
|
-
"venv",
|
|
108
|
-
"env",
|
|
109
|
-
"__pycache__",
|
|
110
|
-
".pytest_cache",
|
|
111
|
-
".mypy_cache",
|
|
112
|
-
".ruff_cache",
|
|
113
|
-
".next",
|
|
114
|
-
".nuxt",
|
|
115
|
-
".turbo",
|
|
116
|
-
"dist",
|
|
117
|
-
"build",
|
|
118
|
-
"target",
|
|
119
|
-
"out",
|
|
120
|
-
"coverage",
|
|
121
|
-
".cache",
|
|
122
|
-
".config",
|
|
123
|
-
".ssh",
|
|
124
|
-
".gnupg",
|
|
125
|
-
".docker",
|
|
126
|
-
".kube",
|
|
127
|
-
".aws",
|
|
128
|
-
".azure",
|
|
129
|
-
".npm",
|
|
130
|
-
".pnpm-store",
|
|
131
|
-
".yarn",
|
|
132
|
-
".bun",
|
|
133
|
-
".cargo",
|
|
134
|
-
".rustup",
|
|
135
|
-
".pyenv",
|
|
136
|
-
".conda",
|
|
137
|
-
".local",
|
|
138
|
-
".claude",
|
|
139
|
-
".codex",
|
|
140
|
-
".cursor",
|
|
141
|
-
".copilot",
|
|
142
|
-
".antigravity",
|
|
143
|
-
".antigravity-ide",
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
COMMON_EXCLUDED_FILE_NAMES = {
|
|
147
|
-
".env",
|
|
148
|
-
".env.local",
|
|
149
|
-
".env.production",
|
|
150
|
-
".env.development",
|
|
151
|
-
"id_rsa",
|
|
152
|
-
"id_ed25519",
|
|
153
|
-
"authorized_keys",
|
|
154
|
-
"known_hosts",
|
|
155
|
-
"credentials.json",
|
|
156
|
-
"service-account.json",
|
|
157
|
-
"token.json",
|
|
158
|
-
"secrets.json",
|
|
159
|
-
"cookies",
|
|
160
|
-
"login data",
|
|
161
|
-
"history",
|
|
162
|
-
"web data",
|
|
163
|
-
".ds_store",
|
|
164
|
-
"thumbs.db",
|
|
165
|
-
}
|
|
166
|
-
COMMON_EXCLUDED_FILE_SUFFIXES = {
|
|
167
|
-
".pem",
|
|
168
|
-
".key",
|
|
169
|
-
".p12",
|
|
170
|
-
".pfx",
|
|
171
|
-
".kdbx",
|
|
172
|
-
".wallet",
|
|
173
|
-
".sqlite",
|
|
174
|
-
".db",
|
|
175
|
-
".exe",
|
|
176
|
-
".dll",
|
|
177
|
-
".sys",
|
|
178
|
-
".msi",
|
|
179
|
-
".dmg",
|
|
180
|
-
".pkg",
|
|
181
|
-
".app",
|
|
182
|
-
".zip",
|
|
183
|
-
".tar",
|
|
184
|
-
".gz",
|
|
185
|
-
".7z",
|
|
186
|
-
".rar",
|
|
187
|
-
".mp4",
|
|
188
|
-
".mov",
|
|
189
|
-
".mp3",
|
|
190
|
-
".wav",
|
|
191
|
-
".tmp",
|
|
192
|
-
".bak",
|
|
193
|
-
".lock",
|
|
194
|
-
}
|
|
195
|
-
SENSITIVE_PATH_KEYWORDS = {
|
|
196
|
-
"secret",
|
|
197
|
-
"secrets",
|
|
198
|
-
"token",
|
|
199
|
-
"password",
|
|
200
|
-
"passwd",
|
|
201
|
-
"credential",
|
|
202
|
-
"credentials",
|
|
203
|
-
"private",
|
|
204
|
-
"key",
|
|
205
|
-
"wallet",
|
|
206
|
-
"recovery",
|
|
207
|
-
"seed",
|
|
208
|
-
"mnemonic",
|
|
209
|
-
"cookie",
|
|
210
|
-
"session",
|
|
211
|
-
"auth",
|
|
212
|
-
"oauth",
|
|
213
|
-
"certificate",
|
|
214
|
-
"cert",
|
|
215
|
-
"api_key",
|
|
216
|
-
"apikey",
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
MACOS_EXCLUDED_PREFIXES = (
|
|
220
|
-
"/System",
|
|
221
|
-
"/Library",
|
|
222
|
-
"/Applications",
|
|
223
|
-
"/private",
|
|
224
|
-
"/tmp",
|
|
225
|
-
"/var",
|
|
226
|
-
)
|
|
227
|
-
WINDOWS_EXCLUDED_NAMES = {
|
|
228
|
-
"windows",
|
|
229
|
-
"program files",
|
|
230
|
-
"program files (x86)",
|
|
231
|
-
"programdata",
|
|
232
|
-
"appdata",
|
|
233
|
-
"$recycle.bin",
|
|
234
|
-
"system volume information",
|
|
235
|
-
"recovery",
|
|
236
|
-
"perflogs",
|
|
237
|
-
"intel",
|
|
238
|
-
"amd",
|
|
239
|
-
"nvidia",
|
|
240
|
-
}
|
|
241
|
-
LINUX_EXCLUDED_PREFIXES = (
|
|
242
|
-
"/bin",
|
|
243
|
-
"/boot",
|
|
244
|
-
"/dev",
|
|
245
|
-
"/etc",
|
|
246
|
-
"/lib",
|
|
247
|
-
"/lib64",
|
|
248
|
-
"/proc",
|
|
249
|
-
"/root",
|
|
250
|
-
"/run",
|
|
251
|
-
"/sbin",
|
|
252
|
-
"/sys",
|
|
253
|
-
"/tmp",
|
|
254
|
-
"/usr",
|
|
255
|
-
"/var",
|
|
256
|
-
"/snap",
|
|
257
|
-
"/lost+found",
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def _now() -> str:
|
|
262
|
-
return datetime.now().isoformat()
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def _parse_iso(raw: Optional[str]) -> Optional[datetime]:
|
|
266
|
-
if not raw:
|
|
267
|
-
return None
|
|
268
|
-
try:
|
|
269
|
-
return datetime.fromisoformat(str(raw))
|
|
270
|
-
except (TypeError, ValueError):
|
|
271
|
-
return None
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def _recency_score(
|
|
275
|
-
updated_at: Optional[str],
|
|
276
|
-
*,
|
|
277
|
-
now: Optional[datetime] = None,
|
|
278
|
-
half_life_days: float = 14.0,
|
|
279
|
-
) -> float:
|
|
280
|
-
stamp = _parse_iso(updated_at)
|
|
281
|
-
if not stamp:
|
|
282
|
-
return 0.0
|
|
283
|
-
now = now or datetime.now()
|
|
284
|
-
age_days = max(0.0, (now - stamp).total_seconds() / 86400.0)
|
|
285
|
-
decay = math.log(2) / max(0.1, half_life_days)
|
|
286
|
-
return math.exp(-decay * age_days)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def _json(data: Optional[Dict[str, Any]]) -> str:
|
|
290
|
-
return json.dumps(data or {}, ensure_ascii=False, sort_keys=True)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def _safe_loads(raw: Optional[str]) -> Dict[str, Any]:
|
|
294
|
-
"""Tolerantly parse a metadata_json column — returns {} on corrupt rows."""
|
|
295
|
-
if not raw:
|
|
296
|
-
return {}
|
|
297
|
-
try:
|
|
298
|
-
value = json.loads(raw)
|
|
299
|
-
return value if isinstance(value, dict) else {}
|
|
300
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
301
|
-
logging.warning(
|
|
302
|
-
"knowledge_graph: corrupt metadata_json (%s) — using empty dict", e
|
|
303
|
-
)
|
|
304
|
-
return {}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _slug(text: str, max_len: int = 96) -> str:
|
|
308
|
-
value = re.sub(r"\s+", " ", str(text or "")).strip().lower()
|
|
309
|
-
value = re.sub(r"[^0-9a-zA-Z가-힣._:@/-]+", "-", value).strip("-")
|
|
310
|
-
return (value or "untitled")[:max_len]
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
def _sha256_bytes(data: bytes) -> str:
|
|
314
|
-
return hashlib.sha256(data).hexdigest()
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
def _sha256_text(text: str) -> str:
|
|
318
|
-
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def _safe_iso_from_stat_mtime(mtime: float) -> str:
|
|
322
|
-
try:
|
|
323
|
-
return datetime.fromtimestamp(float(mtime)).isoformat()
|
|
324
|
-
except (TypeError, ValueError, OSError):
|
|
325
|
-
return ""
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
def _path_fingerprint(path: Path) -> str:
|
|
329
|
-
return _sha256_text(str(path.expanduser().resolve()))[:24]
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def _is_relative_to(path: Path, base: Path) -> bool:
|
|
333
|
-
try:
|
|
334
|
-
path.relative_to(base)
|
|
335
|
-
return True
|
|
336
|
-
except ValueError:
|
|
337
|
-
return False
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def _path_parts_lower(path: Path) -> List[str]:
|
|
341
|
-
return [
|
|
342
|
-
part.lower()
|
|
343
|
-
for part in path.parts
|
|
344
|
-
if part and part not in {os.sep, path.anchor}
|
|
345
|
-
]
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def _current_os_type() -> str:
|
|
349
|
-
system = platform.system().lower()
|
|
350
|
-
if system.startswith("darwin"):
|
|
351
|
-
return "macos"
|
|
352
|
-
if system.startswith("windows"):
|
|
353
|
-
return "windows"
|
|
354
|
-
if system.startswith("linux"):
|
|
355
|
-
return "linux"
|
|
356
|
-
return system or "unknown"
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
def _drive_id_for_path(path: Path) -> str:
|
|
360
|
-
resolved = path.expanduser().resolve()
|
|
361
|
-
if resolved.drive:
|
|
362
|
-
return resolved.drive.upper()
|
|
363
|
-
parts = resolved.parts
|
|
364
|
-
if len(parts) >= 3 and parts[1] == "Volumes":
|
|
365
|
-
return f"/Volumes/{parts[2]}"
|
|
366
|
-
if len(parts) >= 3 and parts[1] == "media":
|
|
367
|
-
return f"/media/{parts[2]}"
|
|
368
|
-
if len(parts) >= 3 and parts[1] == "mnt":
|
|
369
|
-
return f"/mnt/{parts[2]}"
|
|
370
|
-
return resolved.anchor or "/"
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def _file_category(ext: str) -> str:
|
|
374
|
-
ext = (ext or "").lower()
|
|
375
|
-
if ext in LOCAL_CODE_EXTENSIONS:
|
|
376
|
-
return "code"
|
|
377
|
-
if ext in LOCAL_TEXT_EXTENSIONS:
|
|
378
|
-
return "text"
|
|
379
|
-
if ext == ".pdf":
|
|
380
|
-
return "pdf"
|
|
381
|
-
if ext in LOCAL_DOCUMENT_EXTENSIONS:
|
|
382
|
-
return "document"
|
|
383
|
-
if ext in LOCAL_SPREADSHEET_EXTENSIONS:
|
|
384
|
-
return "spreadsheet"
|
|
385
|
-
if ext in LOCAL_SLIDE_EXTENSIONS:
|
|
386
|
-
return "slide_deck"
|
|
387
|
-
if ext in LOCAL_IMAGE_EXTENSIONS:
|
|
388
|
-
return "image"
|
|
389
|
-
return "unsupported"
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def _node_type_for_category(category: str) -> str:
|
|
393
|
-
return {
|
|
394
|
-
"code": "CodeFile",
|
|
395
|
-
"spreadsheet": "Spreadsheet",
|
|
396
|
-
"slide_deck": "SlideDeck",
|
|
397
|
-
"image": "Image",
|
|
398
|
-
"unsupported": "File",
|
|
399
|
-
}.get(category, "Document")
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
def _parser_type_for_category(category: str, ext: str) -> str:
|
|
403
|
-
if category in {"text", "code"}:
|
|
404
|
-
return "plain_text"
|
|
405
|
-
if category == "spreadsheet" and ext == ".csv":
|
|
406
|
-
return "csv_text"
|
|
407
|
-
if category == "image":
|
|
408
|
-
return "image_ocr"
|
|
409
|
-
return ext.lstrip(".") or category
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
def _size_limit_for_category(category: str) -> int:
|
|
413
|
-
return LOCAL_SIZE_LIMITS.get(category, LOCAL_SIZE_LIMITS["document"])
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def _is_hidden_path(path: Path, root: Optional[Path] = None) -> bool:
|
|
417
|
-
parts: Iterable[str]
|
|
418
|
-
if root is not None:
|
|
419
|
-
try:
|
|
420
|
-
parts = path.relative_to(root).parts
|
|
421
|
-
except ValueError:
|
|
422
|
-
parts = path.parts
|
|
423
|
-
else:
|
|
424
|
-
parts = path.parts
|
|
425
|
-
return any(part.startswith(".") and part not in {".", ".."} for part in parts)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
def _excluded_directory_reason(
|
|
429
|
-
path: Path, *, root: Optional[Path] = None, os_type: Optional[str] = None
|
|
430
|
-
) -> Optional[str]:
|
|
431
|
-
os_type = os_type or _current_os_type()
|
|
432
|
-
name = path.name.lower()
|
|
433
|
-
if name in COMMON_EXCLUDED_DIRS:
|
|
434
|
-
return "excluded_folder"
|
|
435
|
-
if _is_hidden_path(path, root):
|
|
436
|
-
return "hidden_folder"
|
|
437
|
-
parts = _path_parts_lower(path)
|
|
438
|
-
if os_type == "windows" and any(part in WINDOWS_EXCLUDED_NAMES for part in parts):
|
|
439
|
-
return "system_folder"
|
|
440
|
-
normalized = path.as_posix()
|
|
441
|
-
root_normalized = root.as_posix() if root else ""
|
|
442
|
-
|
|
443
|
-
def _prefix_blocks(prefixes: Tuple[str, ...]) -> bool:
|
|
444
|
-
for prefix in prefixes:
|
|
445
|
-
path_under_prefix = normalized == prefix or normalized.startswith(
|
|
446
|
-
f"{prefix}/"
|
|
447
|
-
)
|
|
448
|
-
root_under_prefix = bool(root_normalized) and (
|
|
449
|
-
root_normalized == prefix or root_normalized.startswith(f"{prefix}/")
|
|
450
|
-
)
|
|
451
|
-
if path_under_prefix and not root_under_prefix:
|
|
452
|
-
return True
|
|
453
|
-
return False
|
|
454
|
-
|
|
455
|
-
if os_type == "macos":
|
|
456
|
-
home_library = Path.home() / "Library"
|
|
457
|
-
try:
|
|
458
|
-
root_is_library = bool(root) and _is_relative_to(
|
|
459
|
-
root.expanduser().resolve(), home_library.expanduser().resolve()
|
|
460
|
-
)
|
|
461
|
-
if (
|
|
462
|
-
_is_relative_to(
|
|
463
|
-
path.expanduser().resolve(), home_library.expanduser().resolve()
|
|
464
|
-
)
|
|
465
|
-
and not root_is_library
|
|
466
|
-
):
|
|
467
|
-
return "user_library"
|
|
468
|
-
except OSError:
|
|
469
|
-
pass
|
|
470
|
-
if _prefix_blocks(MACOS_EXCLUDED_PREFIXES):
|
|
471
|
-
return "system_folder"
|
|
472
|
-
if os_type == "linux":
|
|
473
|
-
if _prefix_blocks(LINUX_EXCLUDED_PREFIXES):
|
|
474
|
-
return "system_folder"
|
|
475
|
-
return None
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
def _sensitive_file_reason(path: Path, *, root: Optional[Path] = None) -> Optional[str]:
|
|
479
|
-
name = path.name.lower()
|
|
480
|
-
suffix = path.suffix.lower()
|
|
481
|
-
if name in COMMON_EXCLUDED_FILE_NAMES or suffix in COMMON_EXCLUDED_FILE_SUFFIXES:
|
|
482
|
-
return "sensitive_or_excluded_file"
|
|
483
|
-
try:
|
|
484
|
-
rel_text = (
|
|
485
|
-
path.relative_to(root).as_posix().lower()
|
|
486
|
-
if root
|
|
487
|
-
else path.as_posix().lower()
|
|
488
|
-
)
|
|
489
|
-
except ValueError:
|
|
490
|
-
rel_text = path.as_posix().lower()
|
|
491
|
-
tokens = re.split(r"[^0-9a-zA-Z_가-힣]+", rel_text)
|
|
492
|
-
if any(token in SENSITIVE_PATH_KEYWORDS for token in tokens):
|
|
493
|
-
return "sensitive_name"
|
|
494
|
-
return None
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
def _root_warning(path: Path, os_type: str) -> Optional[str]:
|
|
498
|
-
resolved = path.expanduser().resolve()
|
|
499
|
-
home = Path.home().expanduser().resolve()
|
|
500
|
-
if os_type == "macos" and resolved == home:
|
|
501
|
-
return "홈 전체에는 설정/숨김 폴더가 포함될 수 있습니다. 문서, 데스크탑, 다운로드, 프로젝트 폴더부터 추가하는 것을 권장합니다."
|
|
502
|
-
if os_type == "linux" and resolved.as_posix() == "/":
|
|
503
|
-
return "루트 디렉터리에는 시스템 파일이 포함되어 있습니다. 일반 사용자 폴더나 마운트된 데이터 폴더를 권장합니다."
|
|
504
|
-
if os_type == "windows" and str(resolved).rstrip("\\/").upper() in {"C:", "C:\\"}:
|
|
505
|
-
return "C드라이브에는 Windows 시스템 파일과 앱 설정 파일이 포함되어 있습니다. 하위 폴더를 선택하는 것을 권장합니다."
|
|
506
|
-
return None
|
|
507
|
-
|
|
7
|
+
import sys
|
|
8
|
+
import warnings
|
|
508
9
|
|
|
509
|
-
|
|
510
|
-
path: Path, root: Path, status: str, reason: str = ""
|
|
511
|
-
) -> Dict[str, Any]:
|
|
512
|
-
try:
|
|
513
|
-
rel = path.relative_to(root).as_posix()
|
|
514
|
-
except ValueError:
|
|
515
|
-
rel = path.name
|
|
516
|
-
try:
|
|
517
|
-
stat = path.stat()
|
|
518
|
-
size = stat.st_size if path.is_file() else None
|
|
519
|
-
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
520
|
-
except OSError:
|
|
521
|
-
size = None
|
|
522
|
-
modified_at = ""
|
|
523
|
-
return {
|
|
524
|
-
"path": str(path),
|
|
525
|
-
"relative_path": rel,
|
|
526
|
-
"name": path.name,
|
|
527
|
-
"extension": path.suffix.lower(),
|
|
528
|
-
"status": status,
|
|
529
|
-
"reason": reason,
|
|
530
|
-
"size_bytes": size,
|
|
531
|
-
"modified_at": modified_at,
|
|
532
|
-
}
|
|
10
|
+
import lattice_brain.graph._kg_common as _impl
|
|
533
11
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
|
|
540
|
-
cleaned = str(text or "").strip()
|
|
541
|
-
if not cleaned:
|
|
542
|
-
return []
|
|
543
|
-
chunks: List[str] = []
|
|
544
|
-
start = 0
|
|
545
|
-
while start < len(cleaned):
|
|
546
|
-
end = min(len(cleaned), start + size)
|
|
547
|
-
chunks.append(cleaned[start:end])
|
|
548
|
-
if end >= len(cleaned):
|
|
549
|
-
break
|
|
550
|
-
start = max(0, end - overlap)
|
|
551
|
-
return chunks
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
_LLM_EXTRACT_CONCEPT_PROMPT = """Extract the key concepts from the following text.
|
|
555
|
-
Return ONLY a JSON array of objects, each with "concept" (string) and "importance" (float 0-1).
|
|
556
|
-
Extract up to {limit} concepts. Focus on named entities, technical terms, and domain-specific nouns.
|
|
557
|
-
Do NOT include common words, stop words, or generic terms.
|
|
558
|
-
|
|
559
|
-
Text:
|
|
560
|
-
{text}
|
|
561
|
-
|
|
562
|
-
JSON:"""
|
|
563
|
-
|
|
564
|
-
_LLM_EXTRACT_TRIPLE_PROMPT = """Extract relationship triples from the following text.
|
|
565
|
-
Return ONLY a JSON array of objects, each with:
|
|
566
|
-
- "subject": source concept (string)
|
|
567
|
-
- "relation": relationship verb (string, Korean or English)
|
|
568
|
-
- "object": target concept (string)
|
|
569
|
-
- "evidence": the sentence supporting this triple (string, max 240 chars)
|
|
570
|
-
- "confidence": how confident you are (float 0-1)
|
|
571
|
-
|
|
572
|
-
Extract up to {limit} triples. Focus on meaningful semantic relationships.
|
|
573
|
-
|
|
574
|
-
Text:
|
|
575
|
-
{text}
|
|
576
|
-
|
|
577
|
-
Concepts already identified: {concepts}
|
|
578
|
-
|
|
579
|
-
JSON:"""
|
|
580
|
-
|
|
581
|
-
ENABLE_LLM_EXTRACTION = os.getenv("LATTICEAI_LLM_EXTRACTION", "true").lower() in (
|
|
582
|
-
"1",
|
|
583
|
-
"true",
|
|
584
|
-
"yes",
|
|
12
|
+
warnings.warn(
|
|
13
|
+
"latticeai.brain._kg_common is deprecated; import lattice_brain.graph._kg_common instead",
|
|
14
|
+
DeprecationWarning,
|
|
15
|
+
stacklevel=2,
|
|
585
16
|
)
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
def _llm_extract_concepts(text: str, limit: int = 12) -> Optional[List[str]]:
|
|
589
|
-
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
590
|
-
return None
|
|
591
|
-
if not _llm_router_ref.current_model_id:
|
|
592
|
-
return None
|
|
593
|
-
prompt = _LLM_EXTRACT_CONCEPT_PROMPT.format(text=text[:3000], limit=limit)
|
|
594
|
-
try:
|
|
595
|
-
loop = asyncio.get_event_loop()
|
|
596
|
-
if loop.is_running():
|
|
597
|
-
import concurrent.futures
|
|
598
|
-
|
|
599
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
600
|
-
future = pool.submit(
|
|
601
|
-
asyncio.run,
|
|
602
|
-
_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1),
|
|
603
|
-
)
|
|
604
|
-
raw = future.result(timeout=30)
|
|
605
|
-
else:
|
|
606
|
-
raw = asyncio.run(
|
|
607
|
-
_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1)
|
|
608
|
-
)
|
|
609
|
-
raw = raw.strip()
|
|
610
|
-
if raw.startswith("```"):
|
|
611
|
-
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
612
|
-
raw = re.sub(r"\s*```$", "", raw)
|
|
613
|
-
parsed = json.loads(raw)
|
|
614
|
-
if isinstance(parsed, list):
|
|
615
|
-
concepts = []
|
|
616
|
-
for item in parsed[:limit]:
|
|
617
|
-
if isinstance(item, dict) and "concept" in item:
|
|
618
|
-
concepts.append(item["concept"])
|
|
619
|
-
elif isinstance(item, str):
|
|
620
|
-
concepts.append(item)
|
|
621
|
-
return concepts if concepts else None
|
|
622
|
-
except Exception as e:
|
|
623
|
-
logging.debug("LLM concept extraction failed (falling back to rules): %s", e)
|
|
624
|
-
return None
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
def _llm_extract_triples(
|
|
628
|
-
text: str, concepts: List[str], limit: int = 20
|
|
629
|
-
) -> Optional[List[Dict[str, str]]]:
|
|
630
|
-
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
631
|
-
return None
|
|
632
|
-
if not _llm_router_ref.current_model_id:
|
|
633
|
-
return None
|
|
634
|
-
prompt = _LLM_EXTRACT_TRIPLE_PROMPT.format(
|
|
635
|
-
text=text[:3000],
|
|
636
|
-
limit=limit,
|
|
637
|
-
concepts=", ".join(concepts[:15]),
|
|
638
|
-
)
|
|
639
|
-
try:
|
|
640
|
-
loop = asyncio.get_event_loop()
|
|
641
|
-
if loop.is_running():
|
|
642
|
-
import concurrent.futures
|
|
643
|
-
|
|
644
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
645
|
-
future = pool.submit(
|
|
646
|
-
asyncio.run,
|
|
647
|
-
_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1),
|
|
648
|
-
)
|
|
649
|
-
raw = future.result(timeout=30)
|
|
650
|
-
else:
|
|
651
|
-
raw = asyncio.run(
|
|
652
|
-
_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1)
|
|
653
|
-
)
|
|
654
|
-
raw = raw.strip()
|
|
655
|
-
if raw.startswith("```"):
|
|
656
|
-
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
657
|
-
raw = re.sub(r"\s*```$", "", raw)
|
|
658
|
-
parsed = json.loads(raw)
|
|
659
|
-
if isinstance(parsed, list):
|
|
660
|
-
triples = []
|
|
661
|
-
for item in parsed[:limit]:
|
|
662
|
-
if isinstance(item, dict) and "subject" in item and "object" in item:
|
|
663
|
-
triples.append(
|
|
664
|
-
{
|
|
665
|
-
"subject": str(item["subject"]),
|
|
666
|
-
"relation": str(item.get("relation", "관련됨")),
|
|
667
|
-
"object": str(item["object"]),
|
|
668
|
-
"context": str(item.get("evidence", ""))[:240],
|
|
669
|
-
"confidence": float(item.get("confidence", 0.8)),
|
|
670
|
-
}
|
|
671
|
-
)
|
|
672
|
-
return triples if triples else None
|
|
673
|
-
except Exception as e:
|
|
674
|
-
logging.debug("LLM triple extraction failed (falling back to rules): %s", e)
|
|
675
|
-
return None
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
_CONCEPT_STOP: set = {
|
|
679
|
-
# English stop words
|
|
680
|
-
"the",
|
|
681
|
-
"and",
|
|
682
|
-
"for",
|
|
683
|
-
"with",
|
|
684
|
-
"this",
|
|
685
|
-
"that",
|
|
686
|
-
"from",
|
|
687
|
-
"into",
|
|
688
|
-
"which",
|
|
689
|
-
"are",
|
|
690
|
-
"was",
|
|
691
|
-
"were",
|
|
692
|
-
"has",
|
|
693
|
-
"have",
|
|
694
|
-
"had",
|
|
695
|
-
"can",
|
|
696
|
-
"will",
|
|
697
|
-
"would",
|
|
698
|
-
"could",
|
|
699
|
-
"should",
|
|
700
|
-
"may",
|
|
701
|
-
"might",
|
|
702
|
-
"must",
|
|
703
|
-
"shall",
|
|
704
|
-
"being",
|
|
705
|
-
"been",
|
|
706
|
-
"also",
|
|
707
|
-
"just",
|
|
708
|
-
"then",
|
|
709
|
-
"than",
|
|
710
|
-
"when",
|
|
711
|
-
"where",
|
|
712
|
-
"what",
|
|
713
|
-
"how",
|
|
714
|
-
"why",
|
|
715
|
-
"its",
|
|
716
|
-
"their",
|
|
717
|
-
"your",
|
|
718
|
-
"our",
|
|
719
|
-
"you",
|
|
720
|
-
"they",
|
|
721
|
-
"them",
|
|
722
|
-
"these",
|
|
723
|
-
"those",
|
|
724
|
-
"use",
|
|
725
|
-
"used",
|
|
726
|
-
"using",
|
|
727
|
-
"based",
|
|
728
|
-
"like",
|
|
729
|
-
"such",
|
|
730
|
-
"via",
|
|
731
|
-
"per",
|
|
732
|
-
"let",
|
|
733
|
-
"yes",
|
|
734
|
-
"not",
|
|
735
|
-
"but",
|
|
736
|
-
"are",
|
|
737
|
-
"all",
|
|
738
|
-
"any",
|
|
739
|
-
"out",
|
|
740
|
-
"new",
|
|
741
|
-
"get",
|
|
742
|
-
"set",
|
|
743
|
-
# Korean stop words
|
|
744
|
-
"사용자",
|
|
745
|
-
"내용",
|
|
746
|
-
"파일",
|
|
747
|
-
"채팅",
|
|
748
|
-
"답변",
|
|
749
|
-
"입니다",
|
|
750
|
-
"그리고",
|
|
751
|
-
"처럼",
|
|
752
|
-
"있어",
|
|
753
|
-
"없어",
|
|
754
|
-
"이야",
|
|
755
|
-
"이다",
|
|
756
|
-
"한다",
|
|
757
|
-
"하다",
|
|
758
|
-
"되다",
|
|
759
|
-
"됩니다",
|
|
760
|
-
"경우",
|
|
761
|
-
"방법",
|
|
762
|
-
"부분",
|
|
763
|
-
"상태",
|
|
764
|
-
"정도",
|
|
765
|
-
"결과",
|
|
766
|
-
"이후",
|
|
767
|
-
"이전",
|
|
768
|
-
"그것",
|
|
769
|
-
"이것",
|
|
770
|
-
"저것",
|
|
771
|
-
"여기",
|
|
772
|
-
"거기",
|
|
773
|
-
"저기",
|
|
774
|
-
"우리",
|
|
775
|
-
"저희",
|
|
776
|
-
"기능",
|
|
777
|
-
"서버",
|
|
778
|
-
"모델",
|
|
779
|
-
"설정",
|
|
780
|
-
"설명",
|
|
781
|
-
"버전",
|
|
782
|
-
"지원",
|
|
783
|
-
"사용",
|
|
784
|
-
"실행",
|
|
785
|
-
"todo",
|
|
786
|
-
"fixme",
|
|
787
|
-
"note",
|
|
788
|
-
"참고",
|
|
789
|
-
"주의",
|
|
790
|
-
"warning",
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
def _extract_concepts(text: str, limit: int = 12) -> List[str]:
|
|
795
|
-
"""LLM-first concept extraction with rule-based fallback."""
|
|
796
|
-
llm_result = _llm_extract_concepts(text, limit)
|
|
797
|
-
if llm_result:
|
|
798
|
-
return llm_result
|
|
799
|
-
return _extract_concepts_rules(text, limit)
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
def _extract_concepts_rules(text: str, limit: int = 12) -> List[str]:
|
|
803
|
-
"""Extract meaningful named concepts from text (rule-based).
|
|
804
|
-
|
|
805
|
-
Priority order:
|
|
806
|
-
1. Backtick / quoted terms (explicitly technical)
|
|
807
|
-
2. Multi-word proper nouns (Lattice AI, GPT-4o, Claude Sonnet)
|
|
808
|
-
3. Single capitalized proper nouns not at sentence start (Claude, Python, FastAPI)
|
|
809
|
-
4. Korean compound technical terms (멀티모달, 에이전트, 그래프RAG)
|
|
810
|
-
5. Hyphenated / versioned identifiers (gpt-4o, mlx-vlm, gemma-4)
|
|
811
|
-
"""
|
|
812
|
-
text = str(text or "")
|
|
813
|
-
seen: dict = {} # concept_lower → original form
|
|
814
|
-
|
|
815
|
-
def _add(term: str) -> None:
|
|
816
|
-
key = term.strip().lower()
|
|
817
|
-
if key and key not in _CONCEPT_STOP and not key.isdigit() and len(key) >= 2:
|
|
818
|
-
seen.setdefault(key, term.strip())
|
|
819
|
-
|
|
820
|
-
# 1. Backtick-quoted code/term (highest confidence)
|
|
821
|
-
for m in re.findall(r"`([^`]{2,40})`", text):
|
|
822
|
-
if not re.search(r"[\(\)\[\]{}]", m): # skip code expressions
|
|
823
|
-
_add(m)
|
|
824
|
-
|
|
825
|
-
# 2. Double/single quoted terms
|
|
826
|
-
for m in re.findall(r'"([^"]{2,40})"', text):
|
|
827
|
-
_add(m)
|
|
828
|
-
|
|
829
|
-
# 3. Multi-word English proper nouns (Title Case or ALL-CAPS first word, 2–4 words).
|
|
830
|
-
# Pattern A: Mixed-case first word — "Lattice AI", "Tool Use", "Graph RAG"
|
|
831
|
-
for m in re.findall(
|
|
832
|
-
r"([A-Z][a-z]{1,20}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20}|\d[\w.]{0,6})){1,3})",
|
|
833
|
-
text,
|
|
834
|
-
):
|
|
835
|
-
_add(m)
|
|
836
|
-
# Pattern B: ALL-CAPS first word — "VS Code", "MCP Server", "GPT-4o Mini"
|
|
837
|
-
for m in re.findall(
|
|
838
|
-
r"([A-Z]{2,6}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20})){1,2})",
|
|
839
|
-
text,
|
|
840
|
-
):
|
|
841
|
-
_add(m)
|
|
842
|
-
|
|
843
|
-
# 4. Single capitalized proper noun.
|
|
844
|
-
# Use ASCII-boundary lookaround instead of \b so Korean particles
|
|
845
|
-
# (와, 의, 는 …) after an English word don't block the match.
|
|
846
|
-
all_caps_words = re.findall(
|
|
847
|
-
r"(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]{2,24})(?![A-Za-z0-9])", text
|
|
848
|
-
)
|
|
849
|
-
freq: Dict[str, int] = {}
|
|
850
|
-
for w in all_caps_words:
|
|
851
|
-
freq[w] = freq.get(w, 0) + 1
|
|
852
|
-
sentence_starts = set(re.findall(r"(?:^|(?<=[.!?])\s+)([A-Z][a-z]+)", text))
|
|
853
|
-
for m, cnt in freq.items():
|
|
854
|
-
if m.lower() in _CONCEPT_STOP:
|
|
855
|
-
continue
|
|
856
|
-
if cnt >= 2 or m not in sentence_starts:
|
|
857
|
-
_add(m)
|
|
858
|
-
|
|
859
|
-
# 5. Korean technical compound nouns (3–12 chars, no common particles)
|
|
860
|
-
for m in re.findall(
|
|
861
|
-
r"[가-힣]{2,12}(?:AI|LLM|API|UI|RAG|bot|Bot|기능|모델|서버|에이전트|파이프라인|워크플로)",
|
|
862
|
-
text,
|
|
863
|
-
):
|
|
864
|
-
_add(m)
|
|
865
|
-
# Korean standalone terms that appear after topic markers (은/는/이/가 앞)
|
|
866
|
-
for m in re.findall(
|
|
867
|
-
r"([가-힣]{2,12})(?:은|는|이|가|을|를|의|에서|으로|와|과)", text
|
|
868
|
-
):
|
|
869
|
-
if m.lower() not in _CONCEPT_STOP and len(m) >= 2:
|
|
870
|
-
# Only add if it's non-trivial (has 3+ chars or appears multiple times)
|
|
871
|
-
cnt = text.count(m)
|
|
872
|
-
if len(m) >= 3 or cnt >= 2:
|
|
873
|
-
_add(m)
|
|
874
|
-
|
|
875
|
-
# 6. Hyphenated / versioned identifiers (gpt-4o, gemma-4, mlx-vlm)
|
|
876
|
-
for m in re.findall(r"\b([a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z0-9.]+)+)\b", text):
|
|
877
|
-
if len(m) >= 4:
|
|
878
|
-
_add(m)
|
|
879
|
-
|
|
880
|
-
# De-duplicate: remove shorter if ALL its occurrences in the source text
|
|
881
|
-
# are followed immediately by the suffix that forms the longer concept.
|
|
882
|
-
# "Lattice" → dropped when every occurrence is "Lattice AI"
|
|
883
|
-
# "Claude" → kept because it appears as just "Claude" too.
|
|
884
|
-
values = list(seen.values())
|
|
885
|
-
values_lower = [v.lower() for v in values]
|
|
886
|
-
keep = set(range(len(values)))
|
|
887
|
-
for i, v in enumerate(values):
|
|
888
|
-
vl = v.lower()
|
|
889
|
-
for j, wl in enumerate(values_lower):
|
|
890
|
-
if i == j or j not in keep:
|
|
891
|
-
continue
|
|
892
|
-
# Check if vl is a word-prefix of wl
|
|
893
|
-
suffix = wl[len(vl) :]
|
|
894
|
-
if not (wl.startswith(vl) and re.match(r"^[\s\-]", suffix)):
|
|
895
|
-
continue
|
|
896
|
-
# Count occurrences of v NOT followed by the suffix
|
|
897
|
-
suffix_stripped = suffix.lstrip(" -")
|
|
898
|
-
# Escape for regex
|
|
899
|
-
pattern_with_suffix = re.escape(v) + r"[\s\-]+" + re.escape(suffix_stripped)
|
|
900
|
-
pattern_alone = (
|
|
901
|
-
re.escape(v) + r"(?![\s\-]*" + re.escape(suffix_stripped) + r")"
|
|
902
|
-
)
|
|
903
|
-
alone_count = len(re.findall(pattern_alone, text, re.IGNORECASE))
|
|
904
|
-
if alone_count == 0:
|
|
905
|
-
# Shorter term never appears alone → safe to remove
|
|
906
|
-
keep.discard(i)
|
|
907
|
-
break
|
|
908
|
-
|
|
909
|
-
final = [values[i] for i in range(len(values)) if i in keep]
|
|
910
|
-
return final[:limit]
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
# ──────────────────────────────────────────────────────────────────────────────
|
|
914
|
-
# Node type taxonomy (점 = 명사)
|
|
915
|
-
# ──────────────────────────────────────────────────────────────────────────────
|
|
916
|
-
# Chat — 대화 세션
|
|
917
|
-
# Document — 파일 (PDF·PPT·Word·Excel·이미지 등)
|
|
918
|
-
# Concept — 개념·아이디어·기술 용어
|
|
919
|
-
# Person — 사람 (사용자, 언급된 인물)
|
|
920
|
-
# Error — 오류·버그·예외
|
|
921
|
-
# Code — 코드 스니펫·함수·클래스
|
|
922
|
-
# Feature — 소프트웨어 기능
|
|
923
|
-
# Task — 할 일·액션 아이템
|
|
924
|
-
# Decision — 결정 사항
|
|
925
|
-
|
|
926
|
-
# Edge type vocabulary (선 = 동사 — 과거형 서술어)
|
|
927
|
-
EDGE_VERB = {
|
|
928
|
-
"언급함": r"언급|mention|refer|cited",
|
|
929
|
-
"포함함": r"포함|include|consist|구성|탑재|contains",
|
|
930
|
-
"해결함": r"해결|resolv|fix|수정|고쳤|closed",
|
|
931
|
-
"의존함": r"의존|depend|require|필요|based on",
|
|
932
|
-
"설명함": r"설명|explain|describe|정의|란|이란|means",
|
|
933
|
-
"비교함": r"비교|versus|vs\.?|차이|다르|compare",
|
|
934
|
-
"사용함": r"사용|use|활용|이용|apply",
|
|
935
|
-
"연결함": r"연결|connect|통합|integrate|연동|link",
|
|
936
|
-
"확장함": r"확장|extend|플러그인|plugin|addon",
|
|
937
|
-
"생성함": r"생성|만들|create|generate|build|produced",
|
|
938
|
-
"대체함": r"대체|replace|instead|alternative",
|
|
939
|
-
"지원함": r"지원|support|제공|provide|offer",
|
|
940
|
-
"발생함": r"발생|occur|throw|raise|triggered",
|
|
941
|
-
"관련됨": r"관련|related|associated|연관",
|
|
942
|
-
}
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
def _infer_edge(sentence: str) -> str:
|
|
946
|
-
"""Return the best-matching verb-form edge label for a sentence."""
|
|
947
|
-
s = sentence.lower()
|
|
948
|
-
for label, pattern in EDGE_VERB.items():
|
|
949
|
-
if re.search(pattern, s):
|
|
950
|
-
return label
|
|
951
|
-
return "관련됨"
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
# Technical words that cannot be person names
|
|
955
|
-
_NOT_PERSON_WORDS: set = {
|
|
956
|
-
"use",
|
|
957
|
-
"api",
|
|
958
|
-
"rag",
|
|
959
|
-
"sdk",
|
|
960
|
-
"ide",
|
|
961
|
-
"cli",
|
|
962
|
-
"llm",
|
|
963
|
-
"mcp",
|
|
964
|
-
"ui",
|
|
965
|
-
"ux",
|
|
966
|
-
"new",
|
|
967
|
-
"old",
|
|
968
|
-
"get",
|
|
969
|
-
"set",
|
|
970
|
-
"run",
|
|
971
|
-
"add",
|
|
972
|
-
"fix",
|
|
973
|
-
"tool",
|
|
974
|
-
"code",
|
|
975
|
-
"base",
|
|
976
|
-
"core",
|
|
977
|
-
"data",
|
|
978
|
-
"file",
|
|
979
|
-
"test",
|
|
980
|
-
"type",
|
|
981
|
-
"mode",
|
|
982
|
-
"view",
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
def _classify_node_type(concept: str, text: str) -> str:
|
|
987
|
-
"""Classify a concept into the node taxonomy.
|
|
988
|
-
|
|
989
|
-
Term-level signals take priority; then a tight ±60-char window is used
|
|
990
|
-
so distant keywords don't cause mis-classification.
|
|
991
|
-
"""
|
|
992
|
-
term = concept.lower()
|
|
993
|
-
|
|
994
|
-
# ── Term-level signals (highest confidence) ───────────────────────────
|
|
995
|
-
if re.search(r"(?:error|exception|traceback|오류|에러|버그)$", term, re.I):
|
|
996
|
-
return "Error"
|
|
997
|
-
if re.search(r"error|exception|err\b", term, re.I) and len(concept) < 30:
|
|
998
|
-
return "Error"
|
|
999
|
-
if re.search(r"\(\)|\.py$|\.js$|\.ts$|\.go$|::\w", term):
|
|
1000
|
-
return "Code"
|
|
1001
|
-
|
|
1002
|
-
# Person: "First Last" pattern, neither word is a known technical term
|
|
1003
|
-
if re.match(r"^[A-Z][a-z]{1,15} [A-Z][a-z]{1,15}$", concept):
|
|
1004
|
-
words = term.split()
|
|
1005
|
-
if not any(w in _NOT_PERSON_WORDS for w in words):
|
|
1006
|
-
return "Person"
|
|
1007
|
-
|
|
1008
|
-
# ── Windowed context (±60 chars) — NOT used for Error to avoid false positives
|
|
1009
|
-
idx = text.lower().find(term)
|
|
1010
|
-
if idx >= 0:
|
|
1011
|
-
win = text[max(0, idx - 60) : idx + len(concept) + 60].lower()
|
|
1012
|
-
if re.search(r"def |class |function|함수|클래스|메서드|import", win):
|
|
1013
|
-
return "Code"
|
|
1014
|
-
# Feature: concept appears DIRECTLY adjacent to 기능/feature keyword
|
|
1015
|
-
if len(concept) <= 12 and re.search(
|
|
1016
|
-
rf"{re.escape(term)}.{{0,8}}(?:기능|feature)|(?:기능|feature).{{0,8}}{re.escape(term)}",
|
|
1017
|
-
win,
|
|
1018
|
-
):
|
|
1019
|
-
return "Feature"
|
|
1020
|
-
|
|
1021
|
-
return "Concept"
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
def _extract_triples(
|
|
1025
|
-
text: str,
|
|
1026
|
-
concepts: List[str],
|
|
1027
|
-
limit: int = 20,
|
|
1028
|
-
) -> List[Dict[str, str]]:
|
|
1029
|
-
"""LLM-first triple extraction with rule-based fallback."""
|
|
1030
|
-
llm_result = _llm_extract_triples(text, concepts, limit)
|
|
1031
|
-
if llm_result:
|
|
1032
|
-
return llm_result
|
|
1033
|
-
return _extract_triples_rules(text, concepts, limit)
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
def _extract_triples_rules(
|
|
1037
|
-
text: str,
|
|
1038
|
-
concepts: List[str],
|
|
1039
|
-
limit: int = 20,
|
|
1040
|
-
) -> List[Dict[str, str]]:
|
|
1041
|
-
"""Extract (subject, verb-edge, object, context) triples from text (rule-based).
|
|
1042
|
-
|
|
1043
|
-
For each sentence containing ≥2 concepts, infer the verb-form edge label
|
|
1044
|
-
from surrounding context and create a directed triple.
|
|
1045
|
-
"""
|
|
1046
|
-
if len(concepts) < 2:
|
|
1047
|
-
return []
|
|
1048
|
-
|
|
1049
|
-
concept_lower = {c.lower(): c for c in concepts}
|
|
1050
|
-
triples: List[Dict[str, str]] = []
|
|
1051
|
-
seen_pairs: set = set()
|
|
1052
|
-
|
|
1053
|
-
# Split on sentence boundaries
|
|
1054
|
-
sentences = re.split(r"(?<=[.!?\n])\s+|\n{2,}", text)
|
|
1055
|
-
for sent in sentences:
|
|
1056
|
-
sent = sent.strip()
|
|
1057
|
-
if len(sent) < 8:
|
|
1058
|
-
continue
|
|
1059
|
-
sent_lower = sent.lower()
|
|
1060
|
-
|
|
1061
|
-
present = [concept_lower[k] for k in concept_lower if k in sent_lower]
|
|
1062
|
-
if len(present) < 2:
|
|
1063
|
-
continue
|
|
1064
|
-
|
|
1065
|
-
edge = _infer_edge(sent)
|
|
1066
|
-
|
|
1067
|
-
for i in range(len(present) - 1):
|
|
1068
|
-
subj, obj = present[i], present[i + 1]
|
|
1069
|
-
# Deduplicate by (subj, obj) regardless of direction for same edge
|
|
1070
|
-
pair_key = tuple(sorted([subj.lower(), obj.lower()])) + (edge,)
|
|
1071
|
-
if pair_key in seen_pairs:
|
|
1072
|
-
continue
|
|
1073
|
-
seen_pairs.add(pair_key)
|
|
1074
|
-
triples.append(
|
|
1075
|
-
{
|
|
1076
|
-
"subject": subj,
|
|
1077
|
-
"relation": edge, # verb form (동사)
|
|
1078
|
-
"object": obj,
|
|
1079
|
-
"context": sent[:240],
|
|
1080
|
-
}
|
|
1081
|
-
)
|
|
1082
|
-
if len(triples) >= limit:
|
|
1083
|
-
return triples
|
|
1084
|
-
|
|
1085
|
-
return triples
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
def _semantic_items(text: str) -> List[Dict[str, str]]:
|
|
1089
|
-
"""Extract explicit decision / task items from text."""
|
|
1090
|
-
items: List[Dict[str, str]] = []
|
|
1091
|
-
for raw_line in str(text or "").splitlines():
|
|
1092
|
-
line = _clean_text(raw_line)
|
|
1093
|
-
if len(line) < 6:
|
|
1094
|
-
continue
|
|
1095
|
-
lowered = line.lower()
|
|
1096
|
-
if re.search(r"(결정|확정|하기로|decided|decision)", lowered):
|
|
1097
|
-
items.append(
|
|
1098
|
-
{"type": "Decision", "title": line[:120], "summary": line[:500]}
|
|
1099
|
-
)
|
|
1100
|
-
if re.search(r"(todo|해야|하자|진행|구현|수정|확인|next|task|\[ \])", lowered):
|
|
1101
|
-
items.append({"type": "Task", "title": line[:120], "summary": line[:500]})
|
|
1102
|
-
return items[:8]
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
def _topic_candidates(text: str, limit: int = 8) -> List[str]:
|
|
1106
|
-
"""Return compact keyword candidates for fallback graph search."""
|
|
1107
|
-
candidates = _extract_concepts(text, limit=limit)
|
|
1108
|
-
if candidates:
|
|
1109
|
-
return candidates[:limit]
|
|
1110
|
-
seen: Dict[str, str] = {}
|
|
1111
|
-
for token in re.findall(
|
|
1112
|
-
r"[A-Za-z][A-Za-z0-9_.:-]{2,}|[가-힣]{2,12}", str(text or "")
|
|
1113
|
-
):
|
|
1114
|
-
key = token.lower()
|
|
1115
|
-
if key in _CONCEPT_STOP or key.isdigit():
|
|
1116
|
-
continue
|
|
1117
|
-
seen.setdefault(key, token)
|
|
1118
|
-
if len(seen) >= limit:
|
|
1119
|
-
break
|
|
1120
|
-
return list(seen.values())[:limit]
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
__all__ = [name for name in globals() if not name.startswith("__")]
|
|
17
|
+
sys.modules[__name__] = _impl
|