ltcai 3.5.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +73 -35
  2. package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
  3. package/docs/CHANGELOG.md +32 -0
  4. package/docs/HANDOVER_v3.6.0.md +46 -0
  5. package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
  6. package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
  7. package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +509 -0
  8. package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
  9. package/docs/architecture.md +13 -12
  10. package/docs/kg-schema.md +102 -53
  11. package/docs/privacy.md +18 -2
  12. package/docs/security-model.md +17 -0
  13. package/kg_schema.py +139 -10
  14. package/knowledge_graph.py +874 -26
  15. package/knowledge_graph_api.py +11 -127
  16. package/latticeai/__init__.py +1 -1
  17. package/latticeai/api/admin.py +1 -1
  18. package/latticeai/api/agents.py +7 -1
  19. package/latticeai/api/auth.py +27 -4
  20. package/latticeai/api/browser.py +217 -0
  21. package/latticeai/api/chat.py +112 -76
  22. package/latticeai/api/health.py +1 -1
  23. package/latticeai/api/hooks.py +1 -1
  24. package/latticeai/api/knowledge_graph.py +146 -0
  25. package/latticeai/api/local_files.py +1 -1
  26. package/latticeai/api/mcp.py +23 -11
  27. package/latticeai/api/memory.py +1 -1
  28. package/latticeai/api/models.py +1 -1
  29. package/latticeai/api/network.py +81 -0
  30. package/latticeai/api/portability.py +93 -0
  31. package/latticeai/api/realtime.py +1 -1
  32. package/latticeai/api/search.py +26 -2
  33. package/latticeai/api/security_dashboard.py +2 -3
  34. package/latticeai/api/setup.py +2 -2
  35. package/latticeai/api/static_routes.py +2 -4
  36. package/latticeai/api/tools.py +3 -0
  37. package/latticeai/api/workflow_designer.py +46 -0
  38. package/latticeai/api/workspace.py +71 -49
  39. package/latticeai/app_factory.py +1710 -0
  40. package/latticeai/brain/__init__.py +18 -0
  41. package/latticeai/brain/context.py +213 -0
  42. package/latticeai/brain/conversations.py +236 -0
  43. package/latticeai/brain/identity.py +175 -0
  44. package/latticeai/brain/memory.py +102 -0
  45. package/latticeai/brain/network.py +205 -0
  46. package/latticeai/core/agent.py +31 -7
  47. package/latticeai/core/audit.py +0 -7
  48. package/latticeai/core/config.py +1 -1
  49. package/latticeai/core/context_builder.py +1 -2
  50. package/latticeai/core/enterprise.py +1 -1
  51. package/latticeai/core/graph_curator.py +2 -2
  52. package/latticeai/core/marketplace.py +1 -1
  53. package/latticeai/core/mcp_registry.py +791 -0
  54. package/latticeai/core/model_compat.py +1 -1
  55. package/latticeai/core/model_resolution.py +0 -1
  56. package/latticeai/core/multi_agent.py +238 -4
  57. package/latticeai/core/security.py +1 -1
  58. package/latticeai/core/sessions.py +37 -7
  59. package/latticeai/core/workflow_engine.py +114 -2
  60. package/latticeai/core/workspace_os.py +58 -10
  61. package/latticeai/models/__init__.py +7 -0
  62. package/latticeai/models/router.py +779 -0
  63. package/latticeai/server_app.py +29 -1504
  64. package/latticeai/services/agent_runtime.py +1 -0
  65. package/latticeai/services/app_context.py +75 -14
  66. package/latticeai/services/ingestion.py +318 -0
  67. package/latticeai/services/kg_portability.py +207 -0
  68. package/latticeai/services/memory_service.py +39 -11
  69. package/latticeai/services/model_runtime.py +2 -5
  70. package/latticeai/services/platform_runtime.py +100 -23
  71. package/latticeai/services/search_service.py +17 -8
  72. package/latticeai/services/tool_dispatch.py +12 -2
  73. package/latticeai/services/triggers.py +241 -0
  74. package/latticeai/services/upload_service.py +37 -12
  75. package/latticeai/services/workspace_service.py +31 -0
  76. package/llm_router.py +29 -772
  77. package/ltcai_cli.py +1 -2
  78. package/mcp_registry.py +25 -788
  79. package/p_reinforce.py +124 -14
  80. package/package.json +11 -8
  81. package/scripts/build_vsix.mjs +72 -0
  82. package/scripts/bump_version.py +99 -0
  83. package/scripts/generate_diagrams.py +0 -1
  84. package/scripts/lint_v3.mjs +82 -18
  85. package/scripts/validate_release_artifacts.py +0 -1
  86. package/scripts/wheel_smoke.py +142 -0
  87. package/server.py +11 -7
  88. package/setup_wizard.py +1142 -0
  89. package/static/account.html +2 -4
  90. package/static/admin.html +3 -5
  91. package/static/chat.html +3 -6
  92. package/static/graph.html +2 -4
  93. package/static/sw.js +81 -52
  94. package/static/v3/asset-manifest.json +20 -19
  95. package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
  96. package/static/v3/css/lattice.base.css +1 -1
  97. package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
  98. package/static/v3/css/lattice.components.css +1 -1
  99. package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
  100. package/static/v3/css/lattice.shell.css +1 -1
  101. package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
  102. package/static/v3/css/lattice.tokens.css +3 -0
  103. package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
  104. package/static/v3/css/lattice.views.css +2 -2
  105. package/static/v3/index.html +3 -4
  106. package/static/v3/js/{app.d086489d.js → app.356e6452.js} +1 -1
  107. package/static/v3/js/core/{api.12b568ad.js → api.7a308b89.js} +39 -1
  108. package/static/v3/js/core/api.js +38 -0
  109. package/static/v3/js/core/{routes.d214b399.js → routes.7222343d.js} +22 -22
  110. package/static/v3/js/core/routes.js +22 -22
  111. package/static/v3/js/core/{shell.d05266f5.js → shell.a1657f20.js} +4 -4
  112. package/static/v3/js/core/shell.js +1 -1
  113. package/static/v3/js/core/{store.34ebd5e6.js → store.204a08b2.js} +1 -1
  114. package/static/v3/js/core/store.js +1 -1
  115. package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
  116. package/static/v3/js/views/graph-canvas.js +509 -0
  117. package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
  118. package/static/v3/js/views/hybrid-search.js +1 -2
  119. package/static/v3/js/views/knowledge-graph.5e40cbeb.js +509 -0
  120. package/static/v3/js/views/knowledge-graph.js +326 -54
  121. package/static/vendor/chart.umd.min.js +20 -0
  122. package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
  123. package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
  124. package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
  125. package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
  126. package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
  127. package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
  128. package/static/vendor/fonts/inter.css +44 -0
  129. package/static/vendor/icons/tabler-icons.min.css +4 -0
  130. package/static/vendor/icons/tabler-icons.woff2 +0 -0
  131. package/static/vendor/marked.min.js +69 -0
  132. package/static/workspace.html +2 -2
  133. package/telegram_bot.py +1 -2
  134. package/tools/commands.py +4 -2
  135. package/tools/computer.py +1 -1
  136. package/tools/documents.py +1 -3
  137. package/tools/filesystem.py +0 -4
  138. package/tools/knowledge.py +1 -3
  139. package/tools/network.py +1 -3
  140. package/codex_telegram_bot.py +0 -195
  141. package/docs/assets/v3.4.0/agent-run.png +0 -0
  142. package/docs/assets/v3.4.0/agents.png +0 -0
  143. package/docs/assets/v3.4.0/before/chat-before.png +0 -0
  144. package/docs/assets/v3.4.0/before/files-before.png +0 -0
  145. package/docs/assets/v3.4.0/chat.png +0 -0
  146. package/docs/assets/v3.4.0/connect-folder.png +0 -0
  147. package/docs/assets/v3.4.0/files.png +0 -0
  148. package/docs/assets/v3.4.0/home.png +0 -0
  149. package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
  150. package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
  151. package/docs/assets/v3.4.0/local-agent.png +0 -0
  152. package/docs/assets/v3.4.0/memory.png +0 -0
  153. package/docs/assets/v3.4.0/settings.png +0 -0
  154. package/docs/assets/v3.4.0/vision-input.png +0 -0
  155. package/docs/assets/v3.4.0/workflows.png +0 -0
  156. package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
  157. package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
  158. package/docs/assets/v3.4.1/local-agent.png +0 -0
  159. package/docs/images/admin-dashboard.png +0 -0
  160. package/docs/images/architecture.png +0 -0
  161. package/docs/images/enterprise.png +0 -0
  162. package/docs/images/graph.png +0 -0
  163. package/docs/images/hero.gif +0 -0
  164. package/docs/images/knowledge-graph.png +0 -0
  165. package/docs/images/lattice-ai-demo.gif +0 -0
  166. package/docs/images/lattice-ai-hero.png +0 -0
  167. package/docs/images/logo.svg +0 -33
  168. package/docs/images/mobile-responsive.png +0 -0
  169. package/docs/images/model-recommendation.png +0 -0
  170. package/docs/images/onboarding.png +0 -0
  171. package/docs/images/organization.png +0 -0
  172. package/docs/images/pipeline.png +0 -0
  173. package/docs/images/screenshot-admin.png +0 -0
  174. package/docs/images/screenshot-chat.png +0 -0
  175. package/docs/images/screenshot-graph.png +0 -0
  176. package/docs/images/skills.png +0 -0
  177. package/docs/images/workspace-dark.png +0 -0
  178. package/docs/images/workspace-light.png +0 -0
  179. package/docs/images/workspace.png +0 -0
  180. package/requirements.txt +0 -16
  181. package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
@@ -0,0 +1,779 @@
1
+ """
2
+ LLM Router — mlx-vlm 기반 Gemma 4 최적화 및 추측 디코딩(Speculative Decoding) 코어
3
+ """
4
+
5
+ import asyncio
6
+ import base64
7
+ import gc
8
+ import io
9
+ import os
10
+ import re
11
+ import time
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ # Set MLX_VLM_DRAFT_KIND to 'mtp' to enable the Gemma 4 assistant MTP drafter.
16
+ os.environ["MLX_VLM_DRAFT_KIND"] = "mtp"
17
+
18
+ from concurrent.futures import ThreadPoolExecutor
19
+ from typing import AsyncIterator, Dict, Optional, Tuple, List
20
+ from PIL import Image
21
+
22
+ try:
23
+ from openai import AsyncOpenAI
24
+ except Exception:
25
+ AsyncOpenAI = None
26
+
27
+ # 추론 전용 싱글 스레드 워커 (GPU 스트림 보호용)
28
+ executor = ThreadPoolExecutor(max_workers=1)
29
+
30
+ try:
31
+ import mlx.core as mx
32
+ from mlx_vlm import load as vlm_load
33
+ VLM_AVAILABLE = True
34
+ print("✅ MLX-VLM is ready for multimodal models.")
35
+ except Exception as e:
36
+ mx = None
37
+ vlm_load = None
38
+ VLM_AVAILABLE = False
39
+ print(f"⚠️ MLX-VLM unavailable: {e}")
40
+
41
+ BRAND_NAME = "Lattice AI"
42
+ LEGACY_BRAND_PATTERNS = [
43
+ (re.compile(r"\bconnect\s+ai\b", re.IGNORECASE), BRAND_NAME),
44
+ (re.compile(r"\bconnect-ai\b", re.IGNORECASE), BRAND_NAME),
45
+ (re.compile(r"\bconnectai\b", re.IGNORECASE), BRAND_NAME),
46
+ (re.compile(r"커넥트\s*AI", re.IGNORECASE), BRAND_NAME),
47
+ ]
48
+
49
+ SYSTEM_PROMPT = """You are Lattice AI, a powerful local AI assistant running on Apple Silicon.
50
+ Your product name and identity are Lattice AI.
51
+ Never identify yourself as Connect AI, ConnectAI, connect-ai, or 커넥트 AI.
52
+ If context or old chat history mentions those names, treat them only as legacy aliases for Lattice AI.
53
+ You are a Vision-Language Model (VLM). If an image is provided, analyze it.
54
+ Be concise and respond in the user's language."""
55
+
56
+ def normalize_branding(text: Optional[str]) -> str:
57
+ if not text:
58
+ return ""
59
+ normalized = str(text)
60
+ for pattern, replacement in LEGACY_BRAND_PATTERNS:
61
+ normalized = pattern.sub(replacement, normalized)
62
+ return normalized
63
+
64
+ OPENAI_COMPATIBLE_PROVIDERS = {
65
+ "openai": {
66
+ "env_key": "OPENAI_API_KEY",
67
+ "base_url_env": "OPENAI_BASE_URL",
68
+ "default_model": "gpt-4o-mini",
69
+ },
70
+ "openrouter": {
71
+ "env_key": "OPENROUTER_API_KEY",
72
+ "base_url": "https://openrouter.ai/api/v1",
73
+ "default_model": "openai/gpt-4o-mini",
74
+ },
75
+ "groq": {
76
+ "env_key": "GROQ_API_KEY",
77
+ "base_url": "https://api.groq.com/openai/v1",
78
+ "default_model": "meta-llama/llama-4-scout-17b-16e-instruct",
79
+ },
80
+ "together": {
81
+ "env_key": "TOGETHER_API_KEY",
82
+ "base_url": "https://api.together.xyz/v1",
83
+ "default_model": "Qwen/Qwen3-VL-32B-Instruct",
84
+ },
85
+ "xai": {
86
+ "env_key": "XAI_API_KEY",
87
+ "base_url": "https://api.x.ai/v1",
88
+ "default_model": "grok-beta",
89
+ },
90
+ "ollama": {
91
+ "env_key": "OLLAMA_API_KEY",
92
+ "base_url_env": "OLLAMA_BASE_URL",
93
+ "base_url": "http://localhost:11434/v1",
94
+ "default_model": "hf.co/ggml-org/gemma-4-12B-it-GGUF:Q4_K_M",
95
+ "api_key_fallback": "ollama",
96
+ },
97
+ "vllm": {
98
+ "env_key": "VLLM_API_KEY",
99
+ "base_url_env": "VLLM_BASE_URL",
100
+ "base_url": "http://localhost:8000/v1",
101
+ "default_model": "Qwen/Qwen3-VL-8B-Instruct",
102
+ "api_key_fallback": "vllm",
103
+ },
104
+ "lmstudio": {
105
+ "env_key": "LMSTUDIO_API_KEY",
106
+ "base_url_env": "LMSTUDIO_BASE_URL",
107
+ "base_url": "http://localhost:1234/v1",
108
+ "default_model": "local-model",
109
+ "api_key_fallback": "lmstudio",
110
+ },
111
+ "llamacpp": {
112
+ "env_key": "LLAMACPP_API_KEY",
113
+ "base_url_env": "LLAMACPP_BASE_URL",
114
+ "base_url": "http://localhost:8080/v1",
115
+ "default_model": "llama.cpp-model",
116
+ "api_key_fallback": "llamacpp",
117
+ },
118
+ }
119
+
120
+ PROVIDER_MODEL_CATALOG = {
121
+ "openai": [
122
+ {"id": "gpt-5.5", "name": "GPT-5.5", "family": "GPT"},
123
+ {"id": "gpt-5.4", "name": "GPT-5.4", "family": "GPT"},
124
+ {"id": "gpt-5.4-mini", "name": "GPT-5.4 Mini", "family": "GPT"},
125
+ {"id": "gpt-5.4-nano", "name": "GPT-5.4 Nano", "family": "GPT"},
126
+ {"id": "gpt-4o-mini", "name": "GPT-4o Mini", "family": "GPT"},
127
+ {"id": "gpt-4o", "name": "GPT-4o", "family": "GPT"},
128
+ {"id": "gpt-4.1-mini", "name": "GPT-4.1 Mini", "family": "GPT"},
129
+ {"id": "gpt-4.1", "name": "GPT-4.1", "family": "GPT"},
130
+ ],
131
+ "openrouter": [
132
+ {"id": "openai/gpt-5.5", "name": "GPT-5.5 via OpenRouter", "family": "GPT"},
133
+ {"id": "openai/gpt-4o-mini", "name": "GPT-4o Mini via OpenRouter", "family": "GPT"},
134
+ {"id": "anthropic/claude-opus-4.7", "name": "Claude Opus 4.7 via OpenRouter", "family": "Claude"},
135
+ {"id": "anthropic/claude-sonnet-4.6", "name": "Claude Sonnet 4.6 via OpenRouter", "family": "Claude"},
136
+ {"id": "anthropic/claude-haiku-4.5", "name": "Claude Haiku 4.5 via OpenRouter", "family": "Claude"},
137
+ {"id": "qwen/qwen3-vl-235b-a22b-instruct", "name": "Qwen3-VL 235B A22B via OpenRouter", "family": "Qwen"},
138
+ {"id": "google/gemma-4-12b-it", "name": "Gemma 4 12B via OpenRouter", "family": "Gemma"},
139
+ {"id": "x-ai/grok-2", "name": "Grok 2 via OpenRouter", "family": "Grok"},
140
+ {"id": "meta-llama/llama-4-scout-17b-16e-instruct", "name": "Llama 4 Scout via OpenRouter", "family": "Llama"},
141
+ {"id": "google/gemini-2.5-flash", "name": "Gemini 2.5 Flash via OpenRouter", "family": "Gemini"},
142
+ ],
143
+ "groq": [
144
+ {"id": "meta-llama/llama-4-scout-17b-16e-instruct", "name": "Llama 4 Scout", "family": "Llama"},
145
+ ],
146
+ "together": [
147
+ {"id": "Qwen/Qwen3-VL-32B-Instruct", "name": "Qwen3-VL 32B", "family": "Qwen"},
148
+ {"id": "google/gemma-4-12b-it", "name": "Gemma 4 12B", "family": "Gemma"},
149
+ {"id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "name": "Llama 4 Scout", "family": "Llama"},
150
+ ],
151
+ "xai": [
152
+ {"id": "grok-beta", "name": "Grok Beta", "family": "Grok"},
153
+ {"id": "grok-vision-beta", "name": "Grok Vision Beta", "family": "Grok"},
154
+ ],
155
+ }
156
+
157
+ MODEL_SOURCE_BY_FAMILY = {
158
+ "GPT": ("미국", "OpenAI"),
159
+ "Claude": ("미국", "Anthropic"),
160
+ "Qwen": ("중국", "Alibaba"),
161
+ "Llama": ("미국", "Meta"),
162
+ "Gemini": ("미국", "Google"),
163
+ "Grok": ("미국", "xAI"),
164
+ }
165
+
166
+
167
+ def source_metadata_for_model(provider: str, model: Dict[str, str], *, local_server: bool) -> Dict[str, str]:
168
+ family = str(model.get("family") or "")
169
+ country, company = MODEL_SOURCE_BY_FAMILY.get(family, ("미상", provider.title()))
170
+ if local_server:
171
+ execution_method = "내 컴퓨터에서만 실행"
172
+ internet_requirement = "모델을 다운로드할 때만 인터넷 필요; 실행 중에는 필요 없음"
173
+ else:
174
+ execution_method = "인터넷 연결 후 사용"
175
+ internet_requirement = "내 파일이 인터넷으로 전송될 수 있음"
176
+ return {
177
+ "source_country": country,
178
+ "source_company": company,
179
+ "execution_method": execution_method,
180
+ "internet_requirement": internet_requirement,
181
+ "model_name": model.get("name") or model.get("id") or "",
182
+ "source_display_order": [
183
+ "source_country",
184
+ "source_company",
185
+ "execution_method",
186
+ "internet_requirement",
187
+ "model_name",
188
+ ],
189
+ }
190
+
191
+ @dataclass
192
+ class CloudModel:
193
+ provider: str
194
+ model: str
195
+ client: object
196
+ cache_key: str
197
+
198
+ def parse_model_ref(model_id: str) -> tuple[str, str]:
199
+ """Return (provider, model). Unprefixed refs stay local MLX."""
200
+ if model_id.startswith("cloud:"):
201
+ _, provider, model = model_id.split(":", 2)
202
+ return provider, model
203
+ if ":" in model_id:
204
+ provider, model = model_id.split(":", 1)
205
+ if provider in OPENAI_COMPATIBLE_PROVIDERS:
206
+ return provider, model
207
+ if provider in {"local_mlx", "mlx"}:
208
+ return "local_mlx", model
209
+ if model_id.startswith("local_mlx:"):
210
+ return "local_mlx", model_id.split(":", 1)[1]
211
+ return "local_mlx", model_id
212
+
213
+ HF_MODELS_ROOT = Path.home() / ".ltcai" / "hf-models"
214
+
215
+ def hf_model_dir(repo_id: str) -> Path:
216
+ return HF_MODELS_ROOT / repo_id.replace("/", "__")
217
+
218
+ def _looks_like_hf_model_dir(path: Path) -> bool:
219
+ if not path.exists() or not path.is_dir():
220
+ return False
221
+ has_config = (path / "config.json").exists()
222
+ has_weights = any(path.glob("*.safetensors")) or any(path.glob("*.bin"))
223
+ has_tokenizer = (
224
+ (path / "tokenizer.json").exists()
225
+ or (path / "tokenizer.model").exists()
226
+ or (path / "tokenizer_config.json").exists()
227
+ )
228
+ return has_config and has_weights and has_tokenizer
229
+
230
+ def _resolve_local_hf_model(model_id: str) -> str:
231
+ explicit_path = Path(model_id).expanduser()
232
+ if explicit_path.exists():
233
+ return str(explicit_path)
234
+ local_dir = hf_model_dir(model_id)
235
+ if _looks_like_hf_model_dir(local_dir):
236
+ return str(local_dir)
237
+ return model_id
238
+
239
+ def ensure_mlx_runtime() -> None:
240
+ global mx, vlm_load, VLM_AVAILABLE
241
+ if mx is not None and vlm_load is not None:
242
+ return
243
+ try:
244
+ import mlx.core as mlx_core
245
+ from mlx_vlm import load as mlx_vlm_load
246
+
247
+ mx = mlx_core
248
+ vlm_load = mlx_vlm_load
249
+ VLM_AVAILABLE = True
250
+ mx.set_default_device(mx.gpu)
251
+ except Exception as e:
252
+ raise RuntimeError(f"MLX-VLM runtime is not available after install: {e}") from e
253
+
254
+ def _mlx_sampler(temperature: float):
255
+ """Build an MLX sampler callable for the given temperature.
256
+
257
+ Lattice v2.2 keeps local execution on MLX-VLM only. Returning ``None`` lets
258
+ MLX-VLM use its bundled default sampler without pulling another generation
259
+ package into the runtime contract.
260
+ """
261
+ _ = temperature
262
+ return None
263
+
264
+ class LLMRouter:
265
+ def __init__(self):
266
+ self._cache: Dict[str, Tuple] = {}
267
+ self._current: Optional[str] = None
268
+ self._last_used: Dict[str, float] = {}
269
+ self._max_local_models = max(1, int(os.getenv("LATTICEAI_MAX_LOCAL_MODELS", "1")))
270
+
271
+ @property
272
+ def current_model_id(self) -> Optional[str]:
273
+ return self._current
274
+
275
+ @property
276
+ def loaded_model_ids(self) -> List[str]:
277
+ return list(self._cache.keys())
278
+
279
+ def switch_model(self, model_id: str) -> None:
280
+ if model_id not in self._cache:
281
+ raise KeyError(model_id)
282
+ self._current = model_id
283
+ self._touch(model_id)
284
+
285
+ def unload_model(self, model_id: str) -> None:
286
+ self._cache.pop(model_id, None)
287
+ self._last_used.pop(model_id, None)
288
+ if self._current == model_id:
289
+ self._current = next(iter(self._cache), None)
290
+ self._release_memory()
291
+
292
+ def unload_all(self) -> None:
293
+ self._cache.clear()
294
+ self._last_used.clear()
295
+ self._current = None
296
+ self._release_memory()
297
+
298
+ def unload_idle_models(self, idle_seconds: int) -> List[str]:
299
+ if idle_seconds <= 0:
300
+ return []
301
+ now = time.monotonic()
302
+ unloaded = []
303
+ for model_id, last_used in list(self._last_used.items()):
304
+ if now - last_used >= idle_seconds:
305
+ self.unload_model(model_id)
306
+ unloaded.append(model_id)
307
+ return unloaded
308
+
309
+ def model_memory_policy(self) -> Dict[str, object]:
310
+ return {
311
+ "max_local_models": self._max_local_models,
312
+ "loaded_count": len(self._cache),
313
+ "last_used": dict(self._last_used),
314
+ }
315
+
316
+ def _touch(self, model_id: Optional[str] = None) -> None:
317
+ model_id = model_id or self._current
318
+ if model_id:
319
+ self._last_used[model_id] = time.monotonic()
320
+
321
+ def _is_local_model(self, model_id: str) -> bool:
322
+ cached = self._cache.get(model_id)
323
+ return cached is not None and not isinstance(cached, CloudModel)
324
+
325
+ def _enforce_local_model_limit(self, incoming_key: str) -> None:
326
+ local_ids = [model_id for model_id in self._cache if self._is_local_model(model_id)]
327
+ while len(local_ids) >= self._max_local_models:
328
+ victim = min(local_ids, key=lambda model_id: self._last_used.get(model_id, 0))
329
+ if victim == incoming_key:
330
+ break
331
+ print(f"🧹 Unloading local model to stay within memory policy: {victim}")
332
+ self.unload_model(victim)
333
+ local_ids = [model_id for model_id in self._cache if self._is_local_model(model_id)]
334
+
335
+ def _release_memory(self) -> None:
336
+ gc.collect()
337
+ if mx is not None and hasattr(mx, "clear_cache"):
338
+ try:
339
+ mx.clear_cache()
340
+ except Exception as e:
341
+ print(f"⚠️ MLX cache clear skipped: {e}")
342
+
343
+ async def load_model(
344
+ self,
345
+ model_id: str,
346
+ adapter_path: str = None,
347
+ draft_model_id: str = None,
348
+ api_key_override: Optional[str] = None,
349
+ owner: Optional[str] = None,
350
+ ) -> str:
351
+ provider, provider_model = parse_model_ref(model_id)
352
+ if provider != "local_mlx":
353
+ return self._load_cloud_model(provider, provider_model, api_key_override=api_key_override, owner=owner)
354
+
355
+ ensure_mlx_runtime()
356
+ if mx is None or vlm_load is None:
357
+ raise RuntimeError("MLX-VLM is not available in this process. Run on Apple Silicon with Metal access.")
358
+
359
+ cache_key = f"{model_id}_{draft_model_id}" if draft_model_id else model_id
360
+ if cache_key in self._cache:
361
+ self._current = cache_key
362
+ self._touch(cache_key)
363
+ return f"Cached: {cache_key}"
364
+
365
+ self._enforce_local_model_limit(cache_key)
366
+ print(f"⏳ Loading local model stack: {cache_key}...")
367
+ loop = asyncio.get_event_loop()
368
+ target_model_id = _resolve_local_hf_model(model_id)
369
+ target_draft_model_id = _resolve_local_hf_model(draft_model_id) if draft_model_id else None
370
+
371
+ def _load():
372
+ mx.set_default_device(mx.gpu)
373
+ print(f"🔄 Loading Target (VLM Mode): {target_model_id}...")
374
+ model, tokenizer = vlm_load(target_model_id)
375
+
376
+ draft_model = None
377
+ if target_draft_model_id:
378
+ print(f"🔄 Loading Assistant (VLM Mode): {target_draft_model_id}...")
379
+ draft_model, _ = vlm_load(target_draft_model_id)
380
+ print("✅ Assistant Ready.")
381
+
382
+ return model, tokenizer, draft_model
383
+
384
+ try:
385
+ # Use the dedicated single-thread executor to ensure MLX GPU streams match during inference
386
+ model, tokenizer, draft_model = await loop.run_in_executor(executor, _load)
387
+ self._cache[cache_key] = (model, tokenizer, draft_model)
388
+ self._current = cache_key
389
+ self._touch(cache_key)
390
+ print(f"✅ Fully Loaded: {cache_key}")
391
+ return f"Success: {cache_key}"
392
+ except Exception as e:
393
+ print(f"❌ Load Error: {e}")
394
+ raise e
395
+
396
+ def _load_cloud_model(self, provider: str, model: str, api_key_override: Optional[str] = None, owner: Optional[str] = None) -> str:
397
+ if AsyncOpenAI is None:
398
+ raise RuntimeError("openai package is not installed. Add it to requirements.txt and install dependencies.")
399
+ config = OPENAI_COMPATIBLE_PROVIDERS.get(provider)
400
+ if not config:
401
+ raise RuntimeError(f"Unsupported cloud provider: {provider}")
402
+
403
+ api_key = api_key_override or os.getenv(config["env_key"]) or config.get("api_key_fallback")
404
+ if not api_key:
405
+ raise RuntimeError(f"Missing API key env var: {config['env_key']}")
406
+
407
+ base_url = os.getenv(config.get("base_url_env", "")) if config.get("base_url_env") else None
408
+ base_url = base_url or config.get("base_url")
409
+ client_kwargs = {"api_key": api_key}
410
+ if base_url:
411
+ client_kwargs["base_url"] = base_url
412
+
413
+ cache_owner = owner or "global"
414
+ cache_key = f"{provider}:{model}::{cache_owner}"
415
+ self._cache[cache_key] = CloudModel(provider=provider, model=model, client=AsyncOpenAI(**client_kwargs), cache_key=cache_key)
416
+ self._current = cache_key
417
+ self._touch(cache_key)
418
+ return f"Cloud provider ready: {cache_key}"
419
+
420
+ def detected_cloud_models(self) -> List[Dict[str, str]]:
421
+ local_server_providers = {"ollama", "vllm", "lmstudio", "llamacpp"}
422
+ items = []
423
+ for provider, config in OPENAI_COMPATIBLE_PROVIDERS.items():
424
+ has_key = bool(os.getenv(config["env_key"]) or config.get("api_key_fallback"))
425
+ provider_models = PROVIDER_MODEL_CATALOG.get(provider) or [{
426
+ "id": config["default_model"],
427
+ "name": f"{provider.title()} · {config['default_model']}",
428
+ "family": provider.title(),
429
+ }]
430
+ for model in provider_models:
431
+ model_id = model["id"]
432
+ local_server = provider in local_server_providers
433
+ items.append({
434
+ "id": f"{provider}:{model_id}",
435
+ "name": model.get("name") or f"{provider.title()} · {model_id}",
436
+ "provider": provider,
437
+ "family": model.get("family"),
438
+ "tag": "local-server" if local_server else "cloud",
439
+ "available": has_key,
440
+ "requires": config["env_key"] if not has_key else None,
441
+ **source_metadata_for_model(provider, model, local_server=local_server),
442
+ })
443
+ custom = os.getenv("LATTICEAI_CLOUD_MODELS") or ""
444
+ for raw in [item.strip() for item in custom.split(",") if item.strip()]:
445
+ provider, model = parse_model_ref(raw)
446
+ if provider != "local_mlx" and provider in OPENAI_COMPATIBLE_PROVIDERS:
447
+ config = OPENAI_COMPATIBLE_PROVIDERS[provider]
448
+ items.append({
449
+ "id": f"{provider}:{model}",
450
+ "name": f"{provider.title()} · {model}",
451
+ "provider": provider,
452
+ "tag": "cloud",
453
+ "available": bool(os.getenv(config["env_key"]) or config.get("api_key_fallback")),
454
+ "requires": None,
455
+ **source_metadata_for_model(
456
+ provider,
457
+ {"id": model, "name": f"{provider.title()} · {model}", "family": provider.title()},
458
+ local_server=provider in local_server_providers,
459
+ ),
460
+ })
461
+ return items
462
+
463
+ def _is_cloud_current(self) -> bool:
464
+ return bool(self._current and isinstance(self._cache.get(self._current), CloudModel))
465
+
466
+ def _local_server_error_hint(self, cloud: CloudModel, error: Exception) -> str:
467
+ raw = str(error)
468
+ if cloud.provider == "lmstudio":
469
+ base_url = os.getenv("LMSTUDIO_BASE_URL") or OPENAI_COMPATIBLE_PROVIDERS["lmstudio"]["base_url"]
470
+ return (
471
+ f"LM Studio 연결 실패: {raw}\n\n"
472
+ f"- LM Studio의 Developer/Local Server를 켜고 모델을 로드했는지 확인하세요.\n"
473
+ f"- Lattice가 보는 주소는 {base_url} 입니다. 포트가 다르면 LMSTUDIO_BASE_URL을 맞춰주세요.\n"
474
+ f"- 모델 선택창에는 LM Studio /v1/models에서 감지된 모델만 표시됩니다."
475
+ )
476
+ return raw
477
+
478
+ def _build_prompt(self, message: str, context: Optional[str], tokenizer) -> str:
479
+ system = SYSTEM_PROMPT
480
+ context = normalize_branding(context)
481
+ if context:
482
+ system += f"\n\nContext:\n{context}"
483
+ if hasattr(tokenizer, "apply_chat_template"):
484
+ try:
485
+ msgs = [{"role": "system", "content": system}, {"role": "user", "content": message}]
486
+ return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
487
+ except Exception:
488
+ pass
489
+ return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
490
+
491
+ def _build_vlm_prompt(self, model, processor, message: str, context: Optional[str], num_images: int) -> str:
492
+ system = SYSTEM_PROMPT
493
+ context = normalize_branding(context)
494
+ if context:
495
+ system += f"\n\nContext:\n{context}"
496
+ try:
497
+ from mlx_vlm import apply_chat_template
498
+
499
+ return apply_chat_template(
500
+ processor,
501
+ model.config,
502
+ [
503
+ {"role": "system", "content": system},
504
+ {"role": "user", "content": message},
505
+ ],
506
+ add_generation_prompt=True,
507
+ num_images=num_images,
508
+ )
509
+ except Exception as e:
510
+ print(f"⚠️ VLM chat template fallback: {e}")
511
+ return self._build_prompt(message, context, processor)
512
+
513
+ async def generate_as(self, model_id: str | None, message: str, context: Optional[str] = None, max_tokens: int = 4096, temperature: float = 0.2) -> str:
514
+ """Generate using a specific model, temporarily switching if needed. Falls back to current model if model_id is None or not loaded."""
515
+ if not model_id or model_id == self._current:
516
+ return await self.generate(message, context, max_tokens, temperature)
517
+ if model_id not in self._cache:
518
+ raise ValueError(f"Model '{model_id}' is not loaded. Load it first via /models/load.")
519
+ prev = self._current
520
+ self._current = model_id
521
+ try:
522
+ return await self.generate(message, context, max_tokens, temperature)
523
+ finally:
524
+ self._current = prev
525
+
526
+ async def generate(self, message: str, context: Optional[str] = None, max_tokens: int = 4096, temperature: float = 0.2, image_data: Optional[str] = None) -> str:
527
+ if not self._current:
528
+ return "No model."
529
+ self._touch()
530
+ cached = self._cache[self._current]
531
+ if isinstance(cached, CloudModel):
532
+ return await self._cloud_generate(cached, message, context, max_tokens, temperature)
533
+
534
+ model, tokenizer, draft_model = self._cache[self._current]
535
+ prompt = self._build_vlm_prompt(model, tokenizer, message, context, 1 if image_data else 0)
536
+
537
+ loop = asyncio.get_event_loop()
538
+
539
+ def _gen():
540
+ import mlx.core as mx
541
+ mx.set_default_device(mx.gpu)
542
+ from mlx_vlm import generate as vlm_gen
543
+ return vlm_gen(model, tokenizer, prompt=prompt, image=self._prep_image(image_data) if image_data else None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
544
+ result = await loop.run_in_executor(executor, _gen)
545
+ # mlx-vlm might return a GenerationResult object; extract the text
546
+ if hasattr(result, "text"):
547
+ return normalize_branding(result.text)
548
+ return normalize_branding(str(result))
549
+
550
+ async def _cloud_generate(self, cloud: CloudModel, message: str, context: Optional[str], max_tokens: int, temperature: float) -> str:
551
+ system = SYSTEM_PROMPT
552
+ context = normalize_branding(context)
553
+ if context:
554
+ system += f"\n\nContext:\n{context}"
555
+ try:
556
+ response = await cloud.client.chat.completions.create(
557
+ model=cloud.model,
558
+ messages=[
559
+ {"role": "system", "content": system},
560
+ {"role": "user", "content": message},
561
+ ],
562
+ max_tokens=max_tokens,
563
+ temperature=temperature,
564
+ )
565
+ except Exception as e:
566
+ raise RuntimeError(self._local_server_error_hint(cloud, e)) from e
567
+ return normalize_branding(response.choices[0].message.content or "")
568
+
569
+ async def stream_generate(self, message: str, context: Optional[str] = None, max_tokens: int = 4096, temperature: float = 0.2, image_data: Optional[str] = None) -> AsyncIterator[str]:
570
+ if not self._current:
571
+ yield "No model."
572
+ return
573
+ self._touch()
574
+ cached = self._cache[self._current]
575
+ if isinstance(cached, CloudModel):
576
+ async for chunk in self._cloud_stream_generate(cached, message, context, max_tokens, temperature):
577
+ yield chunk
578
+ return
579
+
580
+ model, tokenizer, draft_model = self._cache[self._current]
581
+ prompt = self._build_vlm_prompt(model, tokenizer, message, context, 1 if image_data else 0)
582
+ loop = asyncio.get_event_loop()
583
+ queue = asyncio.Queue()
584
+
585
+ def _stream():
586
+ import mlx.core as mx
587
+ mx.set_default_device(mx.gpu)
588
+ try:
589
+ from mlx_vlm import stream_generate as vlm_stream
590
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=self._prep_image(image_data) if image_data else None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
591
+
592
+ for chunk in gen:
593
+ text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
594
+ loop.call_soon_threadsafe(queue.put_nowait, text)
595
+ except Exception as e:
596
+ loop.call_soon_threadsafe(queue.put_nowait, f"⚠️ Error: {e}")
597
+ finally:
598
+ loop.call_soon_threadsafe(queue.put_nowait, None)
599
+
600
+ loop.run_in_executor(executor, _stream)
601
+ while True:
602
+ chunk = await queue.get()
603
+ if chunk is None:
604
+ break
605
+ yield normalize_branding(chunk)
606
+
607
+ async def _cloud_stream_generate(self, cloud: CloudModel, message: str, context: Optional[str], max_tokens: int, temperature: float) -> AsyncIterator[str]:
608
+ system = SYSTEM_PROMPT
609
+ context = normalize_branding(context)
610
+ if context:
611
+ system += f"\n\nContext:\n{context}"
612
+ try:
613
+ stream = await cloud.client.chat.completions.create(
614
+ model=cloud.model,
615
+ messages=[
616
+ {"role": "system", "content": system},
617
+ {"role": "user", "content": message},
618
+ ],
619
+ max_tokens=max_tokens,
620
+ temperature=temperature,
621
+ stream=True,
622
+ )
623
+ except Exception as e:
624
+ yield f"⚠️ {self._local_server_error_hint(cloud, e)}"
625
+ return
626
+ async for event in stream:
627
+ if not event.choices:
628
+ continue
629
+ delta = event.choices[0].delta.content
630
+ if delta:
631
+ yield normalize_branding(delta)
632
+
633
+ def _prep_image(self, image_data: Optional[str]) -> Optional[Image.Image]:
634
+ if not image_data:
635
+ return None
636
+ try:
637
+ image = Image.open(io.BytesIO(base64.b64decode(image_data))).convert("RGB")
638
+ print(f"🖼️ VLM image decoded: {image.width}x{image.height}")
639
+ return image
640
+ except Exception as e:
641
+ print(f"⚠️ VLM image decode failed: {e}")
642
+ return None
643
+
644
+ # ── Document Generation Pipeline ──────────────────────────────────────
645
+
646
+ async def generate_document(
647
+ self,
648
+ message: str,
649
+ system_prompt: str,
650
+ *,
651
+ max_tokens: int = 8192,
652
+ temperature: float = 0.3,
653
+ ) -> str:
654
+ """Generate a document using a specialized system prompt with graph context."""
655
+ if not self._current:
656
+ return "No model loaded."
657
+ self._touch()
658
+ cached = self._cache[self._current]
659
+
660
+ if isinstance(cached, CloudModel):
661
+ return await self._cloud_generate_document(cached, message, system_prompt, max_tokens, temperature)
662
+
663
+ model, tokenizer, draft_model = cached
664
+ if hasattr(tokenizer, "apply_chat_template"):
665
+ try:
666
+ msgs = [
667
+ {"role": "system", "content": system_prompt},
668
+ {"role": "user", "content": message},
669
+ ]
670
+ prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
671
+ except Exception:
672
+ prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
673
+ else:
674
+ prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
675
+
676
+ loop = asyncio.get_event_loop()
677
+ def _gen():
678
+ import mlx.core as mx
679
+ mx.set_default_device(mx.gpu)
680
+ from mlx_vlm import generate as vlm_gen
681
+ return vlm_gen(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
682
+ result = await loop.run_in_executor(executor, _gen)
683
+ if hasattr(result, "text"):
684
+ return normalize_branding(result.text)
685
+ return normalize_branding(str(result))
686
+
687
+ async def _cloud_generate_document(self, cloud: CloudModel, message: str, system_prompt: str, max_tokens: int, temperature: float) -> str:
688
+ try:
689
+ response = await cloud.client.chat.completions.create(
690
+ model=cloud.model,
691
+ messages=[
692
+ {"role": "system", "content": system_prompt},
693
+ {"role": "user", "content": message},
694
+ ],
695
+ max_tokens=max_tokens,
696
+ temperature=temperature,
697
+ )
698
+ except Exception as e:
699
+ raise RuntimeError(self._local_server_error_hint(cloud, e)) from e
700
+ return normalize_branding(response.choices[0].message.content or "")
701
+
702
+ async def stream_generate_document(
703
+ self,
704
+ message: str,
705
+ system_prompt: str,
706
+ *,
707
+ max_tokens: int = 8192,
708
+ temperature: float = 0.3,
709
+ ) -> AsyncIterator[str]:
710
+ """Stream document generation with specialized system prompt."""
711
+ if not self._current:
712
+ yield "No model loaded."
713
+ return
714
+ self._touch()
715
+ cached = self._cache[self._current]
716
+
717
+ if isinstance(cached, CloudModel):
718
+ async for chunk in self._cloud_stream_document(cached, message, system_prompt, max_tokens, temperature):
719
+ yield chunk
720
+ return
721
+
722
+ model, tokenizer, draft_model = cached
723
+ if hasattr(tokenizer, "apply_chat_template"):
724
+ try:
725
+ msgs = [
726
+ {"role": "system", "content": system_prompt},
727
+ {"role": "user", "content": message},
728
+ ]
729
+ prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
730
+ except Exception:
731
+ prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
732
+ else:
733
+ prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
734
+
735
+ loop = asyncio.get_event_loop()
736
+ queue = asyncio.Queue()
737
+
738
+ def _stream():
739
+ import mlx.core as mx
740
+ mx.set_default_device(mx.gpu)
741
+ try:
742
+ from mlx_vlm import stream_generate as vlm_stream
743
+ gen = vlm_stream(model, tokenizer, prompt=prompt, image=None, max_tokens=max_tokens, sampler=_mlx_sampler(temperature), draft_model=draft_model, draft_kind="mtp")
744
+ for chunk in gen:
745
+ text = chunk.text if hasattr(chunk, "text") else (chunk[0] if isinstance(chunk, tuple) else str(chunk))
746
+ loop.call_soon_threadsafe(queue.put_nowait, text)
747
+ except Exception as e:
748
+ loop.call_soon_threadsafe(queue.put_nowait, f"⚠️ Error: {e}")
749
+ finally:
750
+ loop.call_soon_threadsafe(queue.put_nowait, None)
751
+
752
+ loop.run_in_executor(executor, _stream)
753
+ while True:
754
+ chunk = await queue.get()
755
+ if chunk is None:
756
+ break
757
+ yield normalize_branding(chunk)
758
+
759
+ async def _cloud_stream_document(self, cloud: CloudModel, message: str, system_prompt: str, max_tokens: int, temperature: float) -> AsyncIterator[str]:
760
+ try:
761
+ stream = await cloud.client.chat.completions.create(
762
+ model=cloud.model,
763
+ messages=[
764
+ {"role": "system", "content": system_prompt},
765
+ {"role": "user", "content": message},
766
+ ],
767
+ max_tokens=max_tokens,
768
+ temperature=temperature,
769
+ stream=True,
770
+ )
771
+ except Exception as e:
772
+ yield f"⚠️ {self._local_server_error_hint(cloud, e)}"
773
+ return
774
+ async for event in stream:
775
+ if not event.choices:
776
+ continue
777
+ delta = event.choices[0].delta.content
778
+ if delta:
779
+ yield normalize_branding(delta)