ltcai 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +105 -79
  2. package/docs/CHANGELOG.md +109 -0
  3. package/docs/images/architecture.png +0 -0
  4. package/docs/images/graph.png +0 -0
  5. package/docs/images/hero.gif +0 -0
  6. package/docs/images/model-recommendation.png +0 -0
  7. package/docs/images/onboarding.png +0 -0
  8. package/docs/images/organization.png +0 -0
  9. package/docs/images/skills.png +0 -0
  10. package/docs/images/tmp_frames/frame_00.png +0 -0
  11. package/docs/images/tmp_frames/frame_01.png +0 -0
  12. package/docs/images/tmp_frames/frame_02.png +0 -0
  13. package/docs/images/tmp_frames/frame_03.png +0 -0
  14. package/docs/images/workspace.png +0 -0
  15. package/latticeai/__init__.py +1 -1
  16. package/latticeai/api/admin.py +17 -0
  17. package/latticeai/api/chat.py +786 -0
  18. package/latticeai/api/computer_use.py +294 -0
  19. package/latticeai/api/deps.py +15 -0
  20. package/latticeai/api/garden.py +34 -0
  21. package/latticeai/api/local_files.py +125 -0
  22. package/latticeai/api/models.py +16 -0
  23. package/latticeai/api/permissions.py +331 -0
  24. package/latticeai/api/setup.py +158 -0
  25. package/latticeai/api/static_routes.py +166 -0
  26. package/latticeai/api/tools.py +579 -0
  27. package/latticeai/api/workspace.py +11 -0
  28. package/latticeai/core/enterprise_admin.py +158 -0
  29. package/latticeai/core/workspace_os.py +1 -1
  30. package/latticeai/server_app.py +223 -4301
  31. package/latticeai/services/app_context.py +27 -0
  32. package/latticeai/services/model_catalog.py +289 -0
  33. package/latticeai/services/model_recommendation.py +183 -0
  34. package/latticeai/services/model_runtime.py +1721 -0
  35. package/latticeai/services/tool_dispatch.py +135 -0
  36. package/latticeai/services/upload_service.py +99 -0
  37. package/package.json +3 -3
  38. package/skills/SKILL_TEMPLATE.md +1 -1
  39. package/skills/code_review/SKILL.md +1 -1
  40. package/skills/data_analysis/SKILL.md +1 -1
  41. package/skills/file_edit/SKILL.md +1 -1
  42. package/skills/summarize_document/SKILL.md +1 -1
  43. package/skills/web_search/SKILL.md +1 -1
  44. package/static/scripts/chat.js +45 -0
@@ -0,0 +1,1721 @@
1
+ """Model runtime and provider helpers for Lattice AI.
2
+
3
+ This module owns local/cloud model preparation, engine detection, model download,
4
+ provider-specific server startup, smoke tests, and runtime feature payloads. It is
5
+ configured by ``server_app`` with app-level state but has no FastAPI app import.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import importlib.util
12
+ import json
13
+ import logging
14
+ import os
15
+ import platform
16
+ import queue
17
+ import re
18
+ import shutil
19
+ import subprocess
20
+ import sys
21
+ import tempfile
22
+ import threading
23
+ import time
24
+ import urllib.error
25
+ import urllib.request
26
+ from pathlib import Path
27
+ from typing import AsyncIterator, Dict, List, Optional
28
+
29
+ import httpx
30
+ from fastapi import HTTPException, Request
31
+
32
+ from llm_router import (
33
+ AsyncOpenAI,
34
+ HF_MODELS_ROOT,
35
+ OPENAI_COMPATIBLE_PROVIDERS,
36
+ ensure_mlx_runtime,
37
+ hf_model_dir,
38
+ normalize_branding,
39
+ parse_model_ref,
40
+ )
41
+ from latticeai.core.model_compat import (
42
+ SMOKE_PROMPT as _SMOKE_PROMPT,
43
+ classify_smoke_response as _classify_smoke_response,
44
+ ensure_profile as _ensure_compat_profile,
45
+ fast_postprocess as _compat_fast_postprocess,
46
+ record_smoke_result as _record_smoke_result,
47
+ )
48
+ from latticeai.core.model_resolution import ModelResolution as _ModelResolution
49
+
50
+ # Configured by server_app.configure_model_runtime during app assembly.
51
+ router = None
52
+ APP_MODE = "local"
53
+ DEFAULT_HOST = "127.0.0.1"
54
+ DEFAULT_PORT = 4825
55
+ DATA_DIR = Path.home() / ".latticeai"
56
+ BASE_DIR = Path.cwd()
57
+ ENABLE_TELEGRAM = False
58
+ ENABLE_GRAPH = True
59
+ AUTOLOAD_MODELS = False
60
+ MODEL_IDLE_UNLOAD_SECONDS = 0
61
+ ALLOW_LOCAL_MODELS = True
62
+ REQUIRE_AUTH = False
63
+ INVITE_GATE_ENABLED = False
64
+ ALLOW_PLAINTEXT_API_KEYS = False
65
+ CORS_ALLOW_NETWORK = False
66
+ PUBLIC_MODEL = "openai:gpt-4o-mini"
67
+ LOCAL_MODEL = "mlx-community/SmolLM-1.7B-Instruct-4bit"
68
+ IS_PUBLIC_MODE = False
69
+ keyring = None
70
+
71
+
72
+ def _missing_current_user(_request: Request) -> Optional[str]:
73
+ return None
74
+
75
+
76
+ def _missing_user_api_key(_email: Optional[str], _provider: str) -> Optional[str]:
77
+ return None
78
+
79
+
80
+ get_current_user = _missing_current_user
81
+ get_user_api_key = _missing_user_api_key
82
+
83
+
84
+ def configure_model_runtime(**deps) -> None:
85
+ """Wire app-owned runtime dependencies without importing server_app."""
86
+ globals().update({key: value for key, value in deps.items() if key in globals()})
87
+
88
+
89
+ # Catalog data + version-dedup helpers live in ``model_catalog``; re-exported
90
+ # here so existing ``from ...model_runtime import ENGINE_MODEL_CATALOG`` imports
91
+ # keep working.
92
+ from latticeai.services.model_catalog import ( # noqa: F401 (re-export)
93
+ ENGINE_INSTALLERS,
94
+ ENGINE_MODEL_CATALOG,
95
+ MODEL_ENGINE_ALIASES,
96
+ _VERSIONED_MODEL_PATTERNS,
97
+ _model_family_version,
98
+ _version_tuple,
99
+ filter_lower_family_versions,
100
+ )
101
+
102
+ def _update_env_file(env_file: Path, key: str, value: str) -> None:
103
+ lines = []
104
+ found = False
105
+ if env_file.exists():
106
+ for line in env_file.read_text(encoding="utf-8").splitlines():
107
+ if line.startswith(f"{key}="):
108
+ lines.append(f"{key}={value}")
109
+ found = True
110
+ else:
111
+ lines.append(line)
112
+ if not found:
113
+ lines.append(f"{key}={value}")
114
+ env_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
115
+
116
+
117
+ LOCAL_SERVER_PROCESSES: Dict[str, subprocess.Popen] = {}
118
+ VLLM_METAL_ENV = Path.home() / ".venv-vllm-metal"
119
+ VLLM_METAL_BIN = VLLM_METAL_ENV / "bin" / "vllm"
120
+ VLLM_METAL_PYTHON = VLLM_METAL_ENV / "bin" / "python"
121
+ LMSTUDIO_BUNDLED_CLI = Path("/Applications/LM Studio.app/Contents/Resources/app/.webpack/lms")
122
+
123
+ def windows_binary_candidates(binary: str) -> List[Path]:
124
+ local_appdata = os.environ.get("LOCALAPPDATA", "")
125
+ program_files = os.environ.get("ProgramFiles", r"C:\Program Files")
126
+ program_files_x86 = os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)")
127
+ candidates = {
128
+ "ollama": [
129
+ Path(local_appdata) / "Programs" / "Ollama" / "ollama.exe" if local_appdata else None,
130
+ Path(program_files) / "Ollama" / "ollama.exe",
131
+ ],
132
+ "lms": [
133
+ Path(local_appdata) / "Programs" / "LM Studio" / "resources" / "app" / ".webpack" / "lms.exe" if local_appdata else None,
134
+ Path(program_files) / "LM Studio" / "resources" / "app" / ".webpack" / "lms.exe",
135
+ ],
136
+ "nvidia-smi": [
137
+ Path(program_files) / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
138
+ Path(program_files_x86) / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
139
+ ],
140
+ }
141
+ return [item for item in candidates.get(binary, []) if item is not None]
142
+
143
+
144
+ def local_binary(binary: str) -> Optional[str]:
145
+ found = shutil.which(binary)
146
+ if found:
147
+ return found
148
+ if platform.system() == "Windows":
149
+ for candidate in windows_binary_candidates(binary):
150
+ if candidate.exists():
151
+ return str(candidate)
152
+ return None
153
+
154
+
155
+ def find_lmstudio_cli() -> Optional[str]:
156
+ cli = local_binary("lms")
157
+ if cli:
158
+ return cli
159
+ if LMSTUDIO_BUNDLED_CLI.exists():
160
+ return str(LMSTUDIO_BUNDLED_CLI)
161
+ return None
162
+
163
+
164
+ def vllm_executable() -> Optional[str]:
165
+ found = shutil.which("vllm")
166
+ if found:
167
+ return found
168
+ if VLLM_METAL_BIN.exists():
169
+ return str(VLLM_METAL_BIN)
170
+ return None
171
+
172
+
173
+ def vllm_metal_python() -> Optional[str]:
174
+ if VLLM_METAL_PYTHON.exists():
175
+ return str(VLLM_METAL_PYTHON)
176
+ return None
177
+
178
+
179
+ def _json_request(
180
+ url: str,
181
+ *,
182
+ method: str = "GET",
183
+ payload: Optional[Dict[str, object]] = None,
184
+ headers: Optional[Dict[str, str]] = None,
185
+ timeout: float = 10.0,
186
+ ) -> Dict[str, object]:
187
+ data = None
188
+ req_headers = dict(headers or {})
189
+ if payload is not None:
190
+ data = json.dumps(payload).encode("utf-8")
191
+ req_headers.setdefault("Content-Type", "application/json")
192
+ req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
193
+ with urllib.request.urlopen(req, timeout=timeout) as res:
194
+ raw = res.read().decode("utf-8", errors="replace")
195
+ if not raw.strip():
196
+ return {}
197
+ return json.loads(raw)
198
+
199
+
200
+ def lmstudio_api_base() -> str:
201
+ return (os.getenv("LMSTUDIO_BASE_URL") or OPENAI_COMPATIBLE_PROVIDERS["lmstudio"]["base_url"]).rstrip("/")
202
+
203
+
204
+ def lmstudio_native_api_base() -> str:
205
+ base = lmstudio_api_base()
206
+ return base[:-3] if base.endswith("/v1") else base
207
+
208
+
209
+ def ensure_lmstudio_server() -> None:
210
+ base_url = lmstudio_native_api_base()
211
+ try:
212
+ _json_request(f"{base_url}/api/v1/models", headers={"Authorization": "Bearer lmstudio"}, timeout=2.5)
213
+ return
214
+ except Exception:
215
+ pass
216
+
217
+ cli = find_lmstudio_cli()
218
+ if not cli:
219
+ raise HTTPException(status_code=400, detail="LM Studio CLI를 찾지 못했습니다. LM Studio를 설치한 뒤 다시 시도하세요.")
220
+
221
+ try:
222
+ subprocess.Popen(
223
+ [cli, "server", "start"],
224
+ stdout=subprocess.DEVNULL,
225
+ stderr=subprocess.DEVNULL,
226
+ start_new_session=True,
227
+ )
228
+ except Exception as e:
229
+ raise HTTPException(status_code=500, detail=f"LM Studio 서버 시작 실패: {e}")
230
+
231
+ deadline = time.time() + 45
232
+ while time.time() < deadline:
233
+ try:
234
+ _json_request(f"{base_url}/api/v1/models", headers={"Authorization": "Bearer lmstudio"}, timeout=2.5)
235
+ return
236
+ except Exception:
237
+ time.sleep(1)
238
+ raise HTTPException(status_code=500, detail="LM Studio Local Server를 자동으로 시작하지 못했습니다.")
239
+
240
+
241
+ _LMSTUDIO_MODELS_CACHE: List[Dict[str, object]] = []
242
+ _LMSTUDIO_MODELS_CACHE_TS: float = 0.0
243
+ _LMSTUDIO_MODELS_CACHE_TTL: float = 10.0
244
+
245
+
246
+ def get_lmstudio_models(*, force: bool = False) -> List[Dict[str, object]]:
247
+ global _LMSTUDIO_MODELS_CACHE, _LMSTUDIO_MODELS_CACHE_TS
248
+ if not force and time.monotonic() - _LMSTUDIO_MODELS_CACHE_TS < _LMSTUDIO_MODELS_CACHE_TTL:
249
+ return _LMSTUDIO_MODELS_CACHE
250
+ try:
251
+ payload = _json_request(
252
+ f"{lmstudio_native_api_base()}/api/v1/models",
253
+ headers={"Authorization": f"Bearer {os.getenv('LMSTUDIO_API_KEY') or 'lmstudio'}"},
254
+ timeout=2.5,
255
+ )
256
+ except Exception:
257
+ return _LMSTUDIO_MODELS_CACHE
258
+ models = payload.get("models")
259
+ _LMSTUDIO_MODELS_CACHE = models if isinstance(models, list) else []
260
+ _LMSTUDIO_MODELS_CACHE_TS = time.monotonic()
261
+ return _LMSTUDIO_MODELS_CACHE
262
+
263
+
264
+ def _lmstudio_candidate_keys(model_name: str) -> List[str]:
265
+ raw = model_name.strip()
266
+ if not raw:
267
+ return []
268
+ slug = raw.split("/")[-1].lower()
269
+ slug = slug.replace("-gguf", "").replace("-awq", "")
270
+ parts = [p for p in slug.split("-") if p]
271
+ candidates = [raw.lower(), slug]
272
+ if parts:
273
+ candidates.append("-".join(parts[: min(4, len(parts))]))
274
+ return list(dict.fromkeys(candidates))
275
+
276
+
277
+ def _find_lmstudio_model_key(model_name: str, models: List[Dict[str, object]]) -> Optional[str]:
278
+ if not models:
279
+ return None
280
+ candidate_keys = _lmstudio_candidate_keys(model_name)
281
+ exact = []
282
+ fuzzy = []
283
+ for item in models:
284
+ if not isinstance(item, dict):
285
+ continue
286
+ key = str(item.get("key") or "").strip()
287
+ display_name = str(item.get("display_name") or "").strip()
288
+ haystacks = [key.lower(), display_name.lower()]
289
+ if any(raw == key.lower() for raw in candidate_keys):
290
+ exact.append(key)
291
+ continue
292
+ if any(token and token in hay for token in candidate_keys for hay in haystacks):
293
+ fuzzy.append(key)
294
+ return (exact or fuzzy or [None])[0]
295
+
296
+
297
+ def ensure_lmstudio_model(model_name: str) -> Dict[str, object]:
298
+ ensure_lmstudio_server()
299
+ auth_header = {"Authorization": f"Bearer {os.getenv('LMSTUDIO_API_KEY') or 'lmstudio'}"}
300
+ models = get_lmstudio_models()
301
+ found_key = _find_lmstudio_model_key(model_name, models)
302
+ model_key = found_key or model_name
303
+
304
+ if not found_key:
305
+ try:
306
+ job = _json_request(
307
+ f"{lmstudio_native_api_base()}/api/v1/models/download",
308
+ method="POST",
309
+ payload={"model": model_name},
310
+ headers=auth_header,
311
+ timeout=30,
312
+ )
313
+ except urllib.error.HTTPError as e:
314
+ detail = e.read().decode("utf-8", errors="replace")[-2000:]
315
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {detail or e.reason}")
316
+ except Exception as e:
317
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {e}")
318
+
319
+ status = str(job.get("status") or "")
320
+ job_id = str(job.get("job_id") or "")
321
+ if status not in {"completed", "already_downloaded"} and job_id:
322
+ deadline = time.time() + 3600
323
+ while time.time() < deadline:
324
+ polled = _json_request(
325
+ f"{lmstudio_native_api_base()}/api/v1/models/download/status/{job_id}",
326
+ headers=auth_header,
327
+ timeout=30,
328
+ )
329
+ polled_status = str(polled.get("status") or "")
330
+ if polled_status == "completed":
331
+ break
332
+ if polled_status == "failed":
333
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {polled}")
334
+ time.sleep(2)
335
+ else:
336
+ raise HTTPException(status_code=408, detail="LM Studio 모델 다운로드 시간이 초과되었습니다.")
337
+
338
+ models = get_lmstudio_models(force=True)
339
+ model_key = _find_lmstudio_model_key(model_name, models) or model_name
340
+
341
+ target = next((item for item in models if isinstance(item, dict) and item.get("key") == model_key), None)
342
+ loaded_instances = target.get("loaded_instances") if isinstance(target, dict) else None
343
+ if loaded_instances:
344
+ return {"provider": "lmstudio", "model": model_name, "resolved_model": model_key, "server_ready": True, "cached": True}
345
+
346
+ try:
347
+ loaded = _json_request(
348
+ f"{lmstudio_native_api_base()}/api/v1/models/load",
349
+ method="POST",
350
+ payload={"model": model_key, "context_length": 4096},
351
+ headers=auth_header,
352
+ timeout=120,
353
+ )
354
+ except urllib.error.HTTPError as e:
355
+ detail = e.read().decode("utf-8", errors="replace")[-2000:]
356
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {detail or e.reason}")
357
+ except Exception as e:
358
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {e}")
359
+
360
+ if str(loaded.get("status") or "") != "loaded":
361
+ raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {loaded}")
362
+
363
+ return {
364
+ "provider": "lmstudio",
365
+ "model": model_name,
366
+ "resolved_model": model_key,
367
+ "instance_id": loaded.get("instance_id"),
368
+ "server_ready": True,
369
+ "cached": False,
370
+ }
371
+
372
+ def engine_support_status(engine: str) -> Dict[str, object]:
373
+ if engine != "vllm":
374
+ return {"supported": True, "reason": None}
375
+ is_apple_silicon = sys.platform == "darwin" and platform.machine() == "arm64"
376
+ if sys.platform.startswith("win"):
377
+ return {"supported": False, "reason": "vLLM은 Windows native 자동 설치보다 WSL2/Linux 환경을 권장합니다."}
378
+ if sys.platform == "darwin" and not is_apple_silicon:
379
+ return {"supported": False, "reason": "vLLM Metal 자동 설치는 Apple Silicon macOS에서만 지원됩니다."}
380
+ if sys.version_info >= (3, 13) and is_apple_silicon:
381
+ return {"supported": True, "reason": "현재 환경에서는 vLLM Metal 전용 런타임으로 설치합니다."}
382
+ if sys.version_info >= (3, 13):
383
+ return {"supported": False, "reason": "vLLM 설치는 현재 Python 3.13 이하 또는 별도 전용 런타임이 필요합니다."}
384
+ return {"supported": True, "reason": None}
385
+
386
+ def hf_model_ready(repo_id: str, provider: str = "local_mlx") -> bool:
387
+ model_dir = hf_model_dir(repo_id)
388
+ if provider == "vllm" and (not model_dir.exists() or not model_dir.is_dir()):
389
+ hf_cache_repo = Path.home() / ".cache" / "huggingface" / "hub" / f"models--{repo_id.replace('/', '--')}"
390
+ if hf_cache_repo.exists() and any(hf_cache_repo.glob("snapshots/*")):
391
+ return True
392
+ return False
393
+ if not model_dir.exists() or not model_dir.is_dir():
394
+ return False
395
+ if provider == "llamacpp":
396
+ return any(model_dir.rglob("*.gguf"))
397
+ has_config = (model_dir / "config.json").exists()
398
+ has_weights = any(model_dir.glob("*.safetensors")) or any(model_dir.glob("*.bin"))
399
+ has_tokenizer = (
400
+ (model_dir / "tokenizer.json").exists()
401
+ or (model_dir / "tokenizer.model").exists()
402
+ or (model_dir / "tokenizer_config.json").exists()
403
+ )
404
+ return has_config and has_weights and has_tokenizer
405
+
406
+
407
+ def model_download_progress_payload(
408
+ stage: str,
409
+ message: str,
410
+ *,
411
+ percent: Optional[float] = None,
412
+ detail: Optional[str] = None,
413
+ downloaded_bytes: Optional[int] = None,
414
+ total_bytes: Optional[int] = None,
415
+ eta_seconds: Optional[float] = None,
416
+ file: Optional[str] = None,
417
+ indeterminate: bool = False,
418
+ ) -> Dict[str, object]:
419
+ payload: Dict[str, object] = {
420
+ "stage": stage,
421
+ "message": message,
422
+ "indeterminate": indeterminate,
423
+ "ts": time.time(),
424
+ }
425
+ if percent is not None:
426
+ payload["percent"] = max(0, min(100, round(float(percent), 1)))
427
+ if detail:
428
+ payload["detail"] = detail
429
+ if downloaded_bytes is not None:
430
+ payload["downloaded_bytes"] = max(0, int(downloaded_bytes))
431
+ if total_bytes is not None:
432
+ payload["total_bytes"] = max(0, int(total_bytes))
433
+ if eta_seconds is not None:
434
+ payload["eta_seconds"] = max(0, round(float(eta_seconds)))
435
+ if file:
436
+ payload["file"] = file
437
+ return payload
438
+
439
+
440
+ def estimate_eta_seconds(started_at: float, percent: Optional[float]) -> Optional[float]:
441
+ if percent is None or percent <= 0 or percent >= 100:
442
+ return None
443
+ elapsed = max(0.0, time.time() - started_at)
444
+ return elapsed * (100.0 - percent) / percent
445
+
446
+
447
+ def hf_repo_files_with_sizes(repo_id: str) -> List[Dict[str, object]]:
448
+ from huggingface_hub import HfApi
449
+
450
+ api = HfApi()
451
+ try:
452
+ info = api.model_info(repo_id, files_metadata=True)
453
+ files = []
454
+ for sibling in getattr(info, "siblings", []) or []:
455
+ name = str(getattr(sibling, "rfilename", "") or "").strip()
456
+ if not name or name.endswith("/"):
457
+ continue
458
+ files.append({"name": name, "size": int(getattr(sibling, "size", 0) or 0)})
459
+ if files:
460
+ return files
461
+ except TypeError:
462
+ pass
463
+ except Exception as e:
464
+ logging.warning("huggingface model_info failed for %s: %s", repo_id, e)
465
+
466
+ return [{"name": str(name), "size": 0} for name in api.list_repo_files(repo_id) if str(name).strip()]
467
+
468
+
469
+ def download_hf_model(
470
+ repo_id: str,
471
+ provider: str = "local_mlx",
472
+ progress_emit=None,
473
+ ) -> Dict[str, object]:
474
+ if importlib.util.find_spec("huggingface_hub") is None:
475
+ raise HTTPException(status_code=400, detail="huggingface_hub가 없습니다. 먼저 MLX runtime 설치를 진행해 주세요.")
476
+
477
+ target_dir = hf_model_dir(repo_id)
478
+ if hf_model_ready(repo_id, provider):
479
+ if progress_emit:
480
+ progress_emit(model_download_progress_payload(
481
+ "download",
482
+ "이미 다운로드된 모델을 확인했습니다.",
483
+ percent=100,
484
+ downloaded_bytes=0,
485
+ total_bytes=0,
486
+ eta_seconds=0,
487
+ ))
488
+ return {"model": repo_id, "path": str(target_dir), "cached": True}
489
+
490
+ target_dir.mkdir(parents=True, exist_ok=True)
491
+ try:
492
+ from huggingface_hub import hf_hub_download
493
+
494
+ started_at = time.time()
495
+ all_files = hf_repo_files_with_sizes(repo_id)
496
+ if provider == "llamacpp":
497
+ ggufs = sorted(
498
+ [item for item in all_files if str(item["name"]).lower().endswith(".gguf")],
499
+ key=lambda item: str(item["name"]),
500
+ )
501
+ if not ggufs:
502
+ raise RuntimeError("GGUF 파일을 찾지 못했습니다.")
503
+ preference = ("q4_k_m", "q4_0", "q4_k_s", "q3_k_m", "q2_k")
504
+ selected_files = [
505
+ next(
506
+ (item for pref in preference for item in ggufs if pref in str(item["name"]).lower()),
507
+ ggufs[0],
508
+ )
509
+ ]
510
+ else:
511
+ selected_files = all_files
512
+
513
+ total_bytes = sum(int(item.get("size") or 0) for item in selected_files) or None
514
+ downloaded_bytes = 0
515
+ total_files = max(1, len(selected_files))
516
+ if progress_emit:
517
+ progress_emit(model_download_progress_payload(
518
+ "download",
519
+ "모델 파일 정보를 확인했습니다.",
520
+ percent=0,
521
+ downloaded_bytes=0,
522
+ total_bytes=total_bytes,
523
+ indeterminate=total_bytes is None,
524
+ ))
525
+
526
+ for index, item in enumerate(selected_files, start=1):
527
+ filename = str(item["name"])
528
+ size = int(item.get("size") or 0)
529
+ tqdm_class = None
530
+ if progress_emit:
531
+ current_percent = (
532
+ (downloaded_bytes / total_bytes) * 100 if total_bytes else ((index - 1) / total_files) * 100
533
+ )
534
+ progress_emit(model_download_progress_payload(
535
+ "download",
536
+ "모델 다운로드 중입니다.",
537
+ percent=current_percent,
538
+ detail=filename,
539
+ downloaded_bytes=downloaded_bytes,
540
+ total_bytes=total_bytes,
541
+ eta_seconds=estimate_eta_seconds(started_at, current_percent),
542
+ file=filename,
543
+ indeterminate=total_bytes is None and total_files <= 1,
544
+ ))
545
+ try:
546
+ from tqdm.auto import tqdm as base_tqdm
547
+
548
+ downloaded_before = downloaded_bytes
549
+ last_emit = {"at": 0.0, "percent": -1.0}
550
+
551
+ def emit_byte_progress(done_bytes: float) -> None:
552
+ done = max(0, int(done_bytes or 0))
553
+ if total_bytes:
554
+ aggregate = min(total_bytes, downloaded_before + done)
555
+ percent = (aggregate / total_bytes) * 100
556
+ else:
557
+ file_total = size or done
558
+ file_ratio = min(1.0, done / file_total) if file_total else 0.0
559
+ aggregate = downloaded_before + done
560
+ percent = ((index - 1) + file_ratio) / total_files * 100
561
+ now = time.time()
562
+ if percent < 100 and now - last_emit["at"] < 0.5 and percent - last_emit["percent"] < 0.3:
563
+ return
564
+ last_emit["at"] = now
565
+ last_emit["percent"] = percent
566
+ progress_emit(model_download_progress_payload(
567
+ "download",
568
+ "모델 다운로드 중입니다.",
569
+ percent=percent,
570
+ detail=filename,
571
+ downloaded_bytes=aggregate,
572
+ total_bytes=total_bytes,
573
+ eta_seconds=estimate_eta_seconds(started_at, percent),
574
+ file=filename,
575
+ indeterminate=total_bytes is None and total_files <= 1,
576
+ ))
577
+
578
+ class ProgressTqdm(base_tqdm):
579
+ def update(self, n=1):
580
+ result = super().update(n)
581
+ emit_byte_progress(float(getattr(self, "n", 0) or 0))
582
+ return result
583
+
584
+ tqdm_class = ProgressTqdm
585
+ except Exception:
586
+ tqdm_class = None
587
+ local_path = hf_hub_download(
588
+ repo_id=repo_id,
589
+ filename=filename,
590
+ local_dir=str(target_dir),
591
+ tqdm_class=tqdm_class,
592
+ )
593
+ if size <= 0:
594
+ try:
595
+ size = Path(local_path).stat().st_size
596
+ except OSError:
597
+ size = 0
598
+ downloaded_bytes += size
599
+ if progress_emit:
600
+ current_percent = (
601
+ (downloaded_bytes / total_bytes) * 100 if total_bytes else (index / total_files) * 100
602
+ )
603
+ progress_emit(model_download_progress_payload(
604
+ "download",
605
+ "모델 다운로드 중입니다.",
606
+ percent=current_percent,
607
+ detail=filename,
608
+ downloaded_bytes=downloaded_bytes,
609
+ total_bytes=total_bytes,
610
+ eta_seconds=estimate_eta_seconds(started_at, current_percent),
611
+ file=filename,
612
+ indeterminate=False,
613
+ ))
614
+
615
+ if progress_emit:
616
+ progress_emit(model_download_progress_payload(
617
+ "download",
618
+ "모델 다운로드가 완료되었습니다.",
619
+ percent=100,
620
+ downloaded_bytes=downloaded_bytes,
621
+ total_bytes=total_bytes or downloaded_bytes,
622
+ eta_seconds=0,
623
+ ))
624
+ except Exception as e:
625
+ raise HTTPException(status_code=500, detail=f"{repo_id} 다운로드 실패: {str(e)[-2000:]}")
626
+
627
+ if not hf_model_ready(repo_id, provider):
628
+ raise HTTPException(status_code=500, detail=f"{repo_id} 다운로드가 완료되지 않았습니다. 모델 파일을 찾지 못했습니다.")
629
+
630
+ return {"model": repo_id, "path": str(target_dir), "cached": False}
631
+
632
+
633
+ def pull_ollama_model_with_progress(model_name: str, progress_emit=None) -> Dict[str, object]:
634
+ ollama = local_binary("ollama")
635
+ if not ollama:
636
+ raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
637
+ started_at = time.time()
638
+ if progress_emit:
639
+ progress_emit(model_download_progress_payload(
640
+ "download",
641
+ "Ollama 모델 다운로드를 시작합니다.",
642
+ percent=0,
643
+ detail=model_name,
644
+ indeterminate=True,
645
+ ))
646
+ process = subprocess.Popen(
647
+ [ollama, "pull", model_name],
648
+ stdout=subprocess.PIPE,
649
+ stderr=subprocess.STDOUT,
650
+ text=True,
651
+ bufsize=1,
652
+ )
653
+ last_percent: Optional[float] = None
654
+ lines: List[str] = []
655
+ try:
656
+ assert process.stdout is not None
657
+ for raw_line in process.stdout:
658
+ for part in re.split(r"[\r\n]+", raw_line):
659
+ line = part.strip()
660
+ if not line:
661
+ continue
662
+ lines.append(line)
663
+ match = re.search(r"(\d{1,3}(?:\.\d+)?)\s*%", line)
664
+ if match:
665
+ last_percent = min(100.0, float(match.group(1)))
666
+ if progress_emit:
667
+ progress_emit(model_download_progress_payload(
668
+ "download",
669
+ "Ollama 모델 다운로드 중입니다.",
670
+ percent=last_percent,
671
+ detail=line[-180:],
672
+ eta_seconds=estimate_eta_seconds(started_at, last_percent),
673
+ indeterminate=False,
674
+ ))
675
+ elif progress_emit:
676
+ progress_emit(model_download_progress_payload(
677
+ "download",
678
+ "Ollama 모델 다운로드 중입니다.",
679
+ percent=last_percent,
680
+ detail=line[-180:],
681
+ eta_seconds=estimate_eta_seconds(started_at, last_percent),
682
+ indeterminate=last_percent is None,
683
+ ))
684
+ returncode = process.wait()
685
+ except Exception:
686
+ process.kill()
687
+ raise
688
+
689
+ if returncode != 0:
690
+ tail = "\n".join(lines[-12:])
691
+ raise HTTPException(status_code=500, detail=tail[-2000:] or "Ollama 모델 다운로드 실패")
692
+
693
+ if progress_emit:
694
+ progress_emit(model_download_progress_payload(
695
+ "download",
696
+ "Ollama 모델 다운로드가 완료되었습니다.",
697
+ percent=100,
698
+ detail=model_name,
699
+ eta_seconds=0,
700
+ indeterminate=False,
701
+ ))
702
+ return {"provider": "ollama", "model": model_name, "returncode": returncode}
703
+
704
+
705
+ def get_ollama_pulled_models() -> set:
706
+ ollama = local_binary("ollama")
707
+ if not ollama:
708
+ return set()
709
+ try:
710
+ result = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=5, check=False)
711
+ pulled = set()
712
+ for line in result.stdout.splitlines()[1:]:
713
+ parts = line.split()
714
+ if parts:
715
+ pulled.add(parts[0])
716
+ return pulled
717
+ except Exception:
718
+ return set()
719
+
720
+
721
+ def get_openai_compatible_server_models(provider: str) -> List[str]:
722
+ if provider == "lmstudio":
723
+ models = []
724
+ for item in get_lmstudio_models():
725
+ if not isinstance(item, dict):
726
+ continue
727
+ key = str(item.get("key") or "").strip()
728
+ loaded_instances = item.get("loaded_instances") or []
729
+ if loaded_instances:
730
+ instance_ids = [
731
+ str(instance.get("id") or "").strip()
732
+ for instance in loaded_instances
733
+ if isinstance(instance, dict) and instance.get("id")
734
+ ]
735
+ models.extend(instance_ids or ([key] if key else []))
736
+ return list(dict.fromkeys([model for model in models if model]))
737
+
738
+ config = OPENAI_COMPATIBLE_PROVIDERS.get(provider) or {}
739
+ base_url = os.getenv(config.get("base_url_env", "")) if config.get("base_url_env") else None
740
+ base_url = (base_url or config.get("base_url") or "").rstrip("/")
741
+ if not base_url:
742
+ return []
743
+
744
+ api_key = os.getenv(config.get("env_key", "")) or config.get("api_key_fallback") or provider
745
+ req = urllib.request.Request(
746
+ f"{base_url}/models",
747
+ headers={"Authorization": f"Bearer {api_key}"},
748
+ method="GET",
749
+ )
750
+ try:
751
+ with urllib.request.urlopen(req, timeout=2.5) as res:
752
+ payload = json.loads(res.read().decode("utf-8", errors="replace"))
753
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError, OSError):
754
+ return []
755
+
756
+ models = []
757
+ for item in payload.get("data") or []:
758
+ model_id = item.get("id") if isinstance(item, dict) else None
759
+ if model_id:
760
+ models.append(str(model_id))
761
+ return models
762
+
763
+
764
+ def ensure_ollama_server() -> None:
765
+ ollama = local_binary("ollama")
766
+ if not ollama:
767
+ raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
768
+ try:
769
+ probe = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=3, check=False)
770
+ if probe.returncode == 0:
771
+ return
772
+ except Exception:
773
+ pass
774
+ subprocess.Popen(
775
+ [ollama, "serve"],
776
+ stdout=subprocess.DEVNULL,
777
+ stderr=subprocess.DEVNULL,
778
+ start_new_session=True,
779
+ )
780
+ deadline = time.time() + 20
781
+ while time.time() < deadline:
782
+ try:
783
+ probe = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=3, check=False)
784
+ if probe.returncode == 0:
785
+ return
786
+ except Exception:
787
+ pass
788
+ time.sleep(0.5)
789
+ raise HTTPException(status_code=500, detail="Ollama 서버를 자동으로 시작하지 못했습니다.")
790
+
791
+
792
+ def wait_for_openai_compatible_server(provider: str, model_name: Optional[str] = None, timeout: int = 45) -> bool:
793
+ deadline = time.time() + timeout
794
+ while time.time() < deadline:
795
+ models = get_openai_compatible_server_models(provider)
796
+ if models and (not model_name or model_name in models):
797
+ return True
798
+ time.sleep(1)
799
+ return False
800
+
801
+
802
+ def ensure_vllm_server(model_name: str) -> None:
803
+ served_models = get_openai_compatible_server_models("vllm")
804
+ if model_name in served_models:
805
+ return
806
+ vllm_bin = vllm_executable()
807
+ vllm_metal_py = vllm_metal_python()
808
+ if not vllm_bin and not vllm_metal_py and importlib.util.find_spec("vllm") is None:
809
+ raise HTTPException(status_code=400, detail="vLLM runtime이 설치되지 않았습니다.")
810
+
811
+ local_dir = hf_model_dir(model_name)
812
+ if not vllm_metal_py and not hf_model_ready(model_name, "vllm"):
813
+ download_hf_model(model_name, "vllm")
814
+
815
+ running = LOCAL_SERVER_PROCESSES.get("vllm")
816
+ if running and running.poll() is None:
817
+ running.terminate()
818
+ try:
819
+ running.wait(timeout=10)
820
+ except subprocess.TimeoutExpired:
821
+ running.kill()
822
+ elif served_models:
823
+ raise HTTPException(status_code=409, detail="다른 vLLM 서버가 이미 실행 중입니다. 현재 서버를 종료한 뒤 다시 시도하세요.")
824
+
825
+ running = LOCAL_SERVER_PROCESSES.get("vllm")
826
+ if running and running.poll() is None:
827
+ return
828
+
829
+ _host_args = ["--host", "127.0.0.1", "--port", "8000"]
830
+ if vllm_metal_py:
831
+ command = [vllm_metal_py, "-m", "vllm_metal.server", "--model", model_name, *_host_args]
832
+ elif vllm_bin:
833
+ command = [vllm_bin, "serve", str(local_dir), "--served-model-name", model_name, *_host_args]
834
+ else:
835
+ command = [sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", str(local_dir), "--served-model-name", model_name, *_host_args]
836
+ LOCAL_SERVER_PROCESSES["vllm"] = subprocess.Popen(
837
+ command,
838
+ stdout=subprocess.DEVNULL,
839
+ stderr=subprocess.DEVNULL,
840
+ start_new_session=True,
841
+ )
842
+ if not wait_for_openai_compatible_server("vllm", model_name, timeout=90):
843
+ raise HTTPException(status_code=500, detail="vLLM 서버가 모델을 자동 로드하지 못했습니다.")
844
+
845
+
846
+ def ensure_llamacpp_server(model_name: str) -> None:
847
+ served_models = get_openai_compatible_server_models("llamacpp")
848
+ if model_name in served_models:
849
+ return
850
+ running = LOCAL_SERVER_PROCESSES.get("llamacpp")
851
+ if running and running.poll() is None:
852
+ running.terminate()
853
+ try:
854
+ running.wait(timeout=10)
855
+ except subprocess.TimeoutExpired:
856
+ running.kill()
857
+ elif served_models:
858
+ raise HTTPException(status_code=409, detail="다른 llama.cpp 서버가 이미 실행 중입니다. 현재 서버를 종료한 뒤 다시 시도하세요.")
859
+ if not shutil.which("llama-server"):
860
+ raise HTTPException(status_code=400, detail="llama.cpp가 설치되지 않았습니다.")
861
+ if not hf_model_ready(model_name, "llamacpp"):
862
+ download_hf_model(model_name, "llamacpp")
863
+
864
+ gguf_files = sorted(hf_model_dir(model_name).rglob("*.gguf"))
865
+ if not gguf_files:
866
+ raise HTTPException(status_code=500, detail="다운로드된 GGUF 파일을 찾지 못했습니다.")
867
+
868
+ preferred = next((p for p in gguf_files if "q4_k_m" in p.name.lower()), None)
869
+ model_file = preferred or gguf_files[0]
870
+ LOCAL_SERVER_PROCESSES["llamacpp"] = subprocess.Popen(
871
+ [
872
+ "llama-server",
873
+ "-m",
874
+ str(model_file),
875
+ "--alias",
876
+ model_name,
877
+ "--host",
878
+ "127.0.0.1",
879
+ "--port",
880
+ "8080",
881
+ ],
882
+ stdout=subprocess.DEVNULL,
883
+ stderr=subprocess.DEVNULL,
884
+ start_new_session=True,
885
+ )
886
+ if not wait_for_openai_compatible_server("llamacpp", model_name, timeout=45):
887
+ raise HTTPException(status_code=500, detail="llama.cpp 서버가 모델을 자동 로드하지 못했습니다.")
888
+
889
+
890
+ def engine_installed(engine: str) -> bool:
891
+ if engine == "local_mlx":
892
+ return bool(importlib.util.find_spec("mlx") and importlib.util.find_spec("mlx_lm"))
893
+ if engine == "ollama":
894
+ return local_binary("ollama") is not None
895
+ if engine == "vllm":
896
+ return vllm_metal_python() is not None or vllm_executable() is not None or importlib.util.find_spec("vllm") is not None
897
+ if engine == "lmstudio":
898
+ return find_lmstudio_cli() is not None or Path("/Applications/LM Studio.app").exists()
899
+ if engine == "llamacpp":
900
+ return shutil.which("llama-server") is not None
901
+ if engine in {"openai", "openrouter", "groq", "together", "xai"}:
902
+ return AsyncOpenAI is not None
903
+ return False
904
+
905
+ def engine_status() -> List[Dict]:
906
+ cloud_models = router.detected_cloud_models()
907
+ cloud_by_provider = {}
908
+ for model in cloud_models:
909
+ cloud_by_provider.setdefault(model["provider"], []).append(model)
910
+
911
+ ollama_installed = engine_installed("ollama")
912
+ pulled = get_ollama_pulled_models() if ollama_installed else set()
913
+ ollama_models = []
914
+ for m in ENGINE_MODEL_CATALOG["ollama"]:
915
+ pull_name = m["id"].removeprefix("ollama:")
916
+ ollama_models.append({**m, "pulled": pull_name in pulled})
917
+ ollama_models = filter_lower_family_versions(ollama_models)
918
+
919
+ HF_MODELS_ROOT.mkdir(parents=True, exist_ok=True)
920
+ mlx_models = []
921
+ for m in ENGINE_MODEL_CATALOG.get("local_mlx", []):
922
+ repo_id = m["id"]
923
+ mlx_models.append({**m, "pulled": hf_model_ready(repo_id, "local_mlx")})
924
+ mlx_models = filter_lower_family_versions(mlx_models)
925
+
926
+ vllm_models = []
927
+ for m in ENGINE_MODEL_CATALOG.get("vllm", []):
928
+ repo_id = m["id"].removeprefix("vllm:")
929
+ vllm_models.append({**m, "pulled": hf_model_ready(repo_id, "vllm")})
930
+ vllm_models = filter_lower_family_versions(vllm_models)
931
+
932
+ lmstudio_models = []
933
+ downloaded_lmstudio = get_lmstudio_models()
934
+ downloaded_by_key = {}
935
+ for item in downloaded_lmstudio:
936
+ if not isinstance(item, dict):
937
+ continue
938
+ key = str(item.get("key") or "").strip()
939
+ if not key:
940
+ continue
941
+ downloaded_by_key[key] = item
942
+ loaded_instances = item.get("loaded_instances") or []
943
+ lmstudio_models.append({
944
+ "id": f"lmstudio:{key}",
945
+ "name": item.get("display_name") or f"LM Studio · {key}",
946
+ "family": item.get("architecture") or item.get("publisher") or "LM Studio",
947
+ "tag": "loaded-server-model" if loaded_instances else "downloaded",
948
+ "size": item.get("params_string") or item.get("format") or "LM Studio",
949
+ "pullable": True,
950
+ "pulled": True,
951
+ })
952
+
953
+ if not lmstudio_models:
954
+ for m in ENGINE_MODEL_CATALOG.get("lmstudio", []):
955
+ lmstudio_models.append({**m, "pulled": False})
956
+ else:
957
+ known_ids = {item["id"] for item in lmstudio_models}
958
+ for m in ENGINE_MODEL_CATALOG.get("lmstudio", []):
959
+ repo_id = m["id"].removeprefix("lmstudio:")
960
+ if f"lmstudio:{repo_id}" not in known_ids and repo_id not in downloaded_by_key:
961
+ lmstudio_models.append({**m, "pulled": False})
962
+ lmstudio_models = filter_lower_family_versions(lmstudio_models)
963
+
964
+ llamacpp_models = []
965
+ for m in ENGINE_MODEL_CATALOG.get("llamacpp", []):
966
+ repo_id = m["id"].removeprefix("llamacpp:")
967
+ llamacpp_models.append({**m, "pulled": hf_model_ready(repo_id, "llamacpp")})
968
+ llamacpp_models = filter_lower_family_versions(llamacpp_models)
969
+
970
+ local_server_specs = [
971
+ {
972
+ "id": "vllm",
973
+ "name": "vLLM",
974
+ "description": "vLLM OpenAI 호환 서버(예: http://localhost:8000/v1)에 연결합니다.",
975
+ "requires": "VLLM_BASE_URL",
976
+ "note": engine_support_status("vllm").get("reason"),
977
+ },
978
+ {
979
+ "id": "lmstudio",
980
+ "name": "LM Studio",
981
+ "description": "LM Studio 로컬 OpenAI 호환 서버에 연결합니다.",
982
+ "requires": "LMSTUDIO_BASE_URL",
983
+ "note": (
984
+ "다운로드된 모델은 자동 감지하고, 선택 시 필요하면 다운로드 후 바로 로드합니다."
985
+ if downloaded_lmstudio else
986
+ "LM Studio 설치 후 모델을 선택하면 Local Server 시작, 다운로드, 로드를 자동으로 진행합니다."
987
+ ),
988
+ "server_ready": bool(downloaded_lmstudio),
989
+ },
990
+ {
991
+ "id": "llamacpp",
992
+ "name": "llama.cpp",
993
+ "description": "llama.cpp 서버(OpenAI 호환 /v1)에 연결합니다.",
994
+ "requires": "LLAMACPP_BASE_URL",
995
+ },
996
+ ]
997
+
998
+ engines = [
999
+ {
1000
+ "id": "local_mlx",
1001
+ "name": "MLX",
1002
+ "kind": "local",
1003
+ "description": "Apple Silicon GPU에서 MLX/MLX-VLM 모델을 직접 실행합니다.",
1004
+ "installed": engine_installed("local_mlx"),
1005
+ "installable": True,
1006
+ "install_label": ENGINE_INSTALLERS["local_mlx"]["label"],
1007
+ "models": mlx_models,
1008
+ },
1009
+ {
1010
+ "id": "ollama",
1011
+ "name": "Ollama",
1012
+ "kind": "local-server",
1013
+ "description": "Ollama 로컬 서버를 OpenAI 호환 엔진처럼 사용합니다.",
1014
+ "installed": ollama_installed,
1015
+ "installable": True,
1016
+ "install_label": ENGINE_INSTALLERS["ollama"]["label"],
1017
+ "models": ollama_models,
1018
+ },
1019
+ ]
1020
+ for spec in local_server_specs:
1021
+ support = engine_support_status(spec["id"])
1022
+ engines.append({
1023
+ "id": spec["id"],
1024
+ "name": spec["name"],
1025
+ "kind": "local-server",
1026
+ "description": spec["description"],
1027
+ "installed": engine_installed(spec["id"]),
1028
+ "supported": support["supported"],
1029
+ "support_reason": support["reason"],
1030
+ "installable": support["supported"] and spec["id"] in ENGINE_INSTALLERS,
1031
+ "install_label": ENGINE_INSTALLERS.get(spec["id"], {}).get("label"),
1032
+ "requires": spec["requires"],
1033
+ "models": (
1034
+ vllm_models if spec["id"] == "vllm"
1035
+ else lmstudio_models if spec["id"] == "lmstudio"
1036
+ else llamacpp_models if spec["id"] == "llamacpp"
1037
+ else ENGINE_MODEL_CATALOG.get(spec["id"], [])
1038
+ ),
1039
+ "note": spec.get("note") or support["reason"] or f"{spec['requires']} 설정 시 활성화됩니다.",
1040
+ "server_ready": spec.get("server_ready"),
1041
+ })
1042
+ for provider in ["openai", "openrouter", "groq", "together", "xai"]:
1043
+ env_key = next((item.get("requires") for item in cloud_by_provider.get(provider, []) if item.get("requires")), None)
1044
+ provider_models = []
1045
+ for model in cloud_by_provider.get(provider, []):
1046
+ cache = CLOUD_VERIFY_CACHE.get(model.get("id"))
1047
+ provider_models.append({
1048
+ **model,
1049
+ "verified": cache.get("ok") if cache else None,
1050
+ "verify_reason": cache.get("reason") if cache else None,
1051
+ })
1052
+ engines.append({
1053
+ "id": provider,
1054
+ "name": provider.title(),
1055
+ "kind": "cloud",
1056
+ "description": "OpenAI 호환 Chat Completions API로 cloud LLM을 실행합니다.",
1057
+ "installed": engine_installed(provider),
1058
+ "installable": True,
1059
+ "install_label": ENGINE_INSTALLERS[provider]["label"],
1060
+ "requires": env_key,
1061
+ "models": provider_models,
1062
+ })
1063
+ return engines
1064
+
1065
+ def runtime_features() -> Dict:
1066
+ return {
1067
+ "mode": APP_MODE,
1068
+ "public": IS_PUBLIC_MODE,
1069
+ "host": DEFAULT_HOST,
1070
+ "port": DEFAULT_PORT,
1071
+ "data_dir": str(DATA_DIR),
1072
+ "telegram_enabled": ENABLE_TELEGRAM,
1073
+ "graph_enabled": ENABLE_GRAPH,
1074
+ "autoload_models": AUTOLOAD_MODELS,
1075
+ "model_idle_unload_seconds": MODEL_IDLE_UNLOAD_SECONDS,
1076
+ "model_memory_policy": router.model_memory_policy(),
1077
+ "allow_local_models": ALLOW_LOCAL_MODELS,
1078
+ "security": {
1079
+ "host": DEFAULT_HOST,
1080
+ "require_auth": REQUIRE_AUTH,
1081
+ "invite_gate_enabled": INVITE_GATE_ENABLED,
1082
+ "keyring_available": keyring is not None,
1083
+ "plaintext_api_keys_allowed": ALLOW_PLAINTEXT_API_KEYS,
1084
+ "cors_allow_network": CORS_ALLOW_NETWORK,
1085
+ },
1086
+ "default_model": PUBLIC_MODEL if IS_PUBLIC_MODE else LOCAL_MODEL,
1087
+ "local_only_features": {
1088
+ "mlx": ALLOW_LOCAL_MODELS and not IS_PUBLIC_MODE,
1089
+ "telegram_bridge": ENABLE_TELEGRAM,
1090
+ "desktop_chrome_bridge": not IS_PUBLIC_MODE,
1091
+ "computer_use_bridge": not IS_PUBLIC_MODE,
1092
+ },
1093
+ "public_features": {
1094
+ "web_ui": True,
1095
+ "openai_compatible_models": True,
1096
+ "persistent_data_dir": str(DATA_DIR),
1097
+ },
1098
+ }
1099
+
1100
+ def install_engine(engine: str) -> Dict:
1101
+ if engine not in ENGINE_INSTALLERS:
1102
+ raise HTTPException(status_code=400, detail="지원하지 않는 엔진입니다.")
1103
+ installer = ENGINE_INSTALLERS[engine]
1104
+ required_binary = installer.get("requires_binary")
1105
+ if required_binary and shutil.which(required_binary) is None:
1106
+ raise HTTPException(status_code=400, detail=f"{required_binary}가 설치되어 있지 않아 자동 설치할 수 없습니다.")
1107
+ command = installer["command"]
1108
+ run_kwargs = {
1109
+ "cwd": str(BASE_DIR),
1110
+ "capture_output": True,
1111
+ "text": True,
1112
+ "timeout": 900,
1113
+ "check": False,
1114
+ }
1115
+
1116
+ if engine == "vllm" and sys.platform == "darwin" and platform.machine() == "arm64":
1117
+ command = [
1118
+ "/bin/bash",
1119
+ "-lc",
1120
+ "set -euo pipefail; "
1121
+ "if [ ! -x /opt/homebrew/bin/python3.12 ]; then brew install python@3.12; fi; "
1122
+ "/opt/homebrew/bin/python3.12 -m venv ~/.venv-vllm-metal; "
1123
+ "~/.venv-vllm-metal/bin/pip install -U pip setuptools wheel; "
1124
+ "~/.venv-vllm-metal/bin/pip install vllm-metal",
1125
+ ]
1126
+ try:
1127
+ completed = subprocess.run(command, **run_kwargs)
1128
+ except subprocess.TimeoutExpired:
1129
+ raise HTTPException(status_code=408, detail="엔진 설치 시간이 초과되었습니다.")
1130
+ result = {
1131
+ "engine": engine,
1132
+ "command": " ".join(command),
1133
+ "returncode": completed.returncode,
1134
+ "stdout": completed.stdout[-12000:],
1135
+ "stderr": completed.stderr[-12000:],
1136
+ "installed": engine_installed(engine),
1137
+ }
1138
+ ollama = local_binary("ollama")
1139
+ if engine == "ollama" and completed.returncode == 0 and ollama:
1140
+ # Skip if already running to avoid orphan daemons.
1141
+ already_up = False
1142
+ try:
1143
+ probe = subprocess.run([ollama, "list"], capture_output=True, timeout=2, check=False)
1144
+ already_up = probe.returncode == 0
1145
+ except Exception:
1146
+ already_up = False
1147
+ if already_up:
1148
+ result["daemon_started"] = "already_running"
1149
+ else:
1150
+ try:
1151
+ # Detach so the daemon survives this request but doesn't become our zombie.
1152
+ subprocess.Popen(
1153
+ [ollama, "serve"],
1154
+ stdout=subprocess.DEVNULL,
1155
+ stderr=subprocess.DEVNULL,
1156
+ start_new_session=True,
1157
+ )
1158
+ result["daemon_started"] = True
1159
+ except Exception as e:
1160
+ logging.warning("ollama serve spawn failed: %s", e)
1161
+ result["daemon_started"] = False
1162
+ return result
1163
+
1164
+
1165
+ def _resolve_model_alias(model_id: str, engine: Optional[str] = None) -> str:
1166
+ raw = model_id.strip()
1167
+ engine_hint = (engine or "").strip().lower()
1168
+ provider: Optional[str] = None
1169
+ model_name = raw
1170
+ if ":" in raw:
1171
+ prefix, rest = raw.split(":", 1)
1172
+ prefix = prefix.strip().lower()
1173
+ if prefix in {"ollama", "vllm", "lmstudio", "llamacpp", "local_mlx", "mlx"}:
1174
+ provider = "local_mlx" if prefix in {"local_mlx", "mlx"} else prefix
1175
+ model_name = rest.strip()
1176
+ provider = provider or ("local_mlx" if engine_hint in {"", "local_mlx", "mlx"} else engine_hint)
1177
+ aliases = MODEL_ENGINE_ALIASES.get(model_name.lower())
1178
+ if not aliases:
1179
+ return raw
1180
+ mapped = aliases.get(provider)
1181
+ if not mapped:
1182
+ return raw
1183
+ return mapped if provider == "local_mlx" else f"{provider}:{mapped}"
1184
+
1185
+
1186
+ def normalize_local_model_request(model_id: str, engine: Optional[str] = None) -> str:
1187
+ model_id = _resolve_model_alias(model_id, engine)
1188
+ engine = (engine or "").strip().lower()
1189
+ if engine in {"local_mlx", "mlx"} and model_id.startswith(("local_mlx:", "mlx:")):
1190
+ return model_id.split(":", 1)[1].strip()
1191
+ if engine and engine not in {"local_mlx", "mlx"} and ":" not in model_id:
1192
+ return f"{engine}:{model_id}"
1193
+ return model_id
1194
+
1195
+
1196
+ def ensure_engine_ready(engine: str) -> Dict[str, object]:
1197
+ engine = "local_mlx" if engine == "mlx" else engine
1198
+ if engine not in ENGINE_INSTALLERS and engine not in OPENAI_COMPATIBLE_PROVIDERS:
1199
+ raise HTTPException(status_code=400, detail=f"지원하지 않는 엔진입니다: {engine}")
1200
+ support = engine_support_status(engine)
1201
+ if not support["supported"]:
1202
+ raise HTTPException(status_code=400, detail=str(support["reason"]))
1203
+
1204
+ if engine_installed(engine):
1205
+ if engine == "local_mlx":
1206
+ ensure_mlx_runtime()
1207
+ return {"engine": engine, "installed": True, "installed_now": False}
1208
+
1209
+ if engine not in ENGINE_INSTALLERS:
1210
+ raise HTTPException(status_code=400, detail=f"{engine} 엔진 설치 방법이 등록되어 있지 않습니다.")
1211
+
1212
+ result = install_engine(engine)
1213
+ if result.get("returncode") not in (0, None) or not engine_installed(engine):
1214
+ detail = result.get("stderr") or result.get("stdout") or f"{engine} 설치에 실패했습니다."
1215
+ raise HTTPException(status_code=500, detail=str(detail)[-2000:])
1216
+
1217
+ if engine == "local_mlx":
1218
+ ensure_mlx_runtime()
1219
+ return {"engine": engine, "installed": True, "installed_now": True, "install": result}
1220
+
1221
+
1222
+ def build_model_resolution(
1223
+ input_id: str,
1224
+ engine: Optional[str],
1225
+ *,
1226
+ user_email: Optional[str] = None,
1227
+ display_name: Optional[str] = None,
1228
+ ) -> _ModelResolution:
1229
+ """피드백 #1/#2 공용 ModelResolution 생성기.
1230
+
1231
+ 사용자가 클릭한 input_id + engine 힌트를 받아 모든 단계가 공유할
1232
+ canonical identity를 만든다.
1233
+ """
1234
+ normalized = normalize_local_model_request(input_id, engine)
1235
+ return _ModelResolution.from_request(
1236
+ normalized,
1237
+ engine=engine,
1238
+ user_email=user_email,
1239
+ display_name=display_name or input_id,
1240
+ engine_aliases=MODEL_ENGINE_ALIASES,
1241
+ )
1242
+
1243
+
1244
+ _LOCAL_SMOKE_ENGINES = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
1245
+
1246
+
1247
+ async def _smoke_test_loaded_model(
1248
+ resolution: _ModelResolution,
1249
+ *,
1250
+ api_key_override: Optional[str] = None,
1251
+ ) -> Dict[str, object]:
1252
+ """로드 직후 짧은 채팅 테스트를 돌려 ready_to_chat 여부를 판정한다.
1253
+
1254
+ Cloud(OpenAI/Anthropic/OpenRouter 등) 모델은 사용자 비용 발생 가능성 때문에 skip.
1255
+ 실패해도 예외를 던지지 않는다. 결과는 compat_cache에도 기록된다.
1256
+ """
1257
+ if (resolution.engine or "").lower() not in _LOCAL_SMOKE_ENGINES:
1258
+ profile = _ensure_compat_profile(resolution.load_id, resolution.engine)
1259
+ return {
1260
+ "ok": True,
1261
+ "reason": "skipped (cloud model — smoke test would incur cost)",
1262
+ "answer": None,
1263
+ "profile": profile.to_dict(),
1264
+ "skipped": True,
1265
+ }
1266
+ try:
1267
+ text = await asyncio.wait_for(
1268
+ router.generate(
1269
+ _SMOKE_PROMPT,
1270
+ context=None,
1271
+ max_tokens=128,
1272
+ temperature=0.1,
1273
+ ),
1274
+ timeout=30,
1275
+ )
1276
+ except Exception as exc: # pragma: no cover - generator may not exist on all engines
1277
+ reason = str(exc)[:200] or "generation_failed"
1278
+ profile = _record_smoke_result(
1279
+ resolution.load_id, resolution.engine, False, reason, status="failed"
1280
+ )
1281
+ return {
1282
+ "ok": False,
1283
+ "status": "failed",
1284
+ "reason": reason,
1285
+ "answer": None,
1286
+ "profile": profile.to_dict(),
1287
+ }
1288
+
1289
+ profile = _ensure_compat_profile(resolution.load_id, resolution.engine)
1290
+ cleaned = _compat_fast_postprocess(str(text or ""), profile.to_dict())
1291
+ # item 3-3: ok / degraded / failed 3분류. degraded는 채팅은 가능하다.
1292
+ status, reason = _classify_smoke_response(cleaned)
1293
+ ok = status != "failed"
1294
+ profile = _record_smoke_result(
1295
+ resolution.load_id, resolution.engine, ok, reason, status=status
1296
+ )
1297
+ return {
1298
+ "ok": ok,
1299
+ "status": status,
1300
+ "reason": reason,
1301
+ "answer": cleaned,
1302
+ "profile": profile.to_dict(),
1303
+ }
1304
+
1305
+
1306
+ async def prepare_and_load_model(
1307
+ model_id: str,
1308
+ request: Request,
1309
+ engine: Optional[str] = None,
1310
+ user_email: Optional[str] = None,
1311
+ adapter_path: Optional[str] = None,
1312
+ draft_model_id: Optional[str] = None,
1313
+ ) -> Dict[str, object]:
1314
+ model_id = normalize_local_model_request(model_id, engine)
1315
+ if not model_id:
1316
+ raise HTTPException(status_code=400, detail="모델 식별자가 비어 있습니다.")
1317
+
1318
+ # 피드백 #1: ModelResolution을 모든 단계가 공유한다.
1319
+ resolution = _ModelResolution.from_request(
1320
+ model_id,
1321
+ engine=engine,
1322
+ user_email=user_email or get_current_user(request),
1323
+ engine_aliases=MODEL_ENGINE_ALIASES,
1324
+ )
1325
+
1326
+ parsed_provider, parsed_model = parse_model_ref(model_id)
1327
+ if parsed_provider == "mlx":
1328
+ parsed_provider = "local_mlx"
1329
+
1330
+ local_engines = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
1331
+ install_result: Dict[str, object] = {}
1332
+ download_result: Optional[Dict[str, object]] = None
1333
+
1334
+ if parsed_provider in local_engines:
1335
+ install_result = ensure_engine_ready(parsed_provider)
1336
+
1337
+ if parsed_provider == "local_mlx":
1338
+ explicit_path = Path(parsed_model).expanduser()
1339
+ if not explicit_path.exists() and not hf_model_ready(parsed_model, "local_mlx"):
1340
+ download_result = download_hf_model(parsed_model, "local_mlx")
1341
+ elif parsed_provider == "ollama":
1342
+ ensure_ollama_server()
1343
+ ollama = local_binary("ollama")
1344
+ if not ollama:
1345
+ raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
1346
+ if parsed_model not in get_ollama_pulled_models():
1347
+ completed = subprocess.run(
1348
+ [ollama, "pull", parsed_model],
1349
+ capture_output=True,
1350
+ text=True,
1351
+ timeout=900,
1352
+ check=False,
1353
+ )
1354
+ if completed.returncode != 0:
1355
+ raise HTTPException(status_code=500, detail=completed.stderr[-2000:] or "Ollama 모델 다운로드 실패")
1356
+ download_result = {"provider": "ollama", "model": parsed_model, "returncode": completed.returncode}
1357
+ elif parsed_provider == "vllm":
1358
+ ensure_vllm_server(parsed_model)
1359
+ download_result = {"provider": "vllm", "model": parsed_model, "server_ready": True}
1360
+ elif parsed_provider == "llamacpp":
1361
+ ensure_llamacpp_server(parsed_model)
1362
+ download_result = {"provider": "llamacpp", "model": parsed_model, "server_ready": True}
1363
+ elif parsed_provider == "lmstudio":
1364
+ ensured = ensure_lmstudio_model(parsed_model)
1365
+ resolved_model = str(
1366
+ ensured.get("instance_id")
1367
+ or ensured.get("resolved_model")
1368
+ or parsed_model
1369
+ ).strip()
1370
+ parsed_model = resolved_model
1371
+ model_id = f"lmstudio:{resolved_model}"
1372
+ download_result = ensured
1373
+
1374
+ effective_email = (user_email or get_current_user(request) or "").strip()
1375
+ user_api_key = get_user_api_key(effective_email, parsed_provider) if parsed_provider != "local_mlx" else None
1376
+ msg = await router.load_model(
1377
+ model_id,
1378
+ adapter_path,
1379
+ draft_model_id=draft_model_id,
1380
+ api_key_override=user_api_key,
1381
+ owner=effective_email or None,
1382
+ )
1383
+ # 피드백 #1/#2: 로드 직후 ModelResolution을 실제 current로 동기화하고 smoke test 수행.
1384
+ resolution.update_after_load(actual_current=router.current_model_id)
1385
+ smoke_result: Dict[str, object] = {}
1386
+ ready_to_chat = True
1387
+ compat_status = "ok"
1388
+ try:
1389
+ smoke_result = await _smoke_test_loaded_model(resolution, api_key_override=user_api_key)
1390
+ ready_to_chat = bool(smoke_result.get("ok"))
1391
+ # item 3-3: smoke 결과의 3분류(ok/degraded/failed)를 그대로 노출한다.
1392
+ compat_status = str(smoke_result.get("status") or ("ok" if ready_to_chat else "degraded"))
1393
+ except Exception as exc: # never break load on smoke test failures
1394
+ logging.warning("smoke test failed for %s: %s", resolution.load_id, exc)
1395
+ compat_status = "unknown"
1396
+ return {
1397
+ "status": "ok",
1398
+ "message": msg,
1399
+ "model": model_id,
1400
+ "current": router.current_model_id,
1401
+ "engine": parsed_provider,
1402
+ "installed_now": bool(install_result.get("installed_now")),
1403
+ "download": download_result,
1404
+ "resolution": resolution.to_dict(),
1405
+ "downloaded": True,
1406
+ "loaded": True,
1407
+ "ready_to_chat": ready_to_chat,
1408
+ "compatibility_status": compat_status,
1409
+ "smoke_test": smoke_result,
1410
+ }
1411
+
1412
+
1413
+ def sse_event(event: str, data: Dict[str, object]) -> str:
1414
+ return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
1415
+
1416
+
1417
+ async def prepare_and_load_model_stream(
1418
+ model_id: str,
1419
+ request: Request,
1420
+ engine: Optional[str] = None,
1421
+ user_email: Optional[str] = None,
1422
+ ) -> AsyncIterator[str]:
1423
+ model_id = normalize_local_model_request(model_id, engine)
1424
+ if not model_id:
1425
+ raise HTTPException(status_code=400, detail="모델 식별자가 비어 있습니다.")
1426
+
1427
+ parsed_provider, parsed_model = parse_model_ref(model_id)
1428
+ if parsed_provider == "mlx":
1429
+ parsed_provider = "local_mlx"
1430
+
1431
+ work_queue: "queue.Queue[Dict[str, object]]" = queue.Queue()
1432
+ work_result: Dict[str, object] = {}
1433
+
1434
+ def emit_progress(payload: Dict[str, object]) -> None:
1435
+ work_queue.put({"kind": "progress", "data": payload})
1436
+
1437
+ def blocking_prepare() -> None:
1438
+ try:
1439
+ local_engines = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
1440
+ install_result: Dict[str, object] = {}
1441
+ download_result: Optional[Dict[str, object]] = None
1442
+ prepared_model_id = model_id
1443
+ prepared_model_name = parsed_model
1444
+
1445
+ if parsed_provider in local_engines:
1446
+ emit_progress(model_download_progress_payload(
1447
+ "engine",
1448
+ "실행 엔진을 확인하는 중입니다.",
1449
+ percent=2,
1450
+ indeterminate=True,
1451
+ ))
1452
+ install_result = ensure_engine_ready(parsed_provider)
1453
+ emit_progress(model_download_progress_payload(
1454
+ "engine",
1455
+ "실행 엔진 준비가 완료되었습니다.",
1456
+ percent=10,
1457
+ indeterminate=False,
1458
+ ))
1459
+
1460
+ if parsed_provider == "local_mlx":
1461
+ explicit_path = Path(parsed_model).expanduser()
1462
+ if explicit_path.exists():
1463
+ download_result = {"model": parsed_model, "path": str(explicit_path), "cached": True}
1464
+ emit_progress(model_download_progress_payload(
1465
+ "download",
1466
+ "로컬 모델 경로를 확인했습니다.",
1467
+ percent=100,
1468
+ detail=str(explicit_path),
1469
+ eta_seconds=0,
1470
+ ))
1471
+ elif not hf_model_ready(parsed_model, "local_mlx"):
1472
+ download_result = download_hf_model(parsed_model, "local_mlx", progress_emit=emit_progress)
1473
+ else:
1474
+ download_result = {"model": parsed_model, "path": str(hf_model_dir(parsed_model)), "cached": True}
1475
+ emit_progress(model_download_progress_payload(
1476
+ "download",
1477
+ "이미 다운로드된 모델을 확인했습니다.",
1478
+ percent=100,
1479
+ eta_seconds=0,
1480
+ ))
1481
+ elif parsed_provider == "ollama":
1482
+ emit_progress(model_download_progress_payload(
1483
+ "engine",
1484
+ "Ollama 서버를 확인하는 중입니다.",
1485
+ percent=12,
1486
+ indeterminate=True,
1487
+ ))
1488
+ ensure_ollama_server()
1489
+ if parsed_model not in get_ollama_pulled_models():
1490
+ download_result = pull_ollama_model_with_progress(parsed_model, progress_emit=emit_progress)
1491
+ else:
1492
+ download_result = {"provider": "ollama", "model": parsed_model, "cached": True}
1493
+ emit_progress(model_download_progress_payload(
1494
+ "download",
1495
+ "이미 다운로드된 Ollama 모델을 확인했습니다.",
1496
+ percent=100,
1497
+ detail=parsed_model,
1498
+ eta_seconds=0,
1499
+ ))
1500
+ elif parsed_provider == "vllm":
1501
+ if not hf_model_ready(parsed_model, "vllm"):
1502
+ download_result = download_hf_model(parsed_model, "vllm", progress_emit=emit_progress)
1503
+ else:
1504
+ download_result = {"provider": "vllm", "model": parsed_model, "cached": True}
1505
+ emit_progress(model_download_progress_payload(
1506
+ "download",
1507
+ "이미 다운로드된 모델을 확인했습니다.",
1508
+ percent=100,
1509
+ detail=parsed_model,
1510
+ eta_seconds=0,
1511
+ ))
1512
+ emit_progress(model_download_progress_payload(
1513
+ "server",
1514
+ "vLLM 서버를 시작하는 중입니다.",
1515
+ percent=92,
1516
+ indeterminate=True,
1517
+ ))
1518
+ ensure_vllm_server(parsed_model)
1519
+ download_result = {**(download_result or {}), "provider": "vllm", "model": parsed_model, "server_ready": True}
1520
+ elif parsed_provider == "llamacpp":
1521
+ if not hf_model_ready(parsed_model, "llamacpp"):
1522
+ download_result = download_hf_model(parsed_model, "llamacpp", progress_emit=emit_progress)
1523
+ else:
1524
+ download_result = {"provider": "llamacpp", "model": parsed_model, "cached": True}
1525
+ emit_progress(model_download_progress_payload(
1526
+ "download",
1527
+ "이미 다운로드된 GGUF 모델을 확인했습니다.",
1528
+ percent=100,
1529
+ detail=parsed_model,
1530
+ eta_seconds=0,
1531
+ ))
1532
+ emit_progress(model_download_progress_payload(
1533
+ "server",
1534
+ "llama.cpp 서버를 시작하는 중입니다.",
1535
+ percent=92,
1536
+ indeterminate=True,
1537
+ ))
1538
+ ensure_llamacpp_server(parsed_model)
1539
+ download_result = {**(download_result or {}), "provider": "llamacpp", "model": parsed_model, "server_ready": True}
1540
+ elif parsed_provider == "lmstudio":
1541
+ emit_progress(model_download_progress_payload(
1542
+ "download",
1543
+ "LM Studio 모델을 확인하는 중입니다.",
1544
+ percent=35,
1545
+ indeterminate=True,
1546
+ ))
1547
+ ensured = ensure_lmstudio_model(parsed_model)
1548
+ resolved_model = str(
1549
+ ensured.get("instance_id")
1550
+ or ensured.get("resolved_model")
1551
+ or parsed_model
1552
+ ).strip()
1553
+ prepared_model_name = resolved_model
1554
+ prepared_model_id = f"lmstudio:{resolved_model}"
1555
+ download_result = ensured
1556
+ else:
1557
+ emit_progress(model_download_progress_payload(
1558
+ "engine",
1559
+ "모델 연결을 준비하는 중입니다.",
1560
+ percent=30,
1561
+ indeterminate=True,
1562
+ ))
1563
+
1564
+ work_result.update({
1565
+ "model_id": prepared_model_id,
1566
+ "parsed_provider": parsed_provider,
1567
+ "parsed_model": prepared_model_name,
1568
+ "install_result": install_result,
1569
+ "download_result": download_result,
1570
+ })
1571
+ work_queue.put({"kind": "done"})
1572
+ except HTTPException as exc:
1573
+ work_queue.put({"kind": "error", "status_code": exc.status_code, "detail": exc.detail})
1574
+ except Exception as exc:
1575
+ logging.exception("model prepare stream worker failed")
1576
+ work_queue.put({"kind": "error", "status_code": 500, "detail": str(exc)[-2000:]})
1577
+
1578
+ worker = threading.Thread(target=blocking_prepare, daemon=True)
1579
+ worker.start()
1580
+
1581
+ while True:
1582
+ item = await asyncio.to_thread(work_queue.get)
1583
+ kind = item.get("kind")
1584
+ if kind == "progress":
1585
+ yield sse_event("progress", item["data"])
1586
+ elif kind == "error":
1587
+ raise HTTPException(
1588
+ status_code=int(item.get("status_code") or 500),
1589
+ detail=item.get("detail") or "모델 준비에 실패했습니다.",
1590
+ )
1591
+ elif kind == "done":
1592
+ break
1593
+
1594
+ prepared_model_id = str(work_result.get("model_id") or model_id)
1595
+ prepared_provider = str(work_result.get("parsed_provider") or parsed_provider)
1596
+ install_result = work_result.get("install_result") or {}
1597
+ download_result = work_result.get("download_result")
1598
+
1599
+ yield sse_event("progress", model_download_progress_payload(
1600
+ "load",
1601
+ "모델을 메모리에 로드하는 중입니다.",
1602
+ percent=96,
1603
+ indeterminate=True,
1604
+ ))
1605
+
1606
+ effective_email = (user_email or get_current_user(request) or "").strip()
1607
+ user_api_key = get_user_api_key(effective_email, prepared_provider) if prepared_provider != "local_mlx" else None
1608
+ msg = await router.load_model(
1609
+ prepared_model_id,
1610
+ None,
1611
+ draft_model_id=None,
1612
+ api_key_override=user_api_key,
1613
+ owner=effective_email or None,
1614
+ )
1615
+ # 피드백 #1/#2: SSE에도 ModelResolution과 smoke test 결과를 같이 내려준다.
1616
+ resolution_stream = _ModelResolution.from_request(
1617
+ prepared_model_id,
1618
+ engine=prepared_provider,
1619
+ user_email=effective_email or None,
1620
+ engine_aliases=MODEL_ENGINE_ALIASES,
1621
+ )
1622
+ resolution_stream.update_after_load(actual_current=router.current_model_id)
1623
+ yield sse_event("progress", model_download_progress_payload(
1624
+ "smoke_test",
1625
+ "채팅 호환성 테스트 중입니다.",
1626
+ percent=98,
1627
+ indeterminate=True,
1628
+ ))
1629
+ smoke_result: Dict[str, object] = {}
1630
+ ready_to_chat = True
1631
+ compat_status = "ok"
1632
+ try:
1633
+ smoke_result = await _smoke_test_loaded_model(resolution_stream, api_key_override=user_api_key)
1634
+ ready_to_chat = bool(smoke_result.get("ok"))
1635
+ # item 3-3: smoke 결과의 3분류(ok/degraded/failed)를 그대로 노출한다.
1636
+ compat_status = str(smoke_result.get("status") or ("ok" if ready_to_chat else "degraded"))
1637
+ except Exception as exc:
1638
+ logging.warning("smoke test (stream) failed for %s: %s", resolution_stream.load_id, exc)
1639
+ compat_status = "unknown"
1640
+ result = {
1641
+ "status": "ok",
1642
+ "message": msg,
1643
+ "model": prepared_model_id,
1644
+ "current": router.current_model_id,
1645
+ "engine": prepared_provider,
1646
+ "installed_now": bool(isinstance(install_result, dict) and install_result.get("installed_now")),
1647
+ "download": download_result,
1648
+ "resolution": resolution_stream.to_dict(),
1649
+ "downloaded": True,
1650
+ "loaded": True,
1651
+ "ready_to_chat": ready_to_chat,
1652
+ "compatibility_status": compat_status,
1653
+ "smoke_test": smoke_result,
1654
+ }
1655
+ yield sse_event("progress", model_download_progress_payload(
1656
+ "done",
1657
+ "모델 준비가 완료되었습니다.",
1658
+ percent=100,
1659
+ eta_seconds=0,
1660
+ ))
1661
+ yield sse_event("done", result)
1662
+
1663
+
1664
+ CLOUD_VERIFY_CACHE: Dict[str, Dict] = {}
1665
+ CLOUD_VERIFY_TTL_SECONDS = 600
1666
+
1667
+ async def _probe_cloud_model(model_ref: str) -> Dict[str, object]:
1668
+ provider, model_name = parse_model_ref(model_ref)
1669
+ config = OPENAI_COMPATIBLE_PROVIDERS.get(provider)
1670
+ if not config:
1671
+ return {"ok": False, "reason": f"Unsupported provider: {provider}"}
1672
+
1673
+ api_key = os.getenv(config["env_key"]) or config.get("api_key_fallback")
1674
+ if not api_key:
1675
+ return {"ok": False, "reason": f"Missing API key: {config['env_key']}"}
1676
+
1677
+ base_url = os.getenv(config.get("base_url_env", "")) if config.get("base_url_env") else None
1678
+ base_url = base_url or config.get("base_url")
1679
+ client_kwargs = {"api_key": api_key}
1680
+ if base_url:
1681
+ client_kwargs["base_url"] = base_url
1682
+
1683
+ try:
1684
+ client = AsyncOpenAI(**client_kwargs)
1685
+ await asyncio.wait_for(
1686
+ client.chat.completions.create(
1687
+ model=model_name,
1688
+ messages=[{"role": "user", "content": "ping"}],
1689
+ max_tokens=1,
1690
+ temperature=0,
1691
+ ),
1692
+ timeout=15,
1693
+ )
1694
+ return {"ok": True, "reason": "ok"}
1695
+ except Exception as e:
1696
+ return {"ok": False, "reason": str(e)[:220]}
1697
+
1698
+
1699
+ async def verify_cloud_models(force: bool = False, provider_filter: Optional[str] = None) -> Dict[str, Dict]:
1700
+ now = time.time()
1701
+ cloud_items = [item for item in router.detected_cloud_models() if item.get("tag") == "cloud"]
1702
+ if provider_filter:
1703
+ cloud_items = [item for item in cloud_items if item.get("provider") == provider_filter]
1704
+
1705
+ results: Dict[str, Dict] = {}
1706
+ for item in cloud_items:
1707
+ model_ref = item["id"]
1708
+ cached = CLOUD_VERIFY_CACHE.get(model_ref)
1709
+ if not force and cached and (now - cached.get("ts", 0) <= CLOUD_VERIFY_TTL_SECONDS):
1710
+ results[model_ref] = cached
1711
+ continue
1712
+ if item.get("available") is False:
1713
+ record = {"ok": False, "reason": item.get("requires") or "API key missing", "ts": now}
1714
+ CLOUD_VERIFY_CACHE[model_ref] = record
1715
+ results[model_ref] = record
1716
+ continue
1717
+ probe = await _probe_cloud_model(model_ref)
1718
+ record = {"ok": bool(probe.get("ok")), "reason": probe.get("reason", ""), "ts": now}
1719
+ CLOUD_VERIFY_CACHE[model_ref] = record
1720
+ results[model_ref] = record
1721
+ return results