ltcai 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +105 -79
- package/docs/CHANGELOG.md +109 -0
- package/docs/images/architecture.png +0 -0
- package/docs/images/graph.png +0 -0
- package/docs/images/hero.gif +0 -0
- package/docs/images/model-recommendation.png +0 -0
- package/docs/images/onboarding.png +0 -0
- package/docs/images/organization.png +0 -0
- package/docs/images/skills.png +0 -0
- package/docs/images/tmp_frames/frame_00.png +0 -0
- package/docs/images/tmp_frames/frame_01.png +0 -0
- package/docs/images/tmp_frames/frame_02.png +0 -0
- package/docs/images/tmp_frames/frame_03.png +0 -0
- package/docs/images/workspace.png +0 -0
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +17 -0
- package/latticeai/api/chat.py +786 -0
- package/latticeai/api/computer_use.py +294 -0
- package/latticeai/api/deps.py +15 -0
- package/latticeai/api/garden.py +34 -0
- package/latticeai/api/local_files.py +125 -0
- package/latticeai/api/models.py +16 -0
- package/latticeai/api/permissions.py +331 -0
- package/latticeai/api/setup.py +158 -0
- package/latticeai/api/static_routes.py +166 -0
- package/latticeai/api/tools.py +579 -0
- package/latticeai/api/workspace.py +11 -0
- package/latticeai/core/enterprise_admin.py +158 -0
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/server_app.py +223 -4301
- package/latticeai/services/app_context.py +27 -0
- package/latticeai/services/model_catalog.py +289 -0
- package/latticeai/services/model_recommendation.py +183 -0
- package/latticeai/services/model_runtime.py +1721 -0
- package/latticeai/services/tool_dispatch.py +135 -0
- package/latticeai/services/upload_service.py +99 -0
- package/package.json +3 -3
- package/skills/SKILL_TEMPLATE.md +1 -1
- package/skills/code_review/SKILL.md +1 -1
- package/skills/data_analysis/SKILL.md +1 -1
- package/skills/file_edit/SKILL.md +1 -1
- package/skills/summarize_document/SKILL.md +1 -1
- package/skills/web_search/SKILL.md +1 -1
- package/static/scripts/chat.js +45 -0
|
@@ -0,0 +1,1721 @@
|
|
|
1
|
+
"""Model runtime and provider helpers for Lattice AI.
|
|
2
|
+
|
|
3
|
+
This module owns local/cloud model preparation, engine detection, model download,
|
|
4
|
+
provider-specific server startup, smoke tests, and runtime feature payloads. It is
|
|
5
|
+
configured by ``server_app`` with app-level state but has no FastAPI app import.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import importlib.util
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import platform
|
|
16
|
+
import queue
|
|
17
|
+
import re
|
|
18
|
+
import shutil
|
|
19
|
+
import subprocess
|
|
20
|
+
import sys
|
|
21
|
+
import tempfile
|
|
22
|
+
import threading
|
|
23
|
+
import time
|
|
24
|
+
import urllib.error
|
|
25
|
+
import urllib.request
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import AsyncIterator, Dict, List, Optional
|
|
28
|
+
|
|
29
|
+
import httpx
|
|
30
|
+
from fastapi import HTTPException, Request
|
|
31
|
+
|
|
32
|
+
from llm_router import (
|
|
33
|
+
AsyncOpenAI,
|
|
34
|
+
HF_MODELS_ROOT,
|
|
35
|
+
OPENAI_COMPATIBLE_PROVIDERS,
|
|
36
|
+
ensure_mlx_runtime,
|
|
37
|
+
hf_model_dir,
|
|
38
|
+
normalize_branding,
|
|
39
|
+
parse_model_ref,
|
|
40
|
+
)
|
|
41
|
+
from latticeai.core.model_compat import (
|
|
42
|
+
SMOKE_PROMPT as _SMOKE_PROMPT,
|
|
43
|
+
classify_smoke_response as _classify_smoke_response,
|
|
44
|
+
ensure_profile as _ensure_compat_profile,
|
|
45
|
+
fast_postprocess as _compat_fast_postprocess,
|
|
46
|
+
record_smoke_result as _record_smoke_result,
|
|
47
|
+
)
|
|
48
|
+
from latticeai.core.model_resolution import ModelResolution as _ModelResolution
|
|
49
|
+
|
|
50
|
+
# Configured by server_app.configure_model_runtime during app assembly.
|
|
51
|
+
router = None
|
|
52
|
+
APP_MODE = "local"
|
|
53
|
+
DEFAULT_HOST = "127.0.0.1"
|
|
54
|
+
DEFAULT_PORT = 4825
|
|
55
|
+
DATA_DIR = Path.home() / ".latticeai"
|
|
56
|
+
BASE_DIR = Path.cwd()
|
|
57
|
+
ENABLE_TELEGRAM = False
|
|
58
|
+
ENABLE_GRAPH = True
|
|
59
|
+
AUTOLOAD_MODELS = False
|
|
60
|
+
MODEL_IDLE_UNLOAD_SECONDS = 0
|
|
61
|
+
ALLOW_LOCAL_MODELS = True
|
|
62
|
+
REQUIRE_AUTH = False
|
|
63
|
+
INVITE_GATE_ENABLED = False
|
|
64
|
+
ALLOW_PLAINTEXT_API_KEYS = False
|
|
65
|
+
CORS_ALLOW_NETWORK = False
|
|
66
|
+
PUBLIC_MODEL = "openai:gpt-4o-mini"
|
|
67
|
+
LOCAL_MODEL = "mlx-community/SmolLM-1.7B-Instruct-4bit"
|
|
68
|
+
IS_PUBLIC_MODE = False
|
|
69
|
+
keyring = None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _missing_current_user(_request: Request) -> Optional[str]:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _missing_user_api_key(_email: Optional[str], _provider: str) -> Optional[str]:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
get_current_user = _missing_current_user
|
|
81
|
+
get_user_api_key = _missing_user_api_key
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def configure_model_runtime(**deps) -> None:
|
|
85
|
+
"""Wire app-owned runtime dependencies without importing server_app."""
|
|
86
|
+
globals().update({key: value for key, value in deps.items() if key in globals()})
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# Catalog data + version-dedup helpers live in ``model_catalog``; re-exported
|
|
90
|
+
# here so existing ``from ...model_runtime import ENGINE_MODEL_CATALOG`` imports
|
|
91
|
+
# keep working.
|
|
92
|
+
from latticeai.services.model_catalog import ( # noqa: F401 (re-export)
|
|
93
|
+
ENGINE_INSTALLERS,
|
|
94
|
+
ENGINE_MODEL_CATALOG,
|
|
95
|
+
MODEL_ENGINE_ALIASES,
|
|
96
|
+
_VERSIONED_MODEL_PATTERNS,
|
|
97
|
+
_model_family_version,
|
|
98
|
+
_version_tuple,
|
|
99
|
+
filter_lower_family_versions,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def _update_env_file(env_file: Path, key: str, value: str) -> None:
|
|
103
|
+
lines = []
|
|
104
|
+
found = False
|
|
105
|
+
if env_file.exists():
|
|
106
|
+
for line in env_file.read_text(encoding="utf-8").splitlines():
|
|
107
|
+
if line.startswith(f"{key}="):
|
|
108
|
+
lines.append(f"{key}={value}")
|
|
109
|
+
found = True
|
|
110
|
+
else:
|
|
111
|
+
lines.append(line)
|
|
112
|
+
if not found:
|
|
113
|
+
lines.append(f"{key}={value}")
|
|
114
|
+
env_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
LOCAL_SERVER_PROCESSES: Dict[str, subprocess.Popen] = {}
|
|
118
|
+
VLLM_METAL_ENV = Path.home() / ".venv-vllm-metal"
|
|
119
|
+
VLLM_METAL_BIN = VLLM_METAL_ENV / "bin" / "vllm"
|
|
120
|
+
VLLM_METAL_PYTHON = VLLM_METAL_ENV / "bin" / "python"
|
|
121
|
+
LMSTUDIO_BUNDLED_CLI = Path("/Applications/LM Studio.app/Contents/Resources/app/.webpack/lms")
|
|
122
|
+
|
|
123
|
+
def windows_binary_candidates(binary: str) -> List[Path]:
|
|
124
|
+
local_appdata = os.environ.get("LOCALAPPDATA", "")
|
|
125
|
+
program_files = os.environ.get("ProgramFiles", r"C:\Program Files")
|
|
126
|
+
program_files_x86 = os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)")
|
|
127
|
+
candidates = {
|
|
128
|
+
"ollama": [
|
|
129
|
+
Path(local_appdata) / "Programs" / "Ollama" / "ollama.exe" if local_appdata else None,
|
|
130
|
+
Path(program_files) / "Ollama" / "ollama.exe",
|
|
131
|
+
],
|
|
132
|
+
"lms": [
|
|
133
|
+
Path(local_appdata) / "Programs" / "LM Studio" / "resources" / "app" / ".webpack" / "lms.exe" if local_appdata else None,
|
|
134
|
+
Path(program_files) / "LM Studio" / "resources" / "app" / ".webpack" / "lms.exe",
|
|
135
|
+
],
|
|
136
|
+
"nvidia-smi": [
|
|
137
|
+
Path(program_files) / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
|
|
138
|
+
Path(program_files_x86) / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
|
|
139
|
+
],
|
|
140
|
+
}
|
|
141
|
+
return [item for item in candidates.get(binary, []) if item is not None]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def local_binary(binary: str) -> Optional[str]:
|
|
145
|
+
found = shutil.which(binary)
|
|
146
|
+
if found:
|
|
147
|
+
return found
|
|
148
|
+
if platform.system() == "Windows":
|
|
149
|
+
for candidate in windows_binary_candidates(binary):
|
|
150
|
+
if candidate.exists():
|
|
151
|
+
return str(candidate)
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def find_lmstudio_cli() -> Optional[str]:
|
|
156
|
+
cli = local_binary("lms")
|
|
157
|
+
if cli:
|
|
158
|
+
return cli
|
|
159
|
+
if LMSTUDIO_BUNDLED_CLI.exists():
|
|
160
|
+
return str(LMSTUDIO_BUNDLED_CLI)
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def vllm_executable() -> Optional[str]:
|
|
165
|
+
found = shutil.which("vllm")
|
|
166
|
+
if found:
|
|
167
|
+
return found
|
|
168
|
+
if VLLM_METAL_BIN.exists():
|
|
169
|
+
return str(VLLM_METAL_BIN)
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def vllm_metal_python() -> Optional[str]:
|
|
174
|
+
if VLLM_METAL_PYTHON.exists():
|
|
175
|
+
return str(VLLM_METAL_PYTHON)
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _json_request(
|
|
180
|
+
url: str,
|
|
181
|
+
*,
|
|
182
|
+
method: str = "GET",
|
|
183
|
+
payload: Optional[Dict[str, object]] = None,
|
|
184
|
+
headers: Optional[Dict[str, str]] = None,
|
|
185
|
+
timeout: float = 10.0,
|
|
186
|
+
) -> Dict[str, object]:
|
|
187
|
+
data = None
|
|
188
|
+
req_headers = dict(headers or {})
|
|
189
|
+
if payload is not None:
|
|
190
|
+
data = json.dumps(payload).encode("utf-8")
|
|
191
|
+
req_headers.setdefault("Content-Type", "application/json")
|
|
192
|
+
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
|
|
193
|
+
with urllib.request.urlopen(req, timeout=timeout) as res:
|
|
194
|
+
raw = res.read().decode("utf-8", errors="replace")
|
|
195
|
+
if not raw.strip():
|
|
196
|
+
return {}
|
|
197
|
+
return json.loads(raw)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def lmstudio_api_base() -> str:
|
|
201
|
+
return (os.getenv("LMSTUDIO_BASE_URL") or OPENAI_COMPATIBLE_PROVIDERS["lmstudio"]["base_url"]).rstrip("/")
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def lmstudio_native_api_base() -> str:
|
|
205
|
+
base = lmstudio_api_base()
|
|
206
|
+
return base[:-3] if base.endswith("/v1") else base
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def ensure_lmstudio_server() -> None:
|
|
210
|
+
base_url = lmstudio_native_api_base()
|
|
211
|
+
try:
|
|
212
|
+
_json_request(f"{base_url}/api/v1/models", headers={"Authorization": "Bearer lmstudio"}, timeout=2.5)
|
|
213
|
+
return
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
cli = find_lmstudio_cli()
|
|
218
|
+
if not cli:
|
|
219
|
+
raise HTTPException(status_code=400, detail="LM Studio CLI를 찾지 못했습니다. LM Studio를 설치한 뒤 다시 시도하세요.")
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
subprocess.Popen(
|
|
223
|
+
[cli, "server", "start"],
|
|
224
|
+
stdout=subprocess.DEVNULL,
|
|
225
|
+
stderr=subprocess.DEVNULL,
|
|
226
|
+
start_new_session=True,
|
|
227
|
+
)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 서버 시작 실패: {e}")
|
|
230
|
+
|
|
231
|
+
deadline = time.time() + 45
|
|
232
|
+
while time.time() < deadline:
|
|
233
|
+
try:
|
|
234
|
+
_json_request(f"{base_url}/api/v1/models", headers={"Authorization": "Bearer lmstudio"}, timeout=2.5)
|
|
235
|
+
return
|
|
236
|
+
except Exception:
|
|
237
|
+
time.sleep(1)
|
|
238
|
+
raise HTTPException(status_code=500, detail="LM Studio Local Server를 자동으로 시작하지 못했습니다.")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
_LMSTUDIO_MODELS_CACHE: List[Dict[str, object]] = []
|
|
242
|
+
_LMSTUDIO_MODELS_CACHE_TS: float = 0.0
|
|
243
|
+
_LMSTUDIO_MODELS_CACHE_TTL: float = 10.0
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def get_lmstudio_models(*, force: bool = False) -> List[Dict[str, object]]:
|
|
247
|
+
global _LMSTUDIO_MODELS_CACHE, _LMSTUDIO_MODELS_CACHE_TS
|
|
248
|
+
if not force and time.monotonic() - _LMSTUDIO_MODELS_CACHE_TS < _LMSTUDIO_MODELS_CACHE_TTL:
|
|
249
|
+
return _LMSTUDIO_MODELS_CACHE
|
|
250
|
+
try:
|
|
251
|
+
payload = _json_request(
|
|
252
|
+
f"{lmstudio_native_api_base()}/api/v1/models",
|
|
253
|
+
headers={"Authorization": f"Bearer {os.getenv('LMSTUDIO_API_KEY') or 'lmstudio'}"},
|
|
254
|
+
timeout=2.5,
|
|
255
|
+
)
|
|
256
|
+
except Exception:
|
|
257
|
+
return _LMSTUDIO_MODELS_CACHE
|
|
258
|
+
models = payload.get("models")
|
|
259
|
+
_LMSTUDIO_MODELS_CACHE = models if isinstance(models, list) else []
|
|
260
|
+
_LMSTUDIO_MODELS_CACHE_TS = time.monotonic()
|
|
261
|
+
return _LMSTUDIO_MODELS_CACHE
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _lmstudio_candidate_keys(model_name: str) -> List[str]:
|
|
265
|
+
raw = model_name.strip()
|
|
266
|
+
if not raw:
|
|
267
|
+
return []
|
|
268
|
+
slug = raw.split("/")[-1].lower()
|
|
269
|
+
slug = slug.replace("-gguf", "").replace("-awq", "")
|
|
270
|
+
parts = [p for p in slug.split("-") if p]
|
|
271
|
+
candidates = [raw.lower(), slug]
|
|
272
|
+
if parts:
|
|
273
|
+
candidates.append("-".join(parts[: min(4, len(parts))]))
|
|
274
|
+
return list(dict.fromkeys(candidates))
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _find_lmstudio_model_key(model_name: str, models: List[Dict[str, object]]) -> Optional[str]:
|
|
278
|
+
if not models:
|
|
279
|
+
return None
|
|
280
|
+
candidate_keys = _lmstudio_candidate_keys(model_name)
|
|
281
|
+
exact = []
|
|
282
|
+
fuzzy = []
|
|
283
|
+
for item in models:
|
|
284
|
+
if not isinstance(item, dict):
|
|
285
|
+
continue
|
|
286
|
+
key = str(item.get("key") or "").strip()
|
|
287
|
+
display_name = str(item.get("display_name") or "").strip()
|
|
288
|
+
haystacks = [key.lower(), display_name.lower()]
|
|
289
|
+
if any(raw == key.lower() for raw in candidate_keys):
|
|
290
|
+
exact.append(key)
|
|
291
|
+
continue
|
|
292
|
+
if any(token and token in hay for token in candidate_keys for hay in haystacks):
|
|
293
|
+
fuzzy.append(key)
|
|
294
|
+
return (exact or fuzzy or [None])[0]
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def ensure_lmstudio_model(model_name: str) -> Dict[str, object]:
|
|
298
|
+
ensure_lmstudio_server()
|
|
299
|
+
auth_header = {"Authorization": f"Bearer {os.getenv('LMSTUDIO_API_KEY') or 'lmstudio'}"}
|
|
300
|
+
models = get_lmstudio_models()
|
|
301
|
+
found_key = _find_lmstudio_model_key(model_name, models)
|
|
302
|
+
model_key = found_key or model_name
|
|
303
|
+
|
|
304
|
+
if not found_key:
|
|
305
|
+
try:
|
|
306
|
+
job = _json_request(
|
|
307
|
+
f"{lmstudio_native_api_base()}/api/v1/models/download",
|
|
308
|
+
method="POST",
|
|
309
|
+
payload={"model": model_name},
|
|
310
|
+
headers=auth_header,
|
|
311
|
+
timeout=30,
|
|
312
|
+
)
|
|
313
|
+
except urllib.error.HTTPError as e:
|
|
314
|
+
detail = e.read().decode("utf-8", errors="replace")[-2000:]
|
|
315
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {detail or e.reason}")
|
|
316
|
+
except Exception as e:
|
|
317
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {e}")
|
|
318
|
+
|
|
319
|
+
status = str(job.get("status") or "")
|
|
320
|
+
job_id = str(job.get("job_id") or "")
|
|
321
|
+
if status not in {"completed", "already_downloaded"} and job_id:
|
|
322
|
+
deadline = time.time() + 3600
|
|
323
|
+
while time.time() < deadline:
|
|
324
|
+
polled = _json_request(
|
|
325
|
+
f"{lmstudio_native_api_base()}/api/v1/models/download/status/{job_id}",
|
|
326
|
+
headers=auth_header,
|
|
327
|
+
timeout=30,
|
|
328
|
+
)
|
|
329
|
+
polled_status = str(polled.get("status") or "")
|
|
330
|
+
if polled_status == "completed":
|
|
331
|
+
break
|
|
332
|
+
if polled_status == "failed":
|
|
333
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 다운로드 실패: {polled}")
|
|
334
|
+
time.sleep(2)
|
|
335
|
+
else:
|
|
336
|
+
raise HTTPException(status_code=408, detail="LM Studio 모델 다운로드 시간이 초과되었습니다.")
|
|
337
|
+
|
|
338
|
+
models = get_lmstudio_models(force=True)
|
|
339
|
+
model_key = _find_lmstudio_model_key(model_name, models) or model_name
|
|
340
|
+
|
|
341
|
+
target = next((item for item in models if isinstance(item, dict) and item.get("key") == model_key), None)
|
|
342
|
+
loaded_instances = target.get("loaded_instances") if isinstance(target, dict) else None
|
|
343
|
+
if loaded_instances:
|
|
344
|
+
return {"provider": "lmstudio", "model": model_name, "resolved_model": model_key, "server_ready": True, "cached": True}
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
loaded = _json_request(
|
|
348
|
+
f"{lmstudio_native_api_base()}/api/v1/models/load",
|
|
349
|
+
method="POST",
|
|
350
|
+
payload={"model": model_key, "context_length": 4096},
|
|
351
|
+
headers=auth_header,
|
|
352
|
+
timeout=120,
|
|
353
|
+
)
|
|
354
|
+
except urllib.error.HTTPError as e:
|
|
355
|
+
detail = e.read().decode("utf-8", errors="replace")[-2000:]
|
|
356
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {detail or e.reason}")
|
|
357
|
+
except Exception as e:
|
|
358
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {e}")
|
|
359
|
+
|
|
360
|
+
if str(loaded.get("status") or "") != "loaded":
|
|
361
|
+
raise HTTPException(status_code=500, detail=f"LM Studio 모델 로드 실패: {loaded}")
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
"provider": "lmstudio",
|
|
365
|
+
"model": model_name,
|
|
366
|
+
"resolved_model": model_key,
|
|
367
|
+
"instance_id": loaded.get("instance_id"),
|
|
368
|
+
"server_ready": True,
|
|
369
|
+
"cached": False,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
def engine_support_status(engine: str) -> Dict[str, object]:
|
|
373
|
+
if engine != "vllm":
|
|
374
|
+
return {"supported": True, "reason": None}
|
|
375
|
+
is_apple_silicon = sys.platform == "darwin" and platform.machine() == "arm64"
|
|
376
|
+
if sys.platform.startswith("win"):
|
|
377
|
+
return {"supported": False, "reason": "vLLM은 Windows native 자동 설치보다 WSL2/Linux 환경을 권장합니다."}
|
|
378
|
+
if sys.platform == "darwin" and not is_apple_silicon:
|
|
379
|
+
return {"supported": False, "reason": "vLLM Metal 자동 설치는 Apple Silicon macOS에서만 지원됩니다."}
|
|
380
|
+
if sys.version_info >= (3, 13) and is_apple_silicon:
|
|
381
|
+
return {"supported": True, "reason": "현재 환경에서는 vLLM Metal 전용 런타임으로 설치합니다."}
|
|
382
|
+
if sys.version_info >= (3, 13):
|
|
383
|
+
return {"supported": False, "reason": "vLLM 설치는 현재 Python 3.13 이하 또는 별도 전용 런타임이 필요합니다."}
|
|
384
|
+
return {"supported": True, "reason": None}
|
|
385
|
+
|
|
386
|
+
def hf_model_ready(repo_id: str, provider: str = "local_mlx") -> bool:
|
|
387
|
+
model_dir = hf_model_dir(repo_id)
|
|
388
|
+
if provider == "vllm" and (not model_dir.exists() or not model_dir.is_dir()):
|
|
389
|
+
hf_cache_repo = Path.home() / ".cache" / "huggingface" / "hub" / f"models--{repo_id.replace('/', '--')}"
|
|
390
|
+
if hf_cache_repo.exists() and any(hf_cache_repo.glob("snapshots/*")):
|
|
391
|
+
return True
|
|
392
|
+
return False
|
|
393
|
+
if not model_dir.exists() or not model_dir.is_dir():
|
|
394
|
+
return False
|
|
395
|
+
if provider == "llamacpp":
|
|
396
|
+
return any(model_dir.rglob("*.gguf"))
|
|
397
|
+
has_config = (model_dir / "config.json").exists()
|
|
398
|
+
has_weights = any(model_dir.glob("*.safetensors")) or any(model_dir.glob("*.bin"))
|
|
399
|
+
has_tokenizer = (
|
|
400
|
+
(model_dir / "tokenizer.json").exists()
|
|
401
|
+
or (model_dir / "tokenizer.model").exists()
|
|
402
|
+
or (model_dir / "tokenizer_config.json").exists()
|
|
403
|
+
)
|
|
404
|
+
return has_config and has_weights and has_tokenizer
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def model_download_progress_payload(
|
|
408
|
+
stage: str,
|
|
409
|
+
message: str,
|
|
410
|
+
*,
|
|
411
|
+
percent: Optional[float] = None,
|
|
412
|
+
detail: Optional[str] = None,
|
|
413
|
+
downloaded_bytes: Optional[int] = None,
|
|
414
|
+
total_bytes: Optional[int] = None,
|
|
415
|
+
eta_seconds: Optional[float] = None,
|
|
416
|
+
file: Optional[str] = None,
|
|
417
|
+
indeterminate: bool = False,
|
|
418
|
+
) -> Dict[str, object]:
|
|
419
|
+
payload: Dict[str, object] = {
|
|
420
|
+
"stage": stage,
|
|
421
|
+
"message": message,
|
|
422
|
+
"indeterminate": indeterminate,
|
|
423
|
+
"ts": time.time(),
|
|
424
|
+
}
|
|
425
|
+
if percent is not None:
|
|
426
|
+
payload["percent"] = max(0, min(100, round(float(percent), 1)))
|
|
427
|
+
if detail:
|
|
428
|
+
payload["detail"] = detail
|
|
429
|
+
if downloaded_bytes is not None:
|
|
430
|
+
payload["downloaded_bytes"] = max(0, int(downloaded_bytes))
|
|
431
|
+
if total_bytes is not None:
|
|
432
|
+
payload["total_bytes"] = max(0, int(total_bytes))
|
|
433
|
+
if eta_seconds is not None:
|
|
434
|
+
payload["eta_seconds"] = max(0, round(float(eta_seconds)))
|
|
435
|
+
if file:
|
|
436
|
+
payload["file"] = file
|
|
437
|
+
return payload
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def estimate_eta_seconds(started_at: float, percent: Optional[float]) -> Optional[float]:
|
|
441
|
+
if percent is None or percent <= 0 or percent >= 100:
|
|
442
|
+
return None
|
|
443
|
+
elapsed = max(0.0, time.time() - started_at)
|
|
444
|
+
return elapsed * (100.0 - percent) / percent
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def hf_repo_files_with_sizes(repo_id: str) -> List[Dict[str, object]]:
|
|
448
|
+
from huggingface_hub import HfApi
|
|
449
|
+
|
|
450
|
+
api = HfApi()
|
|
451
|
+
try:
|
|
452
|
+
info = api.model_info(repo_id, files_metadata=True)
|
|
453
|
+
files = []
|
|
454
|
+
for sibling in getattr(info, "siblings", []) or []:
|
|
455
|
+
name = str(getattr(sibling, "rfilename", "") or "").strip()
|
|
456
|
+
if not name or name.endswith("/"):
|
|
457
|
+
continue
|
|
458
|
+
files.append({"name": name, "size": int(getattr(sibling, "size", 0) or 0)})
|
|
459
|
+
if files:
|
|
460
|
+
return files
|
|
461
|
+
except TypeError:
|
|
462
|
+
pass
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logging.warning("huggingface model_info failed for %s: %s", repo_id, e)
|
|
465
|
+
|
|
466
|
+
return [{"name": str(name), "size": 0} for name in api.list_repo_files(repo_id) if str(name).strip()]
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def download_hf_model(
|
|
470
|
+
repo_id: str,
|
|
471
|
+
provider: str = "local_mlx",
|
|
472
|
+
progress_emit=None,
|
|
473
|
+
) -> Dict[str, object]:
|
|
474
|
+
if importlib.util.find_spec("huggingface_hub") is None:
|
|
475
|
+
raise HTTPException(status_code=400, detail="huggingface_hub가 없습니다. 먼저 MLX runtime 설치를 진행해 주세요.")
|
|
476
|
+
|
|
477
|
+
target_dir = hf_model_dir(repo_id)
|
|
478
|
+
if hf_model_ready(repo_id, provider):
|
|
479
|
+
if progress_emit:
|
|
480
|
+
progress_emit(model_download_progress_payload(
|
|
481
|
+
"download",
|
|
482
|
+
"이미 다운로드된 모델을 확인했습니다.",
|
|
483
|
+
percent=100,
|
|
484
|
+
downloaded_bytes=0,
|
|
485
|
+
total_bytes=0,
|
|
486
|
+
eta_seconds=0,
|
|
487
|
+
))
|
|
488
|
+
return {"model": repo_id, "path": str(target_dir), "cached": True}
|
|
489
|
+
|
|
490
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
491
|
+
try:
|
|
492
|
+
from huggingface_hub import hf_hub_download
|
|
493
|
+
|
|
494
|
+
started_at = time.time()
|
|
495
|
+
all_files = hf_repo_files_with_sizes(repo_id)
|
|
496
|
+
if provider == "llamacpp":
|
|
497
|
+
ggufs = sorted(
|
|
498
|
+
[item for item in all_files if str(item["name"]).lower().endswith(".gguf")],
|
|
499
|
+
key=lambda item: str(item["name"]),
|
|
500
|
+
)
|
|
501
|
+
if not ggufs:
|
|
502
|
+
raise RuntimeError("GGUF 파일을 찾지 못했습니다.")
|
|
503
|
+
preference = ("q4_k_m", "q4_0", "q4_k_s", "q3_k_m", "q2_k")
|
|
504
|
+
selected_files = [
|
|
505
|
+
next(
|
|
506
|
+
(item for pref in preference for item in ggufs if pref in str(item["name"]).lower()),
|
|
507
|
+
ggufs[0],
|
|
508
|
+
)
|
|
509
|
+
]
|
|
510
|
+
else:
|
|
511
|
+
selected_files = all_files
|
|
512
|
+
|
|
513
|
+
total_bytes = sum(int(item.get("size") or 0) for item in selected_files) or None
|
|
514
|
+
downloaded_bytes = 0
|
|
515
|
+
total_files = max(1, len(selected_files))
|
|
516
|
+
if progress_emit:
|
|
517
|
+
progress_emit(model_download_progress_payload(
|
|
518
|
+
"download",
|
|
519
|
+
"모델 파일 정보를 확인했습니다.",
|
|
520
|
+
percent=0,
|
|
521
|
+
downloaded_bytes=0,
|
|
522
|
+
total_bytes=total_bytes,
|
|
523
|
+
indeterminate=total_bytes is None,
|
|
524
|
+
))
|
|
525
|
+
|
|
526
|
+
for index, item in enumerate(selected_files, start=1):
|
|
527
|
+
filename = str(item["name"])
|
|
528
|
+
size = int(item.get("size") or 0)
|
|
529
|
+
tqdm_class = None
|
|
530
|
+
if progress_emit:
|
|
531
|
+
current_percent = (
|
|
532
|
+
(downloaded_bytes / total_bytes) * 100 if total_bytes else ((index - 1) / total_files) * 100
|
|
533
|
+
)
|
|
534
|
+
progress_emit(model_download_progress_payload(
|
|
535
|
+
"download",
|
|
536
|
+
"모델 다운로드 중입니다.",
|
|
537
|
+
percent=current_percent,
|
|
538
|
+
detail=filename,
|
|
539
|
+
downloaded_bytes=downloaded_bytes,
|
|
540
|
+
total_bytes=total_bytes,
|
|
541
|
+
eta_seconds=estimate_eta_seconds(started_at, current_percent),
|
|
542
|
+
file=filename,
|
|
543
|
+
indeterminate=total_bytes is None and total_files <= 1,
|
|
544
|
+
))
|
|
545
|
+
try:
|
|
546
|
+
from tqdm.auto import tqdm as base_tqdm
|
|
547
|
+
|
|
548
|
+
downloaded_before = downloaded_bytes
|
|
549
|
+
last_emit = {"at": 0.0, "percent": -1.0}
|
|
550
|
+
|
|
551
|
+
def emit_byte_progress(done_bytes: float) -> None:
|
|
552
|
+
done = max(0, int(done_bytes or 0))
|
|
553
|
+
if total_bytes:
|
|
554
|
+
aggregate = min(total_bytes, downloaded_before + done)
|
|
555
|
+
percent = (aggregate / total_bytes) * 100
|
|
556
|
+
else:
|
|
557
|
+
file_total = size or done
|
|
558
|
+
file_ratio = min(1.0, done / file_total) if file_total else 0.0
|
|
559
|
+
aggregate = downloaded_before + done
|
|
560
|
+
percent = ((index - 1) + file_ratio) / total_files * 100
|
|
561
|
+
now = time.time()
|
|
562
|
+
if percent < 100 and now - last_emit["at"] < 0.5 and percent - last_emit["percent"] < 0.3:
|
|
563
|
+
return
|
|
564
|
+
last_emit["at"] = now
|
|
565
|
+
last_emit["percent"] = percent
|
|
566
|
+
progress_emit(model_download_progress_payload(
|
|
567
|
+
"download",
|
|
568
|
+
"모델 다운로드 중입니다.",
|
|
569
|
+
percent=percent,
|
|
570
|
+
detail=filename,
|
|
571
|
+
downloaded_bytes=aggregate,
|
|
572
|
+
total_bytes=total_bytes,
|
|
573
|
+
eta_seconds=estimate_eta_seconds(started_at, percent),
|
|
574
|
+
file=filename,
|
|
575
|
+
indeterminate=total_bytes is None and total_files <= 1,
|
|
576
|
+
))
|
|
577
|
+
|
|
578
|
+
class ProgressTqdm(base_tqdm):
|
|
579
|
+
def update(self, n=1):
|
|
580
|
+
result = super().update(n)
|
|
581
|
+
emit_byte_progress(float(getattr(self, "n", 0) or 0))
|
|
582
|
+
return result
|
|
583
|
+
|
|
584
|
+
tqdm_class = ProgressTqdm
|
|
585
|
+
except Exception:
|
|
586
|
+
tqdm_class = None
|
|
587
|
+
local_path = hf_hub_download(
|
|
588
|
+
repo_id=repo_id,
|
|
589
|
+
filename=filename,
|
|
590
|
+
local_dir=str(target_dir),
|
|
591
|
+
tqdm_class=tqdm_class,
|
|
592
|
+
)
|
|
593
|
+
if size <= 0:
|
|
594
|
+
try:
|
|
595
|
+
size = Path(local_path).stat().st_size
|
|
596
|
+
except OSError:
|
|
597
|
+
size = 0
|
|
598
|
+
downloaded_bytes += size
|
|
599
|
+
if progress_emit:
|
|
600
|
+
current_percent = (
|
|
601
|
+
(downloaded_bytes / total_bytes) * 100 if total_bytes else (index / total_files) * 100
|
|
602
|
+
)
|
|
603
|
+
progress_emit(model_download_progress_payload(
|
|
604
|
+
"download",
|
|
605
|
+
"모델 다운로드 중입니다.",
|
|
606
|
+
percent=current_percent,
|
|
607
|
+
detail=filename,
|
|
608
|
+
downloaded_bytes=downloaded_bytes,
|
|
609
|
+
total_bytes=total_bytes,
|
|
610
|
+
eta_seconds=estimate_eta_seconds(started_at, current_percent),
|
|
611
|
+
file=filename,
|
|
612
|
+
indeterminate=False,
|
|
613
|
+
))
|
|
614
|
+
|
|
615
|
+
if progress_emit:
|
|
616
|
+
progress_emit(model_download_progress_payload(
|
|
617
|
+
"download",
|
|
618
|
+
"모델 다운로드가 완료되었습니다.",
|
|
619
|
+
percent=100,
|
|
620
|
+
downloaded_bytes=downloaded_bytes,
|
|
621
|
+
total_bytes=total_bytes or downloaded_bytes,
|
|
622
|
+
eta_seconds=0,
|
|
623
|
+
))
|
|
624
|
+
except Exception as e:
|
|
625
|
+
raise HTTPException(status_code=500, detail=f"{repo_id} 다운로드 실패: {str(e)[-2000:]}")
|
|
626
|
+
|
|
627
|
+
if not hf_model_ready(repo_id, provider):
|
|
628
|
+
raise HTTPException(status_code=500, detail=f"{repo_id} 다운로드가 완료되지 않았습니다. 모델 파일을 찾지 못했습니다.")
|
|
629
|
+
|
|
630
|
+
return {"model": repo_id, "path": str(target_dir), "cached": False}
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def pull_ollama_model_with_progress(model_name: str, progress_emit=None) -> Dict[str, object]:
|
|
634
|
+
ollama = local_binary("ollama")
|
|
635
|
+
if not ollama:
|
|
636
|
+
raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
|
|
637
|
+
started_at = time.time()
|
|
638
|
+
if progress_emit:
|
|
639
|
+
progress_emit(model_download_progress_payload(
|
|
640
|
+
"download",
|
|
641
|
+
"Ollama 모델 다운로드를 시작합니다.",
|
|
642
|
+
percent=0,
|
|
643
|
+
detail=model_name,
|
|
644
|
+
indeterminate=True,
|
|
645
|
+
))
|
|
646
|
+
process = subprocess.Popen(
|
|
647
|
+
[ollama, "pull", model_name],
|
|
648
|
+
stdout=subprocess.PIPE,
|
|
649
|
+
stderr=subprocess.STDOUT,
|
|
650
|
+
text=True,
|
|
651
|
+
bufsize=1,
|
|
652
|
+
)
|
|
653
|
+
last_percent: Optional[float] = None
|
|
654
|
+
lines: List[str] = []
|
|
655
|
+
try:
|
|
656
|
+
assert process.stdout is not None
|
|
657
|
+
for raw_line in process.stdout:
|
|
658
|
+
for part in re.split(r"[\r\n]+", raw_line):
|
|
659
|
+
line = part.strip()
|
|
660
|
+
if not line:
|
|
661
|
+
continue
|
|
662
|
+
lines.append(line)
|
|
663
|
+
match = re.search(r"(\d{1,3}(?:\.\d+)?)\s*%", line)
|
|
664
|
+
if match:
|
|
665
|
+
last_percent = min(100.0, float(match.group(1)))
|
|
666
|
+
if progress_emit:
|
|
667
|
+
progress_emit(model_download_progress_payload(
|
|
668
|
+
"download",
|
|
669
|
+
"Ollama 모델 다운로드 중입니다.",
|
|
670
|
+
percent=last_percent,
|
|
671
|
+
detail=line[-180:],
|
|
672
|
+
eta_seconds=estimate_eta_seconds(started_at, last_percent),
|
|
673
|
+
indeterminate=False,
|
|
674
|
+
))
|
|
675
|
+
elif progress_emit:
|
|
676
|
+
progress_emit(model_download_progress_payload(
|
|
677
|
+
"download",
|
|
678
|
+
"Ollama 모델 다운로드 중입니다.",
|
|
679
|
+
percent=last_percent,
|
|
680
|
+
detail=line[-180:],
|
|
681
|
+
eta_seconds=estimate_eta_seconds(started_at, last_percent),
|
|
682
|
+
indeterminate=last_percent is None,
|
|
683
|
+
))
|
|
684
|
+
returncode = process.wait()
|
|
685
|
+
except Exception:
|
|
686
|
+
process.kill()
|
|
687
|
+
raise
|
|
688
|
+
|
|
689
|
+
if returncode != 0:
|
|
690
|
+
tail = "\n".join(lines[-12:])
|
|
691
|
+
raise HTTPException(status_code=500, detail=tail[-2000:] or "Ollama 모델 다운로드 실패")
|
|
692
|
+
|
|
693
|
+
if progress_emit:
|
|
694
|
+
progress_emit(model_download_progress_payload(
|
|
695
|
+
"download",
|
|
696
|
+
"Ollama 모델 다운로드가 완료되었습니다.",
|
|
697
|
+
percent=100,
|
|
698
|
+
detail=model_name,
|
|
699
|
+
eta_seconds=0,
|
|
700
|
+
indeterminate=False,
|
|
701
|
+
))
|
|
702
|
+
return {"provider": "ollama", "model": model_name, "returncode": returncode}
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def get_ollama_pulled_models() -> set:
|
|
706
|
+
ollama = local_binary("ollama")
|
|
707
|
+
if not ollama:
|
|
708
|
+
return set()
|
|
709
|
+
try:
|
|
710
|
+
result = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=5, check=False)
|
|
711
|
+
pulled = set()
|
|
712
|
+
for line in result.stdout.splitlines()[1:]:
|
|
713
|
+
parts = line.split()
|
|
714
|
+
if parts:
|
|
715
|
+
pulled.add(parts[0])
|
|
716
|
+
return pulled
|
|
717
|
+
except Exception:
|
|
718
|
+
return set()
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def get_openai_compatible_server_models(provider: str) -> List[str]:
|
|
722
|
+
if provider == "lmstudio":
|
|
723
|
+
models = []
|
|
724
|
+
for item in get_lmstudio_models():
|
|
725
|
+
if not isinstance(item, dict):
|
|
726
|
+
continue
|
|
727
|
+
key = str(item.get("key") or "").strip()
|
|
728
|
+
loaded_instances = item.get("loaded_instances") or []
|
|
729
|
+
if loaded_instances:
|
|
730
|
+
instance_ids = [
|
|
731
|
+
str(instance.get("id") or "").strip()
|
|
732
|
+
for instance in loaded_instances
|
|
733
|
+
if isinstance(instance, dict) and instance.get("id")
|
|
734
|
+
]
|
|
735
|
+
models.extend(instance_ids or ([key] if key else []))
|
|
736
|
+
return list(dict.fromkeys([model for model in models if model]))
|
|
737
|
+
|
|
738
|
+
config = OPENAI_COMPATIBLE_PROVIDERS.get(provider) or {}
|
|
739
|
+
base_url = os.getenv(config.get("base_url_env", "")) if config.get("base_url_env") else None
|
|
740
|
+
base_url = (base_url or config.get("base_url") or "").rstrip("/")
|
|
741
|
+
if not base_url:
|
|
742
|
+
return []
|
|
743
|
+
|
|
744
|
+
api_key = os.getenv(config.get("env_key", "")) or config.get("api_key_fallback") or provider
|
|
745
|
+
req = urllib.request.Request(
|
|
746
|
+
f"{base_url}/models",
|
|
747
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
748
|
+
method="GET",
|
|
749
|
+
)
|
|
750
|
+
try:
|
|
751
|
+
with urllib.request.urlopen(req, timeout=2.5) as res:
|
|
752
|
+
payload = json.loads(res.read().decode("utf-8", errors="replace"))
|
|
753
|
+
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError, OSError):
|
|
754
|
+
return []
|
|
755
|
+
|
|
756
|
+
models = []
|
|
757
|
+
for item in payload.get("data") or []:
|
|
758
|
+
model_id = item.get("id") if isinstance(item, dict) else None
|
|
759
|
+
if model_id:
|
|
760
|
+
models.append(str(model_id))
|
|
761
|
+
return models
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def ensure_ollama_server() -> None:
|
|
765
|
+
ollama = local_binary("ollama")
|
|
766
|
+
if not ollama:
|
|
767
|
+
raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
|
|
768
|
+
try:
|
|
769
|
+
probe = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=3, check=False)
|
|
770
|
+
if probe.returncode == 0:
|
|
771
|
+
return
|
|
772
|
+
except Exception:
|
|
773
|
+
pass
|
|
774
|
+
subprocess.Popen(
|
|
775
|
+
[ollama, "serve"],
|
|
776
|
+
stdout=subprocess.DEVNULL,
|
|
777
|
+
stderr=subprocess.DEVNULL,
|
|
778
|
+
start_new_session=True,
|
|
779
|
+
)
|
|
780
|
+
deadline = time.time() + 20
|
|
781
|
+
while time.time() < deadline:
|
|
782
|
+
try:
|
|
783
|
+
probe = subprocess.run([ollama, "list"], capture_output=True, text=True, timeout=3, check=False)
|
|
784
|
+
if probe.returncode == 0:
|
|
785
|
+
return
|
|
786
|
+
except Exception:
|
|
787
|
+
pass
|
|
788
|
+
time.sleep(0.5)
|
|
789
|
+
raise HTTPException(status_code=500, detail="Ollama 서버를 자동으로 시작하지 못했습니다.")
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def wait_for_openai_compatible_server(provider: str, model_name: Optional[str] = None, timeout: int = 45) -> bool:
|
|
793
|
+
deadline = time.time() + timeout
|
|
794
|
+
while time.time() < deadline:
|
|
795
|
+
models = get_openai_compatible_server_models(provider)
|
|
796
|
+
if models and (not model_name or model_name in models):
|
|
797
|
+
return True
|
|
798
|
+
time.sleep(1)
|
|
799
|
+
return False
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def ensure_vllm_server(model_name: str) -> None:
|
|
803
|
+
served_models = get_openai_compatible_server_models("vllm")
|
|
804
|
+
if model_name in served_models:
|
|
805
|
+
return
|
|
806
|
+
vllm_bin = vllm_executable()
|
|
807
|
+
vllm_metal_py = vllm_metal_python()
|
|
808
|
+
if not vllm_bin and not vllm_metal_py and importlib.util.find_spec("vllm") is None:
|
|
809
|
+
raise HTTPException(status_code=400, detail="vLLM runtime이 설치되지 않았습니다.")
|
|
810
|
+
|
|
811
|
+
local_dir = hf_model_dir(model_name)
|
|
812
|
+
if not vllm_metal_py and not hf_model_ready(model_name, "vllm"):
|
|
813
|
+
download_hf_model(model_name, "vllm")
|
|
814
|
+
|
|
815
|
+
running = LOCAL_SERVER_PROCESSES.get("vllm")
|
|
816
|
+
if running and running.poll() is None:
|
|
817
|
+
running.terminate()
|
|
818
|
+
try:
|
|
819
|
+
running.wait(timeout=10)
|
|
820
|
+
except subprocess.TimeoutExpired:
|
|
821
|
+
running.kill()
|
|
822
|
+
elif served_models:
|
|
823
|
+
raise HTTPException(status_code=409, detail="다른 vLLM 서버가 이미 실행 중입니다. 현재 서버를 종료한 뒤 다시 시도하세요.")
|
|
824
|
+
|
|
825
|
+
running = LOCAL_SERVER_PROCESSES.get("vllm")
|
|
826
|
+
if running and running.poll() is None:
|
|
827
|
+
return
|
|
828
|
+
|
|
829
|
+
_host_args = ["--host", "127.0.0.1", "--port", "8000"]
|
|
830
|
+
if vllm_metal_py:
|
|
831
|
+
command = [vllm_metal_py, "-m", "vllm_metal.server", "--model", model_name, *_host_args]
|
|
832
|
+
elif vllm_bin:
|
|
833
|
+
command = [vllm_bin, "serve", str(local_dir), "--served-model-name", model_name, *_host_args]
|
|
834
|
+
else:
|
|
835
|
+
command = [sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", str(local_dir), "--served-model-name", model_name, *_host_args]
|
|
836
|
+
LOCAL_SERVER_PROCESSES["vllm"] = subprocess.Popen(
|
|
837
|
+
command,
|
|
838
|
+
stdout=subprocess.DEVNULL,
|
|
839
|
+
stderr=subprocess.DEVNULL,
|
|
840
|
+
start_new_session=True,
|
|
841
|
+
)
|
|
842
|
+
if not wait_for_openai_compatible_server("vllm", model_name, timeout=90):
|
|
843
|
+
raise HTTPException(status_code=500, detail="vLLM 서버가 모델을 자동 로드하지 못했습니다.")
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def ensure_llamacpp_server(model_name: str) -> None:
|
|
847
|
+
served_models = get_openai_compatible_server_models("llamacpp")
|
|
848
|
+
if model_name in served_models:
|
|
849
|
+
return
|
|
850
|
+
running = LOCAL_SERVER_PROCESSES.get("llamacpp")
|
|
851
|
+
if running and running.poll() is None:
|
|
852
|
+
running.terminate()
|
|
853
|
+
try:
|
|
854
|
+
running.wait(timeout=10)
|
|
855
|
+
except subprocess.TimeoutExpired:
|
|
856
|
+
running.kill()
|
|
857
|
+
elif served_models:
|
|
858
|
+
raise HTTPException(status_code=409, detail="다른 llama.cpp 서버가 이미 실행 중입니다. 현재 서버를 종료한 뒤 다시 시도하세요.")
|
|
859
|
+
if not shutil.which("llama-server"):
|
|
860
|
+
raise HTTPException(status_code=400, detail="llama.cpp가 설치되지 않았습니다.")
|
|
861
|
+
if not hf_model_ready(model_name, "llamacpp"):
|
|
862
|
+
download_hf_model(model_name, "llamacpp")
|
|
863
|
+
|
|
864
|
+
gguf_files = sorted(hf_model_dir(model_name).rglob("*.gguf"))
|
|
865
|
+
if not gguf_files:
|
|
866
|
+
raise HTTPException(status_code=500, detail="다운로드된 GGUF 파일을 찾지 못했습니다.")
|
|
867
|
+
|
|
868
|
+
preferred = next((p for p in gguf_files if "q4_k_m" in p.name.lower()), None)
|
|
869
|
+
model_file = preferred or gguf_files[0]
|
|
870
|
+
LOCAL_SERVER_PROCESSES["llamacpp"] = subprocess.Popen(
|
|
871
|
+
[
|
|
872
|
+
"llama-server",
|
|
873
|
+
"-m",
|
|
874
|
+
str(model_file),
|
|
875
|
+
"--alias",
|
|
876
|
+
model_name,
|
|
877
|
+
"--host",
|
|
878
|
+
"127.0.0.1",
|
|
879
|
+
"--port",
|
|
880
|
+
"8080",
|
|
881
|
+
],
|
|
882
|
+
stdout=subprocess.DEVNULL,
|
|
883
|
+
stderr=subprocess.DEVNULL,
|
|
884
|
+
start_new_session=True,
|
|
885
|
+
)
|
|
886
|
+
if not wait_for_openai_compatible_server("llamacpp", model_name, timeout=45):
|
|
887
|
+
raise HTTPException(status_code=500, detail="llama.cpp 서버가 모델을 자동 로드하지 못했습니다.")
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
def engine_installed(engine: str) -> bool:
|
|
891
|
+
if engine == "local_mlx":
|
|
892
|
+
return bool(importlib.util.find_spec("mlx") and importlib.util.find_spec("mlx_lm"))
|
|
893
|
+
if engine == "ollama":
|
|
894
|
+
return local_binary("ollama") is not None
|
|
895
|
+
if engine == "vllm":
|
|
896
|
+
return vllm_metal_python() is not None or vllm_executable() is not None or importlib.util.find_spec("vllm") is not None
|
|
897
|
+
if engine == "lmstudio":
|
|
898
|
+
return find_lmstudio_cli() is not None or Path("/Applications/LM Studio.app").exists()
|
|
899
|
+
if engine == "llamacpp":
|
|
900
|
+
return shutil.which("llama-server") is not None
|
|
901
|
+
if engine in {"openai", "openrouter", "groq", "together", "xai"}:
|
|
902
|
+
return AsyncOpenAI is not None
|
|
903
|
+
return False
|
|
904
|
+
|
|
905
|
+
def engine_status() -> List[Dict]:
|
|
906
|
+
cloud_models = router.detected_cloud_models()
|
|
907
|
+
cloud_by_provider = {}
|
|
908
|
+
for model in cloud_models:
|
|
909
|
+
cloud_by_provider.setdefault(model["provider"], []).append(model)
|
|
910
|
+
|
|
911
|
+
ollama_installed = engine_installed("ollama")
|
|
912
|
+
pulled = get_ollama_pulled_models() if ollama_installed else set()
|
|
913
|
+
ollama_models = []
|
|
914
|
+
for m in ENGINE_MODEL_CATALOG["ollama"]:
|
|
915
|
+
pull_name = m["id"].removeprefix("ollama:")
|
|
916
|
+
ollama_models.append({**m, "pulled": pull_name in pulled})
|
|
917
|
+
ollama_models = filter_lower_family_versions(ollama_models)
|
|
918
|
+
|
|
919
|
+
HF_MODELS_ROOT.mkdir(parents=True, exist_ok=True)
|
|
920
|
+
mlx_models = []
|
|
921
|
+
for m in ENGINE_MODEL_CATALOG.get("local_mlx", []):
|
|
922
|
+
repo_id = m["id"]
|
|
923
|
+
mlx_models.append({**m, "pulled": hf_model_ready(repo_id, "local_mlx")})
|
|
924
|
+
mlx_models = filter_lower_family_versions(mlx_models)
|
|
925
|
+
|
|
926
|
+
vllm_models = []
|
|
927
|
+
for m in ENGINE_MODEL_CATALOG.get("vllm", []):
|
|
928
|
+
repo_id = m["id"].removeprefix("vllm:")
|
|
929
|
+
vllm_models.append({**m, "pulled": hf_model_ready(repo_id, "vllm")})
|
|
930
|
+
vllm_models = filter_lower_family_versions(vllm_models)
|
|
931
|
+
|
|
932
|
+
lmstudio_models = []
|
|
933
|
+
downloaded_lmstudio = get_lmstudio_models()
|
|
934
|
+
downloaded_by_key = {}
|
|
935
|
+
for item in downloaded_lmstudio:
|
|
936
|
+
if not isinstance(item, dict):
|
|
937
|
+
continue
|
|
938
|
+
key = str(item.get("key") or "").strip()
|
|
939
|
+
if not key:
|
|
940
|
+
continue
|
|
941
|
+
downloaded_by_key[key] = item
|
|
942
|
+
loaded_instances = item.get("loaded_instances") or []
|
|
943
|
+
lmstudio_models.append({
|
|
944
|
+
"id": f"lmstudio:{key}",
|
|
945
|
+
"name": item.get("display_name") or f"LM Studio · {key}",
|
|
946
|
+
"family": item.get("architecture") or item.get("publisher") or "LM Studio",
|
|
947
|
+
"tag": "loaded-server-model" if loaded_instances else "downloaded",
|
|
948
|
+
"size": item.get("params_string") or item.get("format") or "LM Studio",
|
|
949
|
+
"pullable": True,
|
|
950
|
+
"pulled": True,
|
|
951
|
+
})
|
|
952
|
+
|
|
953
|
+
if not lmstudio_models:
|
|
954
|
+
for m in ENGINE_MODEL_CATALOG.get("lmstudio", []):
|
|
955
|
+
lmstudio_models.append({**m, "pulled": False})
|
|
956
|
+
else:
|
|
957
|
+
known_ids = {item["id"] for item in lmstudio_models}
|
|
958
|
+
for m in ENGINE_MODEL_CATALOG.get("lmstudio", []):
|
|
959
|
+
repo_id = m["id"].removeprefix("lmstudio:")
|
|
960
|
+
if f"lmstudio:{repo_id}" not in known_ids and repo_id not in downloaded_by_key:
|
|
961
|
+
lmstudio_models.append({**m, "pulled": False})
|
|
962
|
+
lmstudio_models = filter_lower_family_versions(lmstudio_models)
|
|
963
|
+
|
|
964
|
+
llamacpp_models = []
|
|
965
|
+
for m in ENGINE_MODEL_CATALOG.get("llamacpp", []):
|
|
966
|
+
repo_id = m["id"].removeprefix("llamacpp:")
|
|
967
|
+
llamacpp_models.append({**m, "pulled": hf_model_ready(repo_id, "llamacpp")})
|
|
968
|
+
llamacpp_models = filter_lower_family_versions(llamacpp_models)
|
|
969
|
+
|
|
970
|
+
local_server_specs = [
|
|
971
|
+
{
|
|
972
|
+
"id": "vllm",
|
|
973
|
+
"name": "vLLM",
|
|
974
|
+
"description": "vLLM OpenAI 호환 서버(예: http://localhost:8000/v1)에 연결합니다.",
|
|
975
|
+
"requires": "VLLM_BASE_URL",
|
|
976
|
+
"note": engine_support_status("vllm").get("reason"),
|
|
977
|
+
},
|
|
978
|
+
{
|
|
979
|
+
"id": "lmstudio",
|
|
980
|
+
"name": "LM Studio",
|
|
981
|
+
"description": "LM Studio 로컬 OpenAI 호환 서버에 연결합니다.",
|
|
982
|
+
"requires": "LMSTUDIO_BASE_URL",
|
|
983
|
+
"note": (
|
|
984
|
+
"다운로드된 모델은 자동 감지하고, 선택 시 필요하면 다운로드 후 바로 로드합니다."
|
|
985
|
+
if downloaded_lmstudio else
|
|
986
|
+
"LM Studio 설치 후 모델을 선택하면 Local Server 시작, 다운로드, 로드를 자동으로 진행합니다."
|
|
987
|
+
),
|
|
988
|
+
"server_ready": bool(downloaded_lmstudio),
|
|
989
|
+
},
|
|
990
|
+
{
|
|
991
|
+
"id": "llamacpp",
|
|
992
|
+
"name": "llama.cpp",
|
|
993
|
+
"description": "llama.cpp 서버(OpenAI 호환 /v1)에 연결합니다.",
|
|
994
|
+
"requires": "LLAMACPP_BASE_URL",
|
|
995
|
+
},
|
|
996
|
+
]
|
|
997
|
+
|
|
998
|
+
engines = [
|
|
999
|
+
{
|
|
1000
|
+
"id": "local_mlx",
|
|
1001
|
+
"name": "MLX",
|
|
1002
|
+
"kind": "local",
|
|
1003
|
+
"description": "Apple Silicon GPU에서 MLX/MLX-VLM 모델을 직접 실행합니다.",
|
|
1004
|
+
"installed": engine_installed("local_mlx"),
|
|
1005
|
+
"installable": True,
|
|
1006
|
+
"install_label": ENGINE_INSTALLERS["local_mlx"]["label"],
|
|
1007
|
+
"models": mlx_models,
|
|
1008
|
+
},
|
|
1009
|
+
{
|
|
1010
|
+
"id": "ollama",
|
|
1011
|
+
"name": "Ollama",
|
|
1012
|
+
"kind": "local-server",
|
|
1013
|
+
"description": "Ollama 로컬 서버를 OpenAI 호환 엔진처럼 사용합니다.",
|
|
1014
|
+
"installed": ollama_installed,
|
|
1015
|
+
"installable": True,
|
|
1016
|
+
"install_label": ENGINE_INSTALLERS["ollama"]["label"],
|
|
1017
|
+
"models": ollama_models,
|
|
1018
|
+
},
|
|
1019
|
+
]
|
|
1020
|
+
for spec in local_server_specs:
|
|
1021
|
+
support = engine_support_status(spec["id"])
|
|
1022
|
+
engines.append({
|
|
1023
|
+
"id": spec["id"],
|
|
1024
|
+
"name": spec["name"],
|
|
1025
|
+
"kind": "local-server",
|
|
1026
|
+
"description": spec["description"],
|
|
1027
|
+
"installed": engine_installed(spec["id"]),
|
|
1028
|
+
"supported": support["supported"],
|
|
1029
|
+
"support_reason": support["reason"],
|
|
1030
|
+
"installable": support["supported"] and spec["id"] in ENGINE_INSTALLERS,
|
|
1031
|
+
"install_label": ENGINE_INSTALLERS.get(spec["id"], {}).get("label"),
|
|
1032
|
+
"requires": spec["requires"],
|
|
1033
|
+
"models": (
|
|
1034
|
+
vllm_models if spec["id"] == "vllm"
|
|
1035
|
+
else lmstudio_models if spec["id"] == "lmstudio"
|
|
1036
|
+
else llamacpp_models if spec["id"] == "llamacpp"
|
|
1037
|
+
else ENGINE_MODEL_CATALOG.get(spec["id"], [])
|
|
1038
|
+
),
|
|
1039
|
+
"note": spec.get("note") or support["reason"] or f"{spec['requires']} 설정 시 활성화됩니다.",
|
|
1040
|
+
"server_ready": spec.get("server_ready"),
|
|
1041
|
+
})
|
|
1042
|
+
for provider in ["openai", "openrouter", "groq", "together", "xai"]:
|
|
1043
|
+
env_key = next((item.get("requires") for item in cloud_by_provider.get(provider, []) if item.get("requires")), None)
|
|
1044
|
+
provider_models = []
|
|
1045
|
+
for model in cloud_by_provider.get(provider, []):
|
|
1046
|
+
cache = CLOUD_VERIFY_CACHE.get(model.get("id"))
|
|
1047
|
+
provider_models.append({
|
|
1048
|
+
**model,
|
|
1049
|
+
"verified": cache.get("ok") if cache else None,
|
|
1050
|
+
"verify_reason": cache.get("reason") if cache else None,
|
|
1051
|
+
})
|
|
1052
|
+
engines.append({
|
|
1053
|
+
"id": provider,
|
|
1054
|
+
"name": provider.title(),
|
|
1055
|
+
"kind": "cloud",
|
|
1056
|
+
"description": "OpenAI 호환 Chat Completions API로 cloud LLM을 실행합니다.",
|
|
1057
|
+
"installed": engine_installed(provider),
|
|
1058
|
+
"installable": True,
|
|
1059
|
+
"install_label": ENGINE_INSTALLERS[provider]["label"],
|
|
1060
|
+
"requires": env_key,
|
|
1061
|
+
"models": provider_models,
|
|
1062
|
+
})
|
|
1063
|
+
return engines
|
|
1064
|
+
|
|
1065
|
+
def runtime_features() -> Dict:
|
|
1066
|
+
return {
|
|
1067
|
+
"mode": APP_MODE,
|
|
1068
|
+
"public": IS_PUBLIC_MODE,
|
|
1069
|
+
"host": DEFAULT_HOST,
|
|
1070
|
+
"port": DEFAULT_PORT,
|
|
1071
|
+
"data_dir": str(DATA_DIR),
|
|
1072
|
+
"telegram_enabled": ENABLE_TELEGRAM,
|
|
1073
|
+
"graph_enabled": ENABLE_GRAPH,
|
|
1074
|
+
"autoload_models": AUTOLOAD_MODELS,
|
|
1075
|
+
"model_idle_unload_seconds": MODEL_IDLE_UNLOAD_SECONDS,
|
|
1076
|
+
"model_memory_policy": router.model_memory_policy(),
|
|
1077
|
+
"allow_local_models": ALLOW_LOCAL_MODELS,
|
|
1078
|
+
"security": {
|
|
1079
|
+
"host": DEFAULT_HOST,
|
|
1080
|
+
"require_auth": REQUIRE_AUTH,
|
|
1081
|
+
"invite_gate_enabled": INVITE_GATE_ENABLED,
|
|
1082
|
+
"keyring_available": keyring is not None,
|
|
1083
|
+
"plaintext_api_keys_allowed": ALLOW_PLAINTEXT_API_KEYS,
|
|
1084
|
+
"cors_allow_network": CORS_ALLOW_NETWORK,
|
|
1085
|
+
},
|
|
1086
|
+
"default_model": PUBLIC_MODEL if IS_PUBLIC_MODE else LOCAL_MODEL,
|
|
1087
|
+
"local_only_features": {
|
|
1088
|
+
"mlx": ALLOW_LOCAL_MODELS and not IS_PUBLIC_MODE,
|
|
1089
|
+
"telegram_bridge": ENABLE_TELEGRAM,
|
|
1090
|
+
"desktop_chrome_bridge": not IS_PUBLIC_MODE,
|
|
1091
|
+
"computer_use_bridge": not IS_PUBLIC_MODE,
|
|
1092
|
+
},
|
|
1093
|
+
"public_features": {
|
|
1094
|
+
"web_ui": True,
|
|
1095
|
+
"openai_compatible_models": True,
|
|
1096
|
+
"persistent_data_dir": str(DATA_DIR),
|
|
1097
|
+
},
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
def install_engine(engine: str) -> Dict:
|
|
1101
|
+
if engine not in ENGINE_INSTALLERS:
|
|
1102
|
+
raise HTTPException(status_code=400, detail="지원하지 않는 엔진입니다.")
|
|
1103
|
+
installer = ENGINE_INSTALLERS[engine]
|
|
1104
|
+
required_binary = installer.get("requires_binary")
|
|
1105
|
+
if required_binary and shutil.which(required_binary) is None:
|
|
1106
|
+
raise HTTPException(status_code=400, detail=f"{required_binary}가 설치되어 있지 않아 자동 설치할 수 없습니다.")
|
|
1107
|
+
command = installer["command"]
|
|
1108
|
+
run_kwargs = {
|
|
1109
|
+
"cwd": str(BASE_DIR),
|
|
1110
|
+
"capture_output": True,
|
|
1111
|
+
"text": True,
|
|
1112
|
+
"timeout": 900,
|
|
1113
|
+
"check": False,
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
if engine == "vllm" and sys.platform == "darwin" and platform.machine() == "arm64":
|
|
1117
|
+
command = [
|
|
1118
|
+
"/bin/bash",
|
|
1119
|
+
"-lc",
|
|
1120
|
+
"set -euo pipefail; "
|
|
1121
|
+
"if [ ! -x /opt/homebrew/bin/python3.12 ]; then brew install python@3.12; fi; "
|
|
1122
|
+
"/opt/homebrew/bin/python3.12 -m venv ~/.venv-vllm-metal; "
|
|
1123
|
+
"~/.venv-vllm-metal/bin/pip install -U pip setuptools wheel; "
|
|
1124
|
+
"~/.venv-vllm-metal/bin/pip install vllm-metal",
|
|
1125
|
+
]
|
|
1126
|
+
try:
|
|
1127
|
+
completed = subprocess.run(command, **run_kwargs)
|
|
1128
|
+
except subprocess.TimeoutExpired:
|
|
1129
|
+
raise HTTPException(status_code=408, detail="엔진 설치 시간이 초과되었습니다.")
|
|
1130
|
+
result = {
|
|
1131
|
+
"engine": engine,
|
|
1132
|
+
"command": " ".join(command),
|
|
1133
|
+
"returncode": completed.returncode,
|
|
1134
|
+
"stdout": completed.stdout[-12000:],
|
|
1135
|
+
"stderr": completed.stderr[-12000:],
|
|
1136
|
+
"installed": engine_installed(engine),
|
|
1137
|
+
}
|
|
1138
|
+
ollama = local_binary("ollama")
|
|
1139
|
+
if engine == "ollama" and completed.returncode == 0 and ollama:
|
|
1140
|
+
# Skip if already running to avoid orphan daemons.
|
|
1141
|
+
already_up = False
|
|
1142
|
+
try:
|
|
1143
|
+
probe = subprocess.run([ollama, "list"], capture_output=True, timeout=2, check=False)
|
|
1144
|
+
already_up = probe.returncode == 0
|
|
1145
|
+
except Exception:
|
|
1146
|
+
already_up = False
|
|
1147
|
+
if already_up:
|
|
1148
|
+
result["daemon_started"] = "already_running"
|
|
1149
|
+
else:
|
|
1150
|
+
try:
|
|
1151
|
+
# Detach so the daemon survives this request but doesn't become our zombie.
|
|
1152
|
+
subprocess.Popen(
|
|
1153
|
+
[ollama, "serve"],
|
|
1154
|
+
stdout=subprocess.DEVNULL,
|
|
1155
|
+
stderr=subprocess.DEVNULL,
|
|
1156
|
+
start_new_session=True,
|
|
1157
|
+
)
|
|
1158
|
+
result["daemon_started"] = True
|
|
1159
|
+
except Exception as e:
|
|
1160
|
+
logging.warning("ollama serve spawn failed: %s", e)
|
|
1161
|
+
result["daemon_started"] = False
|
|
1162
|
+
return result
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
def _resolve_model_alias(model_id: str, engine: Optional[str] = None) -> str:
|
|
1166
|
+
raw = model_id.strip()
|
|
1167
|
+
engine_hint = (engine or "").strip().lower()
|
|
1168
|
+
provider: Optional[str] = None
|
|
1169
|
+
model_name = raw
|
|
1170
|
+
if ":" in raw:
|
|
1171
|
+
prefix, rest = raw.split(":", 1)
|
|
1172
|
+
prefix = prefix.strip().lower()
|
|
1173
|
+
if prefix in {"ollama", "vllm", "lmstudio", "llamacpp", "local_mlx", "mlx"}:
|
|
1174
|
+
provider = "local_mlx" if prefix in {"local_mlx", "mlx"} else prefix
|
|
1175
|
+
model_name = rest.strip()
|
|
1176
|
+
provider = provider or ("local_mlx" if engine_hint in {"", "local_mlx", "mlx"} else engine_hint)
|
|
1177
|
+
aliases = MODEL_ENGINE_ALIASES.get(model_name.lower())
|
|
1178
|
+
if not aliases:
|
|
1179
|
+
return raw
|
|
1180
|
+
mapped = aliases.get(provider)
|
|
1181
|
+
if not mapped:
|
|
1182
|
+
return raw
|
|
1183
|
+
return mapped if provider == "local_mlx" else f"{provider}:{mapped}"
|
|
1184
|
+
|
|
1185
|
+
|
|
1186
|
+
def normalize_local_model_request(model_id: str, engine: Optional[str] = None) -> str:
|
|
1187
|
+
model_id = _resolve_model_alias(model_id, engine)
|
|
1188
|
+
engine = (engine or "").strip().lower()
|
|
1189
|
+
if engine in {"local_mlx", "mlx"} and model_id.startswith(("local_mlx:", "mlx:")):
|
|
1190
|
+
return model_id.split(":", 1)[1].strip()
|
|
1191
|
+
if engine and engine not in {"local_mlx", "mlx"} and ":" not in model_id:
|
|
1192
|
+
return f"{engine}:{model_id}"
|
|
1193
|
+
return model_id
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
def ensure_engine_ready(engine: str) -> Dict[str, object]:
|
|
1197
|
+
engine = "local_mlx" if engine == "mlx" else engine
|
|
1198
|
+
if engine not in ENGINE_INSTALLERS and engine not in OPENAI_COMPATIBLE_PROVIDERS:
|
|
1199
|
+
raise HTTPException(status_code=400, detail=f"지원하지 않는 엔진입니다: {engine}")
|
|
1200
|
+
support = engine_support_status(engine)
|
|
1201
|
+
if not support["supported"]:
|
|
1202
|
+
raise HTTPException(status_code=400, detail=str(support["reason"]))
|
|
1203
|
+
|
|
1204
|
+
if engine_installed(engine):
|
|
1205
|
+
if engine == "local_mlx":
|
|
1206
|
+
ensure_mlx_runtime()
|
|
1207
|
+
return {"engine": engine, "installed": True, "installed_now": False}
|
|
1208
|
+
|
|
1209
|
+
if engine not in ENGINE_INSTALLERS:
|
|
1210
|
+
raise HTTPException(status_code=400, detail=f"{engine} 엔진 설치 방법이 등록되어 있지 않습니다.")
|
|
1211
|
+
|
|
1212
|
+
result = install_engine(engine)
|
|
1213
|
+
if result.get("returncode") not in (0, None) or not engine_installed(engine):
|
|
1214
|
+
detail = result.get("stderr") or result.get("stdout") or f"{engine} 설치에 실패했습니다."
|
|
1215
|
+
raise HTTPException(status_code=500, detail=str(detail)[-2000:])
|
|
1216
|
+
|
|
1217
|
+
if engine == "local_mlx":
|
|
1218
|
+
ensure_mlx_runtime()
|
|
1219
|
+
return {"engine": engine, "installed": True, "installed_now": True, "install": result}
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
def build_model_resolution(
|
|
1223
|
+
input_id: str,
|
|
1224
|
+
engine: Optional[str],
|
|
1225
|
+
*,
|
|
1226
|
+
user_email: Optional[str] = None,
|
|
1227
|
+
display_name: Optional[str] = None,
|
|
1228
|
+
) -> _ModelResolution:
|
|
1229
|
+
"""피드백 #1/#2 공용 ModelResolution 생성기.
|
|
1230
|
+
|
|
1231
|
+
사용자가 클릭한 input_id + engine 힌트를 받아 모든 단계가 공유할
|
|
1232
|
+
canonical identity를 만든다.
|
|
1233
|
+
"""
|
|
1234
|
+
normalized = normalize_local_model_request(input_id, engine)
|
|
1235
|
+
return _ModelResolution.from_request(
|
|
1236
|
+
normalized,
|
|
1237
|
+
engine=engine,
|
|
1238
|
+
user_email=user_email,
|
|
1239
|
+
display_name=display_name or input_id,
|
|
1240
|
+
engine_aliases=MODEL_ENGINE_ALIASES,
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
_LOCAL_SMOKE_ENGINES = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
async def _smoke_test_loaded_model(
|
|
1248
|
+
resolution: _ModelResolution,
|
|
1249
|
+
*,
|
|
1250
|
+
api_key_override: Optional[str] = None,
|
|
1251
|
+
) -> Dict[str, object]:
|
|
1252
|
+
"""로드 직후 짧은 채팅 테스트를 돌려 ready_to_chat 여부를 판정한다.
|
|
1253
|
+
|
|
1254
|
+
Cloud(OpenAI/Anthropic/OpenRouter 등) 모델은 사용자 비용 발생 가능성 때문에 skip.
|
|
1255
|
+
실패해도 예외를 던지지 않는다. 결과는 compat_cache에도 기록된다.
|
|
1256
|
+
"""
|
|
1257
|
+
if (resolution.engine or "").lower() not in _LOCAL_SMOKE_ENGINES:
|
|
1258
|
+
profile = _ensure_compat_profile(resolution.load_id, resolution.engine)
|
|
1259
|
+
return {
|
|
1260
|
+
"ok": True,
|
|
1261
|
+
"reason": "skipped (cloud model — smoke test would incur cost)",
|
|
1262
|
+
"answer": None,
|
|
1263
|
+
"profile": profile.to_dict(),
|
|
1264
|
+
"skipped": True,
|
|
1265
|
+
}
|
|
1266
|
+
try:
|
|
1267
|
+
text = await asyncio.wait_for(
|
|
1268
|
+
router.generate(
|
|
1269
|
+
_SMOKE_PROMPT,
|
|
1270
|
+
context=None,
|
|
1271
|
+
max_tokens=128,
|
|
1272
|
+
temperature=0.1,
|
|
1273
|
+
),
|
|
1274
|
+
timeout=30,
|
|
1275
|
+
)
|
|
1276
|
+
except Exception as exc: # pragma: no cover - generator may not exist on all engines
|
|
1277
|
+
reason = str(exc)[:200] or "generation_failed"
|
|
1278
|
+
profile = _record_smoke_result(
|
|
1279
|
+
resolution.load_id, resolution.engine, False, reason, status="failed"
|
|
1280
|
+
)
|
|
1281
|
+
return {
|
|
1282
|
+
"ok": False,
|
|
1283
|
+
"status": "failed",
|
|
1284
|
+
"reason": reason,
|
|
1285
|
+
"answer": None,
|
|
1286
|
+
"profile": profile.to_dict(),
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
profile = _ensure_compat_profile(resolution.load_id, resolution.engine)
|
|
1290
|
+
cleaned = _compat_fast_postprocess(str(text or ""), profile.to_dict())
|
|
1291
|
+
# item 3-3: ok / degraded / failed 3분류. degraded는 채팅은 가능하다.
|
|
1292
|
+
status, reason = _classify_smoke_response(cleaned)
|
|
1293
|
+
ok = status != "failed"
|
|
1294
|
+
profile = _record_smoke_result(
|
|
1295
|
+
resolution.load_id, resolution.engine, ok, reason, status=status
|
|
1296
|
+
)
|
|
1297
|
+
return {
|
|
1298
|
+
"ok": ok,
|
|
1299
|
+
"status": status,
|
|
1300
|
+
"reason": reason,
|
|
1301
|
+
"answer": cleaned,
|
|
1302
|
+
"profile": profile.to_dict(),
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
async def prepare_and_load_model(
|
|
1307
|
+
model_id: str,
|
|
1308
|
+
request: Request,
|
|
1309
|
+
engine: Optional[str] = None,
|
|
1310
|
+
user_email: Optional[str] = None,
|
|
1311
|
+
adapter_path: Optional[str] = None,
|
|
1312
|
+
draft_model_id: Optional[str] = None,
|
|
1313
|
+
) -> Dict[str, object]:
|
|
1314
|
+
model_id = normalize_local_model_request(model_id, engine)
|
|
1315
|
+
if not model_id:
|
|
1316
|
+
raise HTTPException(status_code=400, detail="모델 식별자가 비어 있습니다.")
|
|
1317
|
+
|
|
1318
|
+
# 피드백 #1: ModelResolution을 모든 단계가 공유한다.
|
|
1319
|
+
resolution = _ModelResolution.from_request(
|
|
1320
|
+
model_id,
|
|
1321
|
+
engine=engine,
|
|
1322
|
+
user_email=user_email or get_current_user(request),
|
|
1323
|
+
engine_aliases=MODEL_ENGINE_ALIASES,
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
parsed_provider, parsed_model = parse_model_ref(model_id)
|
|
1327
|
+
if parsed_provider == "mlx":
|
|
1328
|
+
parsed_provider = "local_mlx"
|
|
1329
|
+
|
|
1330
|
+
local_engines = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
|
|
1331
|
+
install_result: Dict[str, object] = {}
|
|
1332
|
+
download_result: Optional[Dict[str, object]] = None
|
|
1333
|
+
|
|
1334
|
+
if parsed_provider in local_engines:
|
|
1335
|
+
install_result = ensure_engine_ready(parsed_provider)
|
|
1336
|
+
|
|
1337
|
+
if parsed_provider == "local_mlx":
|
|
1338
|
+
explicit_path = Path(parsed_model).expanduser()
|
|
1339
|
+
if not explicit_path.exists() and not hf_model_ready(parsed_model, "local_mlx"):
|
|
1340
|
+
download_result = download_hf_model(parsed_model, "local_mlx")
|
|
1341
|
+
elif parsed_provider == "ollama":
|
|
1342
|
+
ensure_ollama_server()
|
|
1343
|
+
ollama = local_binary("ollama")
|
|
1344
|
+
if not ollama:
|
|
1345
|
+
raise HTTPException(status_code=400, detail="Ollama가 설치되지 않았습니다.")
|
|
1346
|
+
if parsed_model not in get_ollama_pulled_models():
|
|
1347
|
+
completed = subprocess.run(
|
|
1348
|
+
[ollama, "pull", parsed_model],
|
|
1349
|
+
capture_output=True,
|
|
1350
|
+
text=True,
|
|
1351
|
+
timeout=900,
|
|
1352
|
+
check=False,
|
|
1353
|
+
)
|
|
1354
|
+
if completed.returncode != 0:
|
|
1355
|
+
raise HTTPException(status_code=500, detail=completed.stderr[-2000:] or "Ollama 모델 다운로드 실패")
|
|
1356
|
+
download_result = {"provider": "ollama", "model": parsed_model, "returncode": completed.returncode}
|
|
1357
|
+
elif parsed_provider == "vllm":
|
|
1358
|
+
ensure_vllm_server(parsed_model)
|
|
1359
|
+
download_result = {"provider": "vllm", "model": parsed_model, "server_ready": True}
|
|
1360
|
+
elif parsed_provider == "llamacpp":
|
|
1361
|
+
ensure_llamacpp_server(parsed_model)
|
|
1362
|
+
download_result = {"provider": "llamacpp", "model": parsed_model, "server_ready": True}
|
|
1363
|
+
elif parsed_provider == "lmstudio":
|
|
1364
|
+
ensured = ensure_lmstudio_model(parsed_model)
|
|
1365
|
+
resolved_model = str(
|
|
1366
|
+
ensured.get("instance_id")
|
|
1367
|
+
or ensured.get("resolved_model")
|
|
1368
|
+
or parsed_model
|
|
1369
|
+
).strip()
|
|
1370
|
+
parsed_model = resolved_model
|
|
1371
|
+
model_id = f"lmstudio:{resolved_model}"
|
|
1372
|
+
download_result = ensured
|
|
1373
|
+
|
|
1374
|
+
effective_email = (user_email or get_current_user(request) or "").strip()
|
|
1375
|
+
user_api_key = get_user_api_key(effective_email, parsed_provider) if parsed_provider != "local_mlx" else None
|
|
1376
|
+
msg = await router.load_model(
|
|
1377
|
+
model_id,
|
|
1378
|
+
adapter_path,
|
|
1379
|
+
draft_model_id=draft_model_id,
|
|
1380
|
+
api_key_override=user_api_key,
|
|
1381
|
+
owner=effective_email or None,
|
|
1382
|
+
)
|
|
1383
|
+
# 피드백 #1/#2: 로드 직후 ModelResolution을 실제 current로 동기화하고 smoke test 수행.
|
|
1384
|
+
resolution.update_after_load(actual_current=router.current_model_id)
|
|
1385
|
+
smoke_result: Dict[str, object] = {}
|
|
1386
|
+
ready_to_chat = True
|
|
1387
|
+
compat_status = "ok"
|
|
1388
|
+
try:
|
|
1389
|
+
smoke_result = await _smoke_test_loaded_model(resolution, api_key_override=user_api_key)
|
|
1390
|
+
ready_to_chat = bool(smoke_result.get("ok"))
|
|
1391
|
+
# item 3-3: smoke 결과의 3분류(ok/degraded/failed)를 그대로 노출한다.
|
|
1392
|
+
compat_status = str(smoke_result.get("status") or ("ok" if ready_to_chat else "degraded"))
|
|
1393
|
+
except Exception as exc: # never break load on smoke test failures
|
|
1394
|
+
logging.warning("smoke test failed for %s: %s", resolution.load_id, exc)
|
|
1395
|
+
compat_status = "unknown"
|
|
1396
|
+
return {
|
|
1397
|
+
"status": "ok",
|
|
1398
|
+
"message": msg,
|
|
1399
|
+
"model": model_id,
|
|
1400
|
+
"current": router.current_model_id,
|
|
1401
|
+
"engine": parsed_provider,
|
|
1402
|
+
"installed_now": bool(install_result.get("installed_now")),
|
|
1403
|
+
"download": download_result,
|
|
1404
|
+
"resolution": resolution.to_dict(),
|
|
1405
|
+
"downloaded": True,
|
|
1406
|
+
"loaded": True,
|
|
1407
|
+
"ready_to_chat": ready_to_chat,
|
|
1408
|
+
"compatibility_status": compat_status,
|
|
1409
|
+
"smoke_test": smoke_result,
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
def sse_event(event: str, data: Dict[str, object]) -> str:
|
|
1414
|
+
return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
async def prepare_and_load_model_stream(
|
|
1418
|
+
model_id: str,
|
|
1419
|
+
request: Request,
|
|
1420
|
+
engine: Optional[str] = None,
|
|
1421
|
+
user_email: Optional[str] = None,
|
|
1422
|
+
) -> AsyncIterator[str]:
|
|
1423
|
+
model_id = normalize_local_model_request(model_id, engine)
|
|
1424
|
+
if not model_id:
|
|
1425
|
+
raise HTTPException(status_code=400, detail="모델 식별자가 비어 있습니다.")
|
|
1426
|
+
|
|
1427
|
+
parsed_provider, parsed_model = parse_model_ref(model_id)
|
|
1428
|
+
if parsed_provider == "mlx":
|
|
1429
|
+
parsed_provider = "local_mlx"
|
|
1430
|
+
|
|
1431
|
+
work_queue: "queue.Queue[Dict[str, object]]" = queue.Queue()
|
|
1432
|
+
work_result: Dict[str, object] = {}
|
|
1433
|
+
|
|
1434
|
+
def emit_progress(payload: Dict[str, object]) -> None:
|
|
1435
|
+
work_queue.put({"kind": "progress", "data": payload})
|
|
1436
|
+
|
|
1437
|
+
def blocking_prepare() -> None:
|
|
1438
|
+
try:
|
|
1439
|
+
local_engines = {"local_mlx", "ollama", "vllm", "lmstudio", "llamacpp"}
|
|
1440
|
+
install_result: Dict[str, object] = {}
|
|
1441
|
+
download_result: Optional[Dict[str, object]] = None
|
|
1442
|
+
prepared_model_id = model_id
|
|
1443
|
+
prepared_model_name = parsed_model
|
|
1444
|
+
|
|
1445
|
+
if parsed_provider in local_engines:
|
|
1446
|
+
emit_progress(model_download_progress_payload(
|
|
1447
|
+
"engine",
|
|
1448
|
+
"실행 엔진을 확인하는 중입니다.",
|
|
1449
|
+
percent=2,
|
|
1450
|
+
indeterminate=True,
|
|
1451
|
+
))
|
|
1452
|
+
install_result = ensure_engine_ready(parsed_provider)
|
|
1453
|
+
emit_progress(model_download_progress_payload(
|
|
1454
|
+
"engine",
|
|
1455
|
+
"실행 엔진 준비가 완료되었습니다.",
|
|
1456
|
+
percent=10,
|
|
1457
|
+
indeterminate=False,
|
|
1458
|
+
))
|
|
1459
|
+
|
|
1460
|
+
if parsed_provider == "local_mlx":
|
|
1461
|
+
explicit_path = Path(parsed_model).expanduser()
|
|
1462
|
+
if explicit_path.exists():
|
|
1463
|
+
download_result = {"model": parsed_model, "path": str(explicit_path), "cached": True}
|
|
1464
|
+
emit_progress(model_download_progress_payload(
|
|
1465
|
+
"download",
|
|
1466
|
+
"로컬 모델 경로를 확인했습니다.",
|
|
1467
|
+
percent=100,
|
|
1468
|
+
detail=str(explicit_path),
|
|
1469
|
+
eta_seconds=0,
|
|
1470
|
+
))
|
|
1471
|
+
elif not hf_model_ready(parsed_model, "local_mlx"):
|
|
1472
|
+
download_result = download_hf_model(parsed_model, "local_mlx", progress_emit=emit_progress)
|
|
1473
|
+
else:
|
|
1474
|
+
download_result = {"model": parsed_model, "path": str(hf_model_dir(parsed_model)), "cached": True}
|
|
1475
|
+
emit_progress(model_download_progress_payload(
|
|
1476
|
+
"download",
|
|
1477
|
+
"이미 다운로드된 모델을 확인했습니다.",
|
|
1478
|
+
percent=100,
|
|
1479
|
+
eta_seconds=0,
|
|
1480
|
+
))
|
|
1481
|
+
elif parsed_provider == "ollama":
|
|
1482
|
+
emit_progress(model_download_progress_payload(
|
|
1483
|
+
"engine",
|
|
1484
|
+
"Ollama 서버를 확인하는 중입니다.",
|
|
1485
|
+
percent=12,
|
|
1486
|
+
indeterminate=True,
|
|
1487
|
+
))
|
|
1488
|
+
ensure_ollama_server()
|
|
1489
|
+
if parsed_model not in get_ollama_pulled_models():
|
|
1490
|
+
download_result = pull_ollama_model_with_progress(parsed_model, progress_emit=emit_progress)
|
|
1491
|
+
else:
|
|
1492
|
+
download_result = {"provider": "ollama", "model": parsed_model, "cached": True}
|
|
1493
|
+
emit_progress(model_download_progress_payload(
|
|
1494
|
+
"download",
|
|
1495
|
+
"이미 다운로드된 Ollama 모델을 확인했습니다.",
|
|
1496
|
+
percent=100,
|
|
1497
|
+
detail=parsed_model,
|
|
1498
|
+
eta_seconds=0,
|
|
1499
|
+
))
|
|
1500
|
+
elif parsed_provider == "vllm":
|
|
1501
|
+
if not hf_model_ready(parsed_model, "vllm"):
|
|
1502
|
+
download_result = download_hf_model(parsed_model, "vllm", progress_emit=emit_progress)
|
|
1503
|
+
else:
|
|
1504
|
+
download_result = {"provider": "vllm", "model": parsed_model, "cached": True}
|
|
1505
|
+
emit_progress(model_download_progress_payload(
|
|
1506
|
+
"download",
|
|
1507
|
+
"이미 다운로드된 모델을 확인했습니다.",
|
|
1508
|
+
percent=100,
|
|
1509
|
+
detail=parsed_model,
|
|
1510
|
+
eta_seconds=0,
|
|
1511
|
+
))
|
|
1512
|
+
emit_progress(model_download_progress_payload(
|
|
1513
|
+
"server",
|
|
1514
|
+
"vLLM 서버를 시작하는 중입니다.",
|
|
1515
|
+
percent=92,
|
|
1516
|
+
indeterminate=True,
|
|
1517
|
+
))
|
|
1518
|
+
ensure_vllm_server(parsed_model)
|
|
1519
|
+
download_result = {**(download_result or {}), "provider": "vllm", "model": parsed_model, "server_ready": True}
|
|
1520
|
+
elif parsed_provider == "llamacpp":
|
|
1521
|
+
if not hf_model_ready(parsed_model, "llamacpp"):
|
|
1522
|
+
download_result = download_hf_model(parsed_model, "llamacpp", progress_emit=emit_progress)
|
|
1523
|
+
else:
|
|
1524
|
+
download_result = {"provider": "llamacpp", "model": parsed_model, "cached": True}
|
|
1525
|
+
emit_progress(model_download_progress_payload(
|
|
1526
|
+
"download",
|
|
1527
|
+
"이미 다운로드된 GGUF 모델을 확인했습니다.",
|
|
1528
|
+
percent=100,
|
|
1529
|
+
detail=parsed_model,
|
|
1530
|
+
eta_seconds=0,
|
|
1531
|
+
))
|
|
1532
|
+
emit_progress(model_download_progress_payload(
|
|
1533
|
+
"server",
|
|
1534
|
+
"llama.cpp 서버를 시작하는 중입니다.",
|
|
1535
|
+
percent=92,
|
|
1536
|
+
indeterminate=True,
|
|
1537
|
+
))
|
|
1538
|
+
ensure_llamacpp_server(parsed_model)
|
|
1539
|
+
download_result = {**(download_result or {}), "provider": "llamacpp", "model": parsed_model, "server_ready": True}
|
|
1540
|
+
elif parsed_provider == "lmstudio":
|
|
1541
|
+
emit_progress(model_download_progress_payload(
|
|
1542
|
+
"download",
|
|
1543
|
+
"LM Studio 모델을 확인하는 중입니다.",
|
|
1544
|
+
percent=35,
|
|
1545
|
+
indeterminate=True,
|
|
1546
|
+
))
|
|
1547
|
+
ensured = ensure_lmstudio_model(parsed_model)
|
|
1548
|
+
resolved_model = str(
|
|
1549
|
+
ensured.get("instance_id")
|
|
1550
|
+
or ensured.get("resolved_model")
|
|
1551
|
+
or parsed_model
|
|
1552
|
+
).strip()
|
|
1553
|
+
prepared_model_name = resolved_model
|
|
1554
|
+
prepared_model_id = f"lmstudio:{resolved_model}"
|
|
1555
|
+
download_result = ensured
|
|
1556
|
+
else:
|
|
1557
|
+
emit_progress(model_download_progress_payload(
|
|
1558
|
+
"engine",
|
|
1559
|
+
"모델 연결을 준비하는 중입니다.",
|
|
1560
|
+
percent=30,
|
|
1561
|
+
indeterminate=True,
|
|
1562
|
+
))
|
|
1563
|
+
|
|
1564
|
+
work_result.update({
|
|
1565
|
+
"model_id": prepared_model_id,
|
|
1566
|
+
"parsed_provider": parsed_provider,
|
|
1567
|
+
"parsed_model": prepared_model_name,
|
|
1568
|
+
"install_result": install_result,
|
|
1569
|
+
"download_result": download_result,
|
|
1570
|
+
})
|
|
1571
|
+
work_queue.put({"kind": "done"})
|
|
1572
|
+
except HTTPException as exc:
|
|
1573
|
+
work_queue.put({"kind": "error", "status_code": exc.status_code, "detail": exc.detail})
|
|
1574
|
+
except Exception as exc:
|
|
1575
|
+
logging.exception("model prepare stream worker failed")
|
|
1576
|
+
work_queue.put({"kind": "error", "status_code": 500, "detail": str(exc)[-2000:]})
|
|
1577
|
+
|
|
1578
|
+
worker = threading.Thread(target=blocking_prepare, daemon=True)
|
|
1579
|
+
worker.start()
|
|
1580
|
+
|
|
1581
|
+
while True:
|
|
1582
|
+
item = await asyncio.to_thread(work_queue.get)
|
|
1583
|
+
kind = item.get("kind")
|
|
1584
|
+
if kind == "progress":
|
|
1585
|
+
yield sse_event("progress", item["data"])
|
|
1586
|
+
elif kind == "error":
|
|
1587
|
+
raise HTTPException(
|
|
1588
|
+
status_code=int(item.get("status_code") or 500),
|
|
1589
|
+
detail=item.get("detail") or "모델 준비에 실패했습니다.",
|
|
1590
|
+
)
|
|
1591
|
+
elif kind == "done":
|
|
1592
|
+
break
|
|
1593
|
+
|
|
1594
|
+
prepared_model_id = str(work_result.get("model_id") or model_id)
|
|
1595
|
+
prepared_provider = str(work_result.get("parsed_provider") or parsed_provider)
|
|
1596
|
+
install_result = work_result.get("install_result") or {}
|
|
1597
|
+
download_result = work_result.get("download_result")
|
|
1598
|
+
|
|
1599
|
+
yield sse_event("progress", model_download_progress_payload(
|
|
1600
|
+
"load",
|
|
1601
|
+
"모델을 메모리에 로드하는 중입니다.",
|
|
1602
|
+
percent=96,
|
|
1603
|
+
indeterminate=True,
|
|
1604
|
+
))
|
|
1605
|
+
|
|
1606
|
+
effective_email = (user_email or get_current_user(request) or "").strip()
|
|
1607
|
+
user_api_key = get_user_api_key(effective_email, prepared_provider) if prepared_provider != "local_mlx" else None
|
|
1608
|
+
msg = await router.load_model(
|
|
1609
|
+
prepared_model_id,
|
|
1610
|
+
None,
|
|
1611
|
+
draft_model_id=None,
|
|
1612
|
+
api_key_override=user_api_key,
|
|
1613
|
+
owner=effective_email or None,
|
|
1614
|
+
)
|
|
1615
|
+
# 피드백 #1/#2: SSE에도 ModelResolution과 smoke test 결과를 같이 내려준다.
|
|
1616
|
+
resolution_stream = _ModelResolution.from_request(
|
|
1617
|
+
prepared_model_id,
|
|
1618
|
+
engine=prepared_provider,
|
|
1619
|
+
user_email=effective_email or None,
|
|
1620
|
+
engine_aliases=MODEL_ENGINE_ALIASES,
|
|
1621
|
+
)
|
|
1622
|
+
resolution_stream.update_after_load(actual_current=router.current_model_id)
|
|
1623
|
+
yield sse_event("progress", model_download_progress_payload(
|
|
1624
|
+
"smoke_test",
|
|
1625
|
+
"채팅 호환성 테스트 중입니다.",
|
|
1626
|
+
percent=98,
|
|
1627
|
+
indeterminate=True,
|
|
1628
|
+
))
|
|
1629
|
+
smoke_result: Dict[str, object] = {}
|
|
1630
|
+
ready_to_chat = True
|
|
1631
|
+
compat_status = "ok"
|
|
1632
|
+
try:
|
|
1633
|
+
smoke_result = await _smoke_test_loaded_model(resolution_stream, api_key_override=user_api_key)
|
|
1634
|
+
ready_to_chat = bool(smoke_result.get("ok"))
|
|
1635
|
+
# item 3-3: smoke 결과의 3분류(ok/degraded/failed)를 그대로 노출한다.
|
|
1636
|
+
compat_status = str(smoke_result.get("status") or ("ok" if ready_to_chat else "degraded"))
|
|
1637
|
+
except Exception as exc:
|
|
1638
|
+
logging.warning("smoke test (stream) failed for %s: %s", resolution_stream.load_id, exc)
|
|
1639
|
+
compat_status = "unknown"
|
|
1640
|
+
result = {
|
|
1641
|
+
"status": "ok",
|
|
1642
|
+
"message": msg,
|
|
1643
|
+
"model": prepared_model_id,
|
|
1644
|
+
"current": router.current_model_id,
|
|
1645
|
+
"engine": prepared_provider,
|
|
1646
|
+
"installed_now": bool(isinstance(install_result, dict) and install_result.get("installed_now")),
|
|
1647
|
+
"download": download_result,
|
|
1648
|
+
"resolution": resolution_stream.to_dict(),
|
|
1649
|
+
"downloaded": True,
|
|
1650
|
+
"loaded": True,
|
|
1651
|
+
"ready_to_chat": ready_to_chat,
|
|
1652
|
+
"compatibility_status": compat_status,
|
|
1653
|
+
"smoke_test": smoke_result,
|
|
1654
|
+
}
|
|
1655
|
+
yield sse_event("progress", model_download_progress_payload(
|
|
1656
|
+
"done",
|
|
1657
|
+
"모델 준비가 완료되었습니다.",
|
|
1658
|
+
percent=100,
|
|
1659
|
+
eta_seconds=0,
|
|
1660
|
+
))
|
|
1661
|
+
yield sse_event("done", result)
|
|
1662
|
+
|
|
1663
|
+
|
|
1664
|
+
CLOUD_VERIFY_CACHE: Dict[str, Dict] = {}
|
|
1665
|
+
CLOUD_VERIFY_TTL_SECONDS = 600
|
|
1666
|
+
|
|
1667
|
+
async def _probe_cloud_model(model_ref: str) -> Dict[str, object]:
|
|
1668
|
+
provider, model_name = parse_model_ref(model_ref)
|
|
1669
|
+
config = OPENAI_COMPATIBLE_PROVIDERS.get(provider)
|
|
1670
|
+
if not config:
|
|
1671
|
+
return {"ok": False, "reason": f"Unsupported provider: {provider}"}
|
|
1672
|
+
|
|
1673
|
+
api_key = os.getenv(config["env_key"]) or config.get("api_key_fallback")
|
|
1674
|
+
if not api_key:
|
|
1675
|
+
return {"ok": False, "reason": f"Missing API key: {config['env_key']}"}
|
|
1676
|
+
|
|
1677
|
+
base_url = os.getenv(config.get("base_url_env", "")) if config.get("base_url_env") else None
|
|
1678
|
+
base_url = base_url or config.get("base_url")
|
|
1679
|
+
client_kwargs = {"api_key": api_key}
|
|
1680
|
+
if base_url:
|
|
1681
|
+
client_kwargs["base_url"] = base_url
|
|
1682
|
+
|
|
1683
|
+
try:
|
|
1684
|
+
client = AsyncOpenAI(**client_kwargs)
|
|
1685
|
+
await asyncio.wait_for(
|
|
1686
|
+
client.chat.completions.create(
|
|
1687
|
+
model=model_name,
|
|
1688
|
+
messages=[{"role": "user", "content": "ping"}],
|
|
1689
|
+
max_tokens=1,
|
|
1690
|
+
temperature=0,
|
|
1691
|
+
),
|
|
1692
|
+
timeout=15,
|
|
1693
|
+
)
|
|
1694
|
+
return {"ok": True, "reason": "ok"}
|
|
1695
|
+
except Exception as e:
|
|
1696
|
+
return {"ok": False, "reason": str(e)[:220]}
|
|
1697
|
+
|
|
1698
|
+
|
|
1699
|
+
async def verify_cloud_models(force: bool = False, provider_filter: Optional[str] = None) -> Dict[str, Dict]:
|
|
1700
|
+
now = time.time()
|
|
1701
|
+
cloud_items = [item for item in router.detected_cloud_models() if item.get("tag") == "cloud"]
|
|
1702
|
+
if provider_filter:
|
|
1703
|
+
cloud_items = [item for item in cloud_items if item.get("provider") == provider_filter]
|
|
1704
|
+
|
|
1705
|
+
results: Dict[str, Dict] = {}
|
|
1706
|
+
for item in cloud_items:
|
|
1707
|
+
model_ref = item["id"]
|
|
1708
|
+
cached = CLOUD_VERIFY_CACHE.get(model_ref)
|
|
1709
|
+
if not force and cached and (now - cached.get("ts", 0) <= CLOUD_VERIFY_TTL_SECONDS):
|
|
1710
|
+
results[model_ref] = cached
|
|
1711
|
+
continue
|
|
1712
|
+
if item.get("available") is False:
|
|
1713
|
+
record = {"ok": False, "reason": item.get("requires") or "API key missing", "ts": now}
|
|
1714
|
+
CLOUD_VERIFY_CACHE[model_ref] = record
|
|
1715
|
+
results[model_ref] = record
|
|
1716
|
+
continue
|
|
1717
|
+
probe = await _probe_cloud_model(model_ref)
|
|
1718
|
+
record = {"ok": bool(probe.get("ok")), "reason": probe.get("reason", ""), "ts": now}
|
|
1719
|
+
CLOUD_VERIFY_CACHE[model_ref] = record
|
|
1720
|
+
results[model_ref] = record
|
|
1721
|
+
return results
|