lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/cli.py +47 -5
- lemonade/common/inference_engines.py +13 -4
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +544 -1
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +303 -0
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +393 -33
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +60 -121
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +220 -553
- lemonade/tools/server/serve.py +684 -168
- lemonade/tools/server/static/js/chat.js +666 -342
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +597 -73
- lemonade/tools/server/static/js/shared.js +79 -14
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +491 -66
- lemonade/tools/server/static/webapp.html +83 -31
- lemonade/tools/server/tray.py +158 -38
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
- lemonade/tools/server/webapp.py +4 -1
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/version.py +1 -1
- lemonade_install/install.py +54 -611
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
- lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
- lemonade_server/cli.py +145 -37
- lemonade_server/model_manager.py +521 -37
- lemonade_server/pydantic_models.py +28 -1
- lemonade_server/server_models.json +246 -92
- lemonade_server/settings.py +39 -39
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +0 -173
- lemonade/tools/quark/quark_quantize.py +0 -439
- lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/tools/server/serve.py
CHANGED
|
@@ -7,15 +7,16 @@ import logging
|
|
|
7
7
|
import platform
|
|
8
8
|
import tempfile
|
|
9
9
|
import traceback
|
|
10
|
-
from typing import Optional, Union
|
|
10
|
+
from typing import Optional, Union, List
|
|
11
11
|
import json
|
|
12
|
-
import subprocess
|
|
13
12
|
from pathlib import Path
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
import os
|
|
14
|
+
import shutil
|
|
15
|
+
from fastapi import FastAPI, HTTPException, status, Request, WebSocket, Form, UploadFile
|
|
16
16
|
from fastapi.responses import StreamingResponse
|
|
17
17
|
from fastapi.middleware.cors import CORSMiddleware
|
|
18
18
|
from fastapi.staticfiles import StaticFiles
|
|
19
|
+
from starlette.websockets import WebSocketDisconnect, WebSocketState
|
|
19
20
|
import uvicorn
|
|
20
21
|
from uvicorn.config import Config
|
|
21
22
|
from uvicorn.server import Server as UvicornServer
|
|
@@ -47,7 +48,9 @@ from openai.types.responses import (
|
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
import lemonade.api as lemonade_api
|
|
50
|
-
|
|
51
|
+
from lemonade.tools.server.wrapped_server import WrappedServer
|
|
52
|
+
from lemonade.tools.server.llamacpp import LlamaServer
|
|
53
|
+
from lemonade.tools.server.flm import FlmServer
|
|
51
54
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
52
55
|
from lemonade.tools.server.webapp import get_webapp_html
|
|
53
56
|
from lemonade.tools.server.utils.port import lifespan
|
|
@@ -75,12 +78,83 @@ from lemonade_server.settings import save_setting
|
|
|
75
78
|
# Tests should use the max_new_tokens argument to set a lower value
|
|
76
79
|
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
77
80
|
|
|
78
|
-
|
|
79
|
-
if platform.system() == "Windows":
|
|
81
|
+
if platform.system() in ["Windows", "Darwin"]:
|
|
80
82
|
# pylint: disable=ungrouped-imports
|
|
81
83
|
from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
|
|
82
84
|
|
|
83
85
|
|
|
86
|
+
class ServerLogFilter(logging.Filter):
|
|
87
|
+
def __init__(self, server):
|
|
88
|
+
super().__init__()
|
|
89
|
+
self.server = server
|
|
90
|
+
self.noisy_paths = {
|
|
91
|
+
"/api/v1/health",
|
|
92
|
+
"/api/v0/health",
|
|
93
|
+
"/api/v1/models",
|
|
94
|
+
"/api/v0/models",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
98
|
+
msg = record.getMessage()
|
|
99
|
+
|
|
100
|
+
# Filter out websocket logs
|
|
101
|
+
if "> TEXT" in msg:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
# Filter out noisy HTTP routes if debug logs are OFF
|
|
105
|
+
if not self.server.debug_logging_enabled:
|
|
106
|
+
if any(path in msg for path in self.noisy_paths):
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
# Otherwise, allow the log
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
|
|
114
|
+
logger = logging.getLogger()
|
|
115
|
+
await websocket.accept()
|
|
116
|
+
try:
|
|
117
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
118
|
+
f.seek(0) # start at the beginning of the file
|
|
119
|
+
while True:
|
|
120
|
+
# Try reading a line
|
|
121
|
+
line = f.readline()
|
|
122
|
+
if not line:
|
|
123
|
+
await asyncio.sleep(interval)
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Send defensively: if disconnected, bail out
|
|
127
|
+
if websocket.application_state != WebSocketState.CONNECTED:
|
|
128
|
+
# Server-side state says we're not connected anymore
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
await websocket.send_text(line)
|
|
133
|
+
except WebSocketDisconnect:
|
|
134
|
+
# Client closed — normal path out
|
|
135
|
+
break
|
|
136
|
+
except RuntimeError as re:
|
|
137
|
+
# Starlette will raise this if a close has already been sent
|
|
138
|
+
logger.debug("RuntimeError during send: %s", re)
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
except WebSocketDisconnect:
|
|
142
|
+
# Client closed the socket; do not try to send or close again
|
|
143
|
+
pass
|
|
144
|
+
except Exception as e: # pylint: disable=broad-except
|
|
145
|
+
# Log server-side; do not attempt to send error over a possibly closed socket
|
|
146
|
+
logger.exception("Error in log_streamer: %s", e)
|
|
147
|
+
finally:
|
|
148
|
+
# Only close if Starlette still thinks we're connected.
|
|
149
|
+
# This prevents "Cannot call send once a close message has been sent."
|
|
150
|
+
try:
|
|
151
|
+
if websocket.application_state == WebSocketState.CONNECTED:
|
|
152
|
+
await websocket.close()
|
|
153
|
+
except Exception: # pylint: disable=broad-except
|
|
154
|
+
# If close itself races, swallow — we're shutting down anyway.
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
|
|
84
158
|
class ServerModel(Model):
|
|
85
159
|
"""
|
|
86
160
|
An extension of OpenAI's Model class that adds
|
|
@@ -133,6 +207,21 @@ class StopOnEvent:
|
|
|
133
207
|
return self.stop_event.is_set()
|
|
134
208
|
|
|
135
209
|
|
|
210
|
+
class NoCacheStaticFiles(StaticFiles):
|
|
211
|
+
"""Custom StaticFiles class with no-cache headers"""
|
|
212
|
+
|
|
213
|
+
def __init__(self, *args, **kwargs):
|
|
214
|
+
super().__init__(*args, **kwargs)
|
|
215
|
+
|
|
216
|
+
def file_response(self, *args, **kwargs) -> Response:
|
|
217
|
+
response = super().file_response(*args, **kwargs)
|
|
218
|
+
# Add no-cache headers for all static files
|
|
219
|
+
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
|
|
220
|
+
response.headers["Pragma"] = "no-cache"
|
|
221
|
+
response.headers["Expires"] = "0"
|
|
222
|
+
return response
|
|
223
|
+
|
|
224
|
+
|
|
136
225
|
class Server:
|
|
137
226
|
"""
|
|
138
227
|
Open a web server that apps can use to communicate with the LLM.
|
|
@@ -149,6 +238,7 @@ class Server:
|
|
|
149
238
|
- /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
|
|
150
239
|
- /api/v1/responses: responses API using HTTP chunked transfer encoding.
|
|
151
240
|
- /api/v1/models: list all available models.
|
|
241
|
+
- /api/v1/models/{model_id}: retrieve a specific model by ID.
|
|
152
242
|
"""
|
|
153
243
|
|
|
154
244
|
def __init__(
|
|
@@ -188,6 +278,12 @@ class Server:
|
|
|
188
278
|
allow_headers=["*"], # Allows all headers
|
|
189
279
|
)
|
|
190
280
|
|
|
281
|
+
# Set up debug middleware if debug logging is enabled
|
|
282
|
+
# This must be done during app initialization, not at runtime
|
|
283
|
+
self.debug_logging_enabled = log_level == "debug"
|
|
284
|
+
if self.debug_logging_enabled:
|
|
285
|
+
self.setup_middleware_timer()
|
|
286
|
+
|
|
191
287
|
# Set up custom routes
|
|
192
288
|
self.setup_routes(["/api/v0", "/api/v1"])
|
|
193
289
|
|
|
@@ -198,7 +294,7 @@ class Server:
|
|
|
198
294
|
# as the Web App
|
|
199
295
|
static_dir = Path(__file__).parent / "static"
|
|
200
296
|
self.app.mount(
|
|
201
|
-
"/static",
|
|
297
|
+
"/static", NoCacheStaticFiles(directory=static_dir), name="static_assets"
|
|
202
298
|
)
|
|
203
299
|
|
|
204
300
|
# Performance stats that are set during /ws and can be
|
|
@@ -232,11 +328,8 @@ class Server:
|
|
|
232
328
|
# Add lock for load/unload operations
|
|
233
329
|
self._load_lock = asyncio.Lock()
|
|
234
330
|
|
|
235
|
-
# Subprocess handle for llama_server.exe
|
|
236
|
-
self.
|
|
237
|
-
|
|
238
|
-
# Telemetry instance for llama server
|
|
239
|
-
self.llama_telemetry = llamacpp.LlamaTelemetry()
|
|
331
|
+
# Subprocess handle for wrapped instance of llama_server.exe, etc.
|
|
332
|
+
self.wrapped_server: WrappedServer = None
|
|
240
333
|
|
|
241
334
|
def setup_routes(self, api_prefixes: list[str]):
|
|
242
335
|
for prefix in api_prefixes:
|
|
@@ -252,16 +345,199 @@ class Server:
|
|
|
252
345
|
self.app.post(f"{prefix}/completions")(self.completions)
|
|
253
346
|
self.app.post(f"{prefix}/responses")(self.responses)
|
|
254
347
|
self.app.post(f"{prefix}/log-level")(self.set_log_level)
|
|
348
|
+
self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
|
|
349
|
+
self.app.post(f"{prefix}/add-local-model")(self.add_local_model)
|
|
255
350
|
|
|
256
351
|
# OpenAI-compatible routes
|
|
257
352
|
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
258
353
|
self.app.post(f"{prefix}/embeddings")(self.embeddings)
|
|
259
354
|
self.app.get(f"{prefix}/models")(self.models)
|
|
355
|
+
self.app.get(f"{prefix}/models/{{model_id}}")(self.retrieve_model)
|
|
260
356
|
|
|
261
357
|
# JinaAI routes (jina.ai/reranker/)
|
|
262
358
|
self.app.post(f"{prefix}/reranking")(self.reranking)
|
|
263
359
|
self.app.post(f"{prefix}/rerank")(self.reranking)
|
|
264
360
|
|
|
361
|
+
# Migration routes
|
|
362
|
+
self.app.get(f"{prefix}/migration/incompatible-models")(
|
|
363
|
+
self.get_incompatible_models
|
|
364
|
+
)
|
|
365
|
+
self.app.post(f"{prefix}/migration/cleanup")(
|
|
366
|
+
self.cleanup_incompatible_models
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
async def add_local_model(
|
|
370
|
+
self,
|
|
371
|
+
model_name: str = Form(...),
|
|
372
|
+
checkpoint: str = Form(""),
|
|
373
|
+
recipe: str = Form(...),
|
|
374
|
+
reasoning: bool = Form(False),
|
|
375
|
+
vision: bool = Form(False),
|
|
376
|
+
mmproj: str = Form(None),
|
|
377
|
+
model_files: List[UploadFile] = None,
|
|
378
|
+
):
|
|
379
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
380
|
+
from lemonade.tools.llamacpp.utils import parse_checkpoint
|
|
381
|
+
|
|
382
|
+
# Upload and register a local model from files.
|
|
383
|
+
try:
|
|
384
|
+
if not model_files:
|
|
385
|
+
raise HTTPException(
|
|
386
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
387
|
+
detail="No model files provided for upload",
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if not model_name.startswith("user."):
|
|
391
|
+
raise HTTPException(
|
|
392
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
393
|
+
detail="Model name must start with 'user.'",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
valid_recipes = ["llamacpp", "oga-npu", "oga-hybrid", "oga-cpu"]
|
|
397
|
+
if recipe not in valid_recipes:
|
|
398
|
+
raise HTTPException(
|
|
399
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
400
|
+
detail=f"Invalid recipe. Must be one of: {', '.join(valid_recipes)}",
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if recipe == "llamacpp" and not any(
|
|
404
|
+
f.filename.lower().endswith(".gguf") for f in model_files
|
|
405
|
+
):
|
|
406
|
+
raise HTTPException(
|
|
407
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
408
|
+
detail="At least one .gguf file is required for llamacpp",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Check if model name already exists
|
|
412
|
+
if model_name in ModelManager().supported_models:
|
|
413
|
+
raise HTTPException(
|
|
414
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
415
|
+
detail=(
|
|
416
|
+
f"Model name '{model_name}' already exists. "
|
|
417
|
+
"Please use a different name."
|
|
418
|
+
),
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
model_name_clean = model_name.replace("user.", "")
|
|
422
|
+
|
|
423
|
+
# Files are saved to models--{model_name_clean}
|
|
424
|
+
# Note: This is based on the user's custom model name, NOT the checkpoint field
|
|
425
|
+
repo_cache_name = model_name_clean.replace("/", "--")
|
|
426
|
+
snapshot_path = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
|
|
427
|
+
os.makedirs(snapshot_path, exist_ok=True)
|
|
428
|
+
|
|
429
|
+
# Extract variant from checkpoint field if provided
|
|
430
|
+
# checkpoint field format: "folder:variant" or just "folder"
|
|
431
|
+
variant = None
|
|
432
|
+
if checkpoint and ":" in checkpoint:
|
|
433
|
+
_, variant = parse_checkpoint(checkpoint)
|
|
434
|
+
# variant now contains just the variant[can be with or without the
|
|
435
|
+
# .gguf extension] filename (e.g., "LFM2-VL-1.6B-F16 or LFM2-VL-1.6B-F16.gguf")
|
|
436
|
+
|
|
437
|
+
# Save uploaded files, preserving folder structure
|
|
438
|
+
for file in model_files:
|
|
439
|
+
relative_path = file.filename
|
|
440
|
+
path_parts = relative_path.split("/")
|
|
441
|
+
|
|
442
|
+
if len(path_parts) > 1:
|
|
443
|
+
internal_path = "/".join(path_parts[1:])
|
|
444
|
+
file_path = os.path.join(snapshot_path, internal_path)
|
|
445
|
+
else:
|
|
446
|
+
file_path = os.path.join(snapshot_path, path_parts[0])
|
|
447
|
+
|
|
448
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
449
|
+
with open(file_path, "wb") as f:
|
|
450
|
+
content = await file.read()
|
|
451
|
+
f.write(content)
|
|
452
|
+
|
|
453
|
+
# Resolve actual file paths after upload (for faster loading later)
|
|
454
|
+
resolved_checkpoint = None
|
|
455
|
+
resolved_mmproj = None
|
|
456
|
+
|
|
457
|
+
# For OGA models, find genai_config.json
|
|
458
|
+
if recipe.startswith("oga-"):
|
|
459
|
+
for root, _, files in os.walk(snapshot_path):
|
|
460
|
+
if "genai_config.json" in files:
|
|
461
|
+
resolved_checkpoint = root
|
|
462
|
+
break
|
|
463
|
+
if not resolved_checkpoint:
|
|
464
|
+
resolved_checkpoint = snapshot_path
|
|
465
|
+
|
|
466
|
+
# For llamacpp models, find the GGUF file
|
|
467
|
+
elif recipe == "llamacpp":
|
|
468
|
+
gguf_file_found = None
|
|
469
|
+
|
|
470
|
+
# If variant is specified, look for that specific file
|
|
471
|
+
if variant:
|
|
472
|
+
search_term = (
|
|
473
|
+
variant if variant.endswith(".gguf") else f"{variant}.gguf"
|
|
474
|
+
)
|
|
475
|
+
for root, _, files in os.walk(snapshot_path):
|
|
476
|
+
if search_term in files:
|
|
477
|
+
gguf_file_found = os.path.join(root, search_term)
|
|
478
|
+
break
|
|
479
|
+
|
|
480
|
+
# If no variant or variant not found, search for any .gguf file (excluding mmproj)
|
|
481
|
+
if not gguf_file_found:
|
|
482
|
+
for root, _, files in os.walk(snapshot_path):
|
|
483
|
+
gguf_files = [
|
|
484
|
+
f
|
|
485
|
+
for f in files
|
|
486
|
+
if f.endswith(".gguf") and "mmproj" not in f.lower()
|
|
487
|
+
]
|
|
488
|
+
if gguf_files:
|
|
489
|
+
gguf_file_found = os.path.join(root, gguf_files[0])
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
resolved_checkpoint = (
|
|
493
|
+
gguf_file_found if gguf_file_found else snapshot_path
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Search for mmproj file if provided
|
|
497
|
+
if mmproj:
|
|
498
|
+
for root, _, files in os.walk(snapshot_path):
|
|
499
|
+
if mmproj in files:
|
|
500
|
+
resolved_mmproj = os.path.join(root, mmproj)
|
|
501
|
+
break
|
|
502
|
+
|
|
503
|
+
# Build checkpoint for registration
|
|
504
|
+
# For llamacpp with resolved path, store the full path relative to HF_HUB_CACHE
|
|
505
|
+
if resolved_checkpoint:
|
|
506
|
+
# Store as relative path from HF_HUB_CACHE for portability
|
|
507
|
+
checkpoint_to_register = os.path.relpath(
|
|
508
|
+
resolved_checkpoint, HF_HUB_CACHE
|
|
509
|
+
)
|
|
510
|
+
elif variant:
|
|
511
|
+
checkpoint_to_register = f"models--{repo_cache_name}:{variant}"
|
|
512
|
+
else:
|
|
513
|
+
checkpoint_to_register = f"models--{repo_cache_name}"
|
|
514
|
+
|
|
515
|
+
# Register the model
|
|
516
|
+
ModelManager().register_local_model(
|
|
517
|
+
model_name=model_name,
|
|
518
|
+
checkpoint=checkpoint_to_register,
|
|
519
|
+
recipe=recipe,
|
|
520
|
+
reasoning=reasoning,
|
|
521
|
+
vision=vision,
|
|
522
|
+
mmproj=resolved_mmproj if resolved_mmproj else mmproj,
|
|
523
|
+
snapshot_path=snapshot_path,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Refresh local models
|
|
527
|
+
self.local_models = ModelManager().downloaded_models_enabled
|
|
528
|
+
|
|
529
|
+
return {
|
|
530
|
+
"status": "success",
|
|
531
|
+
"message": f"Model {model_name} uploaded and registered successfully",
|
|
532
|
+
}
|
|
533
|
+
except Exception as e:
|
|
534
|
+
if os.path.exists(checkpoint_to_register):
|
|
535
|
+
shutil.rmtree(checkpoint_to_register)
|
|
536
|
+
raise HTTPException(
|
|
537
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
538
|
+
detail=f"Failed to upload model: {str(e)}",
|
|
539
|
+
)
|
|
540
|
+
|
|
265
541
|
async def set_log_level(self, config: LogLevelConfig):
|
|
266
542
|
"""
|
|
267
543
|
Set the logging level of the server.
|
|
@@ -380,11 +656,13 @@ class Server:
|
|
|
380
656
|
)
|
|
381
657
|
file_handler.setLevel(logging_level)
|
|
382
658
|
file_handler.setFormatter(uvicorn_formatter)
|
|
659
|
+
file_handler.addFilter(ServerLogFilter(self))
|
|
383
660
|
|
|
384
661
|
# Set up console handler
|
|
385
662
|
console_handler = logging.StreamHandler()
|
|
386
663
|
console_handler.setLevel(logging_level)
|
|
387
664
|
console_handler.setFormatter(uvicorn_formatter)
|
|
665
|
+
console_handler.addFilter(ServerLogFilter(self))
|
|
388
666
|
|
|
389
667
|
# Configure root logger with both handlers
|
|
390
668
|
logging.basicConfig(
|
|
@@ -407,10 +685,6 @@ class Server:
|
|
|
407
685
|
).run()
|
|
408
686
|
sys.exit(0)
|
|
409
687
|
|
|
410
|
-
if self.debug_logging_enabled:
|
|
411
|
-
# Print the elapsed time for each request
|
|
412
|
-
self.setup_middleware_timer()
|
|
413
|
-
|
|
414
688
|
# Let the app know what port it's running on, so
|
|
415
689
|
# that the lifespan can access it
|
|
416
690
|
self.app.port = self.port
|
|
@@ -507,7 +781,9 @@ class Server:
|
|
|
507
781
|
|
|
508
782
|
return lc
|
|
509
783
|
|
|
510
|
-
async def completions(
|
|
784
|
+
async def completions(
|
|
785
|
+
self, completion_request: CompletionRequest, request: Request
|
|
786
|
+
):
|
|
511
787
|
"""
|
|
512
788
|
Stream completion responses using HTTP chunked transfer encoding.
|
|
513
789
|
"""
|
|
@@ -520,8 +796,8 @@ class Server:
|
|
|
520
796
|
# Load the model if it's different from the currently loaded one
|
|
521
797
|
await self.load_llm(lc)
|
|
522
798
|
|
|
523
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
524
|
-
return
|
|
799
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
800
|
+
return self.wrapped_server.completion(completion_request)
|
|
525
801
|
|
|
526
802
|
# Check if the model supports reasoning
|
|
527
803
|
reasoning_first_token = self.llm_loaded.reasoning
|
|
@@ -559,29 +835,43 @@ class Server:
|
|
|
559
835
|
# This is necessary because the variable is modified
|
|
560
836
|
# in the inner function
|
|
561
837
|
nonlocal reasoning_first_token
|
|
838
|
+
try:
|
|
839
|
+
async for token in self._generate_tokens(**generation_args):
|
|
840
|
+
# Handle client disconnect: stop generation and exit
|
|
841
|
+
if await request.is_disconnected():
|
|
842
|
+
self.stop_event.set()
|
|
843
|
+
break
|
|
562
844
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
845
|
+
choice = CompletionChoice(
|
|
846
|
+
text=(
|
|
847
|
+
"<think>" + token if reasoning_first_token else token
|
|
848
|
+
),
|
|
849
|
+
index=0,
|
|
850
|
+
finish_reason="stop",
|
|
851
|
+
logprobs=None,
|
|
852
|
+
)
|
|
570
853
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
854
|
+
completion = Completion(
|
|
855
|
+
id="0",
|
|
856
|
+
choices=[choice],
|
|
857
|
+
model=self.llm_loaded.checkpoint,
|
|
858
|
+
object="text_completion",
|
|
859
|
+
created=int(time.time()),
|
|
860
|
+
)
|
|
578
861
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
862
|
+
# Format as SSE
|
|
863
|
+
reasoning_first_token = False
|
|
864
|
+
yield f"data: {completion.model_dump_json()}\n\n".encode(
|
|
865
|
+
"utf-8"
|
|
866
|
+
)
|
|
582
867
|
|
|
583
|
-
|
|
584
|
-
|
|
868
|
+
# Send the [DONE] marker only if still connected
|
|
869
|
+
if not await request.is_disconnected():
|
|
870
|
+
yield b"data: [DONE]\n\n"
|
|
871
|
+
except asyncio.CancelledError:
|
|
872
|
+
# Propagate cancellation to the generator loop
|
|
873
|
+
self.stop_event.set()
|
|
874
|
+
return
|
|
585
875
|
|
|
586
876
|
return StreamingResponse(
|
|
587
877
|
generate(),
|
|
@@ -639,7 +929,9 @@ class Server:
|
|
|
639
929
|
created=int(time.time()),
|
|
640
930
|
)
|
|
641
931
|
|
|
642
|
-
async def chat_completions(
|
|
932
|
+
async def chat_completions(
|
|
933
|
+
self, chat_completion_request: ChatCompletionRequest, request: Request
|
|
934
|
+
):
|
|
643
935
|
"""
|
|
644
936
|
Stream chat completion responses using HTTP chunked transfer encoding.
|
|
645
937
|
"""
|
|
@@ -655,10 +947,25 @@ class Server:
|
|
|
655
947
|
# Load the model if it's different from the currently loaded one
|
|
656
948
|
await self.load_llm(lc)
|
|
657
949
|
|
|
658
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
659
|
-
|
|
660
|
-
chat_completion_request,
|
|
661
|
-
|
|
950
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
951
|
+
if (
|
|
952
|
+
hasattr(chat_completion_request, "enable_thinking")
|
|
953
|
+
and chat_completion_request.enable_thinking is False
|
|
954
|
+
and "qwen3" in self.llm_loaded.model_name.lower()
|
|
955
|
+
):
|
|
956
|
+
|
|
957
|
+
# Modify the last user message to include /no_think
|
|
958
|
+
if chat_completion_request.messages:
|
|
959
|
+
for i in range(len(chat_completion_request.messages) - 1, -1, -1):
|
|
960
|
+
if chat_completion_request.messages[i].get("role") == "user":
|
|
961
|
+
original_content = chat_completion_request.messages[i][
|
|
962
|
+
"content"
|
|
963
|
+
]
|
|
964
|
+
chat_completion_request.messages[i][
|
|
965
|
+
"content"
|
|
966
|
+
] = f"/no_think\n{original_content}"
|
|
967
|
+
break
|
|
968
|
+
return self.wrapped_server.chat_completion(chat_completion_request)
|
|
662
969
|
|
|
663
970
|
# Convert chat messages to text using the model's chat template
|
|
664
971
|
text = self.apply_chat_template(
|
|
@@ -720,68 +1027,126 @@ class Server:
|
|
|
720
1027
|
# Keep track of the full response for tool call extraction
|
|
721
1028
|
full_response = ""
|
|
722
1029
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
if chat_completion_request.tools:
|
|
1030
|
+
# Track whether we're still in the thinking phase (before </think> tag)
|
|
1031
|
+
in_thinking_phase = self.llm_loaded.reasoning
|
|
1032
|
+
reasoning_buffer = "" # Accumulate reasoning tokens to detect </think>
|
|
727
1033
|
|
|
728
|
-
|
|
729
|
-
|
|
1034
|
+
try:
|
|
1035
|
+
async for token in self._generate_tokens(**generation_args):
|
|
1036
|
+
# Handle client disconnect: stop generation and exit
|
|
1037
|
+
if await request.is_disconnected():
|
|
1038
|
+
self.stop_event.set()
|
|
1039
|
+
break
|
|
730
1040
|
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
)
|
|
1041
|
+
# Continuously look for tool calls embedded into the generated text
|
|
1042
|
+
openai_tool_calls = None
|
|
1043
|
+
if chat_completion_request.tools:
|
|
735
1044
|
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
ChoiceDeltaToolCall(
|
|
743
|
-
index=0,
|
|
744
|
-
id="-",
|
|
745
|
-
function=ChoiceDeltaToolCallFunction(
|
|
746
|
-
arguments=json.dumps(tool_call["arguments"]),
|
|
747
|
-
name=tool_call["name"],
|
|
748
|
-
),
|
|
749
|
-
type="function",
|
|
750
|
-
)
|
|
1045
|
+
# Append the token to the full response
|
|
1046
|
+
full_response += token
|
|
1047
|
+
|
|
1048
|
+
tool_calls, _ = extract_tool_calls(
|
|
1049
|
+
full_response,
|
|
1050
|
+
tool_call_pattern,
|
|
751
1051
|
)
|
|
752
1052
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
1053
|
+
# If there are tool calls, reset the full response for the next call
|
|
1054
|
+
if tool_calls:
|
|
1055
|
+
openai_tool_calls = []
|
|
1056
|
+
full_response = ""
|
|
1057
|
+
for tool_call in tool_calls:
|
|
1058
|
+
openai_tool_calls.append(
|
|
1059
|
+
ChoiceDeltaToolCall(
|
|
1060
|
+
index=0,
|
|
1061
|
+
id="-",
|
|
1062
|
+
function=ChoiceDeltaToolCallFunction(
|
|
1063
|
+
arguments=json.dumps(
|
|
1064
|
+
tool_call["arguments"]
|
|
1065
|
+
),
|
|
1066
|
+
name=tool_call["name"],
|
|
1067
|
+
),
|
|
1068
|
+
type="function",
|
|
1069
|
+
)
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
# Create a ChatCompletionChunk with reasoning_content support
|
|
1073
|
+
# If we're in reasoning mode and haven't seen </think> yet,
|
|
1074
|
+
# send tokens as reasoning_content instead of content
|
|
1075
|
+
delta_content = None
|
|
1076
|
+
delta_reasoning = None
|
|
1077
|
+
|
|
1078
|
+
if reasoning_first_token:
|
|
1079
|
+
# First token - include opening tag in reasoning
|
|
1080
|
+
delta_reasoning = "<think>" + token
|
|
1081
|
+
reasoning_first_token = False
|
|
1082
|
+
reasoning_buffer = token
|
|
1083
|
+
elif in_thinking_phase:
|
|
1084
|
+
# Still in thinking phase - accumulate and check for </think>
|
|
1085
|
+
reasoning_buffer += token
|
|
1086
|
+
|
|
1087
|
+
# Check if we've seen the closing tag
|
|
1088
|
+
if "</think>" in reasoning_buffer:
|
|
1089
|
+
# Split at the closing tag
|
|
1090
|
+
before_close, after_close = reasoning_buffer.split(
|
|
1091
|
+
"</think>", 1
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
# Send everything before + closing tag as reasoning
|
|
1095
|
+
if before_close or not reasoning_buffer.startswith(
|
|
1096
|
+
"</think>"
|
|
1097
|
+
):
|
|
1098
|
+
delta_reasoning = before_close + "</think>"
|
|
1099
|
+
else:
|
|
1100
|
+
delta_reasoning = "</think>"
|
|
1101
|
+
|
|
1102
|
+
# Everything after goes to content (will be sent in next iteration)
|
|
1103
|
+
# For now, mark that we've exited thinking phase
|
|
1104
|
+
in_thinking_phase = False
|
|
1105
|
+
|
|
1106
|
+
# If there's content after </think>, we need to send it too
|
|
1107
|
+
# But we send it in the current chunk as regular content
|
|
1108
|
+
if after_close:
|
|
1109
|
+
# We have both reasoning and content in this token
|
|
1110
|
+
# Send reasoning first, content will accumulate
|
|
1111
|
+
delta_content = after_close
|
|
1112
|
+
else:
|
|
1113
|
+
# Still accumulating thinking, send as reasoning_content
|
|
1114
|
+
delta_reasoning = token
|
|
1115
|
+
else:
|
|
1116
|
+
# Normal content (after thinking phase ended)
|
|
1117
|
+
delta_content = token
|
|
1118
|
+
|
|
1119
|
+
chunk = ChatCompletionChunk.model_construct(
|
|
1120
|
+
id="0",
|
|
1121
|
+
object="chat.completion.chunk",
|
|
1122
|
+
created=int(time.time()),
|
|
1123
|
+
model=self.llm_loaded.checkpoint,
|
|
1124
|
+
choices=[
|
|
1125
|
+
Choice.model_construct(
|
|
1126
|
+
index=0,
|
|
1127
|
+
delta=ChoiceDelta(
|
|
1128
|
+
content=delta_content,
|
|
1129
|
+
reasoning_content=delta_reasoning,
|
|
1130
|
+
function_call=None,
|
|
1131
|
+
role="assistant",
|
|
1132
|
+
tool_calls=openai_tool_calls,
|
|
1133
|
+
refusal=None,
|
|
767
1134
|
),
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
finish_reason=None,
|
|
774
|
-
logprobs=None,
|
|
775
|
-
)
|
|
776
|
-
],
|
|
777
|
-
)
|
|
1135
|
+
finish_reason=None,
|
|
1136
|
+
logprobs=None,
|
|
1137
|
+
)
|
|
1138
|
+
],
|
|
1139
|
+
)
|
|
778
1140
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
|
|
1141
|
+
# Format as SSE
|
|
1142
|
+
yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
|
|
782
1143
|
|
|
783
|
-
|
|
784
|
-
|
|
1144
|
+
# Send the [DONE] marker only if still connected
|
|
1145
|
+
if not await request.is_disconnected():
|
|
1146
|
+
yield b"data: [DONE]\n\n"
|
|
1147
|
+
except asyncio.CancelledError:
|
|
1148
|
+
self.stop_event.set()
|
|
1149
|
+
return
|
|
785
1150
|
|
|
786
1151
|
return StreamingResponse(
|
|
787
1152
|
generate(),
|
|
@@ -861,7 +1226,7 @@ class Server:
|
|
|
861
1226
|
|
|
862
1227
|
if self.llm_loaded.recipe == "llamacpp":
|
|
863
1228
|
try:
|
|
864
|
-
return
|
|
1229
|
+
return self.wrapped_server.embeddings(embeddings_request)
|
|
865
1230
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
866
1231
|
# Check if model has embeddings label
|
|
867
1232
|
model_info = ModelManager().supported_models.get(
|
|
@@ -884,7 +1249,7 @@ class Server:
|
|
|
884
1249
|
|
|
885
1250
|
async def reranking(self, reranking_request: RerankingRequest):
|
|
886
1251
|
"""
|
|
887
|
-
Rerank documents based on their relevance to a query
|
|
1252
|
+
Rerank documents based on their relevance to a query.
|
|
888
1253
|
"""
|
|
889
1254
|
# Initialize load config from reranking request
|
|
890
1255
|
lc = LoadConfig(model_name=reranking_request.model)
|
|
@@ -894,7 +1259,7 @@ class Server:
|
|
|
894
1259
|
|
|
895
1260
|
if self.llm_loaded.recipe == "llamacpp":
|
|
896
1261
|
try:
|
|
897
|
-
return
|
|
1262
|
+
return self.wrapped_server.reranking(reranking_request)
|
|
898
1263
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
899
1264
|
# Check if model has reranking label
|
|
900
1265
|
model_info = ModelManager().supported_models.get(
|
|
@@ -940,7 +1305,7 @@ class Server:
|
|
|
940
1305
|
formatted_messages.append(f"{role_marker}\n{content} <|end|>")
|
|
941
1306
|
return "\n".join(formatted_messages) + "\n<|assistant|>"
|
|
942
1307
|
|
|
943
|
-
async def responses(self, responses_request: ResponsesRequest):
|
|
1308
|
+
async def responses(self, responses_request: ResponsesRequest, request: Request):
|
|
944
1309
|
"""
|
|
945
1310
|
Stream responses using HTTP chunked transfer encoding.
|
|
946
1311
|
"""
|
|
@@ -953,6 +1318,12 @@ class Server:
|
|
|
953
1318
|
# Load the model if it's different from the currently loaded one
|
|
954
1319
|
await self.load_llm(lc)
|
|
955
1320
|
|
|
1321
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
1322
|
+
raise HTTPException(
|
|
1323
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
1324
|
+
detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
|
|
1325
|
+
)
|
|
1326
|
+
|
|
956
1327
|
# Convert chat messages to text using the model's chat template
|
|
957
1328
|
if isinstance(responses_request.input, str):
|
|
958
1329
|
text = responses_request.input
|
|
@@ -1006,56 +1377,72 @@ class Server:
|
|
|
1006
1377
|
|
|
1007
1378
|
full_response = "<think>" if reasoning_first_token else ""
|
|
1008
1379
|
|
|
1009
|
-
|
|
1380
|
+
try:
|
|
1381
|
+
async for token in self._generate_tokens(**generation_args):
|
|
1382
|
+
# Handle client disconnect: stop generation and exit
|
|
1383
|
+
if await request.is_disconnected():
|
|
1384
|
+
self.stop_event.set()
|
|
1385
|
+
break
|
|
1010
1386
|
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1387
|
+
# Create an event
|
|
1388
|
+
delta_event = ResponseTextDeltaEvent(
|
|
1389
|
+
content_index=0,
|
|
1390
|
+
delta=(
|
|
1391
|
+
"<think>" + token if reasoning_first_token else token
|
|
1392
|
+
),
|
|
1393
|
+
item_id="0 ",
|
|
1394
|
+
logprobs=[],
|
|
1395
|
+
output_index=0,
|
|
1396
|
+
sequence_number=0,
|
|
1397
|
+
type="response.output_text.delta",
|
|
1398
|
+
)
|
|
1399
|
+
full_response += token
|
|
1021
1400
|
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1401
|
+
# Format as SSE
|
|
1402
|
+
reasoning_first_token = False
|
|
1403
|
+
yield f"data: {delta_event.model_dump_json()}\n\n".encode(
|
|
1404
|
+
"utf-8"
|
|
1405
|
+
)
|
|
1025
1406
|
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1407
|
+
# Send the completed event (only if still connected)
|
|
1408
|
+
if not await request.is_disconnected():
|
|
1409
|
+
response_output_message = ResponseOutputMessage(
|
|
1410
|
+
id="0",
|
|
1411
|
+
content=[
|
|
1412
|
+
ResponseOutputText(
|
|
1413
|
+
annotations=[],
|
|
1414
|
+
text=full_response,
|
|
1415
|
+
type="output_text",
|
|
1416
|
+
)
|
|
1417
|
+
],
|
|
1418
|
+
role="assistant",
|
|
1419
|
+
status="completed",
|
|
1420
|
+
type="message",
|
|
1421
|
+
)
|
|
1422
|
+
response = Response(
|
|
1423
|
+
id="0",
|
|
1424
|
+
model=self.llm_loaded.checkpoint,
|
|
1425
|
+
created_at=int(time.time()),
|
|
1426
|
+
object="response",
|
|
1427
|
+
output=[response_output_message],
|
|
1428
|
+
parallel_tool_calls=True,
|
|
1429
|
+
tool_choice="auto",
|
|
1430
|
+
tools=[],
|
|
1431
|
+
)
|
|
1432
|
+
completed_event = ResponseCompletedEvent(
|
|
1433
|
+
response=response,
|
|
1434
|
+
type="response.completed",
|
|
1435
|
+
sequence_number=0,
|
|
1436
|
+
)
|
|
1437
|
+
yield f"data: {completed_event.model_dump_json()}\n\n".encode(
|
|
1438
|
+
"utf-8"
|
|
1034
1439
|
)
|
|
1035
|
-
],
|
|
1036
|
-
role="assistant",
|
|
1037
|
-
status="completed",
|
|
1038
|
-
type="message",
|
|
1039
|
-
)
|
|
1040
|
-
response = Response(
|
|
1041
|
-
id="0",
|
|
1042
|
-
model=self.llm_loaded.checkpoint,
|
|
1043
|
-
created_at=int(time.time()),
|
|
1044
|
-
object="response",
|
|
1045
|
-
output=[response_output_message],
|
|
1046
|
-
parallel_tool_calls=True,
|
|
1047
|
-
tool_choice="auto",
|
|
1048
|
-
tools=[],
|
|
1049
|
-
)
|
|
1050
|
-
completed_event = ResponseCompletedEvent(
|
|
1051
|
-
response=response,
|
|
1052
|
-
type="response.completed",
|
|
1053
|
-
sequence_number=0,
|
|
1054
|
-
)
|
|
1055
|
-
yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
|
|
1056
1440
|
|
|
1057
|
-
|
|
1058
|
-
|
|
1441
|
+
# Send the [DONE] marker
|
|
1442
|
+
yield b"data: [DONE]\n\n"
|
|
1443
|
+
except asyncio.CancelledError:
|
|
1444
|
+
self.stop_event.set()
|
|
1445
|
+
return
|
|
1059
1446
|
|
|
1060
1447
|
return StreamingResponse(
|
|
1061
1448
|
generate(),
|
|
@@ -1150,18 +1537,33 @@ class Server:
|
|
|
1150
1537
|
)
|
|
1151
1538
|
self.input_tokens = len(input_ids[0])
|
|
1152
1539
|
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1540
|
+
max_prompt_length = self.ctx_size # Default fallback
|
|
1541
|
+
# For OGA models, try to read the actual max prompt length from config
|
|
1542
|
+
if "oga-" in self.llm_loaded.recipe:
|
|
1543
|
+
try:
|
|
1544
|
+
if model.config and model.config.get("max_prompt_length"):
|
|
1545
|
+
max_prompt_length = model.config["max_prompt_length"]
|
|
1546
|
+
logging.debug(
|
|
1547
|
+
f"Using OGA model max_prompt_length: {max_prompt_length}"
|
|
1548
|
+
)
|
|
1549
|
+
# pylint: disable=broad-exception-caught
|
|
1550
|
+
except Exception as e:
|
|
1551
|
+
logging.debug(f"Could not read OGA model config, using ctx_size: {e}")
|
|
1158
1552
|
|
|
1553
|
+
# Apply truncation if input exceeds the limit
|
|
1554
|
+
if self.input_tokens > max_prompt_length:
|
|
1555
|
+
# Truncate input ids
|
|
1556
|
+
truncate_amount = self.input_tokens - max_prompt_length
|
|
1557
|
+
input_ids = input_ids[:max_prompt_length]
|
|
1159
1558
|
# Update token count
|
|
1160
|
-
self.
|
|
1559
|
+
if "oga-" in self.llm_loaded.recipe:
|
|
1560
|
+
self.input_tokens = len(input_ids)
|
|
1561
|
+
else:
|
|
1562
|
+
self.input_tokens = len(input_ids[0])
|
|
1161
1563
|
|
|
1162
|
-
#
|
|
1564
|
+
# Log warning message instead of raising exception
|
|
1163
1565
|
truncation_message = (
|
|
1164
|
-
f"Input exceeded {
|
|
1566
|
+
f"Input exceeded {max_prompt_length} tokens. "
|
|
1165
1567
|
f"Truncated {truncate_amount} tokens from the beginning."
|
|
1166
1568
|
)
|
|
1167
1569
|
logging.warning(truncation_message)
|
|
@@ -1285,9 +1687,11 @@ class Server:
|
|
|
1285
1687
|
"""
|
|
1286
1688
|
Send performance statistics to the client.
|
|
1287
1689
|
"""
|
|
1288
|
-
# If using
|
|
1289
|
-
if self.llm_loaded and
|
|
1290
|
-
|
|
1690
|
+
# If using wrapped server, get telemetry from the telemetry instance
|
|
1691
|
+
if self.llm_loaded and (
|
|
1692
|
+
self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
|
|
1693
|
+
):
|
|
1694
|
+
return self.wrapped_server.telemetry.get_telemetry_data()
|
|
1291
1695
|
|
|
1292
1696
|
# For built-in server, use the existing telemetry
|
|
1293
1697
|
return {
|
|
@@ -1387,6 +1791,7 @@ class Server:
|
|
|
1387
1791
|
checkpoint=config.checkpoint,
|
|
1388
1792
|
recipe=config.recipe,
|
|
1389
1793
|
reasoning=config.reasoning,
|
|
1794
|
+
vision=config.vision,
|
|
1390
1795
|
mmproj=config.mmproj,
|
|
1391
1796
|
# The pull endpoint will download an upgraded model if available, even
|
|
1392
1797
|
# if we already have a local copy of the model
|
|
@@ -1432,9 +1837,10 @@ class Server:
|
|
|
1432
1837
|
Load a registered LLM into system memory. Install the model first, if needed.
|
|
1433
1838
|
config: the information required to load the model
|
|
1434
1839
|
"""
|
|
1840
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
1841
|
+
|
|
1435
1842
|
try:
|
|
1436
1843
|
await self._load_lock.acquire()
|
|
1437
|
-
|
|
1438
1844
|
# Acquire all generate locks
|
|
1439
1845
|
for _ in range(self.max_concurrent_generations):
|
|
1440
1846
|
await self._generate_semaphore.acquire()
|
|
@@ -1459,6 +1865,38 @@ class Server:
|
|
|
1459
1865
|
# Get additional properties from the model registry
|
|
1460
1866
|
config_to_use = LoadConfig(**supported_models[config.model_name])
|
|
1461
1867
|
|
|
1868
|
+
# For locally uploaded models, convert the relative checkpoint path to absolute path
|
|
1869
|
+
model_source = supported_models.get(config.model_name, {}).get(
|
|
1870
|
+
"source", None
|
|
1871
|
+
)
|
|
1872
|
+
if (
|
|
1873
|
+
model_source == "local_upload"
|
|
1874
|
+
and config_to_use.checkpoint
|
|
1875
|
+
and not config_to_use.recipe.startswith("hf-")
|
|
1876
|
+
):
|
|
1877
|
+
# Check if checkpoint is a relative path (stored during upload)
|
|
1878
|
+
if not os.path.isabs(config_to_use.checkpoint):
|
|
1879
|
+
# Convert relative path to absolute by joining with HF_HUB_CACHE
|
|
1880
|
+
absolute_checkpoint = os.path.join(
|
|
1881
|
+
HF_HUB_CACHE, config_to_use.checkpoint
|
|
1882
|
+
)
|
|
1883
|
+
if os.path.exists(absolute_checkpoint):
|
|
1884
|
+
config_to_use.checkpoint = absolute_checkpoint
|
|
1885
|
+
else:
|
|
1886
|
+
logging.warning(
|
|
1887
|
+
f"Checkpoint path does not exist: {absolute_checkpoint}"
|
|
1888
|
+
)
|
|
1889
|
+
|
|
1890
|
+
# Also resolve mmproj path if present
|
|
1891
|
+
if config_to_use.mmproj and not os.path.isabs(config_to_use.mmproj):
|
|
1892
|
+
absolute_mmproj = os.path.join(HF_HUB_CACHE, config_to_use.mmproj)
|
|
1893
|
+
if os.path.exists(absolute_mmproj):
|
|
1894
|
+
config_to_use.mmproj = absolute_mmproj
|
|
1895
|
+
else:
|
|
1896
|
+
logging.warning(
|
|
1897
|
+
f"MMProj path does not exist: {absolute_mmproj}"
|
|
1898
|
+
)
|
|
1899
|
+
|
|
1462
1900
|
# Caching mechanism: if the checkpoint is already loaded there is nothing else to do
|
|
1463
1901
|
if (
|
|
1464
1902
|
self.llm_loaded
|
|
@@ -1466,9 +1904,9 @@ class Server:
|
|
|
1466
1904
|
):
|
|
1467
1905
|
if (
|
|
1468
1906
|
self.llm_loaded.recipe == "llamacpp"
|
|
1469
|
-
|
|
1470
|
-
):
|
|
1471
|
-
#
|
|
1907
|
+
or self.llm_loaded.recipe == "flm"
|
|
1908
|
+
) and self.wrapped_server.process.poll():
|
|
1909
|
+
# wrapped server process has gone away for some reason, so we should
|
|
1472
1910
|
# proceed with loading to get it back
|
|
1473
1911
|
pass
|
|
1474
1912
|
else:
|
|
@@ -1484,12 +1922,18 @@ class Server:
|
|
|
1484
1922
|
logging.info(f"Loading llm: {config.model_name}")
|
|
1485
1923
|
try:
|
|
1486
1924
|
if config_to_use.recipe == "llamacpp":
|
|
1487
|
-
self.
|
|
1925
|
+
self.wrapped_server = LlamaServer(self.llamacpp_backend)
|
|
1926
|
+
self.wrapped_server.load(
|
|
1927
|
+
model_config=config_to_use,
|
|
1928
|
+
ctx_size=self.ctx_size,
|
|
1929
|
+
do_not_upgrade=True,
|
|
1930
|
+
)
|
|
1931
|
+
|
|
1932
|
+
elif config_to_use.recipe == "flm":
|
|
1933
|
+
self.wrapped_server = FlmServer()
|
|
1934
|
+
self.wrapped_server.load(
|
|
1488
1935
|
model_config=config_to_use,
|
|
1489
|
-
telemetry=self.llama_telemetry,
|
|
1490
|
-
backend=self.llamacpp_backend,
|
|
1491
1936
|
ctx_size=self.ctx_size,
|
|
1492
|
-
# Models should only upgrade when using the pull endpoint
|
|
1493
1937
|
do_not_upgrade=True,
|
|
1494
1938
|
)
|
|
1495
1939
|
|
|
@@ -1529,8 +1973,8 @@ class Server:
|
|
|
1529
1973
|
for _ in range(self.max_concurrent_generations):
|
|
1530
1974
|
await self._generate_semaphore.acquire()
|
|
1531
1975
|
|
|
1532
|
-
if self.llm_loaded.recipe == "llamacpp":
|
|
1533
|
-
self.
|
|
1976
|
+
if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
|
|
1977
|
+
self.wrapped_server.process.terminate()
|
|
1534
1978
|
|
|
1535
1979
|
self.llm_loaded = None
|
|
1536
1980
|
self.tokenizer = None
|
|
@@ -1567,6 +2011,36 @@ class Server:
|
|
|
1567
2011
|
|
|
1568
2012
|
return {"object": "list", "data": models_list}
|
|
1569
2013
|
|
|
2014
|
+
async def retrieve_model(self, model_id: str):
|
|
2015
|
+
"""
|
|
2016
|
+
Retrieve a specific model by ID in OpenAI-compatible format.
|
|
2017
|
+
"""
|
|
2018
|
+
# Raise an error if the model does not exist
|
|
2019
|
+
if model_id not in self.local_models:
|
|
2020
|
+
# Mimic the error format of the OpenAI API
|
|
2021
|
+
raise HTTPException(
|
|
2022
|
+
status_code=404,
|
|
2023
|
+
detail={
|
|
2024
|
+
"message": f"model {model_id} not found",
|
|
2025
|
+
"type": "api_error",
|
|
2026
|
+
"param": None,
|
|
2027
|
+
"code": None,
|
|
2028
|
+
},
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
# Return the specific model
|
|
2032
|
+
model_info = self.local_models[model_id]
|
|
2033
|
+
model = ServerModel(
|
|
2034
|
+
id=model_id,
|
|
2035
|
+
owned_by="lemonade",
|
|
2036
|
+
object="model",
|
|
2037
|
+
created=int(time.time()),
|
|
2038
|
+
checkpoint=model_info["checkpoint"],
|
|
2039
|
+
recipe=model_info["recipe"],
|
|
2040
|
+
)
|
|
2041
|
+
|
|
2042
|
+
return model
|
|
2043
|
+
|
|
1570
2044
|
def setup_middleware_timer(self):
|
|
1571
2045
|
logging.info("Middleware set up")
|
|
1572
2046
|
|
|
@@ -1602,6 +2076,48 @@ class Server:
|
|
|
1602
2076
|
logging.debug(f"Total request time: {request_time:.4f} seconds")
|
|
1603
2077
|
return response
|
|
1604
2078
|
|
|
2079
|
+
async def logs_ws(self, websocket: WebSocket):
|
|
2080
|
+
if not self.log_file or not os.path.exists(self.log_file):
|
|
2081
|
+
await websocket.close(code=4000)
|
|
2082
|
+
return
|
|
2083
|
+
await log_streamer(websocket, self.log_file)
|
|
2084
|
+
|
|
2085
|
+
async def get_incompatible_models(self):
|
|
2086
|
+
"""
|
|
2087
|
+
Get information about incompatible RyzenAI models in the cache.
|
|
2088
|
+
"""
|
|
2089
|
+
try:
|
|
2090
|
+
return ModelManager().get_incompatible_ryzenai_models()
|
|
2091
|
+
except Exception as e:
|
|
2092
|
+
raise HTTPException(
|
|
2093
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2094
|
+
detail=f"Failed to scan for incompatible models: {str(e)}",
|
|
2095
|
+
)
|
|
2096
|
+
|
|
2097
|
+
async def cleanup_incompatible_models(self, request: Request):
|
|
2098
|
+
"""
|
|
2099
|
+
Delete selected incompatible RyzenAI models from the cache.
|
|
2100
|
+
"""
|
|
2101
|
+
try:
|
|
2102
|
+
body = await request.json()
|
|
2103
|
+
model_paths = body.get("model_paths", [])
|
|
2104
|
+
|
|
2105
|
+
if not model_paths:
|
|
2106
|
+
raise HTTPException(
|
|
2107
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2108
|
+
detail="No model_paths provided",
|
|
2109
|
+
)
|
|
2110
|
+
|
|
2111
|
+
result = ModelManager().cleanup_incompatible_models(model_paths)
|
|
2112
|
+
return result
|
|
2113
|
+
except HTTPException:
|
|
2114
|
+
raise
|
|
2115
|
+
except Exception as e:
|
|
2116
|
+
raise HTTPException(
|
|
2117
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2118
|
+
detail=f"Failed to cleanup models: {str(e)}",
|
|
2119
|
+
)
|
|
2120
|
+
|
|
1605
2121
|
|
|
1606
2122
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
1607
2123
|
# Modifications Copyright (c) 2025 AMD
|