lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/cli.py +47 -5
  3. lemonade/common/inference_engines.py +13 -4
  4. lemonade/common/status.py +4 -4
  5. lemonade/common/system_info.py +544 -1
  6. lemonade/profilers/agt_power.py +437 -0
  7. lemonade/profilers/hwinfo_power.py +429 -0
  8. lemonade/tools/accuracy.py +143 -48
  9. lemonade/tools/adapter.py +6 -1
  10. lemonade/tools/bench.py +26 -8
  11. lemonade/tools/flm/__init__.py +1 -0
  12. lemonade/tools/flm/utils.py +303 -0
  13. lemonade/tools/huggingface/bench.py +6 -1
  14. lemonade/tools/llamacpp/bench.py +146 -27
  15. lemonade/tools/llamacpp/load.py +30 -2
  16. lemonade/tools/llamacpp/utils.py +393 -33
  17. lemonade/tools/oga/bench.py +5 -26
  18. lemonade/tools/oga/load.py +60 -121
  19. lemonade/tools/oga/migration.py +403 -0
  20. lemonade/tools/report/table.py +76 -8
  21. lemonade/tools/server/flm.py +133 -0
  22. lemonade/tools/server/llamacpp.py +220 -553
  23. lemonade/tools/server/serve.py +684 -168
  24. lemonade/tools/server/static/js/chat.js +666 -342
  25. lemonade/tools/server/static/js/model-settings.js +24 -3
  26. lemonade/tools/server/static/js/models.js +597 -73
  27. lemonade/tools/server/static/js/shared.js +79 -14
  28. lemonade/tools/server/static/logs.html +191 -0
  29. lemonade/tools/server/static/styles.css +491 -66
  30. lemonade/tools/server/static/webapp.html +83 -31
  31. lemonade/tools/server/tray.py +158 -38
  32. lemonade/tools/server/utils/macos_tray.py +226 -0
  33. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  34. lemonade/tools/server/webapp.py +4 -1
  35. lemonade/tools/server/wrapped_server.py +559 -0
  36. lemonade/version.py +1 -1
  37. lemonade_install/install.py +54 -611
  38. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
  39. lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
  40. lemonade_server/cli.py +145 -37
  41. lemonade_server/model_manager.py +521 -37
  42. lemonade_server/pydantic_models.py +28 -1
  43. lemonade_server/server_models.json +246 -92
  44. lemonade_server/settings.py +39 -39
  45. lemonade/tools/quark/__init__.py +0 -0
  46. lemonade/tools/quark/quark_load.py +0 -173
  47. lemonade/tools/quark/quark_quantize.py +0 -439
  48. lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
  49. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  50. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  51. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  52. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  53. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -7,15 +7,16 @@ import logging
7
7
  import platform
8
8
  import tempfile
9
9
  import traceback
10
- from typing import Optional, Union
10
+ from typing import Optional, Union, List
11
11
  import json
12
- import subprocess
13
12
  from pathlib import Path
14
-
15
- from fastapi import FastAPI, HTTPException, status, Request
13
+ import os
14
+ import shutil
15
+ from fastapi import FastAPI, HTTPException, status, Request, WebSocket, Form, UploadFile
16
16
  from fastapi.responses import StreamingResponse
17
17
  from fastapi.middleware.cors import CORSMiddleware
18
18
  from fastapi.staticfiles import StaticFiles
19
+ from starlette.websockets import WebSocketDisconnect, WebSocketState
19
20
  import uvicorn
20
21
  from uvicorn.config import Config
21
22
  from uvicorn.server import Server as UvicornServer
@@ -47,7 +48,9 @@ from openai.types.responses import (
47
48
  )
48
49
 
49
50
  import lemonade.api as lemonade_api
50
- import lemonade.tools.server.llamacpp as llamacpp
51
+ from lemonade.tools.server.wrapped_server import WrappedServer
52
+ from lemonade.tools.server.llamacpp import LlamaServer
53
+ from lemonade.tools.server.flm import FlmServer
51
54
  from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
52
55
  from lemonade.tools.server.webapp import get_webapp_html
53
56
  from lemonade.tools.server.utils.port import lifespan
@@ -75,12 +78,83 @@ from lemonade_server.settings import save_setting
75
78
  # Tests should use the max_new_tokens argument to set a lower value
76
79
  DEFAULT_MAX_NEW_TOKENS = 1500
77
80
 
78
- # Only import tray on Windows
79
- if platform.system() == "Windows":
81
+ if platform.system() in ["Windows", "Darwin"]:
80
82
  # pylint: disable=ungrouped-imports
81
83
  from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
82
84
 
83
85
 
86
+ class ServerLogFilter(logging.Filter):
87
+ def __init__(self, server):
88
+ super().__init__()
89
+ self.server = server
90
+ self.noisy_paths = {
91
+ "/api/v1/health",
92
+ "/api/v0/health",
93
+ "/api/v1/models",
94
+ "/api/v0/models",
95
+ }
96
+
97
+ def filter(self, record: logging.LogRecord) -> bool:
98
+ msg = record.getMessage()
99
+
100
+ # Filter out websocket logs
101
+ if "> TEXT" in msg:
102
+ return False
103
+
104
+ # Filter out noisy HTTP routes if debug logs are OFF
105
+ if not self.server.debug_logging_enabled:
106
+ if any(path in msg for path in self.noisy_paths):
107
+ return False
108
+
109
+ # Otherwise, allow the log
110
+ return True
111
+
112
+
113
+ async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
114
+ logger = logging.getLogger()
115
+ await websocket.accept()
116
+ try:
117
+ with open(path, "r", encoding="utf-8") as f:
118
+ f.seek(0) # start at the beginning of the file
119
+ while True:
120
+ # Try reading a line
121
+ line = f.readline()
122
+ if not line:
123
+ await asyncio.sleep(interval)
124
+ continue
125
+
126
+ # Send defensively: if disconnected, bail out
127
+ if websocket.application_state != WebSocketState.CONNECTED:
128
+ # Server-side state says we're not connected anymore
129
+ break
130
+
131
+ try:
132
+ await websocket.send_text(line)
133
+ except WebSocketDisconnect:
134
+ # Client closed — normal path out
135
+ break
136
+ except RuntimeError as re:
137
+ # Starlette will raise this if a close has already been sent
138
+ logger.debug("RuntimeError during send: %s", re)
139
+ break
140
+
141
+ except WebSocketDisconnect:
142
+ # Client closed the socket; do not try to send or close again
143
+ pass
144
+ except Exception as e: # pylint: disable=broad-except
145
+ # Log server-side; do not attempt to send error over a possibly closed socket
146
+ logger.exception("Error in log_streamer: %s", e)
147
+ finally:
148
+ # Only close if Starlette still thinks we're connected.
149
+ # This prevents "Cannot call send once a close message has been sent."
150
+ try:
151
+ if websocket.application_state == WebSocketState.CONNECTED:
152
+ await websocket.close()
153
+ except Exception: # pylint: disable=broad-except
154
+ # If close itself races, swallow — we're shutting down anyway.
155
+ pass
156
+
157
+
84
158
  class ServerModel(Model):
85
159
  """
86
160
  An extension of OpenAI's Model class that adds
@@ -133,6 +207,21 @@ class StopOnEvent:
133
207
  return self.stop_event.is_set()
134
208
 
135
209
 
210
+ class NoCacheStaticFiles(StaticFiles):
211
+ """Custom StaticFiles class with no-cache headers"""
212
+
213
+ def __init__(self, *args, **kwargs):
214
+ super().__init__(*args, **kwargs)
215
+
216
+ def file_response(self, *args, **kwargs) -> Response:
217
+ response = super().file_response(*args, **kwargs)
218
+ # Add no-cache headers for all static files
219
+ response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
220
+ response.headers["Pragma"] = "no-cache"
221
+ response.headers["Expires"] = "0"
222
+ return response
223
+
224
+
136
225
  class Server:
137
226
  """
138
227
  Open a web server that apps can use to communicate with the LLM.
@@ -149,6 +238,7 @@ class Server:
149
238
  - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
150
239
  - /api/v1/responses: responses API using HTTP chunked transfer encoding.
151
240
  - /api/v1/models: list all available models.
241
+ - /api/v1/models/{model_id}: retrieve a specific model by ID.
152
242
  """
153
243
 
154
244
  def __init__(
@@ -188,6 +278,12 @@ class Server:
188
278
  allow_headers=["*"], # Allows all headers
189
279
  )
190
280
 
281
+ # Set up debug middleware if debug logging is enabled
282
+ # This must be done during app initialization, not at runtime
283
+ self.debug_logging_enabled = log_level == "debug"
284
+ if self.debug_logging_enabled:
285
+ self.setup_middleware_timer()
286
+
191
287
  # Set up custom routes
192
288
  self.setup_routes(["/api/v0", "/api/v1"])
193
289
 
@@ -198,7 +294,7 @@ class Server:
198
294
  # as the Web App
199
295
  static_dir = Path(__file__).parent / "static"
200
296
  self.app.mount(
201
- "/static", StaticFiles(directory=static_dir), name="static_assets"
297
+ "/static", NoCacheStaticFiles(directory=static_dir), name="static_assets"
202
298
  )
203
299
 
204
300
  # Performance stats that are set during /ws and can be
@@ -232,11 +328,8 @@ class Server:
232
328
  # Add lock for load/unload operations
233
329
  self._load_lock = asyncio.Lock()
234
330
 
235
- # Subprocess handle for llama_server.exe
236
- self.llama_server_process: subprocess.Popen = None
237
-
238
- # Telemetry instance for llama server
239
- self.llama_telemetry = llamacpp.LlamaTelemetry()
331
+ # Subprocess handle for wrapped instance of llama_server.exe, etc.
332
+ self.wrapped_server: WrappedServer = None
240
333
 
241
334
  def setup_routes(self, api_prefixes: list[str]):
242
335
  for prefix in api_prefixes:
@@ -252,16 +345,199 @@ class Server:
252
345
  self.app.post(f"{prefix}/completions")(self.completions)
253
346
  self.app.post(f"{prefix}/responses")(self.responses)
254
347
  self.app.post(f"{prefix}/log-level")(self.set_log_level)
348
+ self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
349
+ self.app.post(f"{prefix}/add-local-model")(self.add_local_model)
255
350
 
256
351
  # OpenAI-compatible routes
257
352
  self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
258
353
  self.app.post(f"{prefix}/embeddings")(self.embeddings)
259
354
  self.app.get(f"{prefix}/models")(self.models)
355
+ self.app.get(f"{prefix}/models/{{model_id}}")(self.retrieve_model)
260
356
 
261
357
  # JinaAI routes (jina.ai/reranker/)
262
358
  self.app.post(f"{prefix}/reranking")(self.reranking)
263
359
  self.app.post(f"{prefix}/rerank")(self.reranking)
264
360
 
361
+ # Migration routes
362
+ self.app.get(f"{prefix}/migration/incompatible-models")(
363
+ self.get_incompatible_models
364
+ )
365
+ self.app.post(f"{prefix}/migration/cleanup")(
366
+ self.cleanup_incompatible_models
367
+ )
368
+
369
+ async def add_local_model(
370
+ self,
371
+ model_name: str = Form(...),
372
+ checkpoint: str = Form(""),
373
+ recipe: str = Form(...),
374
+ reasoning: bool = Form(False),
375
+ vision: bool = Form(False),
376
+ mmproj: str = Form(None),
377
+ model_files: List[UploadFile] = None,
378
+ ):
379
+ from huggingface_hub.constants import HF_HUB_CACHE
380
+ from lemonade.tools.llamacpp.utils import parse_checkpoint
381
+
382
+ # Upload and register a local model from files.
383
+ try:
384
+ if not model_files:
385
+ raise HTTPException(
386
+ status_code=status.HTTP_400_BAD_REQUEST,
387
+ detail="No model files provided for upload",
388
+ )
389
+
390
+ if not model_name.startswith("user."):
391
+ raise HTTPException(
392
+ status_code=status.HTTP_400_BAD_REQUEST,
393
+ detail="Model name must start with 'user.'",
394
+ )
395
+
396
+ valid_recipes = ["llamacpp", "oga-npu", "oga-hybrid", "oga-cpu"]
397
+ if recipe not in valid_recipes:
398
+ raise HTTPException(
399
+ status_code=status.HTTP_400_BAD_REQUEST,
400
+ detail=f"Invalid recipe. Must be one of: {', '.join(valid_recipes)}",
401
+ )
402
+
403
+ if recipe == "llamacpp" and not any(
404
+ f.filename.lower().endswith(".gguf") for f in model_files
405
+ ):
406
+ raise HTTPException(
407
+ status_code=status.HTTP_400_BAD_REQUEST,
408
+ detail="At least one .gguf file is required for llamacpp",
409
+ )
410
+
411
+ # Check if model name already exists
412
+ if model_name in ModelManager().supported_models:
413
+ raise HTTPException(
414
+ status_code=status.HTTP_409_CONFLICT,
415
+ detail=(
416
+ f"Model name '{model_name}' already exists. "
417
+ "Please use a different name."
418
+ ),
419
+ )
420
+
421
+ model_name_clean = model_name.replace("user.", "")
422
+
423
+ # Files are saved to models--{model_name_clean}
424
+ # Note: This is based on the user's custom model name, NOT the checkpoint field
425
+ repo_cache_name = model_name_clean.replace("/", "--")
426
+ snapshot_path = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
427
+ os.makedirs(snapshot_path, exist_ok=True)
428
+
429
+ # Extract variant from checkpoint field if provided
430
+ # checkpoint field format: "folder:variant" or just "folder"
431
+ variant = None
432
+ if checkpoint and ":" in checkpoint:
433
+ _, variant = parse_checkpoint(checkpoint)
434
+ # variant now contains just the variant[can be with or without the
435
+ # .gguf extension] filename (e.g., "LFM2-VL-1.6B-F16 or LFM2-VL-1.6B-F16.gguf")
436
+
437
+ # Save uploaded files, preserving folder structure
438
+ for file in model_files:
439
+ relative_path = file.filename
440
+ path_parts = relative_path.split("/")
441
+
442
+ if len(path_parts) > 1:
443
+ internal_path = "/".join(path_parts[1:])
444
+ file_path = os.path.join(snapshot_path, internal_path)
445
+ else:
446
+ file_path = os.path.join(snapshot_path, path_parts[0])
447
+
448
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
449
+ with open(file_path, "wb") as f:
450
+ content = await file.read()
451
+ f.write(content)
452
+
453
+ # Resolve actual file paths after upload (for faster loading later)
454
+ resolved_checkpoint = None
455
+ resolved_mmproj = None
456
+
457
+ # For OGA models, find genai_config.json
458
+ if recipe.startswith("oga-"):
459
+ for root, _, files in os.walk(snapshot_path):
460
+ if "genai_config.json" in files:
461
+ resolved_checkpoint = root
462
+ break
463
+ if not resolved_checkpoint:
464
+ resolved_checkpoint = snapshot_path
465
+
466
+ # For llamacpp models, find the GGUF file
467
+ elif recipe == "llamacpp":
468
+ gguf_file_found = None
469
+
470
+ # If variant is specified, look for that specific file
471
+ if variant:
472
+ search_term = (
473
+ variant if variant.endswith(".gguf") else f"{variant}.gguf"
474
+ )
475
+ for root, _, files in os.walk(snapshot_path):
476
+ if search_term in files:
477
+ gguf_file_found = os.path.join(root, search_term)
478
+ break
479
+
480
+ # If no variant or variant not found, search for any .gguf file (excluding mmproj)
481
+ if not gguf_file_found:
482
+ for root, _, files in os.walk(snapshot_path):
483
+ gguf_files = [
484
+ f
485
+ for f in files
486
+ if f.endswith(".gguf") and "mmproj" not in f.lower()
487
+ ]
488
+ if gguf_files:
489
+ gguf_file_found = os.path.join(root, gguf_files[0])
490
+ break
491
+
492
+ resolved_checkpoint = (
493
+ gguf_file_found if gguf_file_found else snapshot_path
494
+ )
495
+
496
+ # Search for mmproj file if provided
497
+ if mmproj:
498
+ for root, _, files in os.walk(snapshot_path):
499
+ if mmproj in files:
500
+ resolved_mmproj = os.path.join(root, mmproj)
501
+ break
502
+
503
+ # Build checkpoint for registration
504
+ # For llamacpp with resolved path, store the full path relative to HF_HUB_CACHE
505
+ if resolved_checkpoint:
506
+ # Store as relative path from HF_HUB_CACHE for portability
507
+ checkpoint_to_register = os.path.relpath(
508
+ resolved_checkpoint, HF_HUB_CACHE
509
+ )
510
+ elif variant:
511
+ checkpoint_to_register = f"models--{repo_cache_name}:{variant}"
512
+ else:
513
+ checkpoint_to_register = f"models--{repo_cache_name}"
514
+
515
+ # Register the model
516
+ ModelManager().register_local_model(
517
+ model_name=model_name,
518
+ checkpoint=checkpoint_to_register,
519
+ recipe=recipe,
520
+ reasoning=reasoning,
521
+ vision=vision,
522
+ mmproj=resolved_mmproj if resolved_mmproj else mmproj,
523
+ snapshot_path=snapshot_path,
524
+ )
525
+
526
+ # Refresh local models
527
+ self.local_models = ModelManager().downloaded_models_enabled
528
+
529
+ return {
530
+ "status": "success",
531
+ "message": f"Model {model_name} uploaded and registered successfully",
532
+ }
533
+ except Exception as e:
534
+ if os.path.exists(checkpoint_to_register):
535
+ shutil.rmtree(checkpoint_to_register)
536
+ raise HTTPException(
537
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
538
+ detail=f"Failed to upload model: {str(e)}",
539
+ )
540
+
265
541
  async def set_log_level(self, config: LogLevelConfig):
266
542
  """
267
543
  Set the logging level of the server.
@@ -380,11 +656,13 @@ class Server:
380
656
  )
381
657
  file_handler.setLevel(logging_level)
382
658
  file_handler.setFormatter(uvicorn_formatter)
659
+ file_handler.addFilter(ServerLogFilter(self))
383
660
 
384
661
  # Set up console handler
385
662
  console_handler = logging.StreamHandler()
386
663
  console_handler.setLevel(logging_level)
387
664
  console_handler.setFormatter(uvicorn_formatter)
665
+ console_handler.addFilter(ServerLogFilter(self))
388
666
 
389
667
  # Configure root logger with both handlers
390
668
  logging.basicConfig(
@@ -407,10 +685,6 @@ class Server:
407
685
  ).run()
408
686
  sys.exit(0)
409
687
 
410
- if self.debug_logging_enabled:
411
- # Print the elapsed time for each request
412
- self.setup_middleware_timer()
413
-
414
688
  # Let the app know what port it's running on, so
415
689
  # that the lifespan can access it
416
690
  self.app.port = self.port
@@ -507,7 +781,9 @@ class Server:
507
781
 
508
782
  return lc
509
783
 
510
- async def completions(self, completion_request: CompletionRequest):
784
+ async def completions(
785
+ self, completion_request: CompletionRequest, request: Request
786
+ ):
511
787
  """
512
788
  Stream completion responses using HTTP chunked transfer encoding.
513
789
  """
@@ -520,8 +796,8 @@ class Server:
520
796
  # Load the model if it's different from the currently loaded one
521
797
  await self.load_llm(lc)
522
798
 
523
- if self.llm_loaded.recipe == "llamacpp":
524
- return llamacpp.completion(completion_request, self.llama_telemetry)
799
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
800
+ return self.wrapped_server.completion(completion_request)
525
801
 
526
802
  # Check if the model supports reasoning
527
803
  reasoning_first_token = self.llm_loaded.reasoning
@@ -559,29 +835,43 @@ class Server:
559
835
  # This is necessary because the variable is modified
560
836
  # in the inner function
561
837
  nonlocal reasoning_first_token
838
+ try:
839
+ async for token in self._generate_tokens(**generation_args):
840
+ # Handle client disconnect: stop generation and exit
841
+ if await request.is_disconnected():
842
+ self.stop_event.set()
843
+ break
562
844
 
563
- async for token in self._generate_tokens(**generation_args):
564
- choice = CompletionChoice(
565
- text=("<think>" + token if reasoning_first_token else token),
566
- index=0,
567
- finish_reason="stop",
568
- logprobs=None,
569
- )
845
+ choice = CompletionChoice(
846
+ text=(
847
+ "<think>" + token if reasoning_first_token else token
848
+ ),
849
+ index=0,
850
+ finish_reason="stop",
851
+ logprobs=None,
852
+ )
570
853
 
571
- completion = Completion(
572
- id="0",
573
- choices=[choice],
574
- model=self.llm_loaded.checkpoint,
575
- object="text_completion",
576
- created=int(time.time()),
577
- )
854
+ completion = Completion(
855
+ id="0",
856
+ choices=[choice],
857
+ model=self.llm_loaded.checkpoint,
858
+ object="text_completion",
859
+ created=int(time.time()),
860
+ )
578
861
 
579
- # Format as SSE
580
- reasoning_first_token = False
581
- yield f"data: {completion.model_dump_json()}\n\n".encode("utf-8")
862
+ # Format as SSE
863
+ reasoning_first_token = False
864
+ yield f"data: {completion.model_dump_json()}\n\n".encode(
865
+ "utf-8"
866
+ )
582
867
 
583
- # Send the [DONE] marker
584
- yield b"data: [DONE]\n\n"
868
+ # Send the [DONE] marker only if still connected
869
+ if not await request.is_disconnected():
870
+ yield b"data: [DONE]\n\n"
871
+ except asyncio.CancelledError:
872
+ # Propagate cancellation to the generator loop
873
+ self.stop_event.set()
874
+ return
585
875
 
586
876
  return StreamingResponse(
587
877
  generate(),
@@ -639,7 +929,9 @@ class Server:
639
929
  created=int(time.time()),
640
930
  )
641
931
 
642
- async def chat_completions(self, chat_completion_request: ChatCompletionRequest):
932
+ async def chat_completions(
933
+ self, chat_completion_request: ChatCompletionRequest, request: Request
934
+ ):
643
935
  """
644
936
  Stream chat completion responses using HTTP chunked transfer encoding.
645
937
  """
@@ -655,10 +947,25 @@ class Server:
655
947
  # Load the model if it's different from the currently loaded one
656
948
  await self.load_llm(lc)
657
949
 
658
- if self.llm_loaded.recipe == "llamacpp":
659
- return llamacpp.chat_completion(
660
- chat_completion_request, self.llama_telemetry
661
- )
950
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
951
+ if (
952
+ hasattr(chat_completion_request, "enable_thinking")
953
+ and chat_completion_request.enable_thinking is False
954
+ and "qwen3" in self.llm_loaded.model_name.lower()
955
+ ):
956
+
957
+ # Modify the last user message to include /no_think
958
+ if chat_completion_request.messages:
959
+ for i in range(len(chat_completion_request.messages) - 1, -1, -1):
960
+ if chat_completion_request.messages[i].get("role") == "user":
961
+ original_content = chat_completion_request.messages[i][
962
+ "content"
963
+ ]
964
+ chat_completion_request.messages[i][
965
+ "content"
966
+ ] = f"/no_think\n{original_content}"
967
+ break
968
+ return self.wrapped_server.chat_completion(chat_completion_request)
662
969
 
663
970
  # Convert chat messages to text using the model's chat template
664
971
  text = self.apply_chat_template(
@@ -720,68 +1027,126 @@ class Server:
720
1027
  # Keep track of the full response for tool call extraction
721
1028
  full_response = ""
722
1029
 
723
- async for token in self._generate_tokens(**generation_args):
724
- # Continuously look for tool calls embedded into the generated text
725
- openai_tool_calls = None
726
- if chat_completion_request.tools:
1030
+ # Track whether we're still in the thinking phase (before </think> tag)
1031
+ in_thinking_phase = self.llm_loaded.reasoning
1032
+ reasoning_buffer = "" # Accumulate reasoning tokens to detect </think>
727
1033
 
728
- # Append the token to the full response
729
- full_response += token
1034
+ try:
1035
+ async for token in self._generate_tokens(**generation_args):
1036
+ # Handle client disconnect: stop generation and exit
1037
+ if await request.is_disconnected():
1038
+ self.stop_event.set()
1039
+ break
730
1040
 
731
- tool_calls, _ = extract_tool_calls(
732
- full_response,
733
- tool_call_pattern,
734
- )
1041
+ # Continuously look for tool calls embedded into the generated text
1042
+ openai_tool_calls = None
1043
+ if chat_completion_request.tools:
735
1044
 
736
- # If there are tool calls, reset the full response for the next tool call
737
- if tool_calls:
738
- openai_tool_calls = []
739
- full_response = ""
740
- for tool_call in tool_calls:
741
- openai_tool_calls.append(
742
- ChoiceDeltaToolCall(
743
- index=0,
744
- id="-",
745
- function=ChoiceDeltaToolCallFunction(
746
- arguments=json.dumps(tool_call["arguments"]),
747
- name=tool_call["name"],
748
- ),
749
- type="function",
750
- )
1045
+ # Append the token to the full response
1046
+ full_response += token
1047
+
1048
+ tool_calls, _ = extract_tool_calls(
1049
+ full_response,
1050
+ tool_call_pattern,
751
1051
  )
752
1052
 
753
- # Create a ChatCompletionChunk
754
- chunk = ChatCompletionChunk.model_construct(
755
- id="0",
756
- object="chat.completion.chunk",
757
- created=int(time.time()),
758
- model=self.llm_loaded.checkpoint,
759
- choices=[
760
- Choice.model_construct(
761
- index=0,
762
- delta=ChoiceDelta(
763
- content=(
764
- "<think>" + token
765
- if reasoning_first_token
766
- else token
1053
+ # If there are tool calls, reset the full response for the next call
1054
+ if tool_calls:
1055
+ openai_tool_calls = []
1056
+ full_response = ""
1057
+ for tool_call in tool_calls:
1058
+ openai_tool_calls.append(
1059
+ ChoiceDeltaToolCall(
1060
+ index=0,
1061
+ id="-",
1062
+ function=ChoiceDeltaToolCallFunction(
1063
+ arguments=json.dumps(
1064
+ tool_call["arguments"]
1065
+ ),
1066
+ name=tool_call["name"],
1067
+ ),
1068
+ type="function",
1069
+ )
1070
+ )
1071
+
1072
+ # Create a ChatCompletionChunk with reasoning_content support
1073
+ # If we're in reasoning mode and haven't seen </think> yet,
1074
+ # send tokens as reasoning_content instead of content
1075
+ delta_content = None
1076
+ delta_reasoning = None
1077
+
1078
+ if reasoning_first_token:
1079
+ # First token - include opening tag in reasoning
1080
+ delta_reasoning = "<think>" + token
1081
+ reasoning_first_token = False
1082
+ reasoning_buffer = token
1083
+ elif in_thinking_phase:
1084
+ # Still in thinking phase - accumulate and check for </think>
1085
+ reasoning_buffer += token
1086
+
1087
+ # Check if we've seen the closing tag
1088
+ if "</think>" in reasoning_buffer:
1089
+ # Split at the closing tag
1090
+ before_close, after_close = reasoning_buffer.split(
1091
+ "</think>", 1
1092
+ )
1093
+
1094
+ # Send everything before + closing tag as reasoning
1095
+ if before_close or not reasoning_buffer.startswith(
1096
+ "</think>"
1097
+ ):
1098
+ delta_reasoning = before_close + "</think>"
1099
+ else:
1100
+ delta_reasoning = "</think>"
1101
+
1102
+ # Everything after goes to content (will be sent in next iteration)
1103
+ # For now, mark that we've exited thinking phase
1104
+ in_thinking_phase = False
1105
+
1106
+ # If there's content after </think>, we need to send it too
1107
+ # But we send it in the current chunk as regular content
1108
+ if after_close:
1109
+ # We have both reasoning and content in this token
1110
+ # Send reasoning first, content will accumulate
1111
+ delta_content = after_close
1112
+ else:
1113
+ # Still accumulating thinking, send as reasoning_content
1114
+ delta_reasoning = token
1115
+ else:
1116
+ # Normal content (after thinking phase ended)
1117
+ delta_content = token
1118
+
1119
+ chunk = ChatCompletionChunk.model_construct(
1120
+ id="0",
1121
+ object="chat.completion.chunk",
1122
+ created=int(time.time()),
1123
+ model=self.llm_loaded.checkpoint,
1124
+ choices=[
1125
+ Choice.model_construct(
1126
+ index=0,
1127
+ delta=ChoiceDelta(
1128
+ content=delta_content,
1129
+ reasoning_content=delta_reasoning,
1130
+ function_call=None,
1131
+ role="assistant",
1132
+ tool_calls=openai_tool_calls,
1133
+ refusal=None,
767
1134
  ),
768
- function_call=None,
769
- role="assistant",
770
- tool_calls=openai_tool_calls,
771
- refusal=None,
772
- ),
773
- finish_reason=None,
774
- logprobs=None,
775
- )
776
- ],
777
- )
1135
+ finish_reason=None,
1136
+ logprobs=None,
1137
+ )
1138
+ ],
1139
+ )
778
1140
 
779
- # Format as SSE
780
- reasoning_first_token = False
781
- yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
1141
+ # Format as SSE
1142
+ yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
782
1143
 
783
- # Send the [DONE] marker
784
- yield b"data: [DONE]\n\n"
1144
+ # Send the [DONE] marker only if still connected
1145
+ if not await request.is_disconnected():
1146
+ yield b"data: [DONE]\n\n"
1147
+ except asyncio.CancelledError:
1148
+ self.stop_event.set()
1149
+ return
785
1150
 
786
1151
  return StreamingResponse(
787
1152
  generate(),
@@ -861,7 +1226,7 @@ class Server:
861
1226
 
862
1227
  if self.llm_loaded.recipe == "llamacpp":
863
1228
  try:
864
- return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
1229
+ return self.wrapped_server.embeddings(embeddings_request)
865
1230
  except Exception as e: # pylint: disable=broad-exception-caught
866
1231
  # Check if model has embeddings label
867
1232
  model_info = ModelManager().supported_models.get(
@@ -884,7 +1249,7 @@ class Server:
884
1249
 
885
1250
  async def reranking(self, reranking_request: RerankingRequest):
886
1251
  """
887
- Rerank documents based on their relevance to a query using the llamacpp server.
1252
+ Rerank documents based on their relevance to a query.
888
1253
  """
889
1254
  # Initialize load config from reranking request
890
1255
  lc = LoadConfig(model_name=reranking_request.model)
@@ -894,7 +1259,7 @@ class Server:
894
1259
 
895
1260
  if self.llm_loaded.recipe == "llamacpp":
896
1261
  try:
897
- return llamacpp.reranking(reranking_request, self.llama_telemetry)
1262
+ return self.wrapped_server.reranking(reranking_request)
898
1263
  except Exception as e: # pylint: disable=broad-exception-caught
899
1264
  # Check if model has reranking label
900
1265
  model_info = ModelManager().supported_models.get(
@@ -940,7 +1305,7 @@ class Server:
940
1305
  formatted_messages.append(f"{role_marker}\n{content} <|end|>")
941
1306
  return "\n".join(formatted_messages) + "\n<|assistant|>"
942
1307
 
943
- async def responses(self, responses_request: ResponsesRequest):
1308
+ async def responses(self, responses_request: ResponsesRequest, request: Request):
944
1309
  """
945
1310
  Stream responses using HTTP chunked transfer encoding.
946
1311
  """
@@ -953,6 +1318,12 @@ class Server:
953
1318
  # Load the model if it's different from the currently loaded one
954
1319
  await self.load_llm(lc)
955
1320
 
1321
+ if self.llm_loaded.recipe == "llamacpp":
1322
+ raise HTTPException(
1323
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
1324
+ detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
1325
+ )
1326
+
956
1327
  # Convert chat messages to text using the model's chat template
957
1328
  if isinstance(responses_request.input, str):
958
1329
  text = responses_request.input
@@ -1006,56 +1377,72 @@ class Server:
1006
1377
 
1007
1378
  full_response = "<think>" if reasoning_first_token else ""
1008
1379
 
1009
- async for token in self._generate_tokens(**generation_args):
1380
+ try:
1381
+ async for token in self._generate_tokens(**generation_args):
1382
+ # Handle client disconnect: stop generation and exit
1383
+ if await request.is_disconnected():
1384
+ self.stop_event.set()
1385
+ break
1010
1386
 
1011
- # Create an event
1012
- delta_event = ResponseTextDeltaEvent(
1013
- content_index=0,
1014
- delta=("<think>" + token if reasoning_first_token else token),
1015
- item_id="0 ",
1016
- output_index=0,
1017
- type="response.output_text.delta",
1018
- sequence_number=0,
1019
- )
1020
- full_response += token
1387
+ # Create an event
1388
+ delta_event = ResponseTextDeltaEvent(
1389
+ content_index=0,
1390
+ delta=(
1391
+ "<think>" + token if reasoning_first_token else token
1392
+ ),
1393
+ item_id="0 ",
1394
+ logprobs=[],
1395
+ output_index=0,
1396
+ sequence_number=0,
1397
+ type="response.output_text.delta",
1398
+ )
1399
+ full_response += token
1021
1400
 
1022
- # Format as SSE
1023
- reasoning_first_token = False
1024
- yield f"data: {delta_event.model_dump_json()}\n\n".encode("utf-8")
1401
+ # Format as SSE
1402
+ reasoning_first_token = False
1403
+ yield f"data: {delta_event.model_dump_json()}\n\n".encode(
1404
+ "utf-8"
1405
+ )
1025
1406
 
1026
- # Send the completed event
1027
- response_output_message = ResponseOutputMessage(
1028
- id="0",
1029
- content=[
1030
- ResponseOutputText(
1031
- annotations=[],
1032
- text=full_response,
1033
- type="output_text",
1407
+ # Send the completed event (only if still connected)
1408
+ if not await request.is_disconnected():
1409
+ response_output_message = ResponseOutputMessage(
1410
+ id="0",
1411
+ content=[
1412
+ ResponseOutputText(
1413
+ annotations=[],
1414
+ text=full_response,
1415
+ type="output_text",
1416
+ )
1417
+ ],
1418
+ role="assistant",
1419
+ status="completed",
1420
+ type="message",
1421
+ )
1422
+ response = Response(
1423
+ id="0",
1424
+ model=self.llm_loaded.checkpoint,
1425
+ created_at=int(time.time()),
1426
+ object="response",
1427
+ output=[response_output_message],
1428
+ parallel_tool_calls=True,
1429
+ tool_choice="auto",
1430
+ tools=[],
1431
+ )
1432
+ completed_event = ResponseCompletedEvent(
1433
+ response=response,
1434
+ type="response.completed",
1435
+ sequence_number=0,
1436
+ )
1437
+ yield f"data: {completed_event.model_dump_json()}\n\n".encode(
1438
+ "utf-8"
1034
1439
  )
1035
- ],
1036
- role="assistant",
1037
- status="completed",
1038
- type="message",
1039
- )
1040
- response = Response(
1041
- id="0",
1042
- model=self.llm_loaded.checkpoint,
1043
- created_at=int(time.time()),
1044
- object="response",
1045
- output=[response_output_message],
1046
- parallel_tool_calls=True,
1047
- tool_choice="auto",
1048
- tools=[],
1049
- )
1050
- completed_event = ResponseCompletedEvent(
1051
- response=response,
1052
- type="response.completed",
1053
- sequence_number=0,
1054
- )
1055
- yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
1056
1440
 
1057
- # Send the [DONE] marker
1058
- yield b"data: [DONE]\n\n"
1441
+ # Send the [DONE] marker
1442
+ yield b"data: [DONE]\n\n"
1443
+ except asyncio.CancelledError:
1444
+ self.stop_event.set()
1445
+ return
1059
1446
 
1060
1447
  return StreamingResponse(
1061
1448
  generate(),
@@ -1150,18 +1537,33 @@ class Server:
1150
1537
  )
1151
1538
  self.input_tokens = len(input_ids[0])
1152
1539
 
1153
- # For non-llamacpp recipes, truncate inputs to ctx_size if needed
1154
- if self.llm_loaded.recipe != "llamacpp" and self.input_tokens > self.ctx_size:
1155
- # Truncate input ids
1156
- truncate_amount = self.input_tokens - self.ctx_size
1157
- input_ids = input_ids[: self.ctx_size]
1540
+ max_prompt_length = self.ctx_size # Default fallback
1541
+ # For OGA models, try to read the actual max prompt length from config
1542
+ if "oga-" in self.llm_loaded.recipe:
1543
+ try:
1544
+ if model.config and model.config.get("max_prompt_length"):
1545
+ max_prompt_length = model.config["max_prompt_length"]
1546
+ logging.debug(
1547
+ f"Using OGA model max_prompt_length: {max_prompt_length}"
1548
+ )
1549
+ # pylint: disable=broad-exception-caught
1550
+ except Exception as e:
1551
+ logging.debug(f"Could not read OGA model config, using ctx_size: {e}")
1158
1552
 
1553
+ # Apply truncation if input exceeds the limit
1554
+ if self.input_tokens > max_prompt_length:
1555
+ # Truncate input ids
1556
+ truncate_amount = self.input_tokens - max_prompt_length
1557
+ input_ids = input_ids[:max_prompt_length]
1159
1558
  # Update token count
1160
- self.input_tokens = len(input_ids)
1559
+ if "oga-" in self.llm_loaded.recipe:
1560
+ self.input_tokens = len(input_ids)
1561
+ else:
1562
+ self.input_tokens = len(input_ids[0])
1161
1563
 
1162
- # Show warning message
1564
+ # Log warning message instead of raising exception
1163
1565
  truncation_message = (
1164
- f"Input exceeded {self.ctx_size} tokens. "
1566
+ f"Input exceeded {max_prompt_length} tokens. "
1165
1567
  f"Truncated {truncate_amount} tokens from the beginning."
1166
1568
  )
1167
1569
  logging.warning(truncation_message)
@@ -1285,9 +1687,11 @@ class Server:
1285
1687
  """
1286
1688
  Send performance statistics to the client.
1287
1689
  """
1288
- # If using llama server, get telemetry from the telemetry instance
1289
- if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
1290
- return self.llama_telemetry.get_telemetry_data()
1690
+ # If using wrapped server, get telemetry from the telemetry instance
1691
+ if self.llm_loaded and (
1692
+ self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
1693
+ ):
1694
+ return self.wrapped_server.telemetry.get_telemetry_data()
1291
1695
 
1292
1696
  # For built-in server, use the existing telemetry
1293
1697
  return {
@@ -1387,6 +1791,7 @@ class Server:
1387
1791
  checkpoint=config.checkpoint,
1388
1792
  recipe=config.recipe,
1389
1793
  reasoning=config.reasoning,
1794
+ vision=config.vision,
1390
1795
  mmproj=config.mmproj,
1391
1796
  # The pull endpoint will download an upgraded model if available, even
1392
1797
  # if we already have a local copy of the model
@@ -1432,9 +1837,10 @@ class Server:
1432
1837
  Load a registered LLM into system memory. Install the model first, if needed.
1433
1838
  config: the information required to load the model
1434
1839
  """
1840
+ from huggingface_hub.constants import HF_HUB_CACHE
1841
+
1435
1842
  try:
1436
1843
  await self._load_lock.acquire()
1437
-
1438
1844
  # Acquire all generate locks
1439
1845
  for _ in range(self.max_concurrent_generations):
1440
1846
  await self._generate_semaphore.acquire()
@@ -1459,6 +1865,38 @@ class Server:
1459
1865
  # Get additional properties from the model registry
1460
1866
  config_to_use = LoadConfig(**supported_models[config.model_name])
1461
1867
 
1868
+ # For locally uploaded models, convert the relative checkpoint path to absolute path
1869
+ model_source = supported_models.get(config.model_name, {}).get(
1870
+ "source", None
1871
+ )
1872
+ if (
1873
+ model_source == "local_upload"
1874
+ and config_to_use.checkpoint
1875
+ and not config_to_use.recipe.startswith("hf-")
1876
+ ):
1877
+ # Check if checkpoint is a relative path (stored during upload)
1878
+ if not os.path.isabs(config_to_use.checkpoint):
1879
+ # Convert relative path to absolute by joining with HF_HUB_CACHE
1880
+ absolute_checkpoint = os.path.join(
1881
+ HF_HUB_CACHE, config_to_use.checkpoint
1882
+ )
1883
+ if os.path.exists(absolute_checkpoint):
1884
+ config_to_use.checkpoint = absolute_checkpoint
1885
+ else:
1886
+ logging.warning(
1887
+ f"Checkpoint path does not exist: {absolute_checkpoint}"
1888
+ )
1889
+
1890
+ # Also resolve mmproj path if present
1891
+ if config_to_use.mmproj and not os.path.isabs(config_to_use.mmproj):
1892
+ absolute_mmproj = os.path.join(HF_HUB_CACHE, config_to_use.mmproj)
1893
+ if os.path.exists(absolute_mmproj):
1894
+ config_to_use.mmproj = absolute_mmproj
1895
+ else:
1896
+ logging.warning(
1897
+ f"MMProj path does not exist: {absolute_mmproj}"
1898
+ )
1899
+
1462
1900
  # Caching mechanism: if the checkpoint is already loaded there is nothing else to do
1463
1901
  if (
1464
1902
  self.llm_loaded
@@ -1466,9 +1904,9 @@ class Server:
1466
1904
  ):
1467
1905
  if (
1468
1906
  self.llm_loaded.recipe == "llamacpp"
1469
- and self.llama_server_process.poll()
1470
- ):
1471
- # llama-server process has gone away for some reason, so we should
1907
+ or self.llm_loaded.recipe == "flm"
1908
+ ) and self.wrapped_server.process.poll():
1909
+ # wrapped server process has gone away for some reason, so we should
1472
1910
  # proceed with loading to get it back
1473
1911
  pass
1474
1912
  else:
@@ -1484,12 +1922,18 @@ class Server:
1484
1922
  logging.info(f"Loading llm: {config.model_name}")
1485
1923
  try:
1486
1924
  if config_to_use.recipe == "llamacpp":
1487
- self.llama_server_process = llamacpp.server_load(
1925
+ self.wrapped_server = LlamaServer(self.llamacpp_backend)
1926
+ self.wrapped_server.load(
1927
+ model_config=config_to_use,
1928
+ ctx_size=self.ctx_size,
1929
+ do_not_upgrade=True,
1930
+ )
1931
+
1932
+ elif config_to_use.recipe == "flm":
1933
+ self.wrapped_server = FlmServer()
1934
+ self.wrapped_server.load(
1488
1935
  model_config=config_to_use,
1489
- telemetry=self.llama_telemetry,
1490
- backend=self.llamacpp_backend,
1491
1936
  ctx_size=self.ctx_size,
1492
- # Models should only upgrade when using the pull endpoint
1493
1937
  do_not_upgrade=True,
1494
1938
  )
1495
1939
 
@@ -1529,8 +1973,8 @@ class Server:
1529
1973
  for _ in range(self.max_concurrent_generations):
1530
1974
  await self._generate_semaphore.acquire()
1531
1975
 
1532
- if self.llm_loaded.recipe == "llamacpp":
1533
- self.llama_server_process.terminate()
1976
+ if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
1977
+ self.wrapped_server.process.terminate()
1534
1978
 
1535
1979
  self.llm_loaded = None
1536
1980
  self.tokenizer = None
@@ -1567,6 +2011,36 @@ class Server:
1567
2011
 
1568
2012
  return {"object": "list", "data": models_list}
1569
2013
 
2014
+ async def retrieve_model(self, model_id: str):
2015
+ """
2016
+ Retrieve a specific model by ID in OpenAI-compatible format.
2017
+ """
2018
+ # Raise an error if the model does not exist
2019
+ if model_id not in self.local_models:
2020
+ # Mimic the error format of the OpenAI API
2021
+ raise HTTPException(
2022
+ status_code=404,
2023
+ detail={
2024
+ "message": f"model {model_id} not found",
2025
+ "type": "api_error",
2026
+ "param": None,
2027
+ "code": None,
2028
+ },
2029
+ )
2030
+
2031
+ # Return the specific model
2032
+ model_info = self.local_models[model_id]
2033
+ model = ServerModel(
2034
+ id=model_id,
2035
+ owned_by="lemonade",
2036
+ object="model",
2037
+ created=int(time.time()),
2038
+ checkpoint=model_info["checkpoint"],
2039
+ recipe=model_info["recipe"],
2040
+ )
2041
+
2042
+ return model
2043
+
1570
2044
  def setup_middleware_timer(self):
1571
2045
  logging.info("Middleware set up")
1572
2046
 
@@ -1602,6 +2076,48 @@ class Server:
1602
2076
  logging.debug(f"Total request time: {request_time:.4f} seconds")
1603
2077
  return response
1604
2078
 
2079
+ async def logs_ws(self, websocket: WebSocket):
2080
+ if not self.log_file or not os.path.exists(self.log_file):
2081
+ await websocket.close(code=4000)
2082
+ return
2083
+ await log_streamer(websocket, self.log_file)
2084
+
2085
+ async def get_incompatible_models(self):
2086
+ """
2087
+ Get information about incompatible RyzenAI models in the cache.
2088
+ """
2089
+ try:
2090
+ return ModelManager().get_incompatible_ryzenai_models()
2091
+ except Exception as e:
2092
+ raise HTTPException(
2093
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2094
+ detail=f"Failed to scan for incompatible models: {str(e)}",
2095
+ )
2096
+
2097
+ async def cleanup_incompatible_models(self, request: Request):
2098
+ """
2099
+ Delete selected incompatible RyzenAI models from the cache.
2100
+ """
2101
+ try:
2102
+ body = await request.json()
2103
+ model_paths = body.get("model_paths", [])
2104
+
2105
+ if not model_paths:
2106
+ raise HTTPException(
2107
+ status_code=status.HTTP_400_BAD_REQUEST,
2108
+ detail="No model_paths provided",
2109
+ )
2110
+
2111
+ result = ModelManager().cleanup_incompatible_models(model_paths)
2112
+ return result
2113
+ except HTTPException:
2114
+ raise
2115
+ except Exception as e:
2116
+ raise HTTPException(
2117
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2118
+ detail=f"Failed to cleanup models: {str(e)}",
2119
+ )
2120
+
1605
2121
 
1606
2122
  # This file was originally licensed under Apache 2.0. It has been modified.
1607
2123
  # Modifications Copyright (c) 2025 AMD