lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (32) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/common/status.py +4 -4
  3. lemonade/common/system_info.py +0 -26
  4. lemonade/tools/bench.py +22 -1
  5. lemonade/tools/flm/utils.py +70 -22
  6. lemonade/tools/llamacpp/bench.py +111 -23
  7. lemonade/tools/llamacpp/load.py +30 -2
  8. lemonade/tools/llamacpp/utils.py +234 -15
  9. lemonade/tools/oga/bench.py +0 -26
  10. lemonade/tools/oga/load.py +38 -142
  11. lemonade/tools/oga/migration.py +403 -0
  12. lemonade/tools/report/table.py +6 -0
  13. lemonade/tools/server/flm.py +2 -6
  14. lemonade/tools/server/llamacpp.py +20 -1
  15. lemonade/tools/server/serve.py +335 -17
  16. lemonade/tools/server/static/js/models.js +416 -18
  17. lemonade/tools/server/static/js/shared.js +44 -6
  18. lemonade/tools/server/static/logs.html +29 -19
  19. lemonade/tools/server/static/styles.css +204 -0
  20. lemonade/tools/server/static/webapp.html +32 -0
  21. lemonade/version.py +1 -1
  22. lemonade_install/install.py +33 -579
  23. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/METADATA +5 -3
  24. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/RECORD +32 -31
  25. lemonade_server/cli.py +10 -0
  26. lemonade_server/model_manager.py +172 -11
  27. lemonade_server/server_models.json +102 -66
  28. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/WHEEL +0 -0
  29. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/entry_points.txt +0 -0
  30. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/licenses/LICENSE +0 -0
  31. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/licenses/NOTICE.md +0 -0
  32. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.0.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,12 @@ import logging
7
7
  import platform
8
8
  import tempfile
9
9
  import traceback
10
- from typing import Optional, Union
10
+ from typing import Optional, Union, List
11
11
  import json
12
12
  from pathlib import Path
13
13
  import os
14
-
15
- from fastapi import FastAPI, HTTPException, status, Request, WebSocket
14
+ import shutil
15
+ from fastapi import FastAPI, HTTPException, status, Request, WebSocket, Form, UploadFile
16
16
  from fastapi.responses import StreamingResponse
17
17
  from fastapi.middleware.cors import CORSMiddleware
18
18
  from fastapi.staticfiles import StaticFiles
@@ -83,10 +83,31 @@ if platform.system() in ["Windows", "Darwin"]:
83
83
  from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
84
84
 
85
85
 
86
- class WebsocketTextFilter(logging.Filter):
86
+ class ServerLogFilter(logging.Filter):
87
+ def __init__(self, server):
88
+ super().__init__()
89
+ self.server = server
90
+ self.noisy_paths = {
91
+ "/api/v1/health",
92
+ "/api/v0/health",
93
+ "/api/v1/models",
94
+ "/api/v0/models",
95
+ }
96
+
87
97
  def filter(self, record: logging.LogRecord) -> bool:
88
- # Only allow logs that don't include "> TEXT"
89
- return "> TEXT" not in record.getMessage()
98
+ msg = record.getMessage()
99
+
100
+ # Filter out websocket logs
101
+ if "> TEXT" in msg:
102
+ return False
103
+
104
+ # Filter out noisy HTTP routes if debug logs are OFF
105
+ if not self.server.debug_logging_enabled:
106
+ if any(path in msg for path in self.noisy_paths):
107
+ return False
108
+
109
+ # Otherwise, allow the log
110
+ return True
90
111
 
91
112
 
92
113
  async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
@@ -94,7 +115,7 @@ async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
94
115
  await websocket.accept()
95
116
  try:
96
117
  with open(path, "r", encoding="utf-8") as f:
97
- f.seek(0, os.SEEK_END) # start at end
118
+ f.seek(0) # start at the beginning of the file
98
119
  while True:
99
120
  # Try reading a line
100
121
  line = f.readline()
@@ -325,6 +346,7 @@ class Server:
325
346
  self.app.post(f"{prefix}/responses")(self.responses)
326
347
  self.app.post(f"{prefix}/log-level")(self.set_log_level)
327
348
  self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
349
+ self.app.post(f"{prefix}/add-local-model")(self.add_local_model)
328
350
 
329
351
  # OpenAI-compatible routes
330
352
  self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
@@ -336,6 +358,186 @@ class Server:
336
358
  self.app.post(f"{prefix}/reranking")(self.reranking)
337
359
  self.app.post(f"{prefix}/rerank")(self.reranking)
338
360
 
361
+ # Migration routes
362
+ self.app.get(f"{prefix}/migration/incompatible-models")(
363
+ self.get_incompatible_models
364
+ )
365
+ self.app.post(f"{prefix}/migration/cleanup")(
366
+ self.cleanup_incompatible_models
367
+ )
368
+
369
+ async def add_local_model(
370
+ self,
371
+ model_name: str = Form(...),
372
+ checkpoint: str = Form(""),
373
+ recipe: str = Form(...),
374
+ reasoning: bool = Form(False),
375
+ vision: bool = Form(False),
376
+ mmproj: str = Form(None),
377
+ model_files: List[UploadFile] = None,
378
+ ):
379
+ from huggingface_hub.constants import HF_HUB_CACHE
380
+ from lemonade.tools.llamacpp.utils import parse_checkpoint
381
+
382
+ # Upload and register a local model from files.
383
+ try:
384
+ if not model_files:
385
+ raise HTTPException(
386
+ status_code=status.HTTP_400_BAD_REQUEST,
387
+ detail="No model files provided for upload",
388
+ )
389
+
390
+ if not model_name.startswith("user."):
391
+ raise HTTPException(
392
+ status_code=status.HTTP_400_BAD_REQUEST,
393
+ detail="Model name must start with 'user.'",
394
+ )
395
+
396
+ valid_recipes = ["llamacpp", "oga-npu", "oga-hybrid", "oga-cpu"]
397
+ if recipe not in valid_recipes:
398
+ raise HTTPException(
399
+ status_code=status.HTTP_400_BAD_REQUEST,
400
+ detail=f"Invalid recipe. Must be one of: {', '.join(valid_recipes)}",
401
+ )
402
+
403
+ if recipe == "llamacpp" and not any(
404
+ f.filename.lower().endswith(".gguf") for f in model_files
405
+ ):
406
+ raise HTTPException(
407
+ status_code=status.HTTP_400_BAD_REQUEST,
408
+ detail="At least one .gguf file is required for llamacpp",
409
+ )
410
+
411
+ # Check if model name already exists
412
+ if model_name in ModelManager().supported_models:
413
+ raise HTTPException(
414
+ status_code=status.HTTP_409_CONFLICT,
415
+ detail=(
416
+ f"Model name '{model_name}' already exists. "
417
+ "Please use a different name."
418
+ ),
419
+ )
420
+
421
+ model_name_clean = model_name.replace("user.", "")
422
+
423
+ # Files are saved to models--{model_name_clean}
424
+ # Note: This is based on the user's custom model name, NOT the checkpoint field
425
+ repo_cache_name = model_name_clean.replace("/", "--")
426
+ snapshot_path = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
427
+ os.makedirs(snapshot_path, exist_ok=True)
428
+
429
+ # Extract variant from checkpoint field if provided
430
+ # checkpoint field format: "folder:variant" or just "folder"
431
+ variant = None
432
+ if checkpoint and ":" in checkpoint:
433
+ _, variant = parse_checkpoint(checkpoint)
434
+ # variant now contains just the variant[can be with or without the
435
+ # .gguf extension] filename (e.g., "LFM2-VL-1.6B-F16 or LFM2-VL-1.6B-F16.gguf")
436
+
437
+ # Save uploaded files, preserving folder structure
438
+ for file in model_files:
439
+ relative_path = file.filename
440
+ path_parts = relative_path.split("/")
441
+
442
+ if len(path_parts) > 1:
443
+ internal_path = "/".join(path_parts[1:])
444
+ file_path = os.path.join(snapshot_path, internal_path)
445
+ else:
446
+ file_path = os.path.join(snapshot_path, path_parts[0])
447
+
448
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
449
+ with open(file_path, "wb") as f:
450
+ content = await file.read()
451
+ f.write(content)
452
+
453
+ # Resolve actual file paths after upload (for faster loading later)
454
+ resolved_checkpoint = None
455
+ resolved_mmproj = None
456
+
457
+ # For OGA models, find genai_config.json
458
+ if recipe.startswith("oga-"):
459
+ for root, _, files in os.walk(snapshot_path):
460
+ if "genai_config.json" in files:
461
+ resolved_checkpoint = root
462
+ break
463
+ if not resolved_checkpoint:
464
+ resolved_checkpoint = snapshot_path
465
+
466
+ # For llamacpp models, find the GGUF file
467
+ elif recipe == "llamacpp":
468
+ gguf_file_found = None
469
+
470
+ # If variant is specified, look for that specific file
471
+ if variant:
472
+ search_term = (
473
+ variant if variant.endswith(".gguf") else f"{variant}.gguf"
474
+ )
475
+ for root, _, files in os.walk(snapshot_path):
476
+ if search_term in files:
477
+ gguf_file_found = os.path.join(root, search_term)
478
+ break
479
+
480
+ # If no variant or variant not found, search for any .gguf file (excluding mmproj)
481
+ if not gguf_file_found:
482
+ for root, _, files in os.walk(snapshot_path):
483
+ gguf_files = [
484
+ f
485
+ for f in files
486
+ if f.endswith(".gguf") and "mmproj" not in f.lower()
487
+ ]
488
+ if gguf_files:
489
+ gguf_file_found = os.path.join(root, gguf_files[0])
490
+ break
491
+
492
+ resolved_checkpoint = (
493
+ gguf_file_found if gguf_file_found else snapshot_path
494
+ )
495
+
496
+ # Search for mmproj file if provided
497
+ if mmproj:
498
+ for root, _, files in os.walk(snapshot_path):
499
+ if mmproj in files:
500
+ resolved_mmproj = os.path.join(root, mmproj)
501
+ break
502
+
503
+ # Build checkpoint for registration
504
+ # For llamacpp with resolved path, store the full path relative to HF_HUB_CACHE
505
+ if resolved_checkpoint:
506
+ # Store as relative path from HF_HUB_CACHE for portability
507
+ checkpoint_to_register = os.path.relpath(
508
+ resolved_checkpoint, HF_HUB_CACHE
509
+ )
510
+ elif variant:
511
+ checkpoint_to_register = f"models--{repo_cache_name}:{variant}"
512
+ else:
513
+ checkpoint_to_register = f"models--{repo_cache_name}"
514
+
515
+ # Register the model
516
+ ModelManager().register_local_model(
517
+ model_name=model_name,
518
+ checkpoint=checkpoint_to_register,
519
+ recipe=recipe,
520
+ reasoning=reasoning,
521
+ vision=vision,
522
+ mmproj=resolved_mmproj if resolved_mmproj else mmproj,
523
+ snapshot_path=snapshot_path,
524
+ )
525
+
526
+ # Refresh local models
527
+ self.local_models = ModelManager().downloaded_models_enabled
528
+
529
+ return {
530
+ "status": "success",
531
+ "message": f"Model {model_name} uploaded and registered successfully",
532
+ }
533
+ except Exception as e:
534
+ if os.path.exists(checkpoint_to_register):
535
+ shutil.rmtree(checkpoint_to_register)
536
+ raise HTTPException(
537
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
538
+ detail=f"Failed to upload model: {str(e)}",
539
+ )
540
+
339
541
  async def set_log_level(self, config: LogLevelConfig):
340
542
  """
341
543
  Set the logging level of the server.
@@ -454,13 +656,13 @@ class Server:
454
656
  )
455
657
  file_handler.setLevel(logging_level)
456
658
  file_handler.setFormatter(uvicorn_formatter)
457
- file_handler.addFilter(WebsocketTextFilter())
659
+ file_handler.addFilter(ServerLogFilter(self))
458
660
 
459
661
  # Set up console handler
460
662
  console_handler = logging.StreamHandler()
461
663
  console_handler.setLevel(logging_level)
462
664
  console_handler.setFormatter(uvicorn_formatter)
463
- console_handler.addFilter(WebsocketTextFilter())
665
+ console_handler.addFilter(ServerLogFilter(self))
464
666
 
465
667
  # Configure root logger with both handlers
466
668
  logging.basicConfig(
@@ -807,6 +1009,11 @@ class Server:
807
1009
 
808
1010
  # Keep track of the full response for tool call extraction
809
1011
  full_response = ""
1012
+
1013
+ # Track whether we're still in the thinking phase (before </think> tag)
1014
+ in_thinking_phase = self.llm_loaded.reasoning
1015
+ reasoning_buffer = "" # Accumulate reasoning tokens to detect </think>
1016
+
810
1017
  try:
811
1018
  async for token in self._generate_tokens(**generation_args):
812
1019
  # Handle client disconnect: stop generation and exit
@@ -845,7 +1052,53 @@ class Server:
845
1052
  )
846
1053
  )
847
1054
 
848
- # Create a ChatCompletionChunk
1055
+ # Create a ChatCompletionChunk with reasoning_content support
1056
+ # If we're in reasoning mode and haven't seen </think> yet,
1057
+ # send tokens as reasoning_content instead of content
1058
+ delta_content = None
1059
+ delta_reasoning = None
1060
+
1061
+ if reasoning_first_token:
1062
+ # First token - include opening tag in reasoning
1063
+ delta_reasoning = "<think>" + token
1064
+ reasoning_first_token = False
1065
+ reasoning_buffer = token
1066
+ elif in_thinking_phase:
1067
+ # Still in thinking phase - accumulate and check for </think>
1068
+ reasoning_buffer += token
1069
+
1070
+ # Check if we've seen the closing tag
1071
+ if "</think>" in reasoning_buffer:
1072
+ # Split at the closing tag
1073
+ before_close, after_close = reasoning_buffer.split(
1074
+ "</think>", 1
1075
+ )
1076
+
1077
+ # Send everything before + closing tag as reasoning
1078
+ if before_close or not reasoning_buffer.startswith(
1079
+ "</think>"
1080
+ ):
1081
+ delta_reasoning = before_close + "</think>"
1082
+ else:
1083
+ delta_reasoning = "</think>"
1084
+
1085
+ # Everything after goes to content (will be sent in next iteration)
1086
+ # For now, mark that we've exited thinking phase
1087
+ in_thinking_phase = False
1088
+
1089
+ # If there's content after </think>, we need to send it too
1090
+ # But we send it in the current chunk as regular content
1091
+ if after_close:
1092
+ # We have both reasoning and content in this token
1093
+ # Send reasoning first, content will accumulate
1094
+ delta_content = after_close
1095
+ else:
1096
+ # Still accumulating thinking, send as reasoning_content
1097
+ delta_reasoning = token
1098
+ else:
1099
+ # Normal content (after thinking phase ended)
1100
+ delta_content = token
1101
+
849
1102
  chunk = ChatCompletionChunk.model_construct(
850
1103
  id="0",
851
1104
  object="chat.completion.chunk",
@@ -855,11 +1108,8 @@ class Server:
855
1108
  Choice.model_construct(
856
1109
  index=0,
857
1110
  delta=ChoiceDelta(
858
- content=(
859
- "<think>" + token
860
- if reasoning_first_token
861
- else token
862
- ),
1111
+ content=delta_content,
1112
+ reasoning_content=delta_reasoning,
863
1113
  function_call=None,
864
1114
  role="assistant",
865
1115
  tool_calls=openai_tool_calls,
@@ -872,7 +1122,6 @@ class Server:
872
1122
  )
873
1123
 
874
1124
  # Format as SSE
875
- reasoning_first_token = False
876
1125
  yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
877
1126
 
878
1127
  # Send the [DONE] marker only if still connected
@@ -1570,9 +1819,10 @@ class Server:
1570
1819
  Load a registered LLM into system memory. Install the model first, if needed.
1571
1820
  config: the information required to load the model
1572
1821
  """
1822
+ from huggingface_hub.constants import HF_HUB_CACHE
1823
+
1573
1824
  try:
1574
1825
  await self._load_lock.acquire()
1575
-
1576
1826
  # Acquire all generate locks
1577
1827
  for _ in range(self.max_concurrent_generations):
1578
1828
  await self._generate_semaphore.acquire()
@@ -1597,6 +1847,38 @@ class Server:
1597
1847
  # Get additional properties from the model registry
1598
1848
  config_to_use = LoadConfig(**supported_models[config.model_name])
1599
1849
 
1850
+ # For locally uploaded models, convert the relative checkpoint path to absolute path
1851
+ model_source = supported_models.get(config.model_name, {}).get(
1852
+ "source", None
1853
+ )
1854
+ if (
1855
+ model_source == "local_upload"
1856
+ and config_to_use.checkpoint
1857
+ and not config_to_use.recipe.startswith("hf-")
1858
+ ):
1859
+ # Check if checkpoint is a relative path (stored during upload)
1860
+ if not os.path.isabs(config_to_use.checkpoint):
1861
+ # Convert relative path to absolute by joining with HF_HUB_CACHE
1862
+ absolute_checkpoint = os.path.join(
1863
+ HF_HUB_CACHE, config_to_use.checkpoint
1864
+ )
1865
+ if os.path.exists(absolute_checkpoint):
1866
+ config_to_use.checkpoint = absolute_checkpoint
1867
+ else:
1868
+ logging.warning(
1869
+ f"Checkpoint path does not exist: {absolute_checkpoint}"
1870
+ )
1871
+
1872
+ # Also resolve mmproj path if present
1873
+ if config_to_use.mmproj and not os.path.isabs(config_to_use.mmproj):
1874
+ absolute_mmproj = os.path.join(HF_HUB_CACHE, config_to_use.mmproj)
1875
+ if os.path.exists(absolute_mmproj):
1876
+ config_to_use.mmproj = absolute_mmproj
1877
+ else:
1878
+ logging.warning(
1879
+ f"MMProj path does not exist: {absolute_mmproj}"
1880
+ )
1881
+
1600
1882
  # Caching mechanism: if the checkpoint is already loaded there is nothing else to do
1601
1883
  if (
1602
1884
  self.llm_loaded
@@ -1782,6 +2064,42 @@ class Server:
1782
2064
  return
1783
2065
  await log_streamer(websocket, self.log_file)
1784
2066
 
2067
+ async def get_incompatible_models(self):
2068
+ """
2069
+ Get information about incompatible RyzenAI models in the cache.
2070
+ """
2071
+ try:
2072
+ return ModelManager().get_incompatible_ryzenai_models()
2073
+ except Exception as e:
2074
+ raise HTTPException(
2075
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2076
+ detail=f"Failed to scan for incompatible models: {str(e)}",
2077
+ )
2078
+
2079
+ async def cleanup_incompatible_models(self, request: Request):
2080
+ """
2081
+ Delete selected incompatible RyzenAI models from the cache.
2082
+ """
2083
+ try:
2084
+ body = await request.json()
2085
+ model_paths = body.get("model_paths", [])
2086
+
2087
+ if not model_paths:
2088
+ raise HTTPException(
2089
+ status_code=status.HTTP_400_BAD_REQUEST,
2090
+ detail="No model_paths provided",
2091
+ )
2092
+
2093
+ result = ModelManager().cleanup_incompatible_models(model_paths)
2094
+ return result
2095
+ except HTTPException:
2096
+ raise
2097
+ except Exception as e:
2098
+ raise HTTPException(
2099
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2100
+ detail=f"Failed to cleanup models: {str(e)}",
2101
+ )
2102
+
1785
2103
 
1786
2104
  # This file was originally licensed under Apache 2.0. It has been modified.
1787
2105
  # Modifications Copyright (c) 2025 AMD