lemonade-sdk 7.0.1__py3-none-any.whl → 7.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

lemonade/cli.py CHANGED
@@ -19,6 +19,7 @@ import lemonade.cache as cache
19
19
  from lemonade.tools.mmlu import AccuracyMMLU
20
20
  from lemonade.tools.humaneval import AccuracyHumaneval
21
21
  from lemonade.tools.perplexity import AccuracyPerplexity
22
+ from lemonade.tools.accuracy import LMEvalHarness
22
23
  from lemonade.tools.prompt import LLMPrompt
23
24
  from lemonade.tools.quark.quark_load import QuarkLoad
24
25
  from lemonade.tools.quark.quark_quantize import QuarkQuantize
@@ -36,6 +37,7 @@ def main():
36
37
  AccuracyMMLU,
37
38
  AccuracyHumaneval,
38
39
  AccuracyPerplexity,
40
+ LMEvalHarness,
39
41
  LLMPrompt,
40
42
  HuggingfaceBench,
41
43
  OgaBench,
@@ -0,0 +1,335 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from typing import Optional
9
+
10
+ import requests
11
+
12
+ from lemonade.state import State
13
+ from lemonade.tools import Tool
14
+ import lemonade.common.printing as printing
15
+ import lemonade.common.build as build
16
+
17
+ from lemonade.tools.server.thread_utils import ServerRunner
18
+
19
+
20
+ def is_port_in_use(port, host="localhost"):
21
+ """
22
+ Check if a port is in use
23
+ """
24
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
25
+ return s.connect_ex((host, port)) == 0
26
+
27
+
28
+ class LMEvalHarness(Tool):
29
+ """
30
+ Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
31
+ like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
32
+ """
33
+
34
+ unique_name = "lm-eval-harness"
35
+
36
+ def __init__(self):
37
+ super().__init__(
38
+ monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
39
+ )
40
+ self.status_stats = []
41
+ self.server_runner = None
42
+
43
+ @staticmethod
44
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
45
+ parser = __class__.helpful_parser(
46
+ short_description="Evaluate model using lm-eval-harness",
47
+ add_help=add_help,
48
+ )
49
+
50
+ parser.add_argument(
51
+ "--task",
52
+ type=str,
53
+ required=True,
54
+ help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--server-port", type=int, default=8000, help="Port to use for the server"
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--num-fewshot",
63
+ type=int,
64
+ default=0,
65
+ help="Number of examples in few-shot prompts",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--limit",
70
+ type=int,
71
+ default=None,
72
+ help="Limit the number of examples per task",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--log-samples",
77
+ action="store_true",
78
+ help="Log samples for each task to log file",
79
+ )
80
+
81
+ parser.add_argument(
82
+ "--output-path",
83
+ type=str,
84
+ default=None,
85
+ help="Path to save evaluation results",
86
+ )
87
+
88
+ return parser
89
+
90
+ def _process_results(self, results_dir, state):
91
+ """Process evaluation results and save to state stats"""
92
+ if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
93
+ printing.log_warning(f"Results directory not found at {results_dir}")
94
+ return
95
+
96
+ model_dirs = [
97
+ d
98
+ for d in os.listdir(results_dir)
99
+ if os.path.isdir(os.path.join(results_dir, d))
100
+ ]
101
+
102
+ if not model_dirs:
103
+ printing.log_warning(f"No model directories found in {results_dir}")
104
+ return
105
+
106
+ model_dir = os.path.join(results_dir, model_dirs[0])
107
+ printing.log_info(f"Found model directory: {model_dir}")
108
+
109
+ # Find the results JSON file with timestamp
110
+ results_files = [
111
+ f
112
+ for f in os.listdir(model_dir)
113
+ if f.startswith("results_") and f.endswith(".json")
114
+ ]
115
+
116
+ if not results_files:
117
+ printing.log_warning(f"No results files found in {model_dir}")
118
+ return
119
+
120
+ # Sort by timestamp
121
+ results_files.sort(reverse=True)
122
+ results_file_path = os.path.join(model_dir, results_files[0])
123
+ printing.log_info(f"Processing results from {results_file_path}")
124
+
125
+ # Read and process results
126
+ try:
127
+ with open(results_file_path, "r", encoding="utf-8") as f:
128
+ results = json.load(f)
129
+
130
+ # Extract and display metrics
131
+ if "results" in results:
132
+ for task_name, metrics in results["results"].items():
133
+ printing.log_info(f"Results for {task_name}:")
134
+
135
+ for metric, value in metrics.items():
136
+ if isinstance(value, (int, float)) and not metric.startswith(
137
+ "alias"
138
+ ):
139
+ # Format metric name for stats
140
+ clean_metric = metric.replace(",", "_")
141
+ stat_name = f"lm_eval_{task_name}_{clean_metric}"
142
+
143
+ # Save to state stats as percentage
144
+ state.save_stat(stat_name, float(value) * 100)
145
+ state.save_stat(f"{stat_name}_units", "%")
146
+ self.status_stats.append(stat_name)
147
+
148
+ printing.log_info(
149
+ f" {metric}: {value:.4f} ({value*100:.2f}%)"
150
+ )
151
+
152
+ # Save summary metrics if available
153
+ avg_metrics = {}
154
+ if "higher_is_better" in results:
155
+ for metric_type in results["higher_is_better"].values():
156
+ for metric in metric_type.keys():
157
+ if metric not in avg_metrics:
158
+ avg_metrics[metric] = []
159
+
160
+ for task_metrics in results["results"].values():
161
+ for metric, value in task_metrics.items():
162
+ if isinstance(value, (int, float)) and not metric.startswith(
163
+ "alias"
164
+ ):
165
+ base_metric = metric.split(",")[0]
166
+ if base_metric in avg_metrics:
167
+ avg_metrics[base_metric].append(value)
168
+
169
+ # Calculate and save averages
170
+ for metric, values in avg_metrics.items():
171
+ if values:
172
+ avg_value = sum(values) / len(values)
173
+ stat_name = f"lm_eval_average_{metric}"
174
+ state.save_stat(stat_name, float(avg_value) * 100)
175
+ state.save_stat(f"{stat_name}_units", "%")
176
+ self.status_stats.append(stat_name)
177
+ printing.log_info(
178
+ f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
179
+ )
180
+
181
+ except (IOError, json.JSONDecodeError) as e:
182
+ printing.log_error(f"Error processing results: {e}")
183
+
184
+ def run(
185
+ self,
186
+ state: State,
187
+ task: str,
188
+ server_port: int = 8000,
189
+ server_host: str = "localhost",
190
+ num_fewshot: int = 0,
191
+ limit: Optional[int] = None,
192
+ log_samples: bool = False,
193
+ output_path: Optional[str] = None,
194
+ ) -> State:
195
+
196
+ model = state.model
197
+ tokenizer = state.tokenizer
198
+
199
+ if model is None or tokenizer is None:
200
+ raise ValueError(
201
+ "Model and tokenizer must be loaded in state before running lm-eval-harness"
202
+ )
203
+
204
+ # Set up output path
205
+ if output_path is None:
206
+ output_path = os.path.join(
207
+ build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
208
+ )
209
+
210
+ os.makedirs(output_path, exist_ok=True)
211
+
212
+ # Check if port is already in use
213
+ if is_port_in_use(server_port, server_host):
214
+ error_msg = (
215
+ f"Port {server_port} is already in use. "
216
+ "Please close all applications using this port and try again."
217
+ )
218
+ printing.log_error(error_msg)
219
+ raise RuntimeError(error_msg)
220
+
221
+ # Retroactively determine recipe based on model type to select correct iterator
222
+ # The model is already loaded in server, so we only need recipe for iterator selection
223
+ checkpoint = getattr(state, "checkpoint", "unknown")
224
+ if "OrtGenaiModel" in str(type(model)):
225
+ recipe = "oga-"
226
+ else:
227
+ recipe = "unknown"
228
+
229
+ # Start the server thread
230
+ self.server_runner = ServerRunner(
231
+ model=model,
232
+ tokenizer=tokenizer,
233
+ checkpoint=checkpoint,
234
+ recipe=recipe,
235
+ host=server_host,
236
+ port=server_port,
237
+ )
238
+ self.server_runner.start()
239
+
240
+ # Wait for server initialization
241
+ printing.log_info("Waiting for server initialization...")
242
+
243
+ # Wait for server to start and be responsive
244
+ server_url = f"http://{server_host}:{server_port}"
245
+ max_retries = 30
246
+ retry_delay = 1
247
+
248
+ printing.log_info(f"Checking if server is available at {server_url}...")
249
+ for i in range(max_retries):
250
+ try:
251
+ response = requests.get(f"{server_url}/api/v0/health", timeout=2)
252
+ if response.status_code == 200:
253
+ printing.log_info(f"Server is ready after {i+1} attempts")
254
+ break
255
+ except requests.exceptions.RequestException:
256
+ if i < max_retries - 1:
257
+ time.sleep(retry_delay)
258
+ else:
259
+ printing.log_error(
260
+ f"Server did not start after {max_retries} attempts"
261
+ )
262
+ raise RuntimeError("Failed to start the server")
263
+
264
+ # Build API URL
265
+ results_file = os.path.join(output_path, f"{task}_results")
266
+
267
+ printing.log_info(f"Running lm-eval-harness on {task}...")
268
+
269
+ # Build lm-eval-harness command
270
+ cmd = [
271
+ "lm_eval",
272
+ "--model",
273
+ "local-completions",
274
+ "--tasks",
275
+ task,
276
+ "--model_args",
277
+ (
278
+ f"model={checkpoint},"
279
+ f"base_url={server_url}/api/v0/completions,"
280
+ f"num_concurrent=1,"
281
+ f"max_retries=5,"
282
+ f"retry_timeout=10,"
283
+ f"tokenized_requests=False"
284
+ ),
285
+ "--num_fewshot",
286
+ str(num_fewshot),
287
+ "--output_path",
288
+ results_file,
289
+ ]
290
+
291
+ if limit is not None:
292
+ cmd.extend(["--limit", str(limit)])
293
+
294
+ if log_samples:
295
+ cmd.extend(["--log_samples"])
296
+
297
+ try:
298
+ # On Windows, set UTF-8 mode to handle Unicode output
299
+ env = os.environ.copy()
300
+ if sys.platform == "win32":
301
+ env["PYTHONIOENCODING"] = "utf-8"
302
+
303
+ # Execute lm-eval-harness command
304
+ result = subprocess.run(
305
+ cmd, check=True, text=True, capture_output=True, env=env
306
+ )
307
+
308
+ # Log relevant output and skip any parts that might cause encoding issues
309
+ try:
310
+ printing.log_info(result.stdout)
311
+ except UnicodeEncodeError:
312
+ printing.log_info(
313
+ "Results obtained successfully but couldn't display due to encoding issues"
314
+ )
315
+
316
+ # Process results from the correct location
317
+ results_dir = os.path.join(output_path, f"{task}_results")
318
+ self._process_results(results_dir, state)
319
+
320
+ except subprocess.CalledProcessError as e:
321
+ printing.log_error(f"Error running lm-eval-harness: {e}")
322
+ printing.log_error(f"stderr: {e.stderr}")
323
+ except (IOError, ValueError, requests.RequestException) as e:
324
+ printing.log_error(f"Error: {e}")
325
+ finally:
326
+ # Shut down server
327
+ if self.server_runner and self.server_runner.is_alive():
328
+ printing.log_info("Shutting down server runner...")
329
+ self.server_runner.shutdown()
330
+
331
+ # Make sure we don't have any lingering references to state's model/tokenizer
332
+ # that could prevent garbage collection
333
+ self.server_runner = None
334
+
335
+ return state
@@ -16,9 +16,9 @@ from openai import OpenAI
16
16
 
17
17
  from lemonade_server.model_manager import ModelManager
18
18
  from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+ from lemonade.tools.server.port_utils import find_free_port
19
20
 
20
21
  LLAMA_VERSION = "b5543"
21
- LLAMA_SERVER_PORT = "8081"
22
22
 
23
23
  LLAMA_SERVER_EXE_DIR = os.path.join(
24
24
  os.path.dirname(sys.executable),
@@ -43,6 +43,23 @@ class LlamaTelemetry:
43
43
  self.tokens_per_second = None
44
44
  self.prompt_eval_time = None
45
45
  self.eval_time = None
46
+ self.port = None
47
+
48
+ def choose_port(self):
49
+ """
50
+ Users probably don't care what port we start llama-server on, so let's
51
+ search for an empty port
52
+ """
53
+
54
+ self.port = find_free_port()
55
+
56
+ if self.port is None:
57
+ msg = "Failed to find an empty port to start llama-server on"
58
+ logging.error(msg)
59
+ raise HTTPException(
60
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
61
+ detail=msg,
62
+ )
46
63
 
47
64
  def parse_telemetry_line(self, line: str):
48
65
  """
@@ -128,10 +145,12 @@ def _log_subprocess_output(
128
145
  break
129
146
 
130
147
 
131
- def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
148
+ def _wait_for_load(
149
+ llama_server_process: subprocess.Popen, port: int, fail_message: str
150
+ ):
132
151
  status_code = None
133
152
  while not llama_server_process.poll() and status_code != 200:
134
- health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
153
+ health_url = f"http://localhost:{port}/health"
135
154
  try:
136
155
  health_response = requests.get(health_url)
137
156
  except requests.exceptions.ConnectionError:
@@ -152,12 +171,17 @@ def _launch_llama_subprocess(
152
171
  Launch llama server subprocess with GPU or CPU configuration
153
172
  """
154
173
 
174
+ # Find a port, and save it in the telemetry object for future reference
175
+ # by other functions
176
+ telemetry.choose_port()
177
+
155
178
  base_command = [
156
179
  LLAMA_SERVER_EXE_PATH,
157
180
  "-m",
158
181
  model_path,
159
182
  "--port",
160
- LLAMA_SERVER_PORT,
183
+ str(telemetry.port),
184
+ "--jinja",
161
185
  ]
162
186
 
163
187
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
@@ -227,6 +251,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
227
251
  # Check the /health endpoint until GPU server is ready
228
252
  _wait_for_load(
229
253
  llama_server_process,
254
+ telemetry.port,
230
255
  f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
231
256
  )
232
257
 
@@ -239,6 +264,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
239
264
  # Check the /health endpoint until CPU server is ready
240
265
  _wait_for_load(
241
266
  llama_server_process,
267
+ telemetry.port,
242
268
  f"Loading {model_reference} on CPU didn't work",
243
269
  )
244
270
 
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
254
280
  def chat_completion(
255
281
  chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
256
282
  ):
257
- base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
283
+ base_url = f"http://127.0.0.1:{telemetry.port}/v1"
258
284
  client = OpenAI(
259
285
  base_url=base_url,
260
286
  api_key="lemonade",
@@ -0,0 +1,57 @@
1
+ import socketserver
2
+ import sys
3
+ import logging
4
+ from contextlib import asynccontextmanager
5
+ from fastapi import FastAPI
6
+
7
+
8
+ def find_free_port():
9
+ """
10
+ Scans for an unoccupied TCP port
11
+
12
+ Returns the port number as an int on success
13
+ Returns None if no port can be found
14
+ """
15
+
16
+ try:
17
+ with socketserver.TCPServer(("localhost", 0), None) as s:
18
+ return s.server_address[1]
19
+ # pylint: disable=broad-exception-caught
20
+ except Exception:
21
+ return None
22
+
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ # Code here will run when the application starts up
27
+ # Check if console can handle Unicode by testing emoji encoding
28
+
29
+ try:
30
+ if sys.stdout.encoding:
31
+ "🍋".encode(sys.stdout.encoding)
32
+ use_emojis = True
33
+ except (UnicodeEncodeError, AttributeError):
34
+ use_emojis = False
35
+
36
+ if use_emojis:
37
+ logging.info(
38
+ "\n"
39
+ "\n"
40
+ "🍋 Lemonade Server Ready!\n"
41
+ f"🍋 Open http://localhost:{app.port} in your browser for:\n"
42
+ "🍋 💬 chat\n"
43
+ "🍋 💻 model management\n"
44
+ "🍋 📄 docs\n"
45
+ )
46
+ else:
47
+ logging.info(
48
+ "\n"
49
+ "\n"
50
+ "[Lemonade] Lemonade Server Ready!\n"
51
+ f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n"
52
+ "[Lemonade] chat\n"
53
+ "[Lemonade] model management\n"
54
+ "[Lemonade] docs\n"
55
+ )
56
+
57
+ yield
@@ -8,7 +8,6 @@ import traceback
8
8
  from typing import Optional, Union
9
9
  import json
10
10
  import subprocess
11
- from contextlib import asynccontextmanager
12
11
  from pathlib import Path
13
12
 
14
13
  from fastapi import FastAPI, HTTPException, status, Request
@@ -16,6 +15,8 @@ from fastapi.responses import StreamingResponse
16
15
  from fastapi.middleware.cors import CORSMiddleware
17
16
  from fastapi.staticfiles import StaticFiles
18
17
  import uvicorn
18
+ from uvicorn.config import Config
19
+ from uvicorn.server import Server as UvicornServer
19
20
  from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
20
21
  from tabulate import tabulate
21
22
 
@@ -57,7 +58,7 @@ from lemonade.tools.server.pydantic_models import (
57
58
  )
58
59
  from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
59
60
  from lemonade.tools.server.instructions import get_instructions_html
60
-
61
+ from lemonade.tools.server.port_utils import lifespan
61
62
 
62
63
  DEFAULT_PORT = 8000
63
64
  DEFAULT_LOG_LEVEL = "info"
@@ -243,15 +244,22 @@ class Server(ManagementTool):
243
244
 
244
245
  return parser
245
246
 
246
- def run(
247
+ def _setup_server_common(
247
248
  self,
248
- # ManagementTool has a required cache_dir arg, but
249
- # we always use the default cache directory
250
- _=None,
251
- port: int = DEFAULT_PORT,
252
- log_level: str = DEFAULT_LOG_LEVEL,
249
+ port: int,
253
250
  truncate_inputs: bool = False,
251
+ log_level: str = DEFAULT_LOG_LEVEL,
252
+ threaded_mode: bool = False,
254
253
  ):
254
+ """
255
+ Common setup logic shared between run() and run_in_thread().
256
+
257
+ Args:
258
+ port: Port number for the server
259
+ truncate_inputs: Whether to truncate inputs if they exceed max length
260
+ log_level: Logging level to configure
261
+ threaded_mode: Whether this is being set up for threaded execution
262
+ """
255
263
  # Store truncation settings
256
264
  self.truncate_inputs = truncate_inputs
257
265
 
@@ -265,22 +273,27 @@ class Server(ManagementTool):
265
273
 
266
274
  logging.trace = trace
267
275
 
268
- # Configure logging to match uvicorn's format
269
- logging_level = getattr(logging, log_level.upper())
270
- logging.basicConfig(
271
- level=logging_level,
272
- format="%(levelprefix)s %(message)s",
273
- datefmt="%Y-%m-%d %H:%M:%S",
274
- )
276
+ # Configure logging based on mode
277
+ if threaded_mode:
278
+ # Configure logging for warning level (to reduce noise in threaded execution)
279
+ logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
280
+ else:
281
+ # Configure logging to match uvicorn's format
282
+ logging_level = getattr(logging, log_level.upper())
283
+ logging.basicConfig(
284
+ level=logging_level,
285
+ format="%(levelprefix)s %(message)s",
286
+ datefmt="%Y-%m-%d %H:%M:%S",
287
+ )
275
288
 
276
- # Add uvicorn's log formatter
277
- logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
278
- fmt="%(levelprefix)s %(message)s",
279
- use_colors=True,
280
- )
289
+ # Add uvicorn's log formatter
290
+ logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
291
+ fmt="%(levelprefix)s %(message)s",
292
+ use_colors=True,
293
+ )
281
294
 
282
- # Ensure the log level is properly set
283
- logging.getLogger().setLevel(logging_level)
295
+ # Ensure the log level is properly set
296
+ logging.getLogger().setLevel(logging_level)
284
297
 
285
298
  # Update debug logging state after setting log level
286
299
  self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
@@ -293,8 +306,62 @@ class Server(ManagementTool):
293
306
  # that the lifespan can access it
294
307
  self.app.port = port
295
308
 
309
+ def run(
310
+ self,
311
+ # ManagementTool has a required cache_dir arg, but
312
+ # we always use the default cache directory
313
+ _=None,
314
+ port: int = DEFAULT_PORT,
315
+ log_level: str = DEFAULT_LOG_LEVEL,
316
+ truncate_inputs: bool = False,
317
+ ):
318
+ # Common setup
319
+ self._setup_server_common(
320
+ port=port,
321
+ truncate_inputs=truncate_inputs,
322
+ log_level=log_level,
323
+ threaded_mode=False,
324
+ )
325
+
296
326
  uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
297
327
 
328
+ def run_in_thread(
329
+ self,
330
+ port: int = DEFAULT_PORT,
331
+ host: str = "localhost",
332
+ log_level: str = "warning",
333
+ truncate_inputs: bool = False,
334
+ ):
335
+ """
336
+ Set up the server for running in a thread.
337
+ Returns a uvicorn server instance that can be controlled externally.
338
+ """
339
+ # Common setup
340
+ self._setup_server_common(
341
+ port=port,
342
+ truncate_inputs=truncate_inputs,
343
+ log_level=log_level,
344
+ threaded_mode=True,
345
+ )
346
+
347
+ class CustomServer(UvicornServer):
348
+ """Custom Uvicorn server that can be properly shutdown from another thread"""
349
+
350
+ def install_signal_handlers(self):
351
+ pass
352
+
353
+ # Configure the server
354
+ config = Config(
355
+ app=self.app,
356
+ host=host,
357
+ port=port,
358
+ log_level=log_level,
359
+ log_config=None,
360
+ )
361
+
362
+ # Create and return the uvicorn server
363
+ return CustomServer(config=config)
364
+
298
365
  async def _show_telemetry(self):
299
366
  """
300
367
  Show telemetry data in debug mode.
@@ -1241,6 +1308,8 @@ class Server(ManagementTool):
1241
1308
  "status": "success",
1242
1309
  "message": f"Loaded model: {model_reference}",
1243
1310
  }
1311
+ except HTTPException:
1312
+ raise
1244
1313
  except Exception: # pylint: disable=broad-exception-caught
1245
1314
  self.model_load_failure(model_reference)
1246
1315
 
@@ -1339,22 +1408,5 @@ class Server(ManagementTool):
1339
1408
  return response
1340
1409
 
1341
1410
 
1342
- @asynccontextmanager
1343
- async def lifespan(app: FastAPI):
1344
- # Code here will run when the application starts up
1345
-
1346
- logging.info(
1347
- "\n"
1348
- "\n"
1349
- "🍋 Lemonade Server Ready!\n"
1350
- f"🍋 Open http://localhost:{app.port} in your browser for:\n"
1351
- "🍋 💬 chat\n"
1352
- "🍋 💻 model management\n"
1353
- "🍋 📄 docs\n"
1354
- )
1355
-
1356
- yield
1357
-
1358
-
1359
1411
  # This file was originally licensed under Apache 2.0. It has been modified.
1360
1412
  # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,87 @@
1
+ import threading
2
+ import logging
3
+ from lemonade.tools.server.serve import Server
4
+
5
+
6
+ class ServerRunner(threading.Thread):
7
+ """
8
+ Thread class for running the Lemonade Server with a loaded model.
9
+ """
10
+
11
+ def __init__(
12
+ self, model, tokenizer, checkpoint, recipe, host="localhost", port=8000
13
+ ):
14
+ threading.Thread.__init__(self)
15
+ self.model = model
16
+ self.tokenizer = tokenizer
17
+ self.checkpoint = checkpoint
18
+ self.recipe = recipe
19
+ self.host = host
20
+ self.port = port
21
+ self.server = None
22
+ self.ready_event = threading.Event()
23
+ self.shutdown_event = threading.Event()
24
+ self.uvicorn_server = None
25
+
26
+ def run(self):
27
+ try:
28
+ # Create the server instance
29
+ self.server = Server()
30
+
31
+ # Configure the server with model/tokenizer
32
+ self.server.model = self.model
33
+ self.server.tokenizer = self.tokenizer
34
+ self.server.llm_loaded = type(
35
+ "obj",
36
+ (object,),
37
+ {
38
+ "checkpoint": self.checkpoint,
39
+ "recipe": self.recipe,
40
+ "max_prompt_length": None,
41
+ "reasoning": False,
42
+ "model_name": "custom",
43
+ },
44
+ )
45
+
46
+ # Set up the server for threaded execution
47
+ self.uvicorn_server = self.server.run_in_thread(
48
+ port=self.port, host=self.host, log_level="warning"
49
+ )
50
+
51
+ # Set the ready event
52
+ self.ready_event.set()
53
+
54
+ # Run the server until shutdown is requested
55
+ logging.info(f"Starting server on http://{self.host}:{self.port}")
56
+ self.uvicorn_server.run()
57
+
58
+ except Exception as e:
59
+ logging.error(f"Error starting server: {e}")
60
+ self.ready_event.set()
61
+ raise
62
+
63
+ def shutdown(self):
64
+ """Shutdown the server"""
65
+ if hasattr(self, "uvicorn_server") and self.uvicorn_server:
66
+ logging.info("Shutting down server...")
67
+ self.uvicorn_server.should_exit = True
68
+ self.shutdown_event.set()
69
+
70
+ # Clean up resources properly to avoid memory leaks
71
+ if hasattr(self, "server") and self.server:
72
+ logging.info("Cleaning up model and tokenizer resources...")
73
+
74
+ if hasattr(self.server, "model"):
75
+ self.server.model = None
76
+
77
+ if hasattr(self.server, "tokenizer"):
78
+ self.server.tokenizer = None
79
+
80
+ if hasattr(self.server, "llm_loaded"):
81
+ self.server.llm_loaded = None
82
+
83
+ # Clean up local references
84
+ if hasattr(self, "model"):
85
+ del self.model
86
+ if hasattr(self, "tokenizer"):
87
+ del self.tokenizer
lemonade/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "7.0.1"
1
+ __version__ = "7.0.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 7.0.1
3
+ Version: 7.0.2
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -1,10 +1,10 @@
1
1
  lemonade/__init__.py,sha256=W1Qk7r0rnQqFhPNHp6BIBT_q-OH3s-8Q_POoVfAmKW0,117
2
2
  lemonade/api.py,sha256=9apNWSMS4bYpYl7iqDA4CsHHOOMdjOIuJhNYSqj_jIA,3878
3
3
  lemonade/cache.py,sha256=djr2qgyUUAWlQv8FehU9qlNtCwK0IZqo82hcBDyZ3-A,2850
4
- lemonade/cli.py,sha256=_s-LWpaVIhOmaP0Q1qirXxNiBhdumAZ-5ub5-lRNccs,4351
4
+ lemonade/cli.py,sha256=ddN2QqsGMsVwydfcR7MSZu1z8_-bUgUP7dhw9lzbHa8,4424
5
5
  lemonade/sequence.py,sha256=KSH7BPsiyDKsOsg_ziQKEGsDwMmuO_YbgPRBxkZd0pw,13267
6
6
  lemonade/state.py,sha256=sdSezla7Cd7KYL90xY3p9kcNV4ndSyN6UvNLOr3vBMA,5261
7
- lemonade/version.py,sha256=co6LyaBArt-ahHXYZSdSER8TFZ2vVTb86CNG6X8Pxwc,22
7
+ lemonade/version.py,sha256=iVyoEZ1fyZz5oicAj7ERV3Eld5fVjLM_p365GVSKBpk,22
8
8
  lemonade/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  lemonade/common/analyze_model.py,sha256=sYWDznEUEWjx_Qekg7f1hHY4Pfe87IQ77lmsWqePgE0,803
10
10
  lemonade/common/build.py,sha256=Pk86mCr6fyBIx2zXDpq0BkdahlCmWRnwSTpShA_gwZw,7849
@@ -23,6 +23,7 @@ lemonade/profilers/__init__.py,sha256=JKVonvJ4XZ9_6sKXPWsiMLQCNyzQOxhQw5BEHR1qOf
23
23
  lemonade/profilers/memory_tracker.py,sha256=-SSBmNlrweiX59wyNtLMWiwaMOskBzNO1_cufVwteqs,9357
24
24
  lemonade/profilers/profiler.py,sha256=y_iMGr1ToQ6rcwcIcXck4ajapisLXCfHggiV-IpPF98,1666
25
25
  lemonade/tools/__init__.py,sha256=_6xRc-FHxmujoLjLjWtpYrWYEXtCSneSy-5ya01kyPk,53
26
+ lemonade/tools/accuracy.py,sha256=QndammQ1bmlTaF_6YDaaiJp6fpkKZDYGySdQpAgZIp8,11699
26
27
  lemonade/tools/adapter.py,sha256=4H6gfbjvqyU6qm1_-b2FE-c3a7N9OzEBeDVnIwqRDvg,3014
27
28
  lemonade/tools/bench.py,sha256=aN5LMA_EH6-ZhAH3Gf26JYL7s0eKpUd3j-bReRhzvEY,10016
28
29
  lemonade/tools/huggingface_bench.py,sha256=POE5JYzArK2FBktazOkluLNFzlLctM39B19fK5sMx-0,10017
@@ -46,20 +47,22 @@ lemonade/tools/report/llm_report.py,sha256=bVHhwCINA-Ok2EdSwAsLubsc83N3KWOVuwTgu
46
47
  lemonade/tools/report/table.py,sha256=a0TXo1X84RxCSu0un_XM3ANOlhLtPDuqtGwR7eomf2s,24853
47
48
  lemonade/tools/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
49
  lemonade/tools/server/instructions.py,sha256=Lvm-tRZaYgHkyt3zQkmMChkXO6rUiLoIAunudmMr_D8,13388
49
- lemonade/tools/server/llamacpp.py,sha256=PeHg1DbMGcf68txFgC1CJJN5HRHEnIJ4_4EDhvqAFUI,9255
50
+ lemonade/tools/server/llamacpp.py,sha256=R86Q2btI9_EPpPj27vvELnF9KmKxpu3sPIIS1xW3PIA,9997
51
+ lemonade/tools/server/port_utils.py,sha256=24Ryz5cNU0R9L1kuVSapZoyXTZHzhF4y0Yje9MVOrE0,1535
50
52
  lemonade/tools/server/pydantic_models.py,sha256=z1RAs9hkAFkOfMiTPtmUiC3CD2P6OMI2N0J2ztNs0d4,2179
51
- lemonade/tools/server/serve.py,sha256=7meKOKVHaODHBYD_3dDJyaiwoC_m4z_FWniZfsZ9cCI,50655
53
+ lemonade/tools/server/serve.py,sha256=3JQa42WZdllKAf_DY-cal0Pc8vdBZd4vwsfhZmpheS8,52500
54
+ lemonade/tools/server/thread_utils.py,sha256=pK9K_6DNWoQ78NArkAX3Ym2WsxLnCs9sKTk6TitlYnI,2804
52
55
  lemonade/tools/server/tool_calls.py,sha256=xrAlQwKG-nv2xLlf8f9CDSaUbyMn8ZtHkds9iZLG9K8,5230
53
56
  lemonade/tools/server/static/styles.css,sha256=8U1EejQaqRLQ6QTCF5UG_dLPtLjRwT1menUHMDhaq2M,5045
54
57
  lemonade_install/__init__.py,sha256=26zohKg2jgr_5y7tObduWMYQg8zCTWMZHL8lfi2zZVQ,40
55
58
  lemonade_install/install.py,sha256=61qUO7kWCLcdjK0_IQZ46-rKP_AWkyznh4YpDclPKyM,28036
56
- lemonade_sdk-7.0.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
57
- lemonade_sdk-7.0.1.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
59
+ lemonade_sdk-7.0.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
60
+ lemonade_sdk-7.0.2.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
58
61
  lemonade_server/cli.py,sha256=DR6sIt66K1sZZG3ascEw_6HUgz3UhU9KGUyzxf4nO_A,7351
59
62
  lemonade_server/model_manager.py,sha256=WDGDxrKjq-u2GkGWLNUsRk0d74J-RG2yCYEnH8WMnDw,4010
60
- lemonade_server/server_models.json,sha256=ZSg1R555bLVW4U7BPaYX5ZgwaJVNAP3z1C62dzMRqAM,6198
61
- lemonade_sdk-7.0.1.dist-info/METADATA,sha256=bvg9-Tzg_v8sTKjkAJtLahpDq_GmLDMDKA9PTisaNGw,5443
62
- lemonade_sdk-7.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
63
- lemonade_sdk-7.0.1.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
64
- lemonade_sdk-7.0.1.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
65
- lemonade_sdk-7.0.1.dist-info/RECORD,,
63
+ lemonade_server/server_models.json,sha256=S_wVpybtBT5xTuM2BLxT83bOsJnPR_yWIl35jy30aJ8,6453
64
+ lemonade_sdk-7.0.2.dist-info/METADATA,sha256=Pf_-kdMDlXVYw_6CHQJDlO3ac4GbHzxENx0Rg8p4QBo,5443
65
+ lemonade_sdk-7.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
66
+ lemonade_sdk-7.0.2.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
67
+ lemonade_sdk-7.0.2.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
68
+ lemonade_sdk-7.0.2.dist-info/RECORD,,
@@ -9,13 +9,13 @@
9
9
  "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx",
10
10
  "recipe": "oga-cpu",
11
11
  "reasoning": false,
12
- "suggested": true
12
+ "suggested": false
13
13
  },
14
14
  "Llama-3.2-3B-Instruct-CPU": {
15
15
  "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx",
16
16
  "recipe": "oga-cpu",
17
17
  "reasoning": false,
18
- "suggested": true
18
+ "suggested": false
19
19
  },
20
20
  "Phi-3-Mini-Instruct-CPU": {
21
21
  "checkpoint": "amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu",
@@ -103,6 +103,13 @@
103
103
  "max_prompt_length": 2000,
104
104
  "suggested": true
105
105
  },
106
+ "Llama-xLAM-2-8b-fc-r-Hybrid": {
107
+ "checkpoint": "amd/Llama-xLAM-2-8b-fc-r-awq-g128-int4-asym-bfp16-onnx-hybrid",
108
+ "recipe": "oga-hybrid",
109
+ "reasoning": false,
110
+ "max_prompt_length": 2000,
111
+ "suggested": true
112
+ },
106
113
  "Llama-3.2-1B-Instruct-DirectML": {
107
114
  "checkpoint": "amd/Llama-3.2-1B-Instruct-dml-int4-awq-block-128-directml",
108
115
  "recipe": "oga-igpu",
@@ -158,7 +165,7 @@
158
165
  "suggested": true
159
166
  },
160
167
  "Qwen3-8B-GGUF": {
161
- "checkpoint": "unsloth/Qwen3-8B-GGUF:Q4_0",
168
+ "checkpoint": "unsloth/Qwen3-8B-GGUF:Q4_1",
162
169
  "recipe": "llamacpp",
163
170
  "reasoning": true,
164
171
  "suggested": true
@@ -181,4 +188,4 @@
181
188
  "reasoning": true,
182
189
  "suggested": true
183
190
  }
184
- }
191
+ }