lemonade-sdk 7.0.1__py3-none-any.whl → 7.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cli.py +2 -0
- lemonade/tools/accuracy.py +335 -0
- lemonade/tools/server/llamacpp.py +31 -5
- lemonade/tools/server/port_utils.py +57 -0
- lemonade/tools/server/serve.py +91 -39
- lemonade/tools/server/thread_utils.py +87 -0
- lemonade/version.py +1 -1
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/METADATA +1 -1
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/RECORD +15 -12
- lemonade_server/server_models.json +11 -4
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.1.dist-info → lemonade_sdk-7.0.2.dist-info}/top_level.txt +0 -0
lemonade/cli.py
CHANGED
|
@@ -19,6 +19,7 @@ import lemonade.cache as cache
|
|
|
19
19
|
from lemonade.tools.mmlu import AccuracyMMLU
|
|
20
20
|
from lemonade.tools.humaneval import AccuracyHumaneval
|
|
21
21
|
from lemonade.tools.perplexity import AccuracyPerplexity
|
|
22
|
+
from lemonade.tools.accuracy import LMEvalHarness
|
|
22
23
|
from lemonade.tools.prompt import LLMPrompt
|
|
23
24
|
from lemonade.tools.quark.quark_load import QuarkLoad
|
|
24
25
|
from lemonade.tools.quark.quark_quantize import QuarkQuantize
|
|
@@ -36,6 +37,7 @@ def main():
|
|
|
36
37
|
AccuracyMMLU,
|
|
37
38
|
AccuracyHumaneval,
|
|
38
39
|
AccuracyPerplexity,
|
|
40
|
+
LMEvalHarness,
|
|
39
41
|
LLMPrompt,
|
|
40
42
|
HuggingfaceBench,
|
|
41
43
|
OgaBench,
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from lemonade.state import State
|
|
13
|
+
from lemonade.tools import Tool
|
|
14
|
+
import lemonade.common.printing as printing
|
|
15
|
+
import lemonade.common.build as build
|
|
16
|
+
|
|
17
|
+
from lemonade.tools.server.thread_utils import ServerRunner
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_port_in_use(port, host="localhost"):
|
|
21
|
+
"""
|
|
22
|
+
Check if a port is in use
|
|
23
|
+
"""
|
|
24
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
25
|
+
return s.connect_ex((host, port)) == 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LMEvalHarness(Tool):
|
|
29
|
+
"""
|
|
30
|
+
Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
|
|
31
|
+
like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
unique_name = "lm-eval-harness"
|
|
35
|
+
|
|
36
|
+
def __init__(self):
|
|
37
|
+
super().__init__(
|
|
38
|
+
monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
|
|
39
|
+
)
|
|
40
|
+
self.status_stats = []
|
|
41
|
+
self.server_runner = None
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
45
|
+
parser = __class__.helpful_parser(
|
|
46
|
+
short_description="Evaluate model using lm-eval-harness",
|
|
47
|
+
add_help=add_help,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--task",
|
|
52
|
+
type=str,
|
|
53
|
+
required=True,
|
|
54
|
+
help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--server-port", type=int, default=8000, help="Port to use for the server"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--num-fewshot",
|
|
63
|
+
type=int,
|
|
64
|
+
default=0,
|
|
65
|
+
help="Number of examples in few-shot prompts",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--limit",
|
|
70
|
+
type=int,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Limit the number of examples per task",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--log-samples",
|
|
77
|
+
action="store_true",
|
|
78
|
+
help="Log samples for each task to log file",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--output-path",
|
|
83
|
+
type=str,
|
|
84
|
+
default=None,
|
|
85
|
+
help="Path to save evaluation results",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return parser
|
|
89
|
+
|
|
90
|
+
def _process_results(self, results_dir, state):
|
|
91
|
+
"""Process evaluation results and save to state stats"""
|
|
92
|
+
if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
|
|
93
|
+
printing.log_warning(f"Results directory not found at {results_dir}")
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
model_dirs = [
|
|
97
|
+
d
|
|
98
|
+
for d in os.listdir(results_dir)
|
|
99
|
+
if os.path.isdir(os.path.join(results_dir, d))
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
if not model_dirs:
|
|
103
|
+
printing.log_warning(f"No model directories found in {results_dir}")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
model_dir = os.path.join(results_dir, model_dirs[0])
|
|
107
|
+
printing.log_info(f"Found model directory: {model_dir}")
|
|
108
|
+
|
|
109
|
+
# Find the results JSON file with timestamp
|
|
110
|
+
results_files = [
|
|
111
|
+
f
|
|
112
|
+
for f in os.listdir(model_dir)
|
|
113
|
+
if f.startswith("results_") and f.endswith(".json")
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if not results_files:
|
|
117
|
+
printing.log_warning(f"No results files found in {model_dir}")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
# Sort by timestamp
|
|
121
|
+
results_files.sort(reverse=True)
|
|
122
|
+
results_file_path = os.path.join(model_dir, results_files[0])
|
|
123
|
+
printing.log_info(f"Processing results from {results_file_path}")
|
|
124
|
+
|
|
125
|
+
# Read and process results
|
|
126
|
+
try:
|
|
127
|
+
with open(results_file_path, "r", encoding="utf-8") as f:
|
|
128
|
+
results = json.load(f)
|
|
129
|
+
|
|
130
|
+
# Extract and display metrics
|
|
131
|
+
if "results" in results:
|
|
132
|
+
for task_name, metrics in results["results"].items():
|
|
133
|
+
printing.log_info(f"Results for {task_name}:")
|
|
134
|
+
|
|
135
|
+
for metric, value in metrics.items():
|
|
136
|
+
if isinstance(value, (int, float)) and not metric.startswith(
|
|
137
|
+
"alias"
|
|
138
|
+
):
|
|
139
|
+
# Format metric name for stats
|
|
140
|
+
clean_metric = metric.replace(",", "_")
|
|
141
|
+
stat_name = f"lm_eval_{task_name}_{clean_metric}"
|
|
142
|
+
|
|
143
|
+
# Save to state stats as percentage
|
|
144
|
+
state.save_stat(stat_name, float(value) * 100)
|
|
145
|
+
state.save_stat(f"{stat_name}_units", "%")
|
|
146
|
+
self.status_stats.append(stat_name)
|
|
147
|
+
|
|
148
|
+
printing.log_info(
|
|
149
|
+
f" {metric}: {value:.4f} ({value*100:.2f}%)"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Save summary metrics if available
|
|
153
|
+
avg_metrics = {}
|
|
154
|
+
if "higher_is_better" in results:
|
|
155
|
+
for metric_type in results["higher_is_better"].values():
|
|
156
|
+
for metric in metric_type.keys():
|
|
157
|
+
if metric not in avg_metrics:
|
|
158
|
+
avg_metrics[metric] = []
|
|
159
|
+
|
|
160
|
+
for task_metrics in results["results"].values():
|
|
161
|
+
for metric, value in task_metrics.items():
|
|
162
|
+
if isinstance(value, (int, float)) and not metric.startswith(
|
|
163
|
+
"alias"
|
|
164
|
+
):
|
|
165
|
+
base_metric = metric.split(",")[0]
|
|
166
|
+
if base_metric in avg_metrics:
|
|
167
|
+
avg_metrics[base_metric].append(value)
|
|
168
|
+
|
|
169
|
+
# Calculate and save averages
|
|
170
|
+
for metric, values in avg_metrics.items():
|
|
171
|
+
if values:
|
|
172
|
+
avg_value = sum(values) / len(values)
|
|
173
|
+
stat_name = f"lm_eval_average_{metric}"
|
|
174
|
+
state.save_stat(stat_name, float(avg_value) * 100)
|
|
175
|
+
state.save_stat(f"{stat_name}_units", "%")
|
|
176
|
+
self.status_stats.append(stat_name)
|
|
177
|
+
printing.log_info(
|
|
178
|
+
f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
except (IOError, json.JSONDecodeError) as e:
|
|
182
|
+
printing.log_error(f"Error processing results: {e}")
|
|
183
|
+
|
|
184
|
+
def run(
|
|
185
|
+
self,
|
|
186
|
+
state: State,
|
|
187
|
+
task: str,
|
|
188
|
+
server_port: int = 8000,
|
|
189
|
+
server_host: str = "localhost",
|
|
190
|
+
num_fewshot: int = 0,
|
|
191
|
+
limit: Optional[int] = None,
|
|
192
|
+
log_samples: bool = False,
|
|
193
|
+
output_path: Optional[str] = None,
|
|
194
|
+
) -> State:
|
|
195
|
+
|
|
196
|
+
model = state.model
|
|
197
|
+
tokenizer = state.tokenizer
|
|
198
|
+
|
|
199
|
+
if model is None or tokenizer is None:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"Model and tokenizer must be loaded in state before running lm-eval-harness"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Set up output path
|
|
205
|
+
if output_path is None:
|
|
206
|
+
output_path = os.path.join(
|
|
207
|
+
build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
os.makedirs(output_path, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
# Check if port is already in use
|
|
213
|
+
if is_port_in_use(server_port, server_host):
|
|
214
|
+
error_msg = (
|
|
215
|
+
f"Port {server_port} is already in use. "
|
|
216
|
+
"Please close all applications using this port and try again."
|
|
217
|
+
)
|
|
218
|
+
printing.log_error(error_msg)
|
|
219
|
+
raise RuntimeError(error_msg)
|
|
220
|
+
|
|
221
|
+
# Retroactively determine recipe based on model type to select correct iterator
|
|
222
|
+
# The model is already loaded in server, so we only need recipe for iterator selection
|
|
223
|
+
checkpoint = getattr(state, "checkpoint", "unknown")
|
|
224
|
+
if "OrtGenaiModel" in str(type(model)):
|
|
225
|
+
recipe = "oga-"
|
|
226
|
+
else:
|
|
227
|
+
recipe = "unknown"
|
|
228
|
+
|
|
229
|
+
# Start the server thread
|
|
230
|
+
self.server_runner = ServerRunner(
|
|
231
|
+
model=model,
|
|
232
|
+
tokenizer=tokenizer,
|
|
233
|
+
checkpoint=checkpoint,
|
|
234
|
+
recipe=recipe,
|
|
235
|
+
host=server_host,
|
|
236
|
+
port=server_port,
|
|
237
|
+
)
|
|
238
|
+
self.server_runner.start()
|
|
239
|
+
|
|
240
|
+
# Wait for server initialization
|
|
241
|
+
printing.log_info("Waiting for server initialization...")
|
|
242
|
+
|
|
243
|
+
# Wait for server to start and be responsive
|
|
244
|
+
server_url = f"http://{server_host}:{server_port}"
|
|
245
|
+
max_retries = 30
|
|
246
|
+
retry_delay = 1
|
|
247
|
+
|
|
248
|
+
printing.log_info(f"Checking if server is available at {server_url}...")
|
|
249
|
+
for i in range(max_retries):
|
|
250
|
+
try:
|
|
251
|
+
response = requests.get(f"{server_url}/api/v0/health", timeout=2)
|
|
252
|
+
if response.status_code == 200:
|
|
253
|
+
printing.log_info(f"Server is ready after {i+1} attempts")
|
|
254
|
+
break
|
|
255
|
+
except requests.exceptions.RequestException:
|
|
256
|
+
if i < max_retries - 1:
|
|
257
|
+
time.sleep(retry_delay)
|
|
258
|
+
else:
|
|
259
|
+
printing.log_error(
|
|
260
|
+
f"Server did not start after {max_retries} attempts"
|
|
261
|
+
)
|
|
262
|
+
raise RuntimeError("Failed to start the server")
|
|
263
|
+
|
|
264
|
+
# Build API URL
|
|
265
|
+
results_file = os.path.join(output_path, f"{task}_results")
|
|
266
|
+
|
|
267
|
+
printing.log_info(f"Running lm-eval-harness on {task}...")
|
|
268
|
+
|
|
269
|
+
# Build lm-eval-harness command
|
|
270
|
+
cmd = [
|
|
271
|
+
"lm_eval",
|
|
272
|
+
"--model",
|
|
273
|
+
"local-completions",
|
|
274
|
+
"--tasks",
|
|
275
|
+
task,
|
|
276
|
+
"--model_args",
|
|
277
|
+
(
|
|
278
|
+
f"model={checkpoint},"
|
|
279
|
+
f"base_url={server_url}/api/v0/completions,"
|
|
280
|
+
f"num_concurrent=1,"
|
|
281
|
+
f"max_retries=5,"
|
|
282
|
+
f"retry_timeout=10,"
|
|
283
|
+
f"tokenized_requests=False"
|
|
284
|
+
),
|
|
285
|
+
"--num_fewshot",
|
|
286
|
+
str(num_fewshot),
|
|
287
|
+
"--output_path",
|
|
288
|
+
results_file,
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
if limit is not None:
|
|
292
|
+
cmd.extend(["--limit", str(limit)])
|
|
293
|
+
|
|
294
|
+
if log_samples:
|
|
295
|
+
cmd.extend(["--log_samples"])
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
# On Windows, set UTF-8 mode to handle Unicode output
|
|
299
|
+
env = os.environ.copy()
|
|
300
|
+
if sys.platform == "win32":
|
|
301
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
|
302
|
+
|
|
303
|
+
# Execute lm-eval-harness command
|
|
304
|
+
result = subprocess.run(
|
|
305
|
+
cmd, check=True, text=True, capture_output=True, env=env
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Log relevant output and skip any parts that might cause encoding issues
|
|
309
|
+
try:
|
|
310
|
+
printing.log_info(result.stdout)
|
|
311
|
+
except UnicodeEncodeError:
|
|
312
|
+
printing.log_info(
|
|
313
|
+
"Results obtained successfully but couldn't display due to encoding issues"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Process results from the correct location
|
|
317
|
+
results_dir = os.path.join(output_path, f"{task}_results")
|
|
318
|
+
self._process_results(results_dir, state)
|
|
319
|
+
|
|
320
|
+
except subprocess.CalledProcessError as e:
|
|
321
|
+
printing.log_error(f"Error running lm-eval-harness: {e}")
|
|
322
|
+
printing.log_error(f"stderr: {e.stderr}")
|
|
323
|
+
except (IOError, ValueError, requests.RequestException) as e:
|
|
324
|
+
printing.log_error(f"Error: {e}")
|
|
325
|
+
finally:
|
|
326
|
+
# Shut down server
|
|
327
|
+
if self.server_runner and self.server_runner.is_alive():
|
|
328
|
+
printing.log_info("Shutting down server runner...")
|
|
329
|
+
self.server_runner.shutdown()
|
|
330
|
+
|
|
331
|
+
# Make sure we don't have any lingering references to state's model/tokenizer
|
|
332
|
+
# that could prevent garbage collection
|
|
333
|
+
self.server_runner = None
|
|
334
|
+
|
|
335
|
+
return state
|
|
@@ -16,9 +16,9 @@ from openai import OpenAI
|
|
|
16
16
|
|
|
17
17
|
from lemonade_server.model_manager import ModelManager
|
|
18
18
|
from lemonade.tools.server.pydantic_models import ChatCompletionRequest
|
|
19
|
+
from lemonade.tools.server.port_utils import find_free_port
|
|
19
20
|
|
|
20
21
|
LLAMA_VERSION = "b5543"
|
|
21
|
-
LLAMA_SERVER_PORT = "8081"
|
|
22
22
|
|
|
23
23
|
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
24
|
os.path.dirname(sys.executable),
|
|
@@ -43,6 +43,23 @@ class LlamaTelemetry:
|
|
|
43
43
|
self.tokens_per_second = None
|
|
44
44
|
self.prompt_eval_time = None
|
|
45
45
|
self.eval_time = None
|
|
46
|
+
self.port = None
|
|
47
|
+
|
|
48
|
+
def choose_port(self):
|
|
49
|
+
"""
|
|
50
|
+
Users probably don't care what port we start llama-server on, so let's
|
|
51
|
+
search for an empty port
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
self.port = find_free_port()
|
|
55
|
+
|
|
56
|
+
if self.port is None:
|
|
57
|
+
msg = "Failed to find an empty port to start llama-server on"
|
|
58
|
+
logging.error(msg)
|
|
59
|
+
raise HTTPException(
|
|
60
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
61
|
+
detail=msg,
|
|
62
|
+
)
|
|
46
63
|
|
|
47
64
|
def parse_telemetry_line(self, line: str):
|
|
48
65
|
"""
|
|
@@ -128,10 +145,12 @@ def _log_subprocess_output(
|
|
|
128
145
|
break
|
|
129
146
|
|
|
130
147
|
|
|
131
|
-
def _wait_for_load(
|
|
148
|
+
def _wait_for_load(
|
|
149
|
+
llama_server_process: subprocess.Popen, port: int, fail_message: str
|
|
150
|
+
):
|
|
132
151
|
status_code = None
|
|
133
152
|
while not llama_server_process.poll() and status_code != 200:
|
|
134
|
-
health_url = f"http://localhost:{
|
|
153
|
+
health_url = f"http://localhost:{port}/health"
|
|
135
154
|
try:
|
|
136
155
|
health_response = requests.get(health_url)
|
|
137
156
|
except requests.exceptions.ConnectionError:
|
|
@@ -152,12 +171,17 @@ def _launch_llama_subprocess(
|
|
|
152
171
|
Launch llama server subprocess with GPU or CPU configuration
|
|
153
172
|
"""
|
|
154
173
|
|
|
174
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
175
|
+
# by other functions
|
|
176
|
+
telemetry.choose_port()
|
|
177
|
+
|
|
155
178
|
base_command = [
|
|
156
179
|
LLAMA_SERVER_EXE_PATH,
|
|
157
180
|
"-m",
|
|
158
181
|
model_path,
|
|
159
182
|
"--port",
|
|
160
|
-
|
|
183
|
+
str(telemetry.port),
|
|
184
|
+
"--jinja",
|
|
161
185
|
]
|
|
162
186
|
|
|
163
187
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
@@ -227,6 +251,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
227
251
|
# Check the /health endpoint until GPU server is ready
|
|
228
252
|
_wait_for_load(
|
|
229
253
|
llama_server_process,
|
|
254
|
+
telemetry.port,
|
|
230
255
|
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
231
256
|
)
|
|
232
257
|
|
|
@@ -239,6 +264,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
239
264
|
# Check the /health endpoint until CPU server is ready
|
|
240
265
|
_wait_for_load(
|
|
241
266
|
llama_server_process,
|
|
267
|
+
telemetry.port,
|
|
242
268
|
f"Loading {model_reference} on CPU didn't work",
|
|
243
269
|
)
|
|
244
270
|
|
|
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
254
280
|
def chat_completion(
|
|
255
281
|
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
256
282
|
):
|
|
257
|
-
base_url = f"http://127.0.0.1:{
|
|
283
|
+
base_url = f"http://127.0.0.1:{telemetry.port}/v1"
|
|
258
284
|
client = OpenAI(
|
|
259
285
|
base_url=base_url,
|
|
260
286
|
api_key="lemonade",
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import socketserver
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from fastapi import FastAPI
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_free_port():
|
|
9
|
+
"""
|
|
10
|
+
Scans for an unoccupied TCP port
|
|
11
|
+
|
|
12
|
+
Returns the port number as an int on success
|
|
13
|
+
Returns None if no port can be found
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
with socketserver.TCPServer(("localhost", 0), None) as s:
|
|
18
|
+
return s.server_address[1]
|
|
19
|
+
# pylint: disable=broad-exception-caught
|
|
20
|
+
except Exception:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@asynccontextmanager
|
|
25
|
+
async def lifespan(app: FastAPI):
|
|
26
|
+
# Code here will run when the application starts up
|
|
27
|
+
# Check if console can handle Unicode by testing emoji encoding
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
if sys.stdout.encoding:
|
|
31
|
+
"🍋".encode(sys.stdout.encoding)
|
|
32
|
+
use_emojis = True
|
|
33
|
+
except (UnicodeEncodeError, AttributeError):
|
|
34
|
+
use_emojis = False
|
|
35
|
+
|
|
36
|
+
if use_emojis:
|
|
37
|
+
logging.info(
|
|
38
|
+
"\n"
|
|
39
|
+
"\n"
|
|
40
|
+
"🍋 Lemonade Server Ready!\n"
|
|
41
|
+
f"🍋 Open http://localhost:{app.port} in your browser for:\n"
|
|
42
|
+
"🍋 💬 chat\n"
|
|
43
|
+
"🍋 💻 model management\n"
|
|
44
|
+
"🍋 📄 docs\n"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
logging.info(
|
|
48
|
+
"\n"
|
|
49
|
+
"\n"
|
|
50
|
+
"[Lemonade] Lemonade Server Ready!\n"
|
|
51
|
+
f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n"
|
|
52
|
+
"[Lemonade] chat\n"
|
|
53
|
+
"[Lemonade] model management\n"
|
|
54
|
+
"[Lemonade] docs\n"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
yield
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -8,7 +8,6 @@ import traceback
|
|
|
8
8
|
from typing import Optional, Union
|
|
9
9
|
import json
|
|
10
10
|
import subprocess
|
|
11
|
-
from contextlib import asynccontextmanager
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
|
|
14
13
|
from fastapi import FastAPI, HTTPException, status, Request
|
|
@@ -16,6 +15,8 @@ from fastapi.responses import StreamingResponse
|
|
|
16
15
|
from fastapi.middleware.cors import CORSMiddleware
|
|
17
16
|
from fastapi.staticfiles import StaticFiles
|
|
18
17
|
import uvicorn
|
|
18
|
+
from uvicorn.config import Config
|
|
19
|
+
from uvicorn.server import Server as UvicornServer
|
|
19
20
|
from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
|
|
20
21
|
from tabulate import tabulate
|
|
21
22
|
|
|
@@ -57,7 +58,7 @@ from lemonade.tools.server.pydantic_models import (
|
|
|
57
58
|
)
|
|
58
59
|
from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
|
|
59
60
|
from lemonade.tools.server.instructions import get_instructions_html
|
|
60
|
-
|
|
61
|
+
from lemonade.tools.server.port_utils import lifespan
|
|
61
62
|
|
|
62
63
|
DEFAULT_PORT = 8000
|
|
63
64
|
DEFAULT_LOG_LEVEL = "info"
|
|
@@ -243,15 +244,22 @@ class Server(ManagementTool):
|
|
|
243
244
|
|
|
244
245
|
return parser
|
|
245
246
|
|
|
246
|
-
def
|
|
247
|
+
def _setup_server_common(
|
|
247
248
|
self,
|
|
248
|
-
|
|
249
|
-
# we always use the default cache directory
|
|
250
|
-
_=None,
|
|
251
|
-
port: int = DEFAULT_PORT,
|
|
252
|
-
log_level: str = DEFAULT_LOG_LEVEL,
|
|
249
|
+
port: int,
|
|
253
250
|
truncate_inputs: bool = False,
|
|
251
|
+
log_level: str = DEFAULT_LOG_LEVEL,
|
|
252
|
+
threaded_mode: bool = False,
|
|
254
253
|
):
|
|
254
|
+
"""
|
|
255
|
+
Common setup logic shared between run() and run_in_thread().
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
port: Port number for the server
|
|
259
|
+
truncate_inputs: Whether to truncate inputs if they exceed max length
|
|
260
|
+
log_level: Logging level to configure
|
|
261
|
+
threaded_mode: Whether this is being set up for threaded execution
|
|
262
|
+
"""
|
|
255
263
|
# Store truncation settings
|
|
256
264
|
self.truncate_inputs = truncate_inputs
|
|
257
265
|
|
|
@@ -265,22 +273,27 @@ class Server(ManagementTool):
|
|
|
265
273
|
|
|
266
274
|
logging.trace = trace
|
|
267
275
|
|
|
268
|
-
# Configure logging
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
276
|
+
# Configure logging based on mode
|
|
277
|
+
if threaded_mode:
|
|
278
|
+
# Configure logging for warning level (to reduce noise in threaded execution)
|
|
279
|
+
logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
|
|
280
|
+
else:
|
|
281
|
+
# Configure logging to match uvicorn's format
|
|
282
|
+
logging_level = getattr(logging, log_level.upper())
|
|
283
|
+
logging.basicConfig(
|
|
284
|
+
level=logging_level,
|
|
285
|
+
format="%(levelprefix)s %(message)s",
|
|
286
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
287
|
+
)
|
|
275
288
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
289
|
+
# Add uvicorn's log formatter
|
|
290
|
+
logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
|
|
291
|
+
fmt="%(levelprefix)s %(message)s",
|
|
292
|
+
use_colors=True,
|
|
293
|
+
)
|
|
281
294
|
|
|
282
|
-
|
|
283
|
-
|
|
295
|
+
# Ensure the log level is properly set
|
|
296
|
+
logging.getLogger().setLevel(logging_level)
|
|
284
297
|
|
|
285
298
|
# Update debug logging state after setting log level
|
|
286
299
|
self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
|
|
@@ -293,8 +306,62 @@ class Server(ManagementTool):
|
|
|
293
306
|
# that the lifespan can access it
|
|
294
307
|
self.app.port = port
|
|
295
308
|
|
|
309
|
+
def run(
|
|
310
|
+
self,
|
|
311
|
+
# ManagementTool has a required cache_dir arg, but
|
|
312
|
+
# we always use the default cache directory
|
|
313
|
+
_=None,
|
|
314
|
+
port: int = DEFAULT_PORT,
|
|
315
|
+
log_level: str = DEFAULT_LOG_LEVEL,
|
|
316
|
+
truncate_inputs: bool = False,
|
|
317
|
+
):
|
|
318
|
+
# Common setup
|
|
319
|
+
self._setup_server_common(
|
|
320
|
+
port=port,
|
|
321
|
+
truncate_inputs=truncate_inputs,
|
|
322
|
+
log_level=log_level,
|
|
323
|
+
threaded_mode=False,
|
|
324
|
+
)
|
|
325
|
+
|
|
296
326
|
uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
|
|
297
327
|
|
|
328
|
+
def run_in_thread(
|
|
329
|
+
self,
|
|
330
|
+
port: int = DEFAULT_PORT,
|
|
331
|
+
host: str = "localhost",
|
|
332
|
+
log_level: str = "warning",
|
|
333
|
+
truncate_inputs: bool = False,
|
|
334
|
+
):
|
|
335
|
+
"""
|
|
336
|
+
Set up the server for running in a thread.
|
|
337
|
+
Returns a uvicorn server instance that can be controlled externally.
|
|
338
|
+
"""
|
|
339
|
+
# Common setup
|
|
340
|
+
self._setup_server_common(
|
|
341
|
+
port=port,
|
|
342
|
+
truncate_inputs=truncate_inputs,
|
|
343
|
+
log_level=log_level,
|
|
344
|
+
threaded_mode=True,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
class CustomServer(UvicornServer):
|
|
348
|
+
"""Custom Uvicorn server that can be properly shutdown from another thread"""
|
|
349
|
+
|
|
350
|
+
def install_signal_handlers(self):
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# Configure the server
|
|
354
|
+
config = Config(
|
|
355
|
+
app=self.app,
|
|
356
|
+
host=host,
|
|
357
|
+
port=port,
|
|
358
|
+
log_level=log_level,
|
|
359
|
+
log_config=None,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Create and return the uvicorn server
|
|
363
|
+
return CustomServer(config=config)
|
|
364
|
+
|
|
298
365
|
async def _show_telemetry(self):
|
|
299
366
|
"""
|
|
300
367
|
Show telemetry data in debug mode.
|
|
@@ -1241,6 +1308,8 @@ class Server(ManagementTool):
|
|
|
1241
1308
|
"status": "success",
|
|
1242
1309
|
"message": f"Loaded model: {model_reference}",
|
|
1243
1310
|
}
|
|
1311
|
+
except HTTPException:
|
|
1312
|
+
raise
|
|
1244
1313
|
except Exception: # pylint: disable=broad-exception-caught
|
|
1245
1314
|
self.model_load_failure(model_reference)
|
|
1246
1315
|
|
|
@@ -1339,22 +1408,5 @@ class Server(ManagementTool):
|
|
|
1339
1408
|
return response
|
|
1340
1409
|
|
|
1341
1410
|
|
|
1342
|
-
@asynccontextmanager
|
|
1343
|
-
async def lifespan(app: FastAPI):
|
|
1344
|
-
# Code here will run when the application starts up
|
|
1345
|
-
|
|
1346
|
-
logging.info(
|
|
1347
|
-
"\n"
|
|
1348
|
-
"\n"
|
|
1349
|
-
"🍋 Lemonade Server Ready!\n"
|
|
1350
|
-
f"🍋 Open http://localhost:{app.port} in your browser for:\n"
|
|
1351
|
-
"🍋 💬 chat\n"
|
|
1352
|
-
"🍋 💻 model management\n"
|
|
1353
|
-
"🍋 📄 docs\n"
|
|
1354
|
-
)
|
|
1355
|
-
|
|
1356
|
-
yield
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
1411
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
1360
1412
|
# Modifications Copyright (c) 2025 AMD
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import logging
|
|
3
|
+
from lemonade.tools.server.serve import Server
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ServerRunner(threading.Thread):
|
|
7
|
+
"""
|
|
8
|
+
Thread class for running the Lemonade Server with a loaded model.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self, model, tokenizer, checkpoint, recipe, host="localhost", port=8000
|
|
13
|
+
):
|
|
14
|
+
threading.Thread.__init__(self)
|
|
15
|
+
self.model = model
|
|
16
|
+
self.tokenizer = tokenizer
|
|
17
|
+
self.checkpoint = checkpoint
|
|
18
|
+
self.recipe = recipe
|
|
19
|
+
self.host = host
|
|
20
|
+
self.port = port
|
|
21
|
+
self.server = None
|
|
22
|
+
self.ready_event = threading.Event()
|
|
23
|
+
self.shutdown_event = threading.Event()
|
|
24
|
+
self.uvicorn_server = None
|
|
25
|
+
|
|
26
|
+
def run(self):
|
|
27
|
+
try:
|
|
28
|
+
# Create the server instance
|
|
29
|
+
self.server = Server()
|
|
30
|
+
|
|
31
|
+
# Configure the server with model/tokenizer
|
|
32
|
+
self.server.model = self.model
|
|
33
|
+
self.server.tokenizer = self.tokenizer
|
|
34
|
+
self.server.llm_loaded = type(
|
|
35
|
+
"obj",
|
|
36
|
+
(object,),
|
|
37
|
+
{
|
|
38
|
+
"checkpoint": self.checkpoint,
|
|
39
|
+
"recipe": self.recipe,
|
|
40
|
+
"max_prompt_length": None,
|
|
41
|
+
"reasoning": False,
|
|
42
|
+
"model_name": "custom",
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Set up the server for threaded execution
|
|
47
|
+
self.uvicorn_server = self.server.run_in_thread(
|
|
48
|
+
port=self.port, host=self.host, log_level="warning"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Set the ready event
|
|
52
|
+
self.ready_event.set()
|
|
53
|
+
|
|
54
|
+
# Run the server until shutdown is requested
|
|
55
|
+
logging.info(f"Starting server on http://{self.host}:{self.port}")
|
|
56
|
+
self.uvicorn_server.run()
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logging.error(f"Error starting server: {e}")
|
|
60
|
+
self.ready_event.set()
|
|
61
|
+
raise
|
|
62
|
+
|
|
63
|
+
def shutdown(self):
|
|
64
|
+
"""Shutdown the server"""
|
|
65
|
+
if hasattr(self, "uvicorn_server") and self.uvicorn_server:
|
|
66
|
+
logging.info("Shutting down server...")
|
|
67
|
+
self.uvicorn_server.should_exit = True
|
|
68
|
+
self.shutdown_event.set()
|
|
69
|
+
|
|
70
|
+
# Clean up resources properly to avoid memory leaks
|
|
71
|
+
if hasattr(self, "server") and self.server:
|
|
72
|
+
logging.info("Cleaning up model and tokenizer resources...")
|
|
73
|
+
|
|
74
|
+
if hasattr(self.server, "model"):
|
|
75
|
+
self.server.model = None
|
|
76
|
+
|
|
77
|
+
if hasattr(self.server, "tokenizer"):
|
|
78
|
+
self.server.tokenizer = None
|
|
79
|
+
|
|
80
|
+
if hasattr(self.server, "llm_loaded"):
|
|
81
|
+
self.server.llm_loaded = None
|
|
82
|
+
|
|
83
|
+
# Clean up local references
|
|
84
|
+
if hasattr(self, "model"):
|
|
85
|
+
del self.model
|
|
86
|
+
if hasattr(self, "tokenizer"):
|
|
87
|
+
del self.tokenizer
|
lemonade/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "7.0.
|
|
1
|
+
__version__ = "7.0.2"
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
lemonade/__init__.py,sha256=W1Qk7r0rnQqFhPNHp6BIBT_q-OH3s-8Q_POoVfAmKW0,117
|
|
2
2
|
lemonade/api.py,sha256=9apNWSMS4bYpYl7iqDA4CsHHOOMdjOIuJhNYSqj_jIA,3878
|
|
3
3
|
lemonade/cache.py,sha256=djr2qgyUUAWlQv8FehU9qlNtCwK0IZqo82hcBDyZ3-A,2850
|
|
4
|
-
lemonade/cli.py,sha256=
|
|
4
|
+
lemonade/cli.py,sha256=ddN2QqsGMsVwydfcR7MSZu1z8_-bUgUP7dhw9lzbHa8,4424
|
|
5
5
|
lemonade/sequence.py,sha256=KSH7BPsiyDKsOsg_ziQKEGsDwMmuO_YbgPRBxkZd0pw,13267
|
|
6
6
|
lemonade/state.py,sha256=sdSezla7Cd7KYL90xY3p9kcNV4ndSyN6UvNLOr3vBMA,5261
|
|
7
|
-
lemonade/version.py,sha256=
|
|
7
|
+
lemonade/version.py,sha256=iVyoEZ1fyZz5oicAj7ERV3Eld5fVjLM_p365GVSKBpk,22
|
|
8
8
|
lemonade/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
lemonade/common/analyze_model.py,sha256=sYWDznEUEWjx_Qekg7f1hHY4Pfe87IQ77lmsWqePgE0,803
|
|
10
10
|
lemonade/common/build.py,sha256=Pk86mCr6fyBIx2zXDpq0BkdahlCmWRnwSTpShA_gwZw,7849
|
|
@@ -23,6 +23,7 @@ lemonade/profilers/__init__.py,sha256=JKVonvJ4XZ9_6sKXPWsiMLQCNyzQOxhQw5BEHR1qOf
|
|
|
23
23
|
lemonade/profilers/memory_tracker.py,sha256=-SSBmNlrweiX59wyNtLMWiwaMOskBzNO1_cufVwteqs,9357
|
|
24
24
|
lemonade/profilers/profiler.py,sha256=y_iMGr1ToQ6rcwcIcXck4ajapisLXCfHggiV-IpPF98,1666
|
|
25
25
|
lemonade/tools/__init__.py,sha256=_6xRc-FHxmujoLjLjWtpYrWYEXtCSneSy-5ya01kyPk,53
|
|
26
|
+
lemonade/tools/accuracy.py,sha256=QndammQ1bmlTaF_6YDaaiJp6fpkKZDYGySdQpAgZIp8,11699
|
|
26
27
|
lemonade/tools/adapter.py,sha256=4H6gfbjvqyU6qm1_-b2FE-c3a7N9OzEBeDVnIwqRDvg,3014
|
|
27
28
|
lemonade/tools/bench.py,sha256=aN5LMA_EH6-ZhAH3Gf26JYL7s0eKpUd3j-bReRhzvEY,10016
|
|
28
29
|
lemonade/tools/huggingface_bench.py,sha256=POE5JYzArK2FBktazOkluLNFzlLctM39B19fK5sMx-0,10017
|
|
@@ -46,20 +47,22 @@ lemonade/tools/report/llm_report.py,sha256=bVHhwCINA-Ok2EdSwAsLubsc83N3KWOVuwTgu
|
|
|
46
47
|
lemonade/tools/report/table.py,sha256=a0TXo1X84RxCSu0un_XM3ANOlhLtPDuqtGwR7eomf2s,24853
|
|
47
48
|
lemonade/tools/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
49
|
lemonade/tools/server/instructions.py,sha256=Lvm-tRZaYgHkyt3zQkmMChkXO6rUiLoIAunudmMr_D8,13388
|
|
49
|
-
lemonade/tools/server/llamacpp.py,sha256=
|
|
50
|
+
lemonade/tools/server/llamacpp.py,sha256=R86Q2btI9_EPpPj27vvELnF9KmKxpu3sPIIS1xW3PIA,9997
|
|
51
|
+
lemonade/tools/server/port_utils.py,sha256=24Ryz5cNU0R9L1kuVSapZoyXTZHzhF4y0Yje9MVOrE0,1535
|
|
50
52
|
lemonade/tools/server/pydantic_models.py,sha256=z1RAs9hkAFkOfMiTPtmUiC3CD2P6OMI2N0J2ztNs0d4,2179
|
|
51
|
-
lemonade/tools/server/serve.py,sha256=
|
|
53
|
+
lemonade/tools/server/serve.py,sha256=3JQa42WZdllKAf_DY-cal0Pc8vdBZd4vwsfhZmpheS8,52500
|
|
54
|
+
lemonade/tools/server/thread_utils.py,sha256=pK9K_6DNWoQ78NArkAX3Ym2WsxLnCs9sKTk6TitlYnI,2804
|
|
52
55
|
lemonade/tools/server/tool_calls.py,sha256=xrAlQwKG-nv2xLlf8f9CDSaUbyMn8ZtHkds9iZLG9K8,5230
|
|
53
56
|
lemonade/tools/server/static/styles.css,sha256=8U1EejQaqRLQ6QTCF5UG_dLPtLjRwT1menUHMDhaq2M,5045
|
|
54
57
|
lemonade_install/__init__.py,sha256=26zohKg2jgr_5y7tObduWMYQg8zCTWMZHL8lfi2zZVQ,40
|
|
55
58
|
lemonade_install/install.py,sha256=61qUO7kWCLcdjK0_IQZ46-rKP_AWkyznh4YpDclPKyM,28036
|
|
56
|
-
lemonade_sdk-7.0.
|
|
57
|
-
lemonade_sdk-7.0.
|
|
59
|
+
lemonade_sdk-7.0.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
60
|
+
lemonade_sdk-7.0.2.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
|
|
58
61
|
lemonade_server/cli.py,sha256=DR6sIt66K1sZZG3ascEw_6HUgz3UhU9KGUyzxf4nO_A,7351
|
|
59
62
|
lemonade_server/model_manager.py,sha256=WDGDxrKjq-u2GkGWLNUsRk0d74J-RG2yCYEnH8WMnDw,4010
|
|
60
|
-
lemonade_server/server_models.json,sha256=
|
|
61
|
-
lemonade_sdk-7.0.
|
|
62
|
-
lemonade_sdk-7.0.
|
|
63
|
-
lemonade_sdk-7.0.
|
|
64
|
-
lemonade_sdk-7.0.
|
|
65
|
-
lemonade_sdk-7.0.
|
|
63
|
+
lemonade_server/server_models.json,sha256=S_wVpybtBT5xTuM2BLxT83bOsJnPR_yWIl35jy30aJ8,6453
|
|
64
|
+
lemonade_sdk-7.0.2.dist-info/METADATA,sha256=Pf_-kdMDlXVYw_6CHQJDlO3ac4GbHzxENx0Rg8p4QBo,5443
|
|
65
|
+
lemonade_sdk-7.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
66
|
+
lemonade_sdk-7.0.2.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
|
|
67
|
+
lemonade_sdk-7.0.2.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
|
|
68
|
+
lemonade_sdk-7.0.2.dist-info/RECORD,,
|
|
@@ -9,13 +9,13 @@
|
|
|
9
9
|
"checkpoint": "amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx",
|
|
10
10
|
"recipe": "oga-cpu",
|
|
11
11
|
"reasoning": false,
|
|
12
|
-
"suggested":
|
|
12
|
+
"suggested": false
|
|
13
13
|
},
|
|
14
14
|
"Llama-3.2-3B-Instruct-CPU": {
|
|
15
15
|
"checkpoint": "amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx",
|
|
16
16
|
"recipe": "oga-cpu",
|
|
17
17
|
"reasoning": false,
|
|
18
|
-
"suggested":
|
|
18
|
+
"suggested": false
|
|
19
19
|
},
|
|
20
20
|
"Phi-3-Mini-Instruct-CPU": {
|
|
21
21
|
"checkpoint": "amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu",
|
|
@@ -103,6 +103,13 @@
|
|
|
103
103
|
"max_prompt_length": 2000,
|
|
104
104
|
"suggested": true
|
|
105
105
|
},
|
|
106
|
+
"Llama-xLAM-2-8b-fc-r-Hybrid": {
|
|
107
|
+
"checkpoint": "amd/Llama-xLAM-2-8b-fc-r-awq-g128-int4-asym-bfp16-onnx-hybrid",
|
|
108
|
+
"recipe": "oga-hybrid",
|
|
109
|
+
"reasoning": false,
|
|
110
|
+
"max_prompt_length": 2000,
|
|
111
|
+
"suggested": true
|
|
112
|
+
},
|
|
106
113
|
"Llama-3.2-1B-Instruct-DirectML": {
|
|
107
114
|
"checkpoint": "amd/Llama-3.2-1B-Instruct-dml-int4-awq-block-128-directml",
|
|
108
115
|
"recipe": "oga-igpu",
|
|
@@ -158,7 +165,7 @@
|
|
|
158
165
|
"suggested": true
|
|
159
166
|
},
|
|
160
167
|
"Qwen3-8B-GGUF": {
|
|
161
|
-
"checkpoint": "unsloth/Qwen3-8B-GGUF:
|
|
168
|
+
"checkpoint": "unsloth/Qwen3-8B-GGUF:Q4_1",
|
|
162
169
|
"recipe": "llamacpp",
|
|
163
170
|
"reasoning": true,
|
|
164
171
|
"suggested": true
|
|
@@ -181,4 +188,4 @@
|
|
|
181
188
|
"reasoning": true,
|
|
182
189
|
"suggested": true
|
|
183
190
|
}
|
|
184
|
-
}
|
|
191
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|