lemonade-sdk 7.0.1__tar.gz → 7.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/PKG-INFO +1 -1
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/setup.py +1 -1
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/cli.py +2 -0
- lemonade_sdk-7.0.3/src/lemonade/tools/accuracy.py +335 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/huggingface_load.py +6 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/oga.py +6 -4
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/prompt.py +28 -1
- lemonade_sdk-7.0.3/src/lemonade/tools/server/instructions.py +37 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/llamacpp.py +45 -19
- lemonade_sdk-7.0.3/src/lemonade/tools/server/port_utils.py +57 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/serve.py +96 -44
- lemonade_sdk-7.0.3/src/lemonade/tools/server/static/instructions.html +262 -0
- lemonade_sdk-7.0.3/src/lemonade/tools/server/thread_utils.py +87 -0
- lemonade_sdk-7.0.3/src/lemonade/version.py +1 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/PKG-INFO +1 -1
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/SOURCES.txt +5 -1
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/model_manager.py +45 -12
- {lemonade_sdk-7.0.1/src/lemonade/tools/server → lemonade_sdk-7.0.3/src/lemonade_server}/pydantic_models.py +2 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/server_models.json +25 -4
- lemonade_sdk-7.0.1/src/lemonade/tools/server/instructions.py +0 -294
- lemonade_sdk-7.0.1/src/lemonade/version.py +0 -1
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/LICENSE +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/NOTICE.md +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/README.md +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/pyproject.toml +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/setup.cfg +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/api.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/cache.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/analyze_model.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/labels.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/onnx_helpers.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/plugins.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/tensor_helpers.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/state.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/huggingface_bench.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/llamacpp.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/llamacpp_bench.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/ort_genai/oga_bench.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/quark_load.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/quark/quark_quantize.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/report/table.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/static/styles.css +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/requires.txt +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-7.0.1 → lemonade_sdk-7.0.3}/src/lemonade_server/cli.py +0 -0
|
@@ -19,6 +19,7 @@ import lemonade.cache as cache
|
|
|
19
19
|
from lemonade.tools.mmlu import AccuracyMMLU
|
|
20
20
|
from lemonade.tools.humaneval import AccuracyHumaneval
|
|
21
21
|
from lemonade.tools.perplexity import AccuracyPerplexity
|
|
22
|
+
from lemonade.tools.accuracy import LMEvalHarness
|
|
22
23
|
from lemonade.tools.prompt import LLMPrompt
|
|
23
24
|
from lemonade.tools.quark.quark_load import QuarkLoad
|
|
24
25
|
from lemonade.tools.quark.quark_quantize import QuarkQuantize
|
|
@@ -36,6 +37,7 @@ def main():
|
|
|
36
37
|
AccuracyMMLU,
|
|
37
38
|
AccuracyHumaneval,
|
|
38
39
|
AccuracyPerplexity,
|
|
40
|
+
LMEvalHarness,
|
|
39
41
|
LLMPrompt,
|
|
40
42
|
HuggingfaceBench,
|
|
41
43
|
OgaBench,
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from lemonade.state import State
|
|
13
|
+
from lemonade.tools import Tool
|
|
14
|
+
import lemonade.common.printing as printing
|
|
15
|
+
import lemonade.common.build as build
|
|
16
|
+
|
|
17
|
+
from lemonade.tools.server.thread_utils import ServerRunner
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_port_in_use(port, host="localhost"):
|
|
21
|
+
"""
|
|
22
|
+
Check if a port is in use
|
|
23
|
+
"""
|
|
24
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
25
|
+
return s.connect_ex((host, port)) == 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LMEvalHarness(Tool):
|
|
29
|
+
"""
|
|
30
|
+
Tool for evaluating LLMs using lm-eval-harness on industry standard benchmarks
|
|
31
|
+
like MMLU, GSM8k, and more. See docs/lemonade/lm_eval.md for more details.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
unique_name = "lm-eval-harness"
|
|
35
|
+
|
|
36
|
+
def __init__(self):
|
|
37
|
+
super().__init__(
|
|
38
|
+
monitor_message="Evaluate model accuracy using ElutherAI's lm-eval-harness"
|
|
39
|
+
)
|
|
40
|
+
self.status_stats = []
|
|
41
|
+
self.server_runner = None
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
45
|
+
parser = __class__.helpful_parser(
|
|
46
|
+
short_description="Evaluate model using lm-eval-harness",
|
|
47
|
+
add_help=add_help,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--task",
|
|
52
|
+
type=str,
|
|
53
|
+
required=True,
|
|
54
|
+
help="Task(s) to evaluate on (e.g., gsm8k, mmlu)",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--server-port", type=int, default=8000, help="Port to use for the server"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--num-fewshot",
|
|
63
|
+
type=int,
|
|
64
|
+
default=0,
|
|
65
|
+
help="Number of examples in few-shot prompts",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--limit",
|
|
70
|
+
type=int,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Limit the number of examples per task",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--log-samples",
|
|
77
|
+
action="store_true",
|
|
78
|
+
help="Log samples for each task to log file",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--output-path",
|
|
83
|
+
type=str,
|
|
84
|
+
default=None,
|
|
85
|
+
help="Path to save evaluation results",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return parser
|
|
89
|
+
|
|
90
|
+
def _process_results(self, results_dir, state):
|
|
91
|
+
"""Process evaluation results and save to state stats"""
|
|
92
|
+
if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
|
|
93
|
+
printing.log_warning(f"Results directory not found at {results_dir}")
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
model_dirs = [
|
|
97
|
+
d
|
|
98
|
+
for d in os.listdir(results_dir)
|
|
99
|
+
if os.path.isdir(os.path.join(results_dir, d))
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
if not model_dirs:
|
|
103
|
+
printing.log_warning(f"No model directories found in {results_dir}")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
model_dir = os.path.join(results_dir, model_dirs[0])
|
|
107
|
+
printing.log_info(f"Found model directory: {model_dir}")
|
|
108
|
+
|
|
109
|
+
# Find the results JSON file with timestamp
|
|
110
|
+
results_files = [
|
|
111
|
+
f
|
|
112
|
+
for f in os.listdir(model_dir)
|
|
113
|
+
if f.startswith("results_") and f.endswith(".json")
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if not results_files:
|
|
117
|
+
printing.log_warning(f"No results files found in {model_dir}")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
# Sort by timestamp
|
|
121
|
+
results_files.sort(reverse=True)
|
|
122
|
+
results_file_path = os.path.join(model_dir, results_files[0])
|
|
123
|
+
printing.log_info(f"Processing results from {results_file_path}")
|
|
124
|
+
|
|
125
|
+
# Read and process results
|
|
126
|
+
try:
|
|
127
|
+
with open(results_file_path, "r", encoding="utf-8") as f:
|
|
128
|
+
results = json.load(f)
|
|
129
|
+
|
|
130
|
+
# Extract and display metrics
|
|
131
|
+
if "results" in results:
|
|
132
|
+
for task_name, metrics in results["results"].items():
|
|
133
|
+
printing.log_info(f"Results for {task_name}:")
|
|
134
|
+
|
|
135
|
+
for metric, value in metrics.items():
|
|
136
|
+
if isinstance(value, (int, float)) and not metric.startswith(
|
|
137
|
+
"alias"
|
|
138
|
+
):
|
|
139
|
+
# Format metric name for stats
|
|
140
|
+
clean_metric = metric.replace(",", "_")
|
|
141
|
+
stat_name = f"lm_eval_{task_name}_{clean_metric}"
|
|
142
|
+
|
|
143
|
+
# Save to state stats as percentage
|
|
144
|
+
state.save_stat(stat_name, float(value) * 100)
|
|
145
|
+
state.save_stat(f"{stat_name}_units", "%")
|
|
146
|
+
self.status_stats.append(stat_name)
|
|
147
|
+
|
|
148
|
+
printing.log_info(
|
|
149
|
+
f" {metric}: {value:.4f} ({value*100:.2f}%)"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Save summary metrics if available
|
|
153
|
+
avg_metrics = {}
|
|
154
|
+
if "higher_is_better" in results:
|
|
155
|
+
for metric_type in results["higher_is_better"].values():
|
|
156
|
+
for metric in metric_type.keys():
|
|
157
|
+
if metric not in avg_metrics:
|
|
158
|
+
avg_metrics[metric] = []
|
|
159
|
+
|
|
160
|
+
for task_metrics in results["results"].values():
|
|
161
|
+
for metric, value in task_metrics.items():
|
|
162
|
+
if isinstance(value, (int, float)) and not metric.startswith(
|
|
163
|
+
"alias"
|
|
164
|
+
):
|
|
165
|
+
base_metric = metric.split(",")[0]
|
|
166
|
+
if base_metric in avg_metrics:
|
|
167
|
+
avg_metrics[base_metric].append(value)
|
|
168
|
+
|
|
169
|
+
# Calculate and save averages
|
|
170
|
+
for metric, values in avg_metrics.items():
|
|
171
|
+
if values:
|
|
172
|
+
avg_value = sum(values) / len(values)
|
|
173
|
+
stat_name = f"lm_eval_average_{metric}"
|
|
174
|
+
state.save_stat(stat_name, float(avg_value) * 100)
|
|
175
|
+
state.save_stat(f"{stat_name}_units", "%")
|
|
176
|
+
self.status_stats.append(stat_name)
|
|
177
|
+
printing.log_info(
|
|
178
|
+
f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
except (IOError, json.JSONDecodeError) as e:
|
|
182
|
+
printing.log_error(f"Error processing results: {e}")
|
|
183
|
+
|
|
184
|
+
def run(
|
|
185
|
+
self,
|
|
186
|
+
state: State,
|
|
187
|
+
task: str,
|
|
188
|
+
server_port: int = 8000,
|
|
189
|
+
server_host: str = "localhost",
|
|
190
|
+
num_fewshot: int = 0,
|
|
191
|
+
limit: Optional[int] = None,
|
|
192
|
+
log_samples: bool = False,
|
|
193
|
+
output_path: Optional[str] = None,
|
|
194
|
+
) -> State:
|
|
195
|
+
|
|
196
|
+
model = state.model
|
|
197
|
+
tokenizer = state.tokenizer
|
|
198
|
+
|
|
199
|
+
if model is None or tokenizer is None:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"Model and tokenizer must be loaded in state before running lm-eval-harness"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Set up output path
|
|
205
|
+
if output_path is None:
|
|
206
|
+
output_path = os.path.join(
|
|
207
|
+
build.output_dir(state.cache_dir, state.build_name), "lm_eval_results"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
os.makedirs(output_path, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
# Check if port is already in use
|
|
213
|
+
if is_port_in_use(server_port, server_host):
|
|
214
|
+
error_msg = (
|
|
215
|
+
f"Port {server_port} is already in use. "
|
|
216
|
+
"Please close all applications using this port and try again."
|
|
217
|
+
)
|
|
218
|
+
printing.log_error(error_msg)
|
|
219
|
+
raise RuntimeError(error_msg)
|
|
220
|
+
|
|
221
|
+
# Retroactively determine recipe based on model type to select correct iterator
|
|
222
|
+
# The model is already loaded in server, so we only need recipe for iterator selection
|
|
223
|
+
checkpoint = getattr(state, "checkpoint", "unknown")
|
|
224
|
+
if "OrtGenaiModel" in str(type(model)):
|
|
225
|
+
recipe = "oga-"
|
|
226
|
+
else:
|
|
227
|
+
recipe = "unknown"
|
|
228
|
+
|
|
229
|
+
# Start the server thread
|
|
230
|
+
self.server_runner = ServerRunner(
|
|
231
|
+
model=model,
|
|
232
|
+
tokenizer=tokenizer,
|
|
233
|
+
checkpoint=checkpoint,
|
|
234
|
+
recipe=recipe,
|
|
235
|
+
host=server_host,
|
|
236
|
+
port=server_port,
|
|
237
|
+
)
|
|
238
|
+
self.server_runner.start()
|
|
239
|
+
|
|
240
|
+
# Wait for server initialization
|
|
241
|
+
printing.log_info("Waiting for server initialization...")
|
|
242
|
+
|
|
243
|
+
# Wait for server to start and be responsive
|
|
244
|
+
server_url = f"http://{server_host}:{server_port}"
|
|
245
|
+
max_retries = 30
|
|
246
|
+
retry_delay = 1
|
|
247
|
+
|
|
248
|
+
printing.log_info(f"Checking if server is available at {server_url}...")
|
|
249
|
+
for i in range(max_retries):
|
|
250
|
+
try:
|
|
251
|
+
response = requests.get(f"{server_url}/api/v0/health", timeout=2)
|
|
252
|
+
if response.status_code == 200:
|
|
253
|
+
printing.log_info(f"Server is ready after {i+1} attempts")
|
|
254
|
+
break
|
|
255
|
+
except requests.exceptions.RequestException:
|
|
256
|
+
if i < max_retries - 1:
|
|
257
|
+
time.sleep(retry_delay)
|
|
258
|
+
else:
|
|
259
|
+
printing.log_error(
|
|
260
|
+
f"Server did not start after {max_retries} attempts"
|
|
261
|
+
)
|
|
262
|
+
raise RuntimeError("Failed to start the server")
|
|
263
|
+
|
|
264
|
+
# Build API URL
|
|
265
|
+
results_file = os.path.join(output_path, f"{task}_results")
|
|
266
|
+
|
|
267
|
+
printing.log_info(f"Running lm-eval-harness on {task}...")
|
|
268
|
+
|
|
269
|
+
# Build lm-eval-harness command
|
|
270
|
+
cmd = [
|
|
271
|
+
"lm_eval",
|
|
272
|
+
"--model",
|
|
273
|
+
"local-completions",
|
|
274
|
+
"--tasks",
|
|
275
|
+
task,
|
|
276
|
+
"--model_args",
|
|
277
|
+
(
|
|
278
|
+
f"model={checkpoint},"
|
|
279
|
+
f"base_url={server_url}/api/v0/completions,"
|
|
280
|
+
f"num_concurrent=1,"
|
|
281
|
+
f"max_retries=5,"
|
|
282
|
+
f"retry_timeout=10,"
|
|
283
|
+
f"tokenized_requests=False"
|
|
284
|
+
),
|
|
285
|
+
"--num_fewshot",
|
|
286
|
+
str(num_fewshot),
|
|
287
|
+
"--output_path",
|
|
288
|
+
results_file,
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
if limit is not None:
|
|
292
|
+
cmd.extend(["--limit", str(limit)])
|
|
293
|
+
|
|
294
|
+
if log_samples:
|
|
295
|
+
cmd.extend(["--log_samples"])
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
# On Windows, set UTF-8 mode to handle Unicode output
|
|
299
|
+
env = os.environ.copy()
|
|
300
|
+
if sys.platform == "win32":
|
|
301
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
|
302
|
+
|
|
303
|
+
# Execute lm-eval-harness command
|
|
304
|
+
result = subprocess.run(
|
|
305
|
+
cmd, check=True, text=True, capture_output=True, env=env
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Log relevant output and skip any parts that might cause encoding issues
|
|
309
|
+
try:
|
|
310
|
+
printing.log_info(result.stdout)
|
|
311
|
+
except UnicodeEncodeError:
|
|
312
|
+
printing.log_info(
|
|
313
|
+
"Results obtained successfully but couldn't display due to encoding issues"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Process results from the correct location
|
|
317
|
+
results_dir = os.path.join(output_path, f"{task}_results")
|
|
318
|
+
self._process_results(results_dir, state)
|
|
319
|
+
|
|
320
|
+
except subprocess.CalledProcessError as e:
|
|
321
|
+
printing.log_error(f"Error running lm-eval-harness: {e}")
|
|
322
|
+
printing.log_error(f"stderr: {e.stderr}")
|
|
323
|
+
except (IOError, ValueError, requests.RequestException) as e:
|
|
324
|
+
printing.log_error(f"Error: {e}")
|
|
325
|
+
finally:
|
|
326
|
+
# Shut down server
|
|
327
|
+
if self.server_runner and self.server_runner.is_alive():
|
|
328
|
+
printing.log_info("Shutting down server runner...")
|
|
329
|
+
self.server_runner.shutdown()
|
|
330
|
+
|
|
331
|
+
# Make sure we don't have any lingering references to state's model/tokenizer
|
|
332
|
+
# that could prevent garbage collection
|
|
333
|
+
self.server_runner = None
|
|
334
|
+
|
|
335
|
+
return state
|
|
@@ -326,6 +326,7 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
326
326
|
def generate(
|
|
327
327
|
self,
|
|
328
328
|
input_ids,
|
|
329
|
+
random_seed=1,
|
|
329
330
|
**kwargs,
|
|
330
331
|
):
|
|
331
332
|
|
|
@@ -346,6 +347,11 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
346
347
|
**kwargs,
|
|
347
348
|
}
|
|
348
349
|
|
|
350
|
+
if random_seed is None:
|
|
351
|
+
torch.random.seed()
|
|
352
|
+
else:
|
|
353
|
+
torch.random.manual_seed(random_seed)
|
|
354
|
+
|
|
349
355
|
with torch.no_grad(), torch.inference_mode():
|
|
350
356
|
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
351
357
|
|
|
@@ -139,6 +139,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
139
139
|
pad_token_id=None,
|
|
140
140
|
stopping_criteria=None,
|
|
141
141
|
max_length=None,
|
|
142
|
+
random_seed=1,
|
|
142
143
|
):
|
|
143
144
|
params = og.GeneratorParams(self.model)
|
|
144
145
|
|
|
@@ -179,6 +180,9 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
179
180
|
if use_oga_pre_6_api:
|
|
180
181
|
params.input_ids = input_ids
|
|
181
182
|
|
|
183
|
+
if random_seed is None:
|
|
184
|
+
random_seed = -1 # In og.Generator, -1 = seed with random device
|
|
185
|
+
|
|
182
186
|
if self.config and "search" in self.config:
|
|
183
187
|
search_config = self.config["search"]
|
|
184
188
|
params.set_search_options(
|
|
@@ -196,10 +200,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
196
200
|
past_present_share_buffer=search_config.get(
|
|
197
201
|
"past_present_share_buffer", True
|
|
198
202
|
),
|
|
199
|
-
|
|
200
|
-
# by default, random_seed=-1 causes different laptops to give
|
|
201
|
-
# different results
|
|
202
|
-
random_seed=1,
|
|
203
|
+
random_seed=random_seed,
|
|
203
204
|
# Not currently supported by OGA
|
|
204
205
|
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
|
|
205
206
|
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
|
|
@@ -212,6 +213,7 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
212
213
|
temperature=temperature,
|
|
213
214
|
max_length=max_length_to_use,
|
|
214
215
|
min_length=min_length,
|
|
216
|
+
random_seed=random_seed,
|
|
215
217
|
)
|
|
216
218
|
params.try_graph_capture_with_max_batch_size(1)
|
|
217
219
|
|
|
@@ -15,6 +15,7 @@ DEFAULT_GENERATE_PARAMS = {
|
|
|
15
15
|
"temperature": 0.7,
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
DEFAULT_RANDOM_SEED = 1
|
|
18
19
|
DEFAULT_MAX_NEW_TOKENS = 512
|
|
19
20
|
DEFAULT_N_TRIALS = 1
|
|
20
21
|
|
|
@@ -108,6 +109,19 @@ class LLMPrompt(Tool):
|
|
|
108
109
|
f"(useful for testing, default is {DEFAULT_N_TRIALS})",
|
|
109
110
|
)
|
|
110
111
|
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--random-seed",
|
|
114
|
+
"-r",
|
|
115
|
+
default=str(DEFAULT_RANDOM_SEED),
|
|
116
|
+
help="Positive integer seed for random number generator used in "
|
|
117
|
+
"sampling tokens "
|
|
118
|
+
f"(default is {DEFAULT_RANDOM_SEED}). If the number of trials is "
|
|
119
|
+
"greater than one, then the seed is incremented by one for each "
|
|
120
|
+
"trial. Set to `None` for random, non-repeatable results. This "
|
|
121
|
+
"random seed behavior only applies to models loaded with "
|
|
122
|
+
"`oga-load` or `huggingface-load`.",
|
|
123
|
+
)
|
|
124
|
+
|
|
111
125
|
return parser
|
|
112
126
|
|
|
113
127
|
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
@@ -123,6 +137,11 @@ class LLMPrompt(Tool):
|
|
|
123
137
|
with open(parsed_args.prompt, "r", encoding="utf-8") as f:
|
|
124
138
|
parsed_args.prompt = f.read()
|
|
125
139
|
|
|
140
|
+
if parsed_args.random_seed == "None":
|
|
141
|
+
parsed_args.random_seed = None
|
|
142
|
+
else:
|
|
143
|
+
parsed_args.random_seed = int(parsed_args.random_seed)
|
|
144
|
+
|
|
126
145
|
return parsed_args
|
|
127
146
|
|
|
128
147
|
def run(
|
|
@@ -132,6 +151,7 @@ class LLMPrompt(Tool):
|
|
|
132
151
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
133
152
|
n_trials: int = DEFAULT_N_TRIALS,
|
|
134
153
|
template: bool = False,
|
|
154
|
+
random_seed: int = DEFAULT_RANDOM_SEED,
|
|
135
155
|
) -> State:
|
|
136
156
|
|
|
137
157
|
model: ModelAdapter = state.model
|
|
@@ -170,9 +190,16 @@ class LLMPrompt(Tool):
|
|
|
170
190
|
|
|
171
191
|
# Get the response from the LLM, which may include the prompt in it
|
|
172
192
|
response = model.generate(
|
|
173
|
-
input_ids,
|
|
193
|
+
input_ids,
|
|
194
|
+
max_new_tokens=max_new_tokens,
|
|
195
|
+
random_seed=random_seed,
|
|
196
|
+
**DEFAULT_GENERATE_PARAMS,
|
|
174
197
|
)
|
|
175
198
|
|
|
199
|
+
# Increment random seed if not none
|
|
200
|
+
if random_seed is not None:
|
|
201
|
+
random_seed += 1
|
|
202
|
+
|
|
176
203
|
# Flatten the input and response
|
|
177
204
|
input_ids_array = (
|
|
178
205
|
input_ids if isinstance(input_ids, (list, str)) else input_ids[0]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import json
|
|
3
|
+
from fastapi.responses import HTMLResponse
|
|
4
|
+
from lemonade_server.model_manager import ModelManager
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_instructions_html(port=8000):
|
|
8
|
+
"""
|
|
9
|
+
Show instructions on how to use the server.
|
|
10
|
+
"""
|
|
11
|
+
# Load server models from JSON
|
|
12
|
+
server_models_path = (
|
|
13
|
+
Path(__file__).parent.parent.parent.parent
|
|
14
|
+
/ "lemonade_server"
|
|
15
|
+
/ "server_models.json"
|
|
16
|
+
)
|
|
17
|
+
with open(server_models_path, "r", encoding="utf-8") as f:
|
|
18
|
+
server_models = json.load(f)
|
|
19
|
+
|
|
20
|
+
# Use shared filter function from model_manager.py
|
|
21
|
+
filtered_models = ModelManager().filter_models_by_backend(server_models)
|
|
22
|
+
|
|
23
|
+
# Pass filtered server_models to JS
|
|
24
|
+
server_models_js = (
|
|
25
|
+
f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Load HTML template
|
|
29
|
+
template_path = Path(__file__).parent / "static" / "instructions.html"
|
|
30
|
+
with open(template_path, "r", encoding="utf-8") as f:
|
|
31
|
+
html_template = f.read()
|
|
32
|
+
|
|
33
|
+
# Replace template variables
|
|
34
|
+
html_content = html_template.replace("{{SERVER_PORT}}", str(port))
|
|
35
|
+
html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
|
|
36
|
+
|
|
37
|
+
return HTMLResponse(content=html_content)
|
|
@@ -14,11 +14,11 @@ from fastapi.responses import StreamingResponse
|
|
|
14
14
|
|
|
15
15
|
from openai import OpenAI
|
|
16
16
|
|
|
17
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
17
18
|
from lemonade_server.model_manager import ModelManager
|
|
18
|
-
from lemonade.tools.server.
|
|
19
|
+
from lemonade.tools.server.port_utils import find_free_port
|
|
19
20
|
|
|
20
21
|
LLAMA_VERSION = "b5543"
|
|
21
|
-
LLAMA_SERVER_PORT = "8081"
|
|
22
22
|
|
|
23
23
|
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
24
|
os.path.dirname(sys.executable),
|
|
@@ -43,6 +43,23 @@ class LlamaTelemetry:
|
|
|
43
43
|
self.tokens_per_second = None
|
|
44
44
|
self.prompt_eval_time = None
|
|
45
45
|
self.eval_time = None
|
|
46
|
+
self.port = None
|
|
47
|
+
|
|
48
|
+
def choose_port(self):
|
|
49
|
+
"""
|
|
50
|
+
Users probably don't care what port we start llama-server on, so let's
|
|
51
|
+
search for an empty port
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
self.port = find_free_port()
|
|
55
|
+
|
|
56
|
+
if self.port is None:
|
|
57
|
+
msg = "Failed to find an empty port to start llama-server on"
|
|
58
|
+
logging.error(msg)
|
|
59
|
+
raise HTTPException(
|
|
60
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
61
|
+
detail=msg,
|
|
62
|
+
)
|
|
46
63
|
|
|
47
64
|
def parse_telemetry_line(self, line: str):
|
|
48
65
|
"""
|
|
@@ -128,10 +145,12 @@ def _log_subprocess_output(
|
|
|
128
145
|
break
|
|
129
146
|
|
|
130
147
|
|
|
131
|
-
def _wait_for_load(
|
|
148
|
+
def _wait_for_load(
|
|
149
|
+
llama_server_process: subprocess.Popen, port: int, fail_message: str
|
|
150
|
+
):
|
|
132
151
|
status_code = None
|
|
133
152
|
while not llama_server_process.poll() and status_code != 200:
|
|
134
|
-
health_url = f"http://localhost:{
|
|
153
|
+
health_url = f"http://localhost:{port}/health"
|
|
135
154
|
try:
|
|
136
155
|
health_response = requests.get(health_url)
|
|
137
156
|
except requests.exceptions.ConnectionError:
|
|
@@ -146,19 +165,25 @@ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
|
|
|
146
165
|
|
|
147
166
|
|
|
148
167
|
def _launch_llama_subprocess(
|
|
149
|
-
|
|
168
|
+
snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
|
|
150
169
|
) -> subprocess.Popen:
|
|
151
170
|
"""
|
|
152
171
|
Launch llama server subprocess with GPU or CPU configuration
|
|
153
172
|
"""
|
|
154
173
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
174
|
+
# Build the base command
|
|
175
|
+
base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
|
|
176
|
+
if "mmproj" in snapshot_files:
|
|
177
|
+
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
178
|
+
if not use_gpu:
|
|
179
|
+
base_command.extend(["--no-mmproj-offload"])
|
|
180
|
+
|
|
181
|
+
# Find a port, and save it in the telemetry object for future reference
|
|
182
|
+
# by other functions
|
|
183
|
+
telemetry.choose_port()
|
|
184
|
+
|
|
185
|
+
# Add port and jinja to enable tool use
|
|
186
|
+
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
162
187
|
|
|
163
188
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
164
189
|
ngl_value = "99" if use_gpu else "0"
|
|
@@ -180,7 +205,7 @@ def _launch_llama_subprocess(
|
|
|
180
205
|
return process
|
|
181
206
|
|
|
182
207
|
|
|
183
|
-
def server_load(
|
|
208
|
+
def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
|
|
184
209
|
# Download llama.cpp server if it isn't already available
|
|
185
210
|
if not os.path.exists(LLAMA_SERVER_EXE_DIR):
|
|
186
211
|
# Download llama.cpp server zip
|
|
@@ -212,33 +237,34 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
212
237
|
logging.info("Cleaned up zip file")
|
|
213
238
|
|
|
214
239
|
# Download the gguf to the hugging face cache
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
logging.debug(f"GGUF file path: {model_path}")
|
|
240
|
+
snapshot_files = ModelManager().download_gguf(model_config)
|
|
241
|
+
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
218
242
|
|
|
219
243
|
# Start the llama-serve.exe process
|
|
220
244
|
logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
|
|
221
245
|
|
|
222
246
|
# Attempt loading on GPU first
|
|
223
247
|
llama_server_process = _launch_llama_subprocess(
|
|
224
|
-
|
|
248
|
+
snapshot_files, use_gpu=True, telemetry=telemetry
|
|
225
249
|
)
|
|
226
250
|
|
|
227
251
|
# Check the /health endpoint until GPU server is ready
|
|
228
252
|
_wait_for_load(
|
|
229
253
|
llama_server_process,
|
|
254
|
+
telemetry.port,
|
|
230
255
|
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
231
256
|
)
|
|
232
257
|
|
|
233
258
|
# If loading on GPU failed, try loading on CPU
|
|
234
259
|
if llama_server_process.poll():
|
|
235
260
|
llama_server_process = _launch_llama_subprocess(
|
|
236
|
-
|
|
261
|
+
snapshot_files, use_gpu=False, telemetry=telemetry
|
|
237
262
|
)
|
|
238
263
|
|
|
239
264
|
# Check the /health endpoint until CPU server is ready
|
|
240
265
|
_wait_for_load(
|
|
241
266
|
llama_server_process,
|
|
267
|
+
telemetry.port,
|
|
242
268
|
f"Loading {model_reference} on CPU didn't work",
|
|
243
269
|
)
|
|
244
270
|
|
|
@@ -254,7 +280,7 @@ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry
|
|
|
254
280
|
def chat_completion(
|
|
255
281
|
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
256
282
|
):
|
|
257
|
-
base_url = f"http://127.0.0.1:{
|
|
283
|
+
base_url = f"http://127.0.0.1:{telemetry.port}/v1"
|
|
258
284
|
client = OpenAI(
|
|
259
285
|
base_url=base_url,
|
|
260
286
|
api_key="lemonade",
|