lemonade-sdk 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/__init__.py +5 -0
- lemonade/api.py +125 -0
- lemonade/cache.py +85 -0
- lemonade/cli.py +135 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/analyze_model.py +26 -0
- lemonade/common/build.py +223 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/labels.py +61 -0
- lemonade/common/onnx_helpers.py +176 -0
- lemonade/common/plugins.py +10 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +490 -0
- lemonade/common/system_info.py +390 -0
- lemonade/common/tensor_helpers.py +83 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/memory_tracker.py +257 -0
- lemonade/profilers/profiler.py +55 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/adapter.py +104 -0
- lemonade/tools/bench.py +284 -0
- lemonade/tools/huggingface_bench.py +267 -0
- lemonade/tools/huggingface_load.py +520 -0
- lemonade/tools/humaneval.py +258 -0
- lemonade/tools/llamacpp.py +261 -0
- lemonade/tools/llamacpp_bench.py +154 -0
- lemonade/tools/management_tools.py +273 -0
- lemonade/tools/mmlu.py +327 -0
- lemonade/tools/ort_genai/__init__.py +0 -0
- lemonade/tools/ort_genai/oga.py +1129 -0
- lemonade/tools/ort_genai/oga_bench.py +142 -0
- lemonade/tools/perplexity.py +146 -0
- lemonade/tools/prompt.py +228 -0
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +172 -0
- lemonade/tools/quark/quark_quantize.py +439 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +739 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/serve.py +1354 -0
- lemonade/tools/server/tool_calls.py +146 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +774 -0
- lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
- lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
- lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
- lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
- lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
- lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +260 -0
- lemonade_server/model_manager.py +98 -0
- lemonade_server/server_models.json +142 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
from typing import Dict, Optional, Any
|
|
5
|
+
import requests
|
|
6
|
+
from human_eval.data import write_jsonl, read_problems
|
|
7
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
8
|
+
|
|
9
|
+
from lemonade.state import State
|
|
10
|
+
from lemonade.tools import Tool
|
|
11
|
+
import lemonade.common.printing as printing
|
|
12
|
+
import lemonade.common.build as build
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AccuracyHumaneval(Tool):
|
|
16
|
+
"""
|
|
17
|
+
HumanEval accuracy measurement tool.
|
|
18
|
+
|
|
19
|
+
This tool evaluates language models on the HumanEval dataset, which consists of
|
|
20
|
+
Python programming problems. It measures the model's ability to:
|
|
21
|
+
1. Generate functionally correct code completions
|
|
22
|
+
2. Pass unit tests for each programming problem
|
|
23
|
+
|
|
24
|
+
Metrics:
|
|
25
|
+
- pass@1: Percentage of problems solved with 1 generation attempt
|
|
26
|
+
- pass@10: Percentage of problems solved within 10 generation attempts
|
|
27
|
+
- pass@100: Percentage of problems solved within 100 generation attempts
|
|
28
|
+
|
|
29
|
+
See docs/lemonade/humaneval_accuracy.md for more details
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
unique_name = "accuracy-humaneval"
|
|
33
|
+
DATASET = "https://github.com/openai/human-eval/blob/master/data/HumanEval.jsonl.gz?raw=true"
|
|
34
|
+
TOTAL_PROBLEMS = 164 # Total number of problems in the HumanEval dataset
|
|
35
|
+
|
|
36
|
+
def __init__(self):
|
|
37
|
+
super().__init__(monitor_message="Measuring accuracy with HumanEval")
|
|
38
|
+
self.status_stats = []
|
|
39
|
+
# Enable code evaluation for HumanEval
|
|
40
|
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
44
|
+
parser = __class__.helpful_parser(
|
|
45
|
+
short_description="Measure coding accuracy with HumanEval",
|
|
46
|
+
add_help=add_help,
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--k-samples",
|
|
50
|
+
type=int,
|
|
51
|
+
default=1,
|
|
52
|
+
help="Number of completions to generate per prompt for pass@k calculation"
|
|
53
|
+
" (default: %(default)s)",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--first-n-samples",
|
|
57
|
+
type=int,
|
|
58
|
+
default=AccuracyHumaneval.TOTAL_PROBLEMS,
|
|
59
|
+
help=f"Evaluate only the first N problems from the dataset (default: "
|
|
60
|
+
f"%(default)s, evaluates all {AccuracyHumaneval.TOTAL_PROBLEMS} problems)",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--timeout",
|
|
64
|
+
type=float,
|
|
65
|
+
default=30.0,
|
|
66
|
+
help="Timeout in seconds for each test case (default: %(default)s)",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--data-dir",
|
|
70
|
+
type=str,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Custom directory for dataset storage (default: %(default)s, "
|
|
73
|
+
"uses <lemonade_cache_dir>/data/humaneval)",
|
|
74
|
+
)
|
|
75
|
+
return parser
|
|
76
|
+
|
|
77
|
+
def run(
|
|
78
|
+
self,
|
|
79
|
+
state: State,
|
|
80
|
+
data_dir: Optional[str] = None,
|
|
81
|
+
k_samples: int = 1,
|
|
82
|
+
first_n_samples: Optional[int] = TOTAL_PROBLEMS,
|
|
83
|
+
timeout: float = 30.0,
|
|
84
|
+
) -> State:
|
|
85
|
+
"""
|
|
86
|
+
Run HumanEval evaluation on the model.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
state: Current state containing model and tokenizer
|
|
90
|
+
data_dir: Optional custom directory for dataset storage
|
|
91
|
+
k_samples: Number of completions to generate per prompt for pass@k calculation
|
|
92
|
+
first_n_samples: Number of first N problems to evaluate
|
|
93
|
+
timeout: Timeout in seconds for each test case
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Updated state with evaluation results
|
|
97
|
+
"""
|
|
98
|
+
# Validate required state components
|
|
99
|
+
if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
|
|
100
|
+
raise ValueError("State must contain both 'model' and 'tokenizer'")
|
|
101
|
+
|
|
102
|
+
# Setup directories
|
|
103
|
+
data_dir_to_use = data_dir or os.path.join(state.cache_dir, "data", "humaneval")
|
|
104
|
+
data_path = os.path.join(data_dir_to_use, "HumanEval.jsonl.gz")
|
|
105
|
+
model_results_dir = os.path.join(
|
|
106
|
+
build.output_dir(state.cache_dir, state.build_name), "humaneval"
|
|
107
|
+
)
|
|
108
|
+
os.makedirs(model_results_dir, exist_ok=True)
|
|
109
|
+
|
|
110
|
+
# Download dataset if needed
|
|
111
|
+
self._download_dataset(data_path)
|
|
112
|
+
|
|
113
|
+
# Run evaluation
|
|
114
|
+
results = self._evaluate_model(
|
|
115
|
+
state.model,
|
|
116
|
+
state.tokenizer,
|
|
117
|
+
data_path,
|
|
118
|
+
k_samples,
|
|
119
|
+
timeout,
|
|
120
|
+
model_results_dir,
|
|
121
|
+
first_n_samples,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Save metrics
|
|
125
|
+
self._save_metrics(state, results)
|
|
126
|
+
|
|
127
|
+
return state
|
|
128
|
+
|
|
129
|
+
def _download_dataset(self, output_path: str) -> None:
|
|
130
|
+
"""Download HumanEval dataset if not already present."""
|
|
131
|
+
if os.path.exists(output_path):
|
|
132
|
+
printing.log_info(f"Dataset already exists at: {output_path}")
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
136
|
+
response = requests.get(self.DATASET, stream=True)
|
|
137
|
+
|
|
138
|
+
if response.status_code == 200:
|
|
139
|
+
with open(output_path, "wb") as file:
|
|
140
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
141
|
+
file.write(chunk)
|
|
142
|
+
printing.log_info(f"Dataset downloaded successfully to: {output_path}")
|
|
143
|
+
else:
|
|
144
|
+
raise RuntimeError(
|
|
145
|
+
f"Failed to download dataset. Status code: {response.status_code}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def _evaluate_model(
|
|
149
|
+
self,
|
|
150
|
+
model: Any,
|
|
151
|
+
tokenizer: Any,
|
|
152
|
+
data_path: str,
|
|
153
|
+
k_samples: int,
|
|
154
|
+
timeout: float,
|
|
155
|
+
results_dir: str,
|
|
156
|
+
first_n_samples: Optional[int] = TOTAL_PROBLEMS,
|
|
157
|
+
) -> Dict[str, float]:
|
|
158
|
+
"""
|
|
159
|
+
Evaluate model on HumanEval dataset.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
model: The language model to evaluate
|
|
163
|
+
tokenizer: The tokenizer for the model
|
|
164
|
+
data_path: Path to the HumanEval dataset
|
|
165
|
+
k_samples: Number of completions per prompt for pass@k calculation
|
|
166
|
+
timeout: Test case timeout in seconds
|
|
167
|
+
results_dir: Directory to save results
|
|
168
|
+
first_n_samples: Number of first N problems to evaluate
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dictionary containing evaluation metrics
|
|
172
|
+
"""
|
|
173
|
+
dataset = read_problems(data_path)
|
|
174
|
+
|
|
175
|
+
# Limit to first N problems
|
|
176
|
+
dataset_keys = list(dataset.keys())[:first_n_samples]
|
|
177
|
+
ignore_incomplete = True
|
|
178
|
+
|
|
179
|
+
samples = []
|
|
180
|
+
|
|
181
|
+
# Update Tool progress monitor
|
|
182
|
+
self.set_percent_progress(0.0)
|
|
183
|
+
questions_completed = 0
|
|
184
|
+
number_of_questions = first_n_samples * k_samples
|
|
185
|
+
|
|
186
|
+
# Save completions and expected answers
|
|
187
|
+
csv_path = os.path.join(results_dir, "evaluation_results.csv")
|
|
188
|
+
with open(
|
|
189
|
+
csv_path, mode="w", newline="", encoding="utf-8", errors="replace"
|
|
190
|
+
) as file:
|
|
191
|
+
writer = csv.writer(file)
|
|
192
|
+
writer.writerow(["Prompt", "Completion", "Expected Answer"])
|
|
193
|
+
|
|
194
|
+
for task_id in dataset_keys:
|
|
195
|
+
try:
|
|
196
|
+
for _ in range(k_samples):
|
|
197
|
+
prompt = dataset[task_id]["prompt"]
|
|
198
|
+
expected = dataset[task_id]["canonical_solution"]
|
|
199
|
+
|
|
200
|
+
# Generate completion
|
|
201
|
+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
|
202
|
+
completion = model.generate(
|
|
203
|
+
input_ids,
|
|
204
|
+
max_new_tokens=512,
|
|
205
|
+
do_sample=False,
|
|
206
|
+
)
|
|
207
|
+
completion_text = tokenizer.decode(
|
|
208
|
+
completion[0], skip_special_tokens=True
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Save results
|
|
212
|
+
samples.append(
|
|
213
|
+
{"task_id": task_id, "completion": completion_text}
|
|
214
|
+
)
|
|
215
|
+
writer.writerow([prompt, completion_text, expected])
|
|
216
|
+
|
|
217
|
+
# Update progress monitor after completing all samples for a question
|
|
218
|
+
questions_completed = questions_completed + 1
|
|
219
|
+
percent_completed = (
|
|
220
|
+
questions_completed / number_of_questions * 100
|
|
221
|
+
)
|
|
222
|
+
self.set_percent_progress(percent_completed)
|
|
223
|
+
|
|
224
|
+
# pylint: disable=W0718
|
|
225
|
+
except Exception as e:
|
|
226
|
+
printing.log_info(f"Error processing task {task_id}: {str(e)}")
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Save predictions and evaluate
|
|
230
|
+
pred_path = os.path.join(results_dir, "humaneval_predictions.jsonl")
|
|
231
|
+
write_jsonl(pred_path, samples)
|
|
232
|
+
printing.log_info(f"Results saved in: {results_dir}")
|
|
233
|
+
|
|
234
|
+
# Run functional correctness evaluation
|
|
235
|
+
k_values = [k_samples]
|
|
236
|
+
results = evaluate_functional_correctness(
|
|
237
|
+
pred_path,
|
|
238
|
+
k_values,
|
|
239
|
+
n_workers=1,
|
|
240
|
+
timeout=timeout,
|
|
241
|
+
problem_file=data_path,
|
|
242
|
+
ignore_incomplete=ignore_incomplete,
|
|
243
|
+
)
|
|
244
|
+
return results
|
|
245
|
+
|
|
246
|
+
def _save_metrics(self, state: State, results: Dict[str, float]) -> None:
|
|
247
|
+
"""Save evaluation metrics to state."""
|
|
248
|
+
for metric, value in results.items():
|
|
249
|
+
metric_name = f"humaneval_{metric}"
|
|
250
|
+
state.save_stat(
|
|
251
|
+
metric_name, float(value) * 100 if value is not None else None
|
|
252
|
+
)
|
|
253
|
+
state.save_stat(f"{metric_name}_units", "%")
|
|
254
|
+
self.status_stats.append(metric_name)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
258
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import subprocess
|
|
5
|
+
from lemonade.state import State
|
|
6
|
+
import lemonade.common.status as status
|
|
7
|
+
from lemonade.tools import FirstTool
|
|
8
|
+
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
9
|
+
from lemonade.cache import Keys
|
|
10
|
+
from lemonade.tools.huggingface_load import get_base_model
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LlamaCppAdapter(ModelAdapter):
|
|
14
|
+
def __init__(self, model, output_tokens, context_size, threads, executable):
|
|
15
|
+
super().__init__()
|
|
16
|
+
|
|
17
|
+
self.model = os.path.normpath(model)
|
|
18
|
+
self.output_tokens = output_tokens
|
|
19
|
+
self.context_size = context_size
|
|
20
|
+
self.threads = threads
|
|
21
|
+
self.executable = os.path.normpath(executable)
|
|
22
|
+
|
|
23
|
+
def generate(
|
|
24
|
+
self,
|
|
25
|
+
input_ids: str,
|
|
26
|
+
max_new_tokens: Optional[int] = None,
|
|
27
|
+
temperature: float = 0.8,
|
|
28
|
+
top_p: float = 0.95,
|
|
29
|
+
top_k: int = 40,
|
|
30
|
+
return_raw: bool = False,
|
|
31
|
+
**kwargs, # pylint: disable=unused-argument
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Pass a text prompt into the llamacpp inference CLI.
|
|
35
|
+
|
|
36
|
+
The input_ids arg here should receive the original text that
|
|
37
|
+
would normally be encoded by a tokenizer.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
input_ids: The input text prompt
|
|
41
|
+
max_new_tokens: Maximum number of tokens to generate
|
|
42
|
+
temperature: Temperature for sampling (0.0 = greedy)
|
|
43
|
+
top_p: Top-p sampling threshold
|
|
44
|
+
top_k: Top-k sampling threshold
|
|
45
|
+
return_raw: If True, returns the complete raw output including timing info
|
|
46
|
+
**kwargs: Additional arguments (ignored)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List containing a single string with the generated text, or raw output if
|
|
50
|
+
return_raw=True
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
prompt = input_ids
|
|
54
|
+
n_predict = max_new_tokens if max_new_tokens is not None else self.output_tokens
|
|
55
|
+
|
|
56
|
+
cmd = [
|
|
57
|
+
self.executable,
|
|
58
|
+
"-m",
|
|
59
|
+
self.model,
|
|
60
|
+
"--ctx-size",
|
|
61
|
+
str(self.context_size),
|
|
62
|
+
"-n",
|
|
63
|
+
str(n_predict),
|
|
64
|
+
"-t",
|
|
65
|
+
str(self.threads),
|
|
66
|
+
"-p",
|
|
67
|
+
prompt,
|
|
68
|
+
"--temp",
|
|
69
|
+
str(temperature),
|
|
70
|
+
"--top-p",
|
|
71
|
+
str(top_p),
|
|
72
|
+
"--top-k",
|
|
73
|
+
str(top_k),
|
|
74
|
+
"-e",
|
|
75
|
+
"-no-cnv",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
cmd = [str(m) for m in cmd]
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
process = subprocess.Popen(
|
|
82
|
+
cmd,
|
|
83
|
+
stdout=subprocess.PIPE,
|
|
84
|
+
stderr=subprocess.PIPE,
|
|
85
|
+
universal_newlines=True,
|
|
86
|
+
encoding="utf-8",
|
|
87
|
+
errors="replace",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
raw_output, stderr = process.communicate(timeout=600)
|
|
91
|
+
if process.returncode != 0:
|
|
92
|
+
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
93
|
+
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
94
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
95
|
+
error_msg += f"Standard output:\n{raw_output}"
|
|
96
|
+
raise Exception(error_msg)
|
|
97
|
+
|
|
98
|
+
if raw_output is None:
|
|
99
|
+
raise Exception("No output received from llama.cpp process")
|
|
100
|
+
|
|
101
|
+
# Parse timing information
|
|
102
|
+
for line in raw_output.splitlines():
|
|
103
|
+
if "llama_perf_context_print: eval time =" in line:
|
|
104
|
+
parts = line.split("(")[1].strip()
|
|
105
|
+
parts = parts.split(",")
|
|
106
|
+
ms_per_token = float(parts[0].split("ms per token")[0].strip())
|
|
107
|
+
self.tokens_per_second = (
|
|
108
|
+
1000 / ms_per_token if ms_per_token > 0 else 0
|
|
109
|
+
)
|
|
110
|
+
if "llama_perf_context_print: prompt eval time =" in line:
|
|
111
|
+
parts = line.split("=")[1].split("/")[0]
|
|
112
|
+
time_to_first_token_ms = float(parts.split("ms")[0].strip())
|
|
113
|
+
self.time_to_first_token = time_to_first_token_ms / 1000
|
|
114
|
+
|
|
115
|
+
if return_raw:
|
|
116
|
+
return [raw_output, stderr]
|
|
117
|
+
|
|
118
|
+
# Find where the prompt ends and the generated text begins
|
|
119
|
+
prompt_found = False
|
|
120
|
+
output_text = ""
|
|
121
|
+
prompt_first_line = prompt.split("\n")[0]
|
|
122
|
+
for line in raw_output.splitlines():
|
|
123
|
+
if prompt_first_line in line:
|
|
124
|
+
prompt_found = True
|
|
125
|
+
if prompt_found:
|
|
126
|
+
line = line.replace("</s> [end of text]", "")
|
|
127
|
+
output_text = output_text + line
|
|
128
|
+
|
|
129
|
+
if not prompt_found:
|
|
130
|
+
raise Exception(
|
|
131
|
+
f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
|
|
132
|
+
"This usually means the model failed to process the prompt correctly.\n"
|
|
133
|
+
f"Raw output:\n{raw_output}\n"
|
|
134
|
+
f"Stderr:\n{stderr}"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Return list containing the generated text
|
|
138
|
+
return [output_text]
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
|
|
142
|
+
error_msg += f"Command: {' '.join(cmd)}"
|
|
143
|
+
raise Exception(error_msg)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class LoadLlamaCpp(FirstTool):
|
|
147
|
+
unique_name = "load-llama-cpp"
|
|
148
|
+
|
|
149
|
+
def __init__(self):
|
|
150
|
+
super().__init__(monitor_message="Loading llama.cpp model")
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
154
|
+
parser = __class__.helpful_parser(
|
|
155
|
+
short_description="Wrap llama.cpp models with an API",
|
|
156
|
+
add_help=add_help,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--executable",
|
|
161
|
+
required=True,
|
|
162
|
+
type=str,
|
|
163
|
+
help="Path to the llama.cpp executable (e.g., llama-cli or llama-cli.exe)",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
default_threads = 1
|
|
167
|
+
parser.add_argument(
|
|
168
|
+
"--threads",
|
|
169
|
+
required=False,
|
|
170
|
+
type=int,
|
|
171
|
+
default=default_threads,
|
|
172
|
+
help=f"Number of threads to use for generation (default: {default_threads})",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
context_size = 512
|
|
176
|
+
parser.add_argument(
|
|
177
|
+
"--context-size",
|
|
178
|
+
required=False,
|
|
179
|
+
type=int,
|
|
180
|
+
default=context_size,
|
|
181
|
+
help=f"Context size of the prompt (default: {context_size})",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
output_tokens = 512
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--output-tokens",
|
|
187
|
+
required=False,
|
|
188
|
+
type=int,
|
|
189
|
+
default=output_tokens,
|
|
190
|
+
help=f"Maximum number of output tokens the LLM should make (default: {output_tokens})",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
parser.add_argument(
|
|
194
|
+
"--model-binary",
|
|
195
|
+
required=True,
|
|
196
|
+
type=str,
|
|
197
|
+
help="Path to a .gguf model file",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return parser
|
|
201
|
+
|
|
202
|
+
def run(
|
|
203
|
+
self,
|
|
204
|
+
state: State,
|
|
205
|
+
input: str = "",
|
|
206
|
+
context_size: int = 512,
|
|
207
|
+
threads: int = 1,
|
|
208
|
+
output_tokens: int = 512,
|
|
209
|
+
model_binary: Optional[str] = None,
|
|
210
|
+
executable: str = None,
|
|
211
|
+
) -> State:
|
|
212
|
+
"""
|
|
213
|
+
Load a llama.cpp model
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
if executable is None:
|
|
217
|
+
raise Exception(f"{self.__class__.unique_name} requires an executable path")
|
|
218
|
+
|
|
219
|
+
# Convert paths to platform-specific format
|
|
220
|
+
executable = os.path.normpath(executable)
|
|
221
|
+
|
|
222
|
+
if model_binary:
|
|
223
|
+
model_to_use = os.path.normpath(model_binary)
|
|
224
|
+
else:
|
|
225
|
+
model_binary = input
|
|
226
|
+
model_to_use = os.path.normpath(model_binary) if model_binary else None
|
|
227
|
+
|
|
228
|
+
if not model_binary:
|
|
229
|
+
model_to_use = state.get(Keys.MODEL)
|
|
230
|
+
|
|
231
|
+
if model_to_use is None:
|
|
232
|
+
raise Exception(
|
|
233
|
+
f"{self.__class__.unique_name} requires the preceding tool to pass a "
|
|
234
|
+
"Llamacpp model, "
|
|
235
|
+
"or for the user to supply a model with `--model-binary`"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
state.model = LlamaCppAdapter(
|
|
239
|
+
model=model_to_use,
|
|
240
|
+
output_tokens=output_tokens,
|
|
241
|
+
context_size=context_size,
|
|
242
|
+
threads=threads,
|
|
243
|
+
executable=executable,
|
|
244
|
+
)
|
|
245
|
+
state.tokenizer = PassthroughTokenizer()
|
|
246
|
+
|
|
247
|
+
# Save stats about the model
|
|
248
|
+
state.save_stat(Keys.CHECKPOINT, model_to_use)
|
|
249
|
+
|
|
250
|
+
# Get base model information if this is a converted HF model
|
|
251
|
+
base_model = get_base_model(input)
|
|
252
|
+
if base_model is not None:
|
|
253
|
+
state.save_stat("base_model", base_model)
|
|
254
|
+
|
|
255
|
+
status.add_to_state(state=state, name=input, model=model_to_use)
|
|
256
|
+
|
|
257
|
+
return state
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
261
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import statistics
|
|
3
|
+
from statistics import StatisticsError
|
|
4
|
+
from lemonade.state import State
|
|
5
|
+
from lemonade.cache import Keys
|
|
6
|
+
from lemonade.tools.llamacpp import LlamaCppAdapter
|
|
7
|
+
from lemonade.tools.bench import Bench
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LlamaCppBench(Bench):
|
|
11
|
+
|
|
12
|
+
unique_name = "llama-cpp-bench"
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super().__init__()
|
|
16
|
+
|
|
17
|
+
# Additional statistics generated by this bench tool
|
|
18
|
+
self.status_stats += [
|
|
19
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
20
|
+
]
|
|
21
|
+
self.std_dev_token_generation_tokens_per_second_list = []
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
25
|
+
parser = __class__.helpful_parser(
|
|
26
|
+
short_description="Benchmark a llama.cpp model",
|
|
27
|
+
add_help=add_help,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
parser = Bench.parser(parser)
|
|
31
|
+
|
|
32
|
+
return parser
|
|
33
|
+
|
|
34
|
+
def run_prompt(
|
|
35
|
+
self,
|
|
36
|
+
state: State,
|
|
37
|
+
report_progress_fn,
|
|
38
|
+
prompt: str,
|
|
39
|
+
iterations: int,
|
|
40
|
+
warmup_iterations: int,
|
|
41
|
+
output_tokens: int,
|
|
42
|
+
) -> State:
|
|
43
|
+
"""
|
|
44
|
+
Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
if self.first_run_prompt:
|
|
48
|
+
|
|
49
|
+
if not hasattr(state, "model") or not isinstance(
|
|
50
|
+
state.model, LlamaCppAdapter
|
|
51
|
+
):
|
|
52
|
+
raise Exception(
|
|
53
|
+
f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
|
|
54
|
+
"loaded first. Please run load-llama-cpp before this tool."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
iteration_tokens_per_second = []
|
|
58
|
+
iteration_time_to_first_token = []
|
|
59
|
+
|
|
60
|
+
for iteration in range(iterations + warmup_iterations):
|
|
61
|
+
try:
|
|
62
|
+
# Use the adapter's generate method which already has the timeout
|
|
63
|
+
# and error handling
|
|
64
|
+
raw_output, stderr = state.model.generate(prompt, return_raw=True)
|
|
65
|
+
|
|
66
|
+
# Parse the timing information from the output
|
|
67
|
+
ms_per_token = None
|
|
68
|
+
time_to_first_token_ms = None
|
|
69
|
+
input_tokens = None
|
|
70
|
+
|
|
71
|
+
# Look for timing in both stdout and stderr
|
|
72
|
+
for output in [raw_output, stderr]:
|
|
73
|
+
for line in output.splitlines():
|
|
74
|
+
if "llama_perf_context_print: eval time =" in line:
|
|
75
|
+
parts = line.split("(")[1].strip()
|
|
76
|
+
parts = parts.split(",")
|
|
77
|
+
ms_per_token = float(
|
|
78
|
+
parts[0].split("ms per token")[0].strip()
|
|
79
|
+
)
|
|
80
|
+
if "llama_perf_context_print: prompt eval time =" in line:
|
|
81
|
+
parts = line.split("=")[1].split("/")
|
|
82
|
+
time_to_first_token_ms = float(
|
|
83
|
+
parts[0].split("ms")[0].strip()
|
|
84
|
+
)
|
|
85
|
+
input_tokens = int(parts[1].split("tokens")[0].strip())
|
|
86
|
+
|
|
87
|
+
if ms_per_token is None or time_to_first_token_ms is None:
|
|
88
|
+
error_msg = (
|
|
89
|
+
"Could not find timing information in llama.cpp output.\n"
|
|
90
|
+
)
|
|
91
|
+
error_msg += "Raw output:\n" + raw_output + "\n"
|
|
92
|
+
error_msg += "Stderr:\n" + stderr
|
|
93
|
+
raise Exception(error_msg)
|
|
94
|
+
|
|
95
|
+
# When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
|
|
96
|
+
# and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
|
|
97
|
+
# as performance data for generating a few tokens is not relevant.
|
|
98
|
+
tokens_per_second = 0
|
|
99
|
+
if output_tokens > 5 and ms_per_token > 0:
|
|
100
|
+
tokens_per_second = 1000 / ms_per_token
|
|
101
|
+
time_to_first_token = time_to_first_token_ms / 1000
|
|
102
|
+
|
|
103
|
+
if iteration > warmup_iterations - 1:
|
|
104
|
+
iteration_tokens_per_second.append(tokens_per_second)
|
|
105
|
+
iteration_time_to_first_token.append(time_to_first_token)
|
|
106
|
+
|
|
107
|
+
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
error_msg = f"Failed to run benchmark: {str(e)}"
|
|
111
|
+
raise Exception(error_msg)
|
|
112
|
+
|
|
113
|
+
self.input_ids_len_list.append(input_tokens)
|
|
114
|
+
mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
|
|
115
|
+
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
116
|
+
self.prefill_tokens_per_second_list.append(
|
|
117
|
+
input_tokens / mean_time_to_first_token
|
|
118
|
+
)
|
|
119
|
+
self.token_generation_tokens_per_second_list.append(
|
|
120
|
+
statistics.mean(iteration_tokens_per_second)
|
|
121
|
+
)
|
|
122
|
+
try:
|
|
123
|
+
self.std_dev_time_to_first_token_list.append(
|
|
124
|
+
statistics.stdev(iteration_time_to_first_token)
|
|
125
|
+
)
|
|
126
|
+
except StatisticsError:
|
|
127
|
+
# Less than 2 measurements
|
|
128
|
+
self.std_dev_time_to_first_token_list.append(None)
|
|
129
|
+
try:
|
|
130
|
+
self.std_dev_token_generation_tokens_per_second_list.append(
|
|
131
|
+
statistics.stdev(iteration_tokens_per_second)
|
|
132
|
+
)
|
|
133
|
+
except StatisticsError:
|
|
134
|
+
# Less than 2 measurements
|
|
135
|
+
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
136
|
+
|
|
137
|
+
def save_stats(self, state):
|
|
138
|
+
super().save_stats(state)
|
|
139
|
+
|
|
140
|
+
# Save additional statistics
|
|
141
|
+
if not all(
|
|
142
|
+
element is None
|
|
143
|
+
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
144
|
+
):
|
|
145
|
+
state.save_stat(
|
|
146
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
147
|
+
self.get_item_or_list(
|
|
148
|
+
self.std_dev_token_generation_tokens_per_second_list
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
154
|
+
# Modifications Copyright (c) 2025 AMD
|