lemonade-sdk 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/__init__.py +5 -0
- lemonade/api.py +125 -0
- lemonade/cache.py +85 -0
- lemonade/cli.py +135 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/analyze_model.py +26 -0
- lemonade/common/build.py +223 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/labels.py +61 -0
- lemonade/common/onnx_helpers.py +176 -0
- lemonade/common/plugins.py +10 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +490 -0
- lemonade/common/system_info.py +390 -0
- lemonade/common/tensor_helpers.py +83 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/memory_tracker.py +257 -0
- lemonade/profilers/profiler.py +55 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/adapter.py +104 -0
- lemonade/tools/bench.py +284 -0
- lemonade/tools/huggingface_bench.py +267 -0
- lemonade/tools/huggingface_load.py +520 -0
- lemonade/tools/humaneval.py +258 -0
- lemonade/tools/llamacpp.py +261 -0
- lemonade/tools/llamacpp_bench.py +154 -0
- lemonade/tools/management_tools.py +273 -0
- lemonade/tools/mmlu.py +327 -0
- lemonade/tools/ort_genai/__init__.py +0 -0
- lemonade/tools/ort_genai/oga.py +1129 -0
- lemonade/tools/ort_genai/oga_bench.py +142 -0
- lemonade/tools/perplexity.py +146 -0
- lemonade/tools/prompt.py +228 -0
- lemonade/tools/quark/__init__.py +0 -0
- lemonade/tools/quark/quark_load.py +172 -0
- lemonade/tools/quark/quark_quantize.py +439 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +739 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/serve.py +1354 -0
- lemonade/tools/server/tool_calls.py +146 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +774 -0
- lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
- lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
- lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
- lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
- lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
- lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +260 -0
- lemonade_server/model_manager.py +98 -0
- lemonade_server/server_models.json +142 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from typing import List, Tuple
|
|
3
|
+
import time
|
|
4
|
+
import statistics
|
|
5
|
+
from statistics import StatisticsError
|
|
6
|
+
from contextlib import nullcontext
|
|
7
|
+
import torch
|
|
8
|
+
from lemonade.state import State
|
|
9
|
+
from lemonade.cache import Keys
|
|
10
|
+
from lemonade.tools.bench import Bench
|
|
11
|
+
|
|
12
|
+
default_beams = 1
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def benchmark_huggingface_llm(
|
|
16
|
+
model: torch.nn.Module,
|
|
17
|
+
tokenizer,
|
|
18
|
+
input_ids,
|
|
19
|
+
dtype,
|
|
20
|
+
num_beams: int,
|
|
21
|
+
target_output_tokens: int,
|
|
22
|
+
iterations: int,
|
|
23
|
+
warmup_iterations: int,
|
|
24
|
+
report_progress_fn,
|
|
25
|
+
) -> List[Tuple[float, int]]:
|
|
26
|
+
|
|
27
|
+
amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
|
|
28
|
+
# The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
|
|
29
|
+
# where torch.cpu.amp.autocast(enabled=False) does nothing
|
|
30
|
+
with (
|
|
31
|
+
torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
|
|
32
|
+
if amp_enabled
|
|
33
|
+
else nullcontext()
|
|
34
|
+
):
|
|
35
|
+
|
|
36
|
+
per_iteration_result = []
|
|
37
|
+
tokens_out_len_list = []
|
|
38
|
+
|
|
39
|
+
# Early stopping is only a valid parameter with multiple beams
|
|
40
|
+
early_stopping = num_beams > 1
|
|
41
|
+
|
|
42
|
+
with torch.no_grad(), torch.inference_mode():
|
|
43
|
+
# Don't capture time for warmup
|
|
44
|
+
for count in range(warmup_iterations):
|
|
45
|
+
outputs = model.generate(
|
|
46
|
+
input_ids,
|
|
47
|
+
num_beams=num_beams,
|
|
48
|
+
max_new_tokens=target_output_tokens,
|
|
49
|
+
min_new_tokens=target_output_tokens,
|
|
50
|
+
early_stopping=early_stopping,
|
|
51
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
52
|
+
)
|
|
53
|
+
tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
|
|
54
|
+
report_progress_fn((count + 1) / (warmup_iterations + iterations))
|
|
55
|
+
|
|
56
|
+
for count in range(iterations):
|
|
57
|
+
# CUDA synchronization is required prior to GPU benchmarking
|
|
58
|
+
# This has no negative effect on CPU-only benchmarks, and is more robust than
|
|
59
|
+
# checking `model.device == "cuda"` since it applies to multi-GPU environments
|
|
60
|
+
# Synchronization is done before collecting the start time because this will
|
|
61
|
+
# ensure that the GPU has finished initialization tasks such as loading weights
|
|
62
|
+
if torch.cuda.is_available():
|
|
63
|
+
torch.cuda.synchronize()
|
|
64
|
+
start_time = time.perf_counter()
|
|
65
|
+
|
|
66
|
+
outputs = model.generate(
|
|
67
|
+
input_ids,
|
|
68
|
+
num_beams=num_beams,
|
|
69
|
+
max_new_tokens=target_output_tokens,
|
|
70
|
+
min_new_tokens=target_output_tokens,
|
|
71
|
+
early_stopping=early_stopping,
|
|
72
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if torch.cuda.is_available():
|
|
76
|
+
torch.cuda.synchronize()
|
|
77
|
+
end_time = time.perf_counter()
|
|
78
|
+
|
|
79
|
+
latency = end_time - start_time
|
|
80
|
+
|
|
81
|
+
token_len = outputs.shape[1] - input_ids.shape[1]
|
|
82
|
+
tokens_out_len_list.append(token_len)
|
|
83
|
+
|
|
84
|
+
# Only count an iteration if it produced enough tokens
|
|
85
|
+
if token_len >= target_output_tokens:
|
|
86
|
+
per_iteration_result.append((latency, token_len))
|
|
87
|
+
|
|
88
|
+
report_progress_fn(
|
|
89
|
+
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if not per_iteration_result:
|
|
93
|
+
raise Bench.not_enough_tokens(target_output_tokens)
|
|
94
|
+
|
|
95
|
+
return per_iteration_result, tokens_out_len_list
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class HuggingfaceBench(Bench):
|
|
99
|
+
"""
|
|
100
|
+
Benchmarks the performance of the generate() method of an LLM loaded from
|
|
101
|
+
Huggingface Transformers (or any object that supports a
|
|
102
|
+
huggingface-like generate() method).
|
|
103
|
+
|
|
104
|
+
Required input state:
|
|
105
|
+
- DTYPE: data type of the model; used to determine if AMP should be
|
|
106
|
+
enabled to convert the input data type to match the model data
|
|
107
|
+
type.
|
|
108
|
+
- MODEL: huggingface-like instance to benchmark.
|
|
109
|
+
- INPUTS: model inputs to pass to generate() during benchmarking.
|
|
110
|
+
|
|
111
|
+
Output state produced: None
|
|
112
|
+
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
unique_name = "huggingface-bench"
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
|
|
119
|
+
# Allow inherited classes to initialize and pass in a parser, add parameters to it if so
|
|
120
|
+
if parser is None:
|
|
121
|
+
parser = __class__.helpful_parser(
|
|
122
|
+
short_description="Benchmark a huggingface-style PyTorch LLM",
|
|
123
|
+
add_help=add_help,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
parser = Bench.parser(parser)
|
|
127
|
+
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"--num-beams",
|
|
130
|
+
required=False,
|
|
131
|
+
type=int,
|
|
132
|
+
default=default_beams,
|
|
133
|
+
help=f"Number of beams for the LLM to use (default: {default_beams})",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return parser
|
|
137
|
+
|
|
138
|
+
def get_prompt_str(self, state, token_length):
|
|
139
|
+
"""
|
|
140
|
+
Returns a string with the prescribed token length.
|
|
141
|
+
"""
|
|
142
|
+
model = state.model
|
|
143
|
+
tokenizer = state.tokenizer
|
|
144
|
+
test_prompt = "word " * (token_length - 2)
|
|
145
|
+
input_ids = (
|
|
146
|
+
tokenizer(test_prompt, return_tensors="pt")
|
|
147
|
+
.to(device=model.device)
|
|
148
|
+
.input_ids
|
|
149
|
+
)
|
|
150
|
+
test_token_length = input_ids.shape[1]
|
|
151
|
+
delta = test_token_length - token_length
|
|
152
|
+
if delta == 0:
|
|
153
|
+
return test_prompt
|
|
154
|
+
return "word " * max(token_length - 2 - delta, 0)
|
|
155
|
+
|
|
156
|
+
def run_prompt(
|
|
157
|
+
self,
|
|
158
|
+
state: State,
|
|
159
|
+
report_progress_fn,
|
|
160
|
+
prompt: str,
|
|
161
|
+
iterations: int,
|
|
162
|
+
warmup_iterations: int,
|
|
163
|
+
output_tokens: int,
|
|
164
|
+
num_beams: int = default_beams,
|
|
165
|
+
) -> State:
|
|
166
|
+
"""
|
|
167
|
+
We don't have access to the internal timings of generate(), so time to first
|
|
168
|
+
token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
|
|
169
|
+
prefill_latency = latency of generate(output_tokens=1)
|
|
170
|
+
execution_latency = latency of generate(output_tokens=output_tokens)
|
|
171
|
+
tokens_per_second = (new_tokens - 1) / (execution_latency - prefill_latency)
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
if self.first_run_prompt:
|
|
175
|
+
if vars(state).get(Keys.MODEL) is None:
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"{self.__class__.__name__} requires that a model be passed from another tool"
|
|
178
|
+
)
|
|
179
|
+
if (
|
|
180
|
+
vars(state).get("num_beams")
|
|
181
|
+
and vars(state).get("num_beams") != num_beams
|
|
182
|
+
):
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"Number of beams was set to {vars(state).get('num_beams')} "
|
|
185
|
+
f"in a previous tool, but it is set to {num_beams} in "
|
|
186
|
+
"this tool. The values must be the same."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Save benchmarking parameters
|
|
190
|
+
state.save_stat("num_beams", num_beams)
|
|
191
|
+
|
|
192
|
+
model = state.model
|
|
193
|
+
tokenizer = state.tokenizer
|
|
194
|
+
dtype = state.dtype
|
|
195
|
+
|
|
196
|
+
# Generate the input_ids outside the benchmarking function to make sure
|
|
197
|
+
# the same input_ids are used everywhere
|
|
198
|
+
input_ids = (
|
|
199
|
+
tokenizer(prompt, return_tensors="pt").to(device=model.device).input_ids
|
|
200
|
+
)
|
|
201
|
+
self.input_ids_len_list.append(input_ids.shape[1])
|
|
202
|
+
|
|
203
|
+
prefill_report_progress_fn = lambda x: report_progress_fn(0.5 * x)
|
|
204
|
+
|
|
205
|
+
# Benchmark prefill time (time to first token)
|
|
206
|
+
prefill_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
|
|
207
|
+
model=model,
|
|
208
|
+
tokenizer=tokenizer,
|
|
209
|
+
input_ids=input_ids,
|
|
210
|
+
dtype=dtype,
|
|
211
|
+
num_beams=num_beams,
|
|
212
|
+
target_output_tokens=1,
|
|
213
|
+
iterations=iterations,
|
|
214
|
+
warmup_iterations=warmup_iterations,
|
|
215
|
+
report_progress_fn=prefill_report_progress_fn,
|
|
216
|
+
)
|
|
217
|
+
self.tokens_out_len_list += tokens_out_len_list
|
|
218
|
+
|
|
219
|
+
time_to_first_token_per_iteration = [
|
|
220
|
+
latency for latency, _ in prefill_per_iteration_result
|
|
221
|
+
]
|
|
222
|
+
mean_time_to_first_token = statistics.mean(time_to_first_token_per_iteration)
|
|
223
|
+
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
224
|
+
self.prefill_tokens_per_second_list.append(
|
|
225
|
+
input_ids.shape[1] / mean_time_to_first_token
|
|
226
|
+
)
|
|
227
|
+
try:
|
|
228
|
+
self.std_dev_time_to_first_token_list.append(
|
|
229
|
+
statistics.stdev(time_to_first_token_per_iteration)
|
|
230
|
+
)
|
|
231
|
+
except StatisticsError:
|
|
232
|
+
# Less than 2 measurements
|
|
233
|
+
self.std_dev_time_to_first_token_list.append(None)
|
|
234
|
+
|
|
235
|
+
decode_report_progress_fn = lambda x: report_progress_fn(0.5 + 0.5 * x)
|
|
236
|
+
|
|
237
|
+
# Benchmark generation of all tokens
|
|
238
|
+
decode_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
|
|
239
|
+
model=model,
|
|
240
|
+
tokenizer=tokenizer,
|
|
241
|
+
input_ids=input_ids,
|
|
242
|
+
dtype=dtype,
|
|
243
|
+
num_beams=num_beams,
|
|
244
|
+
target_output_tokens=output_tokens,
|
|
245
|
+
iterations=iterations,
|
|
246
|
+
warmup_iterations=warmup_iterations,
|
|
247
|
+
report_progress_fn=decode_report_progress_fn,
|
|
248
|
+
)
|
|
249
|
+
self.tokens_out_len_list += tokens_out_len_list
|
|
250
|
+
|
|
251
|
+
execution_latency_per_iteration = [
|
|
252
|
+
latency for latency, _ in decode_per_iteration_result
|
|
253
|
+
]
|
|
254
|
+
token_len_per_iteration = [
|
|
255
|
+
token_len for _, token_len in decode_per_iteration_result
|
|
256
|
+
]
|
|
257
|
+
mean_execution_latency = statistics.mean(execution_latency_per_iteration)
|
|
258
|
+
mean_decode_latency = mean_execution_latency - mean_time_to_first_token
|
|
259
|
+
mean_token_len = statistics.mean(token_len_per_iteration)
|
|
260
|
+
# Subtract 1 so that we don't count the prefill token
|
|
261
|
+
self.token_generation_tokens_per_second_list.append(
|
|
262
|
+
(mean_token_len - 1) / mean_decode_latency
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
267
|
+
# Modifications Copyright (c) 2025 AMD
|