lemonade-sdk 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (61) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +125 -0
  3. lemonade/cache.py +85 -0
  4. lemonade/cli.py +135 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/analyze_model.py +26 -0
  7. lemonade/common/build.py +223 -0
  8. lemonade/common/cli_helpers.py +139 -0
  9. lemonade/common/exceptions.py +98 -0
  10. lemonade/common/filesystem.py +368 -0
  11. lemonade/common/labels.py +61 -0
  12. lemonade/common/onnx_helpers.py +176 -0
  13. lemonade/common/plugins.py +10 -0
  14. lemonade/common/printing.py +110 -0
  15. lemonade/common/status.py +490 -0
  16. lemonade/common/system_info.py +390 -0
  17. lemonade/common/tensor_helpers.py +83 -0
  18. lemonade/common/test_helpers.py +28 -0
  19. lemonade/profilers/__init__.py +1 -0
  20. lemonade/profilers/memory_tracker.py +257 -0
  21. lemonade/profilers/profiler.py +55 -0
  22. lemonade/sequence.py +363 -0
  23. lemonade/state.py +159 -0
  24. lemonade/tools/__init__.py +1 -0
  25. lemonade/tools/adapter.py +104 -0
  26. lemonade/tools/bench.py +284 -0
  27. lemonade/tools/huggingface_bench.py +267 -0
  28. lemonade/tools/huggingface_load.py +520 -0
  29. lemonade/tools/humaneval.py +258 -0
  30. lemonade/tools/llamacpp.py +261 -0
  31. lemonade/tools/llamacpp_bench.py +154 -0
  32. lemonade/tools/management_tools.py +273 -0
  33. lemonade/tools/mmlu.py +327 -0
  34. lemonade/tools/ort_genai/__init__.py +0 -0
  35. lemonade/tools/ort_genai/oga.py +1129 -0
  36. lemonade/tools/ort_genai/oga_bench.py +142 -0
  37. lemonade/tools/perplexity.py +146 -0
  38. lemonade/tools/prompt.py +228 -0
  39. lemonade/tools/quark/__init__.py +0 -0
  40. lemonade/tools/quark/quark_load.py +172 -0
  41. lemonade/tools/quark/quark_quantize.py +439 -0
  42. lemonade/tools/report/__init__.py +0 -0
  43. lemonade/tools/report/llm_report.py +203 -0
  44. lemonade/tools/report/table.py +739 -0
  45. lemonade/tools/server/__init__.py +0 -0
  46. lemonade/tools/server/serve.py +1354 -0
  47. lemonade/tools/server/tool_calls.py +146 -0
  48. lemonade/tools/tool.py +374 -0
  49. lemonade/version.py +1 -0
  50. lemonade_install/__init__.py +1 -0
  51. lemonade_install/install.py +774 -0
  52. lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
  53. lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
  54. lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
  55. lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
  56. lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
  57. lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
  58. lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
  59. lemonade_server/cli.py +260 -0
  60. lemonade_server/model_manager.py +98 -0
  61. lemonade_server/server_models.json +142 -0
@@ -0,0 +1,267 @@
1
+ import argparse
2
+ from typing import List, Tuple
3
+ import time
4
+ import statistics
5
+ from statistics import StatisticsError
6
+ from contextlib import nullcontext
7
+ import torch
8
+ from lemonade.state import State
9
+ from lemonade.cache import Keys
10
+ from lemonade.tools.bench import Bench
11
+
12
+ default_beams = 1
13
+
14
+
15
+ def benchmark_huggingface_llm(
16
+ model: torch.nn.Module,
17
+ tokenizer,
18
+ input_ids,
19
+ dtype,
20
+ num_beams: int,
21
+ target_output_tokens: int,
22
+ iterations: int,
23
+ warmup_iterations: int,
24
+ report_progress_fn,
25
+ ) -> List[Tuple[float, int]]:
26
+
27
+ amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
28
+ # The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
29
+ # where torch.cpu.amp.autocast(enabled=False) does nothing
30
+ with (
31
+ torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
32
+ if amp_enabled
33
+ else nullcontext()
34
+ ):
35
+
36
+ per_iteration_result = []
37
+ tokens_out_len_list = []
38
+
39
+ # Early stopping is only a valid parameter with multiple beams
40
+ early_stopping = num_beams > 1
41
+
42
+ with torch.no_grad(), torch.inference_mode():
43
+ # Don't capture time for warmup
44
+ for count in range(warmup_iterations):
45
+ outputs = model.generate(
46
+ input_ids,
47
+ num_beams=num_beams,
48
+ max_new_tokens=target_output_tokens,
49
+ min_new_tokens=target_output_tokens,
50
+ early_stopping=early_stopping,
51
+ pad_token_id=tokenizer.eos_token_id,
52
+ )
53
+ tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
54
+ report_progress_fn((count + 1) / (warmup_iterations + iterations))
55
+
56
+ for count in range(iterations):
57
+ # CUDA synchronization is required prior to GPU benchmarking
58
+ # This has no negative effect on CPU-only benchmarks, and is more robust than
59
+ # checking `model.device == "cuda"` since it applies to multi-GPU environments
60
+ # Synchronization is done before collecting the start time because this will
61
+ # ensure that the GPU has finished initialization tasks such as loading weights
62
+ if torch.cuda.is_available():
63
+ torch.cuda.synchronize()
64
+ start_time = time.perf_counter()
65
+
66
+ outputs = model.generate(
67
+ input_ids,
68
+ num_beams=num_beams,
69
+ max_new_tokens=target_output_tokens,
70
+ min_new_tokens=target_output_tokens,
71
+ early_stopping=early_stopping,
72
+ pad_token_id=tokenizer.eos_token_id,
73
+ )
74
+
75
+ if torch.cuda.is_available():
76
+ torch.cuda.synchronize()
77
+ end_time = time.perf_counter()
78
+
79
+ latency = end_time - start_time
80
+
81
+ token_len = outputs.shape[1] - input_ids.shape[1]
82
+ tokens_out_len_list.append(token_len)
83
+
84
+ # Only count an iteration if it produced enough tokens
85
+ if token_len >= target_output_tokens:
86
+ per_iteration_result.append((latency, token_len))
87
+
88
+ report_progress_fn(
89
+ (warmup_iterations + count + 1) / (warmup_iterations + iterations)
90
+ )
91
+
92
+ if not per_iteration_result:
93
+ raise Bench.not_enough_tokens(target_output_tokens)
94
+
95
+ return per_iteration_result, tokens_out_len_list
96
+
97
+
98
+ class HuggingfaceBench(Bench):
99
+ """
100
+ Benchmarks the performance of the generate() method of an LLM loaded from
101
+ Huggingface Transformers (or any object that supports a
102
+ huggingface-like generate() method).
103
+
104
+ Required input state:
105
+ - DTYPE: data type of the model; used to determine if AMP should be
106
+ enabled to convert the input data type to match the model data
107
+ type.
108
+ - MODEL: huggingface-like instance to benchmark.
109
+ - INPUTS: model inputs to pass to generate() during benchmarking.
110
+
111
+ Output state produced: None
112
+
113
+ """
114
+
115
+ unique_name = "huggingface-bench"
116
+
117
+ @staticmethod
118
+ def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
119
+ # Allow inherited classes to initialize and pass in a parser, add parameters to it if so
120
+ if parser is None:
121
+ parser = __class__.helpful_parser(
122
+ short_description="Benchmark a huggingface-style PyTorch LLM",
123
+ add_help=add_help,
124
+ )
125
+
126
+ parser = Bench.parser(parser)
127
+
128
+ parser.add_argument(
129
+ "--num-beams",
130
+ required=False,
131
+ type=int,
132
+ default=default_beams,
133
+ help=f"Number of beams for the LLM to use (default: {default_beams})",
134
+ )
135
+
136
+ return parser
137
+
138
+ def get_prompt_str(self, state, token_length):
139
+ """
140
+ Returns a string with the prescribed token length.
141
+ """
142
+ model = state.model
143
+ tokenizer = state.tokenizer
144
+ test_prompt = "word " * (token_length - 2)
145
+ input_ids = (
146
+ tokenizer(test_prompt, return_tensors="pt")
147
+ .to(device=model.device)
148
+ .input_ids
149
+ )
150
+ test_token_length = input_ids.shape[1]
151
+ delta = test_token_length - token_length
152
+ if delta == 0:
153
+ return test_prompt
154
+ return "word " * max(token_length - 2 - delta, 0)
155
+
156
+ def run_prompt(
157
+ self,
158
+ state: State,
159
+ report_progress_fn,
160
+ prompt: str,
161
+ iterations: int,
162
+ warmup_iterations: int,
163
+ output_tokens: int,
164
+ num_beams: int = default_beams,
165
+ ) -> State:
166
+ """
167
+ We don't have access to the internal timings of generate(), so time to first
168
+ token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
169
+ prefill_latency = latency of generate(output_tokens=1)
170
+ execution_latency = latency of generate(output_tokens=output_tokens)
171
+ tokens_per_second = (new_tokens - 1) / (execution_latency - prefill_latency)
172
+ """
173
+
174
+ if self.first_run_prompt:
175
+ if vars(state).get(Keys.MODEL) is None:
176
+ raise ValueError(
177
+ f"{self.__class__.__name__} requires that a model be passed from another tool"
178
+ )
179
+ if (
180
+ vars(state).get("num_beams")
181
+ and vars(state).get("num_beams") != num_beams
182
+ ):
183
+ raise ValueError(
184
+ f"Number of beams was set to {vars(state).get('num_beams')} "
185
+ f"in a previous tool, but it is set to {num_beams} in "
186
+ "this tool. The values must be the same."
187
+ )
188
+
189
+ # Save benchmarking parameters
190
+ state.save_stat("num_beams", num_beams)
191
+
192
+ model = state.model
193
+ tokenizer = state.tokenizer
194
+ dtype = state.dtype
195
+
196
+ # Generate the input_ids outside the benchmarking function to make sure
197
+ # the same input_ids are used everywhere
198
+ input_ids = (
199
+ tokenizer(prompt, return_tensors="pt").to(device=model.device).input_ids
200
+ )
201
+ self.input_ids_len_list.append(input_ids.shape[1])
202
+
203
+ prefill_report_progress_fn = lambda x: report_progress_fn(0.5 * x)
204
+
205
+ # Benchmark prefill time (time to first token)
206
+ prefill_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
207
+ model=model,
208
+ tokenizer=tokenizer,
209
+ input_ids=input_ids,
210
+ dtype=dtype,
211
+ num_beams=num_beams,
212
+ target_output_tokens=1,
213
+ iterations=iterations,
214
+ warmup_iterations=warmup_iterations,
215
+ report_progress_fn=prefill_report_progress_fn,
216
+ )
217
+ self.tokens_out_len_list += tokens_out_len_list
218
+
219
+ time_to_first_token_per_iteration = [
220
+ latency for latency, _ in prefill_per_iteration_result
221
+ ]
222
+ mean_time_to_first_token = statistics.mean(time_to_first_token_per_iteration)
223
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
224
+ self.prefill_tokens_per_second_list.append(
225
+ input_ids.shape[1] / mean_time_to_first_token
226
+ )
227
+ try:
228
+ self.std_dev_time_to_first_token_list.append(
229
+ statistics.stdev(time_to_first_token_per_iteration)
230
+ )
231
+ except StatisticsError:
232
+ # Less than 2 measurements
233
+ self.std_dev_time_to_first_token_list.append(None)
234
+
235
+ decode_report_progress_fn = lambda x: report_progress_fn(0.5 + 0.5 * x)
236
+
237
+ # Benchmark generation of all tokens
238
+ decode_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
239
+ model=model,
240
+ tokenizer=tokenizer,
241
+ input_ids=input_ids,
242
+ dtype=dtype,
243
+ num_beams=num_beams,
244
+ target_output_tokens=output_tokens,
245
+ iterations=iterations,
246
+ warmup_iterations=warmup_iterations,
247
+ report_progress_fn=decode_report_progress_fn,
248
+ )
249
+ self.tokens_out_len_list += tokens_out_len_list
250
+
251
+ execution_latency_per_iteration = [
252
+ latency for latency, _ in decode_per_iteration_result
253
+ ]
254
+ token_len_per_iteration = [
255
+ token_len for _, token_len in decode_per_iteration_result
256
+ ]
257
+ mean_execution_latency = statistics.mean(execution_latency_per_iteration)
258
+ mean_decode_latency = mean_execution_latency - mean_time_to_first_token
259
+ mean_token_len = statistics.mean(token_len_per_iteration)
260
+ # Subtract 1 so that we don't count the prefill token
261
+ self.token_generation_tokens_per_second_list.append(
262
+ (mean_token_len - 1) / mean_decode_latency
263
+ )
264
+
265
+
266
+ # This file was originally licensed under Apache 2.0. It has been modified.
267
+ # Modifications Copyright (c) 2025 AMD