lemonade-sdk 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (61) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +125 -0
  3. lemonade/cache.py +85 -0
  4. lemonade/cli.py +135 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/analyze_model.py +26 -0
  7. lemonade/common/build.py +223 -0
  8. lemonade/common/cli_helpers.py +139 -0
  9. lemonade/common/exceptions.py +98 -0
  10. lemonade/common/filesystem.py +368 -0
  11. lemonade/common/labels.py +61 -0
  12. lemonade/common/onnx_helpers.py +176 -0
  13. lemonade/common/plugins.py +10 -0
  14. lemonade/common/printing.py +110 -0
  15. lemonade/common/status.py +490 -0
  16. lemonade/common/system_info.py +390 -0
  17. lemonade/common/tensor_helpers.py +83 -0
  18. lemonade/common/test_helpers.py +28 -0
  19. lemonade/profilers/__init__.py +1 -0
  20. lemonade/profilers/memory_tracker.py +257 -0
  21. lemonade/profilers/profiler.py +55 -0
  22. lemonade/sequence.py +363 -0
  23. lemonade/state.py +159 -0
  24. lemonade/tools/__init__.py +1 -0
  25. lemonade/tools/adapter.py +104 -0
  26. lemonade/tools/bench.py +284 -0
  27. lemonade/tools/huggingface_bench.py +267 -0
  28. lemonade/tools/huggingface_load.py +520 -0
  29. lemonade/tools/humaneval.py +258 -0
  30. lemonade/tools/llamacpp.py +261 -0
  31. lemonade/tools/llamacpp_bench.py +154 -0
  32. lemonade/tools/management_tools.py +273 -0
  33. lemonade/tools/mmlu.py +327 -0
  34. lemonade/tools/ort_genai/__init__.py +0 -0
  35. lemonade/tools/ort_genai/oga.py +1129 -0
  36. lemonade/tools/ort_genai/oga_bench.py +142 -0
  37. lemonade/tools/perplexity.py +146 -0
  38. lemonade/tools/prompt.py +228 -0
  39. lemonade/tools/quark/__init__.py +0 -0
  40. lemonade/tools/quark/quark_load.py +172 -0
  41. lemonade/tools/quark/quark_quantize.py +439 -0
  42. lemonade/tools/report/__init__.py +0 -0
  43. lemonade/tools/report/llm_report.py +203 -0
  44. lemonade/tools/report/table.py +739 -0
  45. lemonade/tools/server/__init__.py +0 -0
  46. lemonade/tools/server/serve.py +1354 -0
  47. lemonade/tools/server/tool_calls.py +146 -0
  48. lemonade/tools/tool.py +374 -0
  49. lemonade/version.py +1 -0
  50. lemonade_install/__init__.py +1 -0
  51. lemonade_install/install.py +774 -0
  52. lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
  53. lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
  54. lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
  55. lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
  56. lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
  57. lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
  58. lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
  59. lemonade_server/cli.py +260 -0
  60. lemonade_server/model_manager.py +98 -0
  61. lemonade_server/server_models.json +142 -0
lemonade/state.py ADDED
@@ -0,0 +1,159 @@
1
+ import os
2
+ import sys
3
+ from typing import Dict, Optional, Any
4
+ import yaml
5
+ import lemonade.common.build as build
6
+ import lemonade.common.filesystem as fs
7
+ from lemonade.version import __version__ as lemonade_version
8
+
9
+
10
+ def _is_nice_to_write(value):
11
+ """
12
+ Checks whether a value is nice to write to YAML.
13
+ Returns True if the value is a string, int, float, bool, list, dict, or tuple.
14
+ Returns False otherwise.
15
+ """
16
+ if isinstance(value, (str, int, float, bool)):
17
+ return True
18
+ elif isinstance(value, list) or isinstance(value, tuple):
19
+ # Check if all elements in the list are nice to write
20
+ return all(_is_nice_to_write(item) for item in value)
21
+ elif isinstance(value, dict):
22
+ # Check if all values in the dictionary are nice to write
23
+ return all(_is_nice_to_write(item) for item in value.values())
24
+ return False
25
+
26
+
27
+ def _sanitize_for_yaml(input_dict: Dict) -> Dict:
28
+ """
29
+ Creates a new dictionary containing only nice-to-write values
30
+ from the original dictionary.
31
+ """
32
+ result = {}
33
+ for key, value in input_dict.items():
34
+ if _is_nice_to_write(value):
35
+ result[key] = value
36
+ return result
37
+
38
+
39
+ class State:
40
+ """
41
+ The State class is meant to carry build state, starting with the user's
42
+ initial arguments, through each build Tool in the Sequence, and finally
43
+ to the disk, where it is used to assess cache hits.
44
+
45
+ State is initialized with the key members that are shared by every build,
46
+ and reasonable default values are assigned as appropriate.
47
+
48
+ Tool developers can also add any members they wish. To get or set an
49
+ attribute, reference it as an attribute:
50
+ 1. get: `my_variable = state.attribute_name`
51
+ 2. set: `state.attribute_name = my_variable`
52
+
53
+ Build State can be saved and loaded from disk in the form of a state.yaml file
54
+ via State.save() and load_state(), respectively. Note that while State can
55
+ contain members of any type, only YAML-safe members (str, int, bool, float,
56
+ list, dict, tuple) will be saved and loaded.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ cache_dir: str,
62
+ build_name: Optional[str] = None,
63
+ sequence_info: Dict[str, Dict] = None,
64
+ **kwargs,
65
+ ):
66
+
67
+ # The default model name is the name of the python file that calls build_model()
68
+ if build_name is None:
69
+ build_name = os.path.basename(sys.argv[0])
70
+
71
+ # Support "~" in the cache_dir argument
72
+ parsed_cache_dir = os.path.expanduser(cache_dir)
73
+
74
+ # Save settings as State members
75
+ self.cache_dir = parsed_cache_dir
76
+ self.build_name = build_name
77
+ self.sequence_info = sequence_info
78
+ self.lemonade_version = lemonade_version
79
+ self.build_status = build.FunctionStatus.NOT_STARTED
80
+ self.downcast_applied = False
81
+ self.uid = build.unique_id()
82
+ self.results = None
83
+
84
+ # Store any additional kwargs as members
85
+ for key, value in kwargs.items():
86
+ self.__dict__[key] = value
87
+
88
+ def __setattr__(self, name: str, value: Any) -> None:
89
+ """
90
+ Tool developers can add a new member to State by simply
91
+ assigning it as an attribute, i.e., `state.new_member = value`.
92
+ """
93
+ return super().__setattr__(name, value)
94
+
95
+ def save_stat(self, key: str, value):
96
+ """
97
+ Save statistics to an yaml file in the build directory
98
+ """
99
+
100
+ stats = fs.Stats(self.cache_dir, self.build_name)
101
+ stats.save_stat(key, value)
102
+
103
+ def save_sub_stat(self, parent_key: str, key: str, value):
104
+ """
105
+ Save statistics to an yaml file in the build directory
106
+ """
107
+
108
+ stats = fs.Stats(self.cache_dir, self.build_name)
109
+ stats.save_sub_stat(parent_key, key, value)
110
+
111
+ def save(self):
112
+ """
113
+ Save all YAML-friendly members to disk as a state.yaml file.
114
+
115
+ Note that `model` and `inputs` will typically not be saved since
116
+ they are typically in non-YAML-friendly types such as `torch.nn.Module`
117
+ and `torch.tensor`.
118
+ """
119
+
120
+ state_to_save = _sanitize_for_yaml(vars(self))
121
+
122
+ # Create a build directory in the cache
123
+ fs.make_build_dir(self.cache_dir, self.build_name)
124
+
125
+ with open(
126
+ build.state_file(self.cache_dir, self.build_name),
127
+ "w",
128
+ encoding="utf8",
129
+ ) as outfile:
130
+ yaml.dump(state_to_save, outfile)
131
+
132
+
133
+ def load_state(
134
+ cache_dir=None,
135
+ build_name=None,
136
+ state_path=None,
137
+ ) -> State:
138
+ """
139
+ Read a state.yaml file corresponding to a specific build in a specific
140
+ cache, and use its contents to initialize a State instance.
141
+ """
142
+
143
+ if state_path is not None:
144
+ file_path = state_path
145
+ elif build_name is not None and cache_dir is not None:
146
+ file_path = build.state_file(cache_dir, build_name)
147
+ else:
148
+ raise ValueError(
149
+ "This function requires either build_name and cache_dir to be set, "
150
+ "or state_path to be set, not both or neither"
151
+ )
152
+
153
+ state_dict = build.load_yaml(file_path)
154
+
155
+ return State(**state_dict)
156
+
157
+
158
+ # This file was originally licensed under Apache 2.0. It has been modified.
159
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1 @@
1
+ from .tool import Tool, FirstTool, NiceHelpFormatter
@@ -0,0 +1,104 @@
1
+ import abc
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+ class ModelAdapter(abc.ABC):
6
+ """
7
+ Base class for adapting an LLM to work with lemonade's standardized tools
8
+ """
9
+
10
+ def __init__(self):
11
+ """
12
+ Self-benchmarking ModelAdapters can store their results in the
13
+ tokens_per_second and time_to_first_token members.
14
+ """
15
+ self.tokens_per_second = None
16
+ self.time_to_first_token = None
17
+ self.type = "generic"
18
+
19
+ @abc.abstractmethod
20
+ def generate(self, input_ids, max_new_tokens=512):
21
+ """
22
+ Generate is the primary method required by lemonade's accuracy tools
23
+
24
+ We try to keep the signature here minimal to allow for maximum compatibility
25
+ with recipe components, which themselves may not support a lot of arguments.
26
+ """
27
+
28
+
29
+ class TokenizerAdapter(abc.ABC):
30
+ """
31
+ Base class for adapting an LLM's tokenizer to work with lemonade's standard tools
32
+ """
33
+
34
+ def __init__(self, tokenizer: AutoTokenizer = None):
35
+ self.auto_tokenizer = tokenizer
36
+
37
+ @abc.abstractmethod
38
+ def __call__(self, prompt: str):
39
+ """
40
+ Args:
41
+ prompt: text that should be encoded and passed to the LLM as input_ids
42
+
43
+ Returns: input_ids
44
+ """
45
+
46
+ @abc.abstractmethod
47
+ def decode(self, response) -> str:
48
+ """
49
+ Args:
50
+ response: tokens from the LLM that should be decoded into text
51
+
52
+ Returns: text response of the LLM
53
+ """
54
+
55
+ def apply_chat_template(self, *args, **kwargs):
56
+ """
57
+ Convert messages into a single tokenizable string
58
+ """
59
+ return self.auto_tokenizer.apply_chat_template(*args, **kwargs)
60
+
61
+ @property
62
+ def chat_template(self):
63
+ return self.auto_tokenizer.chat_template
64
+
65
+ @property
66
+ def eos_token(self):
67
+ return self.auto_tokenizer.eos_token
68
+
69
+
70
+ class PassthroughTokenizerResult:
71
+ """
72
+ Data structure for holding a tokenizer result where the input_ids
73
+ are packaged in a non-standard way, but we still want to adhere to
74
+ standard interfaces (e.g., result.input_ids).
75
+
76
+ For example: CLI-based tools that have their own internal tokenizer that
77
+ isn't exposed to the user. In this case we can pass the prompt through as
78
+ a string.
79
+ """
80
+
81
+ def __init__(self, prompt):
82
+ self.input_ids = prompt
83
+
84
+
85
+ class PassthroughTokenizer(TokenizerAdapter):
86
+ """
87
+ Tokenizer adapter that forwards the prompt to input_ids as text,
88
+ and then forwards a text LLM response through decode() as text.
89
+
90
+ Useful for CLI-based tools that have their own internal tokenizer that
91
+ isn't exposed to the user.
92
+ """
93
+
94
+ # pylint: disable=unused-argument
95
+ def __call__(self, prompt: str, **kwargs):
96
+ return PassthroughTokenizerResult(prompt)
97
+
98
+ # pylint: disable=unused-argument
99
+ def decode(self, response: str, **kwargs):
100
+ return response
101
+
102
+
103
+ # This file was originally licensed under Apache 2.0. It has been modified.
104
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,284 @@
1
+ from abc import ABC, abstractmethod
2
+ import argparse
3
+ import os
4
+ import platform
5
+ import psutil
6
+ from lemonade.state import State
7
+ from lemonade.tools import Tool
8
+ from lemonade.cache import Keys
9
+
10
+ default_iterations = 10
11
+ default_warmup_runs = 5
12
+ default_prompt_length = 64
13
+ default_output_tokens = 32
14
+ default_prompt = "Hello, I am conscious and"
15
+
16
+
17
+ class Bench(Tool, ABC):
18
+ """
19
+ Abstract parent class for tools that benchmark the performance of the generate()
20
+ method of an LLM.
21
+ """
22
+
23
+ def __init__(self, monitor_message="Benchmarking LLM"):
24
+ super().__init__(monitor_message)
25
+
26
+ # The minimum set of statistics that a benchmark tool will produce
27
+ # Inherited tools should append any additional statistics they generate to this list
28
+ self.status_stats = [
29
+ Keys.SECONDS_TO_FIRST_TOKEN,
30
+ Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
31
+ Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
32
+ Keys.PREFILL_TOKENS_PER_SECOND,
33
+ Keys.PROMPT_TOKENS,
34
+ Keys.RESPONSE_TOKENS,
35
+ Keys.MAX_MEMORY_USED_GBYTE,
36
+ ]
37
+
38
+ # Minimum per measurement statistics
39
+ # Inherited tools should add additional lists for other per prompt statistics
40
+ self.input_ids_len_list = []
41
+ self.tokens_out_len_list = []
42
+ self.mean_time_to_first_token_list = []
43
+ self.std_dev_time_to_first_token_list = []
44
+ self.prefill_tokens_per_second_list = []
45
+ self.token_generation_tokens_per_second_list = []
46
+ self.max_memory_used_gb_list = []
47
+
48
+ # Max memory used can only be measured on Windows systems
49
+ self.save_max_memory_used = platform.system() == "Windows"
50
+
51
+ # This is set to True only for the duration of the first call to run_prompt
52
+ self.first_run_prompt = None
53
+
54
+ @staticmethod
55
+ def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
56
+ # Allow inherited classes to initialize and pass in a parser, add parameters to it if so
57
+ if parser is None:
58
+ parser = __class__.helpful_parser(
59
+ short_description="Benchmark an LLM", add_help=add_help
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--iterations",
64
+ "-i",
65
+ required=False,
66
+ type=int,
67
+ default=default_iterations,
68
+ help="Number of benchmarking iterations to run (default: "
69
+ f"{default_iterations})",
70
+ )
71
+
72
+ parser.add_argument(
73
+ "--warmup-iterations",
74
+ "-w",
75
+ required=False,
76
+ type=int,
77
+ default=default_warmup_runs,
78
+ help="Number of benchmarking iterations to use for cache warmup "
79
+ "(the results of these iterations "
80
+ f"are not included in the results; default: {default_warmup_runs})",
81
+ )
82
+
83
+ parser.add_argument(
84
+ "--prompts",
85
+ "-p",
86
+ nargs="+",
87
+ required=False,
88
+ default=[str(default_prompt_length)],
89
+ metavar="PROMPT",
90
+ help="Input one or more prompts to the LLM. Three formats are supported. "
91
+ "1) integer: use a synthetic prompt with the specified length "
92
+ "2) str: use a user-provided prompt string "
93
+ "3) path/to/prompt.txt: load the prompt from a text file. "
94
+ f"(default: {default_prompt_length}) ",
95
+ )
96
+
97
+ parser.add_argument(
98
+ "--output-tokens",
99
+ required=False,
100
+ type=int,
101
+ default=default_output_tokens,
102
+ help="Number of new tokens the LLM should make (default: "
103
+ f"{default_output_tokens})",
104
+ )
105
+
106
+ return parser
107
+
108
+ def get_prompt_str(self, _state, token_length):
109
+ """
110
+ Returns a string with approximately the prescribed token length.
111
+ Note: Actual token length is dependent on the tokenizer.
112
+ """
113
+ return "word " * (token_length - 1)
114
+
115
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
116
+ """
117
+ Helper function to parse CLI arguments into the args expected by run()
118
+ """
119
+
120
+ parsed_args = super().parse(state, args, known_only)
121
+
122
+ if parsed_args.prompts is None:
123
+ parsed_args.prompts = [str(default_prompt_length)]
124
+
125
+ # Decode prompt arg into a list of prompt strings
126
+ prompt_strings = []
127
+ for prompt_item in parsed_args.prompts:
128
+ if prompt_item.isdigit():
129
+ # Generate a prompt with the requested length
130
+ token_length = int(prompt_item)
131
+ prompt_strings.append(self.get_prompt_str(state, token_length))
132
+
133
+ elif os.path.exists(prompt_item):
134
+ with open(prompt_item, "r", encoding="utf-8") as f:
135
+ prompt_strings.append(f.read())
136
+
137
+ else:
138
+ # No change to the prompt
139
+ prompt_strings.append(prompt_item)
140
+ parsed_args.prompts = prompt_strings
141
+
142
+ return parsed_args
143
+
144
+ def run(
145
+ self,
146
+ state: State,
147
+ prompts: list[str] = None,
148
+ iterations: int = default_iterations,
149
+ warmup_iterations: int = default_warmup_runs,
150
+ output_tokens: int = default_output_tokens,
151
+ **kwargs,
152
+ ) -> State:
153
+ """
154
+ Args:
155
+ - prompts: List of input prompts used as starting points for LLM text generation
156
+ - iterations: number of benchmarking samples to take; results are
157
+ reported as the median and mean of the samples.
158
+ - warmup_iterations: subset of the iterations to treat as warmup,
159
+ and not included in the results.
160
+ - output_tokens: Number of new tokens LLM to create.
161
+ - kwargs: Additional parameters used by bench tools
162
+ """
163
+
164
+ if prompts is None:
165
+ prompts = ["word " * (default_prompt_length - 2)]
166
+ elif isinstance(prompts, str):
167
+ prompts = [prompts]
168
+
169
+ state.save_stat("prompts", prompts)
170
+ state.save_stat("iterations", iterations)
171
+ state.save_stat("warmup_iterations", warmup_iterations)
172
+ state.save_stat("output_tokens", output_tokens)
173
+
174
+ counter = 0
175
+ report_progress_fn = lambda x: self.set_percent_progress(
176
+ 100 * (counter + x) / len(prompts)
177
+ )
178
+ self.first_run_prompt = True
179
+ for counter, prompt in enumerate(prompts):
180
+ report_progress_fn(0)
181
+
182
+ self.run_prompt(
183
+ state,
184
+ report_progress_fn,
185
+ prompt,
186
+ iterations,
187
+ warmup_iterations,
188
+ output_tokens,
189
+ **kwargs,
190
+ )
191
+ self.first_run_prompt = False
192
+
193
+ if self.save_max_memory_used:
194
+ self.max_memory_used_gb_list.append(
195
+ psutil.Process().memory_info().peak_wset / 1024**3
196
+ )
197
+
198
+ self.set_percent_progress(None)
199
+ self.save_stats(state)
200
+
201
+ return state
202
+
203
+ @abstractmethod
204
+ def run_prompt(
205
+ self,
206
+ state,
207
+ report_progress_fn,
208
+ prompt,
209
+ iterations,
210
+ warmup_iterations,
211
+ output_tokens,
212
+ **kwargs,
213
+ ):
214
+ pass
215
+
216
+ @staticmethod
217
+ def get_item_or_list(lst):
218
+ """
219
+ If the list is just a single item then return the item, else return the list
220
+ """
221
+ if len(lst) == 1:
222
+ return lst[0]
223
+ else:
224
+ return lst
225
+
226
+ def save_stats(self, state):
227
+ # Save performance data to stats
228
+ state.save_stat(
229
+ Keys.PROMPT_TOKENS, self.get_item_or_list(self.input_ids_len_list)
230
+ )
231
+ state.save_stat(
232
+ Keys.RESPONSE_TOKENS, self.get_item_or_list(self.tokens_out_len_list)
233
+ )
234
+ state.save_stat(
235
+ Keys.SECONDS_TO_FIRST_TOKEN,
236
+ self.get_item_or_list(self.mean_time_to_first_token_list),
237
+ )
238
+ if not all(
239
+ element is None for element in self.std_dev_time_to_first_token_list
240
+ ):
241
+ state.save_stat(
242
+ Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
243
+ self.get_item_or_list(self.std_dev_time_to_first_token_list),
244
+ )
245
+ state.save_stat(
246
+ Keys.PREFILL_TOKENS_PER_SECOND,
247
+ self.get_item_or_list(self.prefill_tokens_per_second_list),
248
+ )
249
+ state.save_stat(
250
+ Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
251
+ self.get_item_or_list(self.token_generation_tokens_per_second_list),
252
+ )
253
+ if self.save_max_memory_used:
254
+ state.save_stat(
255
+ Keys.MAX_MEMORY_USED_GBYTE,
256
+ self.get_item_or_list(self.max_memory_used_gb_list),
257
+ )
258
+
259
+ @staticmethod
260
+ def not_enough_tokens(output_tokens: int):
261
+ """
262
+ Raise an exception that explains why a benchmark did not produce any results
263
+ """
264
+
265
+ raise ValueError(
266
+ "Your model was benchmarked, however none of the benchmarking "
267
+ "iterations produced the requested amount of output tokens "
268
+ f"(currently {output_tokens}), so "
269
+ "the results have been discarded. You have the following options "
270
+ "to solve this: \n"
271
+ "1. Use the -p option to change the prompt to something that will "
272
+ "produce more output tokens. For example, 'The extremely long "
273
+ "story of my life, told in excruciating details is:' "
274
+ "is an example of a prompt that will result in a lot of output. \n"
275
+ "2. Set a lower value for --output-tokens to make it more likely "
276
+ "that the model will produce enough. \n"
277
+ "3. Set more verbose hyperparameters. \n"
278
+ "4. Run more benchmarking iterations, to improve the chance of "
279
+ "getting at least one with enough output tokens. \n"
280
+ )
281
+
282
+
283
+ # This file was originally licensed under Apache 2.0. It has been modified.
284
+ # Modifications Copyright (c) 2025 AMD