lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
@@ -0,0 +1,187 @@
1
+ import argparse
2
+ import statistics
3
+ from statistics import StatisticsError
4
+ import psutil
5
+ from lemonade.state import State
6
+ from lemonade.cache import Keys
7
+ from lemonade.tools.bench import Bench
8
+
9
+ default_beams = 1
10
+
11
+
12
+ class HuggingfaceBench(Bench):
13
+ """
14
+ Benchmarks the performance of the generate() method of an LLM loaded from
15
+ Huggingface Transformers (or any object that supports a
16
+ huggingface-like generate() method).
17
+
18
+ Required input state:
19
+ - DTYPE: data type of the model; used to determine if AMP should be
20
+ enabled to convert the input data type to match the model data
21
+ type.
22
+ - MODEL: huggingface-like instance to benchmark.
23
+ - INPUTS: model inputs to pass to generate() during benchmarking.
24
+
25
+ Output state produced: None
26
+
27
+ """
28
+
29
+ unique_name = "huggingface-bench"
30
+
31
+ @staticmethod
32
+ def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
33
+ # Allow inherited classes to initialize and pass in a parser, add parameters to it if so
34
+ if parser is None:
35
+ parser = __class__.helpful_parser(
36
+ short_description="Benchmark a huggingface-style PyTorch LLM",
37
+ add_help=add_help,
38
+ )
39
+
40
+ parser = Bench.parser(parser)
41
+
42
+ parser.add_argument(
43
+ "--num-beams",
44
+ required=False,
45
+ type=int,
46
+ default=default_beams,
47
+ help=f"Number of beams for the LLM to use (default: {default_beams})",
48
+ )
49
+
50
+ return parser
51
+
52
+ def get_prompt_str(self, state, token_length):
53
+ """
54
+ Returns a string with the prescribed token length.
55
+ """
56
+ model = state.model
57
+ tokenizer = state.tokenizer
58
+ test_prompt = "word " * (token_length - 2)
59
+ input_ids = (
60
+ tokenizer(test_prompt, return_tensors="pt")
61
+ .to(device=model.device)
62
+ .input_ids
63
+ )
64
+ test_token_length = input_ids.shape[1]
65
+ delta = test_token_length - token_length
66
+ if delta == 0:
67
+ return test_prompt
68
+ return "word " * max(token_length - 2 - delta, 0)
69
+
70
+ def run_prompt(
71
+ self,
72
+ state: State,
73
+ report_progress_fn,
74
+ prompt: str,
75
+ iterations: int,
76
+ warmup_iterations: int,
77
+ output_tokens: int,
78
+ num_beams: int = default_beams,
79
+ ):
80
+ """
81
+ We don't have access to the internal timings of generate(), so time to first
82
+ token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
83
+ prefill_latency = latency of generate(output_tokens=1)
84
+ execution_latency = latency of generate(output_tokens=output_tokens)
85
+ tokens_per_second = (new_tokens - 1) / (execution_latency - prefill_latency)
86
+ """
87
+
88
+ from lemonade.tools.huggingface.utils import benchmark_huggingface_llm
89
+
90
+ if self.first_run_prompt:
91
+ if vars(state).get(Keys.MODEL) is None:
92
+ raise ValueError(
93
+ f"{self.__class__.__name__} requires that a model be passed from another tool"
94
+ )
95
+ if (
96
+ vars(state).get("num_beams")
97
+ and vars(state).get("num_beams") != num_beams
98
+ ):
99
+ raise ValueError(
100
+ f"Number of beams was set to {vars(state).get('num_beams')} "
101
+ f"in a previous tool, but it is set to {num_beams} in "
102
+ "this tool. The values must be the same."
103
+ )
104
+
105
+ # Save benchmarking parameters
106
+ state.save_stat("num_beams", num_beams)
107
+
108
+ model = state.model
109
+ tokenizer = state.tokenizer
110
+ dtype = state.dtype
111
+
112
+ # Generate the input_ids outside the benchmarking function to make sure
113
+ # the same input_ids are used everywhere
114
+ input_ids = (
115
+ tokenizer(prompt, return_tensors="pt").to(device=model.device).input_ids
116
+ )
117
+ self.input_ids_len_list.append(input_ids.shape[1])
118
+
119
+ prefill_report_progress_fn = lambda x: report_progress_fn(0.5 * x)
120
+
121
+ # Benchmark prefill time (time to first token)
122
+ prefill_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
123
+ model=model,
124
+ tokenizer=tokenizer,
125
+ input_ids=input_ids,
126
+ dtype=dtype,
127
+ num_beams=num_beams,
128
+ target_output_tokens=1,
129
+ iterations=iterations,
130
+ warmup_iterations=warmup_iterations,
131
+ report_progress_fn=prefill_report_progress_fn,
132
+ )
133
+ self.tokens_out_len_list += tokens_out_len_list
134
+
135
+ time_to_first_token_per_iteration = [
136
+ latency for latency, _ in prefill_per_iteration_result
137
+ ]
138
+ mean_time_to_first_token = statistics.mean(time_to_first_token_per_iteration)
139
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
140
+ self.prefill_tokens_per_second_list.append(
141
+ input_ids.shape[1] / mean_time_to_first_token
142
+ )
143
+ try:
144
+ self.std_dev_time_to_first_token_list.append(
145
+ statistics.stdev(time_to_first_token_per_iteration)
146
+ )
147
+ except StatisticsError:
148
+ # Less than 2 measurements
149
+ self.std_dev_time_to_first_token_list.append(None)
150
+
151
+ decode_report_progress_fn = lambda x: report_progress_fn(0.5 + 0.5 * x)
152
+
153
+ # Benchmark generation of all tokens
154
+ decode_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
155
+ model=model,
156
+ tokenizer=tokenizer,
157
+ input_ids=input_ids,
158
+ dtype=dtype,
159
+ num_beams=num_beams,
160
+ target_output_tokens=output_tokens,
161
+ iterations=iterations,
162
+ warmup_iterations=warmup_iterations,
163
+ report_progress_fn=decode_report_progress_fn,
164
+ )
165
+ self.tokens_out_len_list += tokens_out_len_list
166
+
167
+ execution_latency_per_iteration = [
168
+ latency for latency, _ in decode_per_iteration_result
169
+ ]
170
+ token_len_per_iteration = [
171
+ token_len for _, token_len in decode_per_iteration_result
172
+ ]
173
+ mean_execution_latency = statistics.mean(execution_latency_per_iteration)
174
+ mean_decode_latency = mean_execution_latency - mean_time_to_first_token
175
+ mean_token_len = statistics.mean(token_len_per_iteration)
176
+ # Subtract 1 so that we don't count the prefill token
177
+ self.token_generation_tokens_per_second_list.append(
178
+ (mean_token_len - 1) / mean_decode_latency
179
+ )
180
+ if self.save_max_memory_used:
181
+ self.max_memory_used_gb_list.append(
182
+ psutil.Process().memory_info().peak_wset / 1024**3
183
+ )
184
+
185
+
186
+ # This file was originally licensed under Apache 2.0. It has been modified.
187
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,235 @@
1
+ import argparse
2
+ from typing import Dict, Optional
3
+ import json
4
+ from lemonade.tools import FirstTool
5
+ from lemonade.state import State
6
+ import lemonade.common.status as status
7
+ import lemonade.common.printing as printing
8
+ from lemonade.cache import Keys
9
+
10
+
11
+ class HuggingfaceLoad(FirstTool):
12
+ """
13
+ Load an LLM as a torch.nn.Module using the Hugging Face transformers
14
+ from_pretrained() API.
15
+
16
+ Expected input: a checkpoint to load
17
+
18
+ Output state produced:
19
+ - state.model: instance of torch.nn.Module that implements an LLM.
20
+ - state.inputs: tokenized example inputs to the model, in the form of a
21
+ dictionary of kwargs.
22
+ - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
23
+ - state.dtype: data type of the model.
24
+ - state.checkpoint: pretrained checkpoint used to load the model.
25
+ """
26
+
27
+ unique_name = "huggingface-load"
28
+
29
+ def _imports(self):
30
+ pass
31
+
32
+ def __init__(self):
33
+ super().__init__(monitor_message="Loading Huggingface checkpoint")
34
+
35
+ self.status_stats = [Keys.DTYPE]
36
+
37
+ @staticmethod
38
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
39
+ parser = __class__.helpful_parser(
40
+ short_description="Load an LLM in PyTorch using huggingface transformers",
41
+ add_help=add_help,
42
+ )
43
+
44
+ default_dtype = "float32"
45
+ parser.add_argument(
46
+ "--dtype",
47
+ "-d",
48
+ required=False,
49
+ default=default_dtype,
50
+ help=f"Data type to load the model in (default: {default_dtype}).",
51
+ )
52
+
53
+ choices = ["cpu", "cuda"]
54
+ for cuda in range(15):
55
+ choices.append(f"cuda:{cuda}")
56
+ parser.add_argument(
57
+ "--device",
58
+ required=False,
59
+ default=None,
60
+ choices=choices,
61
+ help="Move the model and inputs to a device using the .to() method "
62
+ "(default: don't call the .to() method)",
63
+ )
64
+
65
+ parser.add_argument(
66
+ "--load-kwargs",
67
+ required=False,
68
+ default="{}",
69
+ type=json.loads,
70
+ help="Arbitrary kwargs, in json format, that will be passed as "
71
+ "from_pretrained(**kwargs). "
72
+ r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
73
+ "from_pretrained(trust_remote_code=True)",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--channels-last",
78
+ default=True,
79
+ type=bool,
80
+ help="Whether to format the model in memory using "
81
+ "channels-last (default: True)",
82
+ )
83
+
84
+ return parser
85
+
86
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
87
+
88
+ from lemonade.tools.huggingface.utils import str_to_dtype
89
+
90
+ parsed_args = super().parse(state, args, known_only)
91
+
92
+ # Save stats about the user's input (do this prior to decoding)
93
+ state.save_stat(Keys.CHECKPOINT, parsed_args.input)
94
+ state.save_stat(Keys.DTYPE, parsed_args.dtype)
95
+
96
+ # Decode dtype arg into a torch value
97
+ parsed_args.dtype = str_to_dtype[parsed_args.dtype]
98
+
99
+ return parsed_args
100
+
101
+ def run(
102
+ self,
103
+ state: State,
104
+ input: str = "",
105
+ dtype: "torch.dtype" = None,
106
+ device: Optional[str] = None,
107
+ load_kwargs: Optional[Dict] = None,
108
+ channels_last: bool = True,
109
+ ) -> State:
110
+ # Import expensive modules at runtime
111
+ import transformers
112
+ import torch
113
+
114
+ from lemonade.tools.huggingface.utils import (
115
+ HuggingfaceTokenizerAdapter,
116
+ HuggingfaceAdapter,
117
+ )
118
+ from lemonade.common.network import (
119
+ is_offline,
120
+ get_base_model,
121
+ )
122
+
123
+ # Set default dtype
124
+ if dtype is None:
125
+ dtype_to_use = torch.float32
126
+ else:
127
+ dtype_to_use = dtype
128
+
129
+ # Auto-detect offline status
130
+ offline = is_offline()
131
+ if offline:
132
+ printing.log_warning(
133
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
134
+ )
135
+
136
+ checkpoint = input
137
+
138
+ if load_kwargs is None:
139
+ load_kwargs_to_use = {}
140
+ else:
141
+ load_kwargs_to_use = load_kwargs
142
+
143
+ # Add local_files_only to kwargs in offline mode
144
+ if offline:
145
+ load_kwargs_to_use["local_files_only"] = True
146
+
147
+ if vars(state).get(Keys.MODEL):
148
+ raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
149
+
150
+ try:
151
+ model = transformers.AutoModelForCausalLM.from_pretrained(
152
+ checkpoint,
153
+ torch_dtype=dtype_to_use,
154
+ low_cpu_mem_usage=True,
155
+ **load_kwargs_to_use,
156
+ )
157
+ except Exception as e:
158
+ if offline and "Can't load config for" in str(e):
159
+ raise ValueError(
160
+ f"Cannot load model {checkpoint} in offline mode. "
161
+ f"The model files may not be available locally. Original error: {str(e)}"
162
+ )
163
+ raise
164
+
165
+ # Only call the model.to() method if an argument to this function
166
+ # provides a reason to do so
167
+ to_args = {}
168
+ if channels_last:
169
+ to_args["memory_format"] = torch.channels_last
170
+ if device:
171
+ to_args["device"] = device
172
+ if to_args:
173
+ model.to(**to_args)
174
+
175
+ model = model.eval()
176
+
177
+ try:
178
+ tokenizer_kwargs = {
179
+ "use_fast": False,
180
+ "model_max_length": 4096,
181
+ "padding_side": "left",
182
+ }
183
+ if offline:
184
+ tokenizer_kwargs["local_files_only"] = True
185
+
186
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
187
+ checkpoint, **tokenizer_kwargs
188
+ )
189
+ except ValueError as e:
190
+ # Sometimes those specific tokenizer flags are not supported, in which
191
+ # case we try to just load a simple tokenizer
192
+ tokenizer_kwargs = {}
193
+ if offline:
194
+ tokenizer_kwargs["local_files_only"] = True
195
+
196
+ try:
197
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
198
+ checkpoint, **tokenizer_kwargs
199
+ )
200
+ except Exception as e:
201
+ if offline and "Can't load tokenizer for" in str(e):
202
+ raise ValueError(
203
+ f"Cannot load tokenizer for {checkpoint} in offline mode. "
204
+ f"The tokenizer files may not be available locally. "
205
+ f"Original error: {str(e)}"
206
+ )
207
+ raise
208
+
209
+ # Pass the model and inputs into state
210
+ state.model = HuggingfaceAdapter(model, dtype_to_use, device, tokenizer)
211
+
212
+ state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
213
+ state.dtype = dtype_to_use
214
+ state.checkpoint = checkpoint
215
+ state.device = device
216
+
217
+ # Save stats about the model
218
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
219
+ state.save_stat(Keys.DTYPE, str(dtype_to_use).split(".")[1])
220
+ state.save_stat(Keys.DEVICE, device)
221
+
222
+ # Get base model information
223
+ base_model = get_base_model(checkpoint)
224
+ if base_model is not None:
225
+ state.save_stat("base_model", base_model)
226
+
227
+ # Create a UniqueInvocationInfo and ModelInfo so that we can display status
228
+ # at the end of the sequence
229
+ status.add_to_state(state=state, name=input, model=model)
230
+
231
+ return state
232
+
233
+
234
+ # This file was originally licensed under Apache 2.0. It has been modified.
235
+ # Modifications Copyright (c) 2025 AMD