lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (56) hide show
  1. lemonade/api.py +3 -3
  2. lemonade/cli.py +11 -17
  3. lemonade/common/build.py +0 -47
  4. lemonade/common/network.py +50 -0
  5. lemonade/common/status.py +2 -21
  6. lemonade/common/system_info.py +19 -4
  7. lemonade/profilers/memory_tracker.py +3 -1
  8. lemonade/tools/accuracy.py +3 -4
  9. lemonade/tools/adapter.py +1 -2
  10. lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
  11. lemonade/tools/huggingface/load.py +235 -0
  12. lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
  13. lemonade/tools/humaneval.py +9 -3
  14. lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
  15. lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
  16. lemonade/tools/mmlu.py +7 -15
  17. lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
  18. lemonade/tools/oga/utils.py +423 -0
  19. lemonade/tools/perplexity.py +4 -3
  20. lemonade/tools/prompt.py +2 -1
  21. lemonade/tools/quark/quark_load.py +2 -1
  22. lemonade/tools/quark/quark_quantize.py +5 -5
  23. lemonade/tools/report/table.py +3 -3
  24. lemonade/tools/server/llamacpp.py +188 -45
  25. lemonade/tools/server/serve.py +184 -146
  26. lemonade/tools/server/static/favicon.ico +0 -0
  27. lemonade/tools/server/static/styles.css +568 -0
  28. lemonade/tools/server/static/webapp.html +439 -0
  29. lemonade/tools/server/tray.py +458 -0
  30. lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
  31. lemonade/tools/server/utils/system_tray.py +395 -0
  32. lemonade/tools/server/{instructions.py → webapp.py} +4 -10
  33. lemonade/version.py +1 -1
  34. lemonade_install/install.py +46 -28
  35. lemonade_sdk-8.0.1.dist-info/METADATA +179 -0
  36. lemonade_sdk-8.0.1.dist-info/RECORD +70 -0
  37. lemonade_server/cli.py +182 -27
  38. lemonade_server/model_manager.py +192 -20
  39. lemonade_server/pydantic_models.py +9 -4
  40. lemonade_server/server_models.json +5 -3
  41. lemonade/common/analyze_model.py +0 -26
  42. lemonade/common/labels.py +0 -61
  43. lemonade/common/onnx_helpers.py +0 -176
  44. lemonade/common/plugins.py +0 -10
  45. lemonade/common/tensor_helpers.py +0 -83
  46. lemonade/tools/server/static/instructions.html +0 -262
  47. lemonade_sdk-7.0.4.dist-info/METADATA +0 -113
  48. lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
  49. /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
  50. /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
  51. /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
  52. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/WHEEL +0 -0
  53. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/entry_points.txt +0 -0
  54. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/licenses/LICENSE +0 -0
  55. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/licenses/NOTICE.md +0 -0
  56. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,235 @@
1
+ import argparse
2
+ from typing import Dict, Optional
3
+ import json
4
+ from lemonade.tools import FirstTool
5
+ from lemonade.state import State
6
+ import lemonade.common.status as status
7
+ import lemonade.common.printing as printing
8
+ from lemonade.cache import Keys
9
+
10
+
11
+ class HuggingfaceLoad(FirstTool):
12
+ """
13
+ Load an LLM as a torch.nn.Module using the Hugging Face transformers
14
+ from_pretrained() API.
15
+
16
+ Expected input: a checkpoint to load
17
+
18
+ Output state produced:
19
+ - state.model: instance of torch.nn.Module that implements an LLM.
20
+ - state.inputs: tokenized example inputs to the model, in the form of a
21
+ dictionary of kwargs.
22
+ - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
23
+ - state.dtype: data type of the model.
24
+ - state.checkpoint: pretrained checkpoint used to load the model.
25
+ """
26
+
27
+ unique_name = "huggingface-load"
28
+
29
+ def _imports(self):
30
+ pass
31
+
32
+ def __init__(self):
33
+ super().__init__(monitor_message="Loading Huggingface checkpoint")
34
+
35
+ self.status_stats = [Keys.DTYPE]
36
+
37
+ @staticmethod
38
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
39
+ parser = __class__.helpful_parser(
40
+ short_description="Load an LLM in PyTorch using huggingface transformers",
41
+ add_help=add_help,
42
+ )
43
+
44
+ default_dtype = "float32"
45
+ parser.add_argument(
46
+ "--dtype",
47
+ "-d",
48
+ required=False,
49
+ default=default_dtype,
50
+ help=f"Data type to load the model in (default: {default_dtype}).",
51
+ )
52
+
53
+ choices = ["cpu", "cuda"]
54
+ for cuda in range(15):
55
+ choices.append(f"cuda:{cuda}")
56
+ parser.add_argument(
57
+ "--device",
58
+ required=False,
59
+ default=None,
60
+ choices=choices,
61
+ help="Move the model and inputs to a device using the .to() method "
62
+ "(default: don't call the .to() method)",
63
+ )
64
+
65
+ parser.add_argument(
66
+ "--load-kwargs",
67
+ required=False,
68
+ default="{}",
69
+ type=json.loads,
70
+ help="Arbitrary kwargs, in json format, that will be passed as "
71
+ "from_pretrained(**kwargs). "
72
+ r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
73
+ "from_pretrained(trust_remote_code=True)",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--channels-last",
78
+ default=True,
79
+ type=bool,
80
+ help="Whether to format the model in memory using "
81
+ "channels-last (default: True)",
82
+ )
83
+
84
+ return parser
85
+
86
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
87
+
88
+ from lemonade.tools.huggingface.utils import str_to_dtype
89
+
90
+ parsed_args = super().parse(state, args, known_only)
91
+
92
+ # Save stats about the user's input (do this prior to decoding)
93
+ state.save_stat(Keys.CHECKPOINT, parsed_args.input)
94
+ state.save_stat(Keys.DTYPE, parsed_args.dtype)
95
+
96
+ # Decode dtype arg into a torch value
97
+ parsed_args.dtype = str_to_dtype[parsed_args.dtype]
98
+
99
+ return parsed_args
100
+
101
+ def run(
102
+ self,
103
+ state: State,
104
+ input: str = "",
105
+ dtype: "torch.dtype" = None,
106
+ device: Optional[str] = None,
107
+ load_kwargs: Optional[Dict] = None,
108
+ channels_last: bool = True,
109
+ ) -> State:
110
+ # Import expensive modules at runtime
111
+ import transformers
112
+ import torch
113
+
114
+ from lemonade.tools.huggingface.utils import (
115
+ HuggingfaceTokenizerAdapter,
116
+ HuggingfaceAdapter,
117
+ )
118
+ from lemonade.common.network import (
119
+ is_offline,
120
+ get_base_model,
121
+ )
122
+
123
+ # Set default dtype
124
+ if dtype is None:
125
+ dtype_to_use = torch.float32
126
+ else:
127
+ dtype_to_use = dtype
128
+
129
+ # Auto-detect offline status
130
+ offline = is_offline()
131
+ if offline:
132
+ printing.log_warning(
133
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
134
+ )
135
+
136
+ checkpoint = input
137
+
138
+ if load_kwargs is None:
139
+ load_kwargs_to_use = {}
140
+ else:
141
+ load_kwargs_to_use = load_kwargs
142
+
143
+ # Add local_files_only to kwargs in offline mode
144
+ if offline:
145
+ load_kwargs_to_use["local_files_only"] = True
146
+
147
+ if vars(state).get(Keys.MODEL):
148
+ raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
149
+
150
+ try:
151
+ model = transformers.AutoModelForCausalLM.from_pretrained(
152
+ checkpoint,
153
+ torch_dtype=dtype_to_use,
154
+ low_cpu_mem_usage=True,
155
+ **load_kwargs_to_use,
156
+ )
157
+ except Exception as e:
158
+ if offline and "Can't load config for" in str(e):
159
+ raise ValueError(
160
+ f"Cannot load model {checkpoint} in offline mode. "
161
+ f"The model files may not be available locally. Original error: {str(e)}"
162
+ )
163
+ raise
164
+
165
+ # Only call the model.to() method if an argument to this function
166
+ # provides a reason to do so
167
+ to_args = {}
168
+ if channels_last:
169
+ to_args["memory_format"] = torch.channels_last
170
+ if device:
171
+ to_args["device"] = device
172
+ if to_args:
173
+ model.to(**to_args)
174
+
175
+ model = model.eval()
176
+
177
+ try:
178
+ tokenizer_kwargs = {
179
+ "use_fast": False,
180
+ "model_max_length": 4096,
181
+ "padding_side": "left",
182
+ }
183
+ if offline:
184
+ tokenizer_kwargs["local_files_only"] = True
185
+
186
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
187
+ checkpoint, **tokenizer_kwargs
188
+ )
189
+ except ValueError as e:
190
+ # Sometimes those specific tokenizer flags are not supported, in which
191
+ # case we try to just load a simple tokenizer
192
+ tokenizer_kwargs = {}
193
+ if offline:
194
+ tokenizer_kwargs["local_files_only"] = True
195
+
196
+ try:
197
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
198
+ checkpoint, **tokenizer_kwargs
199
+ )
200
+ except Exception as e:
201
+ if offline and "Can't load tokenizer for" in str(e):
202
+ raise ValueError(
203
+ f"Cannot load tokenizer for {checkpoint} in offline mode. "
204
+ f"The tokenizer files may not be available locally. "
205
+ f"Original error: {str(e)}"
206
+ )
207
+ raise
208
+
209
+ # Pass the model and inputs into state
210
+ state.model = HuggingfaceAdapter(model, dtype_to_use, device, tokenizer)
211
+
212
+ state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
213
+ state.dtype = dtype_to_use
214
+ state.checkpoint = checkpoint
215
+ state.device = device
216
+
217
+ # Save stats about the model
218
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
219
+ state.save_stat(Keys.DTYPE, str(dtype_to_use).split(".")[1])
220
+ state.save_stat(Keys.DEVICE, device)
221
+
222
+ # Get base model information
223
+ base_model = get_base_model(checkpoint)
224
+ if base_model is not None:
225
+ state.save_stat("base_model", base_model)
226
+
227
+ # Create a UniqueInvocationInfo and ModelInfo so that we can display status
228
+ # at the end of the sequence
229
+ status.add_to_state(state=state, name=input, model=model)
230
+
231
+ return state
232
+
233
+
234
+ # This file was originally licensed under Apache 2.0. It has been modified.
235
+ # Modifications Copyright (c) 2025 AMD
@@ -1,16 +1,12 @@
1
- import argparse
2
- from typing import Dict, Optional
3
- import json
4
- import socket
1
+ from typing import Dict, List, Tuple
2
+ import time
3
+ from contextlib import nullcontext
5
4
  import transformers
6
5
  import torch
7
- from huggingface_hub import model_info
8
6
  from lemonade.state import State
9
- import lemonade.common.status as status
10
- import lemonade.common.printing as printing
11
- from lemonade.tools import FirstTool
12
- from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
13
- from lemonade.cache import Keys
7
+ from lemonade.tools.adapter import TokenizerAdapter
8
+ from lemonade.tools.adapter import ModelAdapter
9
+ from lemonade.tools.bench import Bench
14
10
 
15
11
  # Command line interfaces for tools will use string inputs for data
16
12
  # types, however the internal tool logic will need to know the actual
@@ -62,249 +58,6 @@ class HuggingfaceTokenizerAdapter(TokenizerAdapter):
62
58
  return self.tokenizer.save_pretrained(model_dir, **kwargs)
63
59
 
64
60
 
65
- def is_offline():
66
- """
67
- Check if the system is offline by attempting to connect to huggingface.co.
68
-
69
- Returns:
70
- bool: True if the system is offline (cannot connect to huggingface.co),
71
- False otherwise.
72
- """
73
- try:
74
- socket.gethostbyname("huggingface.co")
75
- return False
76
- except socket.gaierror:
77
- return True
78
-
79
-
80
- def get_base_model(checkpoint: str) -> Optional[str]:
81
- """
82
- Get the base model information for a given checkpoint from the Hugging Face Hub.
83
- Will auto-detect if we're offline and skip the network call in that case.
84
-
85
- Args:
86
- checkpoint: The model checkpoint to query
87
-
88
- Returns:
89
- The base model name if found, or None if not found or error occurs
90
- """
91
- # Skip network call in offline mode
92
- if is_offline():
93
- return None
94
-
95
- try:
96
- info = model_info(checkpoint)
97
- if info.cardData and "base_model" in info.cardData:
98
- if info.cardData["base_model"] is not None:
99
- # This is a derived model
100
- return info.cardData["base_model"]
101
- else:
102
- # This is itself a base model
103
- return [checkpoint]
104
- except Exception: # pylint: disable=broad-except
105
- pass
106
- return None
107
-
108
-
109
- class HuggingfaceLoad(FirstTool):
110
- """
111
- Load an LLM as a torch.nn.Module using the Hugging Face transformers
112
- from_pretrained() API.
113
-
114
- Expected input: a checkpoint to load
115
-
116
- Output state produced:
117
- - state.model: instance of torch.nn.Module that implements an LLM.
118
- - state.inputs: tokenized example inputs to the model, in the form of a
119
- dictionary of kwargs.
120
- - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
121
- - state.dtype: data type of the model.
122
- - state.checkpoint: pretrained checkpoint used to load the model.
123
- """
124
-
125
- unique_name = "huggingface-load"
126
-
127
- def __init__(self):
128
- super().__init__(monitor_message="Loading Huggingface checkpoint")
129
-
130
- self.status_stats = [Keys.DTYPE]
131
-
132
- @staticmethod
133
- def parser(add_help: bool = True) -> argparse.ArgumentParser:
134
- parser = __class__.helpful_parser(
135
- short_description="Load an LLM in PyTorch using huggingface transformers",
136
- add_help=add_help,
137
- )
138
-
139
- default_dtype = "float32"
140
- parser.add_argument(
141
- "--dtype",
142
- "-d",
143
- required=False,
144
- default=default_dtype,
145
- help=f"Data type to load the model in (default: {default_dtype}).",
146
- )
147
-
148
- choices = ["cpu", "cuda"]
149
- for cuda in range(15):
150
- choices.append(f"cuda:{cuda}")
151
- parser.add_argument(
152
- "--device",
153
- required=False,
154
- default=None,
155
- choices=choices,
156
- help="Move the model and inputs to a device using the .to() method "
157
- "(default: don't call the .to() method)",
158
- )
159
-
160
- parser.add_argument(
161
- "--load-kwargs",
162
- required=False,
163
- default="{}",
164
- type=json.loads,
165
- help="Arbitrary kwargs, in json format, that will be passed as "
166
- "from_pretrained(**kwargs). "
167
- r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
168
- "from_pretrained(trust_remote_code=True)",
169
- )
170
-
171
- parser.add_argument(
172
- "--channels-last",
173
- default=True,
174
- type=bool,
175
- help="Whether to format the model in memory using "
176
- "channels-last (default: True)",
177
- )
178
-
179
- return parser
180
-
181
- def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
182
-
183
- parsed_args = super().parse(state, args, known_only)
184
-
185
- # Save stats about the user's input (do this prior to decoding)
186
- state.save_stat(Keys.CHECKPOINT, parsed_args.input)
187
- state.save_stat(Keys.DTYPE, parsed_args.dtype)
188
-
189
- # Decode dtype arg into a torch value
190
- parsed_args.dtype = str_to_dtype[parsed_args.dtype]
191
-
192
- return parsed_args
193
-
194
- def run(
195
- self,
196
- state: State,
197
- input: str = "",
198
- dtype: torch.dtype = torch.float32,
199
- device: Optional[str] = None,
200
- load_kwargs: Optional[Dict] = None,
201
- channels_last: bool = True,
202
- ) -> State:
203
- # Auto-detect offline status
204
- offline = is_offline()
205
- if offline:
206
- printing.log_warning(
207
- "Network connectivity to huggingface.co not detected. Running in offline mode."
208
- )
209
-
210
- checkpoint = input
211
-
212
- if load_kwargs is None:
213
- load_kwargs_to_use = {}
214
- else:
215
- load_kwargs_to_use = load_kwargs
216
-
217
- # Add local_files_only to kwargs in offline mode
218
- if offline:
219
- load_kwargs_to_use["local_files_only"] = True
220
-
221
- if vars(state).get(Keys.MODEL):
222
- raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
223
-
224
- try:
225
- model = transformers.AutoModelForCausalLM.from_pretrained(
226
- checkpoint,
227
- torch_dtype=dtype,
228
- low_cpu_mem_usage=True,
229
- **load_kwargs_to_use,
230
- )
231
- except Exception as e:
232
- if offline and "Can't load config for" in str(e):
233
- raise ValueError(
234
- f"Cannot load model {checkpoint} in offline mode. "
235
- f"The model files may not be available locally. Original error: {str(e)}"
236
- )
237
- raise
238
-
239
- # Only call the model.to() method if an argument to this function
240
- # provides a reason to do so
241
- to_args = {}
242
- if channels_last:
243
- to_args["memory_format"] = torch.channels_last
244
- if device:
245
- to_args["device"] = device
246
- if to_args:
247
- model.to(**to_args)
248
-
249
- model = model.eval()
250
-
251
- try:
252
- tokenizer_kwargs = {
253
- "use_fast": False,
254
- "model_max_length": 4096,
255
- "padding_side": "left",
256
- }
257
- if offline:
258
- tokenizer_kwargs["local_files_only"] = True
259
-
260
- tokenizer = transformers.AutoTokenizer.from_pretrained(
261
- checkpoint, **tokenizer_kwargs
262
- )
263
- except ValueError as e:
264
- # Sometimes those specific tokenizer flags are not supported, in which
265
- # case we try to just load a simple tokenizer
266
- tokenizer_kwargs = {}
267
- if offline:
268
- tokenizer_kwargs["local_files_only"] = True
269
-
270
- try:
271
- tokenizer = transformers.AutoTokenizer.from_pretrained(
272
- checkpoint, **tokenizer_kwargs
273
- )
274
- except Exception as e:
275
- if offline and "Can't load tokenizer for" in str(e):
276
- raise ValueError(
277
- f"Cannot load tokenizer for {checkpoint} in offline mode. "
278
- f"The tokenizer files may not be available locally. "
279
- f"Original error: {str(e)}"
280
- )
281
- raise
282
-
283
- # Pass the model and inputs into state
284
- state.model = HuggingfaceAdapter(model, dtype, device, tokenizer)
285
-
286
- state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
287
- state.dtype = dtype
288
- state.checkpoint = checkpoint
289
- state.device = device
290
-
291
- # Save stats about the model
292
- state.save_stat(Keys.CHECKPOINT, checkpoint)
293
- state.save_stat(Keys.DTYPE, str(dtype).split(".")[1])
294
- state.save_stat(Keys.DEVICE, device)
295
-
296
- # Get base model information
297
- base_model = get_base_model(checkpoint)
298
- if base_model is not None:
299
- state.save_stat("base_model", base_model)
300
-
301
- # Create a UniqueInvocationInfo and ModelInfo so that we can display status
302
- # at the end of the sequence
303
- status.add_to_state(state=state, name=input, model=model)
304
-
305
- return state
306
-
307
-
308
61
  class HuggingfaceAdapter(ModelAdapter):
309
62
  """
310
63
  Wrapper class for Huggingface LLMs that handle generation arguments
@@ -522,5 +275,84 @@ class HuggingfaceAdapter(ModelAdapter):
522
275
  return text_offset, token_log_probs, token_strings, top_logprobs_list
523
276
 
524
277
 
525
- # This file was originally licensed under Apache 2.0. It has been modified.
526
- # Modifications Copyright (c) 2025 AMD
278
+ def benchmark_huggingface_llm(
279
+ model: torch.nn.Module,
280
+ tokenizer,
281
+ input_ids,
282
+ dtype,
283
+ num_beams: int,
284
+ target_output_tokens: int,
285
+ iterations: int,
286
+ warmup_iterations: int,
287
+ report_progress_fn,
288
+ ) -> List[Tuple[float, int]]:
289
+
290
+ amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
291
+ # The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
292
+ # where torch.cpu.amp.autocast(enabled=False) does nothing
293
+ with (
294
+ torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
295
+ if amp_enabled
296
+ else nullcontext()
297
+ ):
298
+
299
+ per_iteration_result = []
300
+ tokens_out_len_list = []
301
+
302
+ # Early stopping is only a valid parameter with multiple beams
303
+ early_stopping = num_beams > 1
304
+
305
+ with torch.no_grad(), torch.inference_mode():
306
+ # Don't capture time for warmup
307
+ for count in range(warmup_iterations):
308
+ outputs = model.generate(
309
+ input_ids,
310
+ num_beams=num_beams,
311
+ max_new_tokens=target_output_tokens,
312
+ min_new_tokens=target_output_tokens,
313
+ early_stopping=early_stopping,
314
+ pad_token_id=tokenizer.eos_token_id,
315
+ )
316
+ tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
317
+ report_progress_fn((count + 1) / (warmup_iterations + iterations))
318
+
319
+ for count in range(iterations):
320
+ # CUDA synchronization is required prior to GPU benchmarking
321
+ # This has no negative effect on CPU-only benchmarks, and is more robust than
322
+ # checking `model.device == "cuda"` since it applies to multi-GPU environments
323
+ # Synchronization is done before collecting the start time because this will
324
+ # ensure that the GPU has finished initialization tasks such as loading weights
325
+ if torch.cuda.is_available():
326
+ torch.cuda.synchronize()
327
+ start_time = time.perf_counter()
328
+
329
+ outputs = model.generate(
330
+ input_ids,
331
+ num_beams=num_beams,
332
+ max_new_tokens=target_output_tokens,
333
+ min_new_tokens=target_output_tokens,
334
+ early_stopping=early_stopping,
335
+ pad_token_id=tokenizer.eos_token_id,
336
+ )
337
+
338
+ if torch.cuda.is_available():
339
+ torch.cuda.synchronize()
340
+ end_time = time.perf_counter()
341
+
342
+ latency = end_time - start_time
343
+
344
+ token_len = outputs.shape[1] - input_ids.shape[1]
345
+ tokens_out_len_list.append(token_len)
346
+
347
+ # Only count an iteration if it produced enough tokens
348
+ if token_len >= target_output_tokens:
349
+ per_iteration_result.append((latency, token_len))
350
+
351
+ report_progress_fn(
352
+ (warmup_iterations + count + 1) / (warmup_iterations + iterations)
353
+ )
354
+
355
+ if not per_iteration_result:
356
+ raise Bench.not_enough_tokens(target_output_tokens)
357
+
358
+ return per_iteration_result, tokens_out_len_list
@@ -2,9 +2,7 @@ import argparse
2
2
  import os
3
3
  import csv
4
4
  from typing import Dict, Optional, Any
5
- import requests
6
- from human_eval.data import write_jsonl, read_problems
7
- from human_eval.evaluation import evaluate_functional_correctness
5
+
8
6
 
9
7
  from lemonade.state import State
10
8
  from lemonade.tools import Tool
@@ -95,6 +93,7 @@ class AccuracyHumaneval(Tool):
95
93
  Returns:
96
94
  Updated state with evaluation results
97
95
  """
96
+
98
97
  # Validate required state components
99
98
  if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
100
99
  raise ValueError("State must contain both 'model' and 'tokenizer'")
@@ -128,6 +127,9 @@ class AccuracyHumaneval(Tool):
128
127
 
129
128
  def _download_dataset(self, output_path: str) -> None:
130
129
  """Download HumanEval dataset if not already present."""
130
+
131
+ import requests
132
+
131
133
  if os.path.exists(output_path):
132
134
  printing.log_info(f"Dataset already exists at: {output_path}")
133
135
  return
@@ -170,6 +172,10 @@ class AccuracyHumaneval(Tool):
170
172
  Returns:
171
173
  Dictionary containing evaluation metrics
172
174
  """
175
+
176
+ from human_eval.data import write_jsonl, read_problems
177
+ from human_eval.evaluation import evaluate_functional_correctness
178
+
173
179
  dataset = read_problems(data_path)
174
180
 
175
181
  # Limit to first N problems
@@ -3,7 +3,7 @@ import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
5
  from lemonade.cache import Keys
6
- from lemonade.tools.llamacpp import LlamaCppAdapter
6
+ from lemonade.tools.llamacpp.load import LlamaCppAdapter
7
7
  from lemonade.tools.bench import Bench
8
8
 
9
9