lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
@@ -0,0 +1,359 @@
1
+ from typing import Dict, List, Tuple
2
+ import time
3
+ from contextlib import nullcontext
4
+ import transformers
5
+ import torch
6
+ from lemonade.state import State
7
+ from lemonade.tools.adapter import TokenizerAdapter
8
+ from lemonade.tools.adapter import ModelAdapter
9
+ from lemonade.tools.bench import Bench
10
+
11
+ # Command line interfaces for tools will use string inputs for data
12
+ # types, however the internal tool logic will need to know the actual
13
+ # torch type
14
+ str_to_dtype = {
15
+ "float32": torch.float32,
16
+ "float16": torch.float16,
17
+ "bfloat16": torch.bfloat16,
18
+ "int8_static": torch.int8,
19
+ "int8_dynamic": torch.int8,
20
+ }
21
+
22
+
23
+ def make_example_inputs(state: State) -> Dict:
24
+ """
25
+ Create a dictionary of LLM inputs that can be passed as an argument
26
+ into quantization, ONNX export, etc.
27
+ """
28
+
29
+ tokenizer = state.tokenizer
30
+ inputs_ids = tokenizer("Hello there", return_tensors="pt").input_ids
31
+ return {"input_ids": inputs_ids}
32
+
33
+
34
+ class HuggingfaceTokenizerAdapter(TokenizerAdapter):
35
+ def __init__(self, tokenizer: transformers.AutoTokenizer, device: str):
36
+ super().__init__(tokenizer)
37
+ self.tokenizer = tokenizer
38
+ self.device = device
39
+
40
+ def __call__(self, prompt, **kwargs):
41
+ tokens = self.tokenizer(prompt, **kwargs)
42
+ if self.device:
43
+ return tokens.to(self.device)
44
+ else:
45
+ return tokens
46
+
47
+ def decode(self, response, **kwargs):
48
+ return self.tokenizer.decode(response, **kwargs)
49
+
50
+ def batch_decode(self, tokens, **kwargs):
51
+ return self.tokenizer.batch_decode(tokens, **kwargs)
52
+
53
+ @property
54
+ def eos_token_id(self):
55
+ return self.tokenizer.eos_token_id
56
+
57
+ def save_pretrained(self, model_dir, **kwargs):
58
+ return self.tokenizer.save_pretrained(model_dir, **kwargs)
59
+
60
+
61
+ class HuggingfaceAdapter(ModelAdapter):
62
+ """
63
+ Wrapper class for Huggingface LLMs that handle generation arguments
64
+ from callers to match HF specification.
65
+
66
+ repetition_penalty: helps the LLM avoid repeating the same short
67
+ phrase in the response over and over.
68
+ temperature: helps the LLM stay focused on the prompt.
69
+ do_sample: apply the temperature.
70
+ """
71
+
72
+ def __init__(self, model, dtype=torch.float32, device="cpu", tokenizer=None):
73
+ super().__init__()
74
+ self.model = model
75
+ self.dtype = dtype
76
+ self.device = device
77
+ self.tokenizer = tokenizer
78
+
79
+ def generate(
80
+ self,
81
+ input_ids,
82
+ random_seed=1,
83
+ **kwargs,
84
+ ):
85
+
86
+ # Move input_ids to the same device as the model
87
+ input_ids = input_ids.to(self.device)
88
+
89
+ # Fix temperature handling to avoid errors:
90
+ # If temperature is 0.0, force do_sample=False (greedy decoding)
91
+ if kwargs.get("temperature") == 0.0:
92
+ kwargs["do_sample"] = False
93
+
94
+ # If do_sample is False and temperature is 0.0, remove temperature
95
+ # to avoid the warning from HuggingFace.
96
+ # Note: This is the same approach taken by LM Eval Harness for handling temperature.
97
+ generation_kwargs = {
98
+ "max_new_tokens": kwargs.get("max_new_tokens", 512),
99
+ "do_sample": kwargs.get("do_sample", True),
100
+ **kwargs,
101
+ }
102
+
103
+ if random_seed is None:
104
+ torch.random.seed()
105
+ else:
106
+ torch.random.manual_seed(random_seed)
107
+
108
+ with torch.no_grad(), torch.inference_mode():
109
+ outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
110
+
111
+ self.prompt_tokens = input_ids.shape[1]
112
+ self.response_tokens = len(outputs[0]) - self.prompt_tokens
113
+ return outputs
114
+
115
+ def _model_call(self, input_tensor):
116
+ """Forward pass through the model to get logits
117
+
118
+ This method directly calls the model forward pass rather than using model.generate() for
119
+ several important reasons:
120
+ 1. Purpose: We need raw logits from a single forward pass, while generate() is for producing
121
+ multiple tokens through iterative inference
122
+ 2. Efficiency: Direct calls are more efficient for logprob calculations with no sampling
123
+ overhead
124
+ 3. Precision: Logprob calculations require exact control over input-to-output mapping
125
+ 4. Consistency: Similar approach used in both HF and OGA implementations
126
+
127
+ Args:
128
+ input_tensor: Input token IDs tensor
129
+
130
+ Returns:
131
+ Logits tensor from model forward pass
132
+ """
133
+ with torch.no_grad(), torch.inference_mode():
134
+ outputs = self.model(input_tensor)
135
+ return outputs.logits
136
+
137
+ def _select_cont_toks(self, logits, context_len, cont_toks):
138
+ """
139
+ Select logits corresponding to continuation tokens and gather their probabilities
140
+
141
+ Args:
142
+ logits: Model output logits
143
+ context_len: Length of input context
144
+ cont_toks: List of continuation token IDs
145
+
146
+ Returns:
147
+ Tensor of log probabilities for continuation tokens
148
+ """
149
+ # Get the continuation logits (discard context logits)
150
+ cont_logits = logits[context_len - 1 : context_len - 1 + len(cont_toks)]
151
+
152
+ # Convert cont_toks to tensor if needed
153
+ if not isinstance(cont_toks, torch.Tensor):
154
+ cont_toks = torch.tensor(cont_toks, dtype=torch.long, device=logits.device)
155
+
156
+ # Gather log probs at the corresponding token indices
157
+ log_probs = torch.log_softmax(cont_logits, dim=-1)
158
+ token_log_probs = torch.gather(log_probs, 1, cont_toks.unsqueeze(-1)).squeeze(
159
+ -1
160
+ )
161
+
162
+ return token_log_probs
163
+
164
+ def compute_logprobs(
165
+ self, text, tokenizer, prompt_length=None, logprobs=None, echo=False
166
+ ):
167
+ """
168
+ Compute log probabilities for all tokens in the given text.
169
+
170
+ Args:
171
+ text: The full text to analyze (e.g., prompt + completion)
172
+ prompt_length: Number of tokens in the prompt. If provided and echo=False,
173
+ only completion tokens after this position will be returned.
174
+ logprobs: If not None, return log probabilities. Value indicates how many top
175
+ alternatives to return. If True but not an integer, defaults to 5 alternatives.
176
+ echo: If True, include logprobs for prompt tokens. If False, only return logprobs
177
+ for completion tokens.
178
+
179
+ Returns:
180
+ - text_offset: Character offsets for each token in the text
181
+ - token_logprobs: Log probability for each token
182
+ - tokens: The actual tokens used
183
+ - top_logprobs: Top alternative log probabilities for each position
184
+ """
185
+ if tokenizer is None:
186
+ raise ValueError("Tokenizer is required for logprob calculation")
187
+
188
+ # Encode the full text
189
+ tokens = tokenizer(text).input_ids
190
+
191
+ # Track character offsets for each token
192
+ text_offset = []
193
+ start_idx = 0
194
+
195
+ token_strings = []
196
+ for token_id in tokens:
197
+ token_str = tokenizer.decode([token_id])
198
+ token_strings.append(token_str)
199
+
200
+ # Calculate character offsets for tokens - handles cases where tokens
201
+ # may not directly match in the original text due to encoding differences,
202
+ # special characters, or tokenization artifacts
203
+ try:
204
+ pos = text[start_idx:].find(token_str)
205
+ if pos != -1:
206
+ text_offset.append(start_idx + pos)
207
+ start_idx += pos + len(token_str)
208
+ else:
209
+ text_offset.append(start_idx)
210
+ except (TypeError, ValueError, UnicodeError):
211
+ # Fallback to current position when matching fails due to encoding issues
212
+ text_offset.append(start_idx)
213
+
214
+ # Convert to tensor and get model output
215
+ input_tensor = torch.tensor([tokens], dtype=torch.long, device=self.device)
216
+ logits = self._model_call(input_tensor)[0]
217
+
218
+ # Calculate log probabilities for each token
219
+ all_log_probs = torch.log_softmax(logits, dim=-1)
220
+
221
+ # The first token doesn't have a conditional probability
222
+ # For tokens after the first, get the predicted probability
223
+ token_log_probs = []
224
+ top_logprobs_list = []
225
+
226
+ # For each position, get the actual token probability and top alternatives
227
+ for i in range(len(tokens)):
228
+ # Get previous token position logits
229
+ if i > 0: # First token has no preceding context
230
+ prev_logits = all_log_probs[i - 1]
231
+ curr_token_id = tokens[i]
232
+ # Get probability of the actual token that appeared
233
+ token_logprob = prev_logits[curr_token_id].item()
234
+ token_log_probs.append(token_logprob)
235
+
236
+ # Get top-k alternatives if requested
237
+ if logprobs is not None:
238
+ num_alternatives = logprobs if isinstance(logprobs, int) else 5
239
+ topk_values, topk_indices = torch.topk(
240
+ prev_logits, min(num_alternatives, prev_logits.size(-1))
241
+ )
242
+
243
+ # Create dictionary of token: logprob
244
+ position_logprobs = {}
245
+ for val, idx in zip(topk_values.tolist(), topk_indices.tolist()):
246
+ token_str = tokenizer.decode([idx])
247
+ position_logprobs[token_str] = val
248
+
249
+ top_logprobs_list.append(position_logprobs)
250
+ else:
251
+ # For the first token, we don't have a conditional probability
252
+ token_log_probs.append(None)
253
+ top_logprobs_list.append({})
254
+
255
+ # If we don't want to echo prompt tokens, filter them out
256
+ if not echo and prompt_length is not None:
257
+ # Ensure prompt_length is within bounds
258
+ prompt_length = min(prompt_length, len(tokens))
259
+
260
+ # Filter results to only include completion tokens
261
+ if prompt_length < len(tokens):
262
+ filtered_text_offset = text_offset[prompt_length:]
263
+ filtered_token_logprobs = token_log_probs[prompt_length:]
264
+ filtered_tokens = token_strings[prompt_length:]
265
+ filtered_top_logprobs = top_logprobs_list[prompt_length:]
266
+
267
+ return (
268
+ filtered_text_offset,
269
+ filtered_token_logprobs,
270
+ filtered_tokens,
271
+ filtered_top_logprobs,
272
+ )
273
+ else:
274
+ # No completion tokens
275
+ return [], [], [], []
276
+
277
+ return text_offset, token_log_probs, token_strings, top_logprobs_list
278
+
279
+
280
+ def benchmark_huggingface_llm(
281
+ model: torch.nn.Module,
282
+ tokenizer,
283
+ input_ids,
284
+ dtype,
285
+ num_beams: int,
286
+ target_output_tokens: int,
287
+ iterations: int,
288
+ warmup_iterations: int,
289
+ report_progress_fn,
290
+ ) -> List[Tuple[float, int]]:
291
+
292
+ amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
293
+ # The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
294
+ # where torch.cpu.amp.autocast(enabled=False) does nothing
295
+ with (
296
+ torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
297
+ if amp_enabled
298
+ else nullcontext()
299
+ ):
300
+
301
+ per_iteration_result = []
302
+ tokens_out_len_list = []
303
+
304
+ # Early stopping is only a valid parameter with multiple beams
305
+ early_stopping = num_beams > 1
306
+
307
+ with torch.no_grad(), torch.inference_mode():
308
+ # Don't capture time for warmup
309
+ for count in range(warmup_iterations):
310
+ outputs = model.generate(
311
+ input_ids,
312
+ num_beams=num_beams,
313
+ max_new_tokens=target_output_tokens,
314
+ min_new_tokens=target_output_tokens,
315
+ early_stopping=early_stopping,
316
+ pad_token_id=tokenizer.eos_token_id,
317
+ )
318
+ tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
319
+ report_progress_fn((count + 1) / (warmup_iterations + iterations))
320
+
321
+ for count in range(iterations):
322
+ # CUDA synchronization is required prior to GPU benchmarking
323
+ # This has no negative effect on CPU-only benchmarks, and is more robust than
324
+ # checking `model.device == "cuda"` since it applies to multi-GPU environments
325
+ # Synchronization is done before collecting the start time because this will
326
+ # ensure that the GPU has finished initialization tasks such as loading weights
327
+ if torch.cuda.is_available():
328
+ torch.cuda.synchronize()
329
+ start_time = time.perf_counter()
330
+
331
+ outputs = model.generate(
332
+ input_ids,
333
+ num_beams=num_beams,
334
+ max_new_tokens=target_output_tokens,
335
+ min_new_tokens=target_output_tokens,
336
+ early_stopping=early_stopping,
337
+ pad_token_id=tokenizer.eos_token_id,
338
+ )
339
+
340
+ if torch.cuda.is_available():
341
+ torch.cuda.synchronize()
342
+ end_time = time.perf_counter()
343
+
344
+ latency = end_time - start_time
345
+
346
+ tokens_out_len_list.append(model.response_tokens)
347
+
348
+ # Only count an iteration if it produced enough tokens
349
+ if model.response_tokens >= target_output_tokens:
350
+ per_iteration_result.append((latency, model.response_tokens))
351
+
352
+ report_progress_fn(
353
+ (warmup_iterations + count + 1) / (warmup_iterations + iterations)
354
+ )
355
+
356
+ if not per_iteration_result:
357
+ raise Bench.not_enough_tokens(target_output_tokens)
358
+
359
+ return per_iteration_result, tokens_out_len_list
@@ -0,0 +1,264 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+ from typing import Dict, Optional, Any
5
+
6
+
7
+ from lemonade.state import State
8
+ from lemonade.tools import Tool
9
+ import lemonade.common.printing as printing
10
+ import lemonade.common.build as build
11
+
12
+
13
+ class AccuracyHumaneval(Tool):
14
+ """
15
+ HumanEval accuracy measurement tool.
16
+
17
+ This tool evaluates language models on the HumanEval dataset, which consists of
18
+ Python programming problems. It measures the model's ability to:
19
+ 1. Generate functionally correct code completions
20
+ 2. Pass unit tests for each programming problem
21
+
22
+ Metrics:
23
+ - pass@1: Percentage of problems solved with 1 generation attempt
24
+ - pass@10: Percentage of problems solved within 10 generation attempts
25
+ - pass@100: Percentage of problems solved within 100 generation attempts
26
+
27
+ See docs/dev_cli/humaneval_accuracy.md for more details
28
+ """
29
+
30
+ unique_name = "accuracy-humaneval"
31
+ DATASET = "https://github.com/openai/human-eval/blob/master/data/HumanEval.jsonl.gz?raw=true"
32
+ TOTAL_PROBLEMS = 164 # Total number of problems in the HumanEval dataset
33
+
34
+ def __init__(self):
35
+ super().__init__(monitor_message="Measuring accuracy with HumanEval")
36
+ self.status_stats = []
37
+ # Enable code evaluation for HumanEval
38
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
39
+
40
+ @staticmethod
41
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
42
+ parser = __class__.helpful_parser(
43
+ short_description="Measure coding accuracy with HumanEval",
44
+ add_help=add_help,
45
+ )
46
+ parser.add_argument(
47
+ "--k-samples",
48
+ type=int,
49
+ default=1,
50
+ help="Number of completions to generate per prompt for pass@k calculation"
51
+ " (default: %(default)s)",
52
+ )
53
+ parser.add_argument(
54
+ "--first-n-samples",
55
+ type=int,
56
+ default=AccuracyHumaneval.TOTAL_PROBLEMS,
57
+ help=f"Evaluate only the first N problems from the dataset (default: "
58
+ f"%(default)s, evaluates all {AccuracyHumaneval.TOTAL_PROBLEMS} problems)",
59
+ )
60
+ parser.add_argument(
61
+ "--timeout",
62
+ type=float,
63
+ default=30.0,
64
+ help="Timeout in seconds for each test case (default: %(default)s)",
65
+ )
66
+ parser.add_argument(
67
+ "--data-dir",
68
+ type=str,
69
+ default=None,
70
+ help="Custom directory for dataset storage (default: %(default)s, "
71
+ "uses <lemonade_cache_dir>/data/humaneval)",
72
+ )
73
+ return parser
74
+
75
+ def run(
76
+ self,
77
+ state: State,
78
+ data_dir: Optional[str] = None,
79
+ k_samples: int = 1,
80
+ first_n_samples: Optional[int] = TOTAL_PROBLEMS,
81
+ timeout: float = 30.0,
82
+ ) -> State:
83
+ """
84
+ Run HumanEval evaluation on the model.
85
+
86
+ Args:
87
+ state: Current state containing model and tokenizer
88
+ data_dir: Optional custom directory for dataset storage
89
+ k_samples: Number of completions to generate per prompt for pass@k calculation
90
+ first_n_samples: Number of first N problems to evaluate
91
+ timeout: Timeout in seconds for each test case
92
+
93
+ Returns:
94
+ Updated state with evaluation results
95
+ """
96
+
97
+ # Validate required state components
98
+ if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
99
+ raise ValueError("State must contain both 'model' and 'tokenizer'")
100
+
101
+ # Setup directories
102
+ data_dir_to_use = data_dir or os.path.join(state.cache_dir, "data", "humaneval")
103
+ data_path = os.path.join(data_dir_to_use, "HumanEval.jsonl.gz")
104
+ model_results_dir = os.path.join(
105
+ build.output_dir(state.cache_dir, state.build_name), "humaneval"
106
+ )
107
+ os.makedirs(model_results_dir, exist_ok=True)
108
+
109
+ # Download dataset if needed
110
+ self._download_dataset(data_path)
111
+
112
+ # Run evaluation
113
+ results = self._evaluate_model(
114
+ state.model,
115
+ state.tokenizer,
116
+ data_path,
117
+ k_samples,
118
+ timeout,
119
+ model_results_dir,
120
+ first_n_samples,
121
+ )
122
+
123
+ # Save metrics
124
+ self._save_metrics(state, results)
125
+
126
+ return state
127
+
128
+ def _download_dataset(self, output_path: str) -> None:
129
+ """Download HumanEval dataset if not already present."""
130
+
131
+ import requests
132
+
133
+ if os.path.exists(output_path):
134
+ printing.log_info(f"Dataset already exists at: {output_path}")
135
+ return
136
+
137
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
138
+ response = requests.get(self.DATASET, stream=True)
139
+
140
+ if response.status_code == 200:
141
+ with open(output_path, "wb") as file:
142
+ for chunk in response.iter_content(chunk_size=8192):
143
+ file.write(chunk)
144
+ printing.log_info(f"Dataset downloaded successfully to: {output_path}")
145
+ else:
146
+ raise RuntimeError(
147
+ f"Failed to download dataset. Status code: {response.status_code}"
148
+ )
149
+
150
+ def _evaluate_model(
151
+ self,
152
+ model: Any,
153
+ tokenizer: Any,
154
+ data_path: str,
155
+ k_samples: int,
156
+ timeout: float,
157
+ results_dir: str,
158
+ first_n_samples: Optional[int] = TOTAL_PROBLEMS,
159
+ ) -> Dict[str, float]:
160
+ """
161
+ Evaluate model on HumanEval dataset.
162
+
163
+ Args:
164
+ model: The language model to evaluate
165
+ tokenizer: The tokenizer for the model
166
+ data_path: Path to the HumanEval dataset
167
+ k_samples: Number of completions per prompt for pass@k calculation
168
+ timeout: Test case timeout in seconds
169
+ results_dir: Directory to save results
170
+ first_n_samples: Number of first N problems to evaluate
171
+
172
+ Returns:
173
+ Dictionary containing evaluation metrics
174
+ """
175
+
176
+ from human_eval.data import write_jsonl, read_problems
177
+ from human_eval.evaluation import evaluate_functional_correctness
178
+
179
+ dataset = read_problems(data_path)
180
+
181
+ # Limit to first N problems
182
+ dataset_keys = list(dataset.keys())[:first_n_samples]
183
+ ignore_incomplete = True
184
+
185
+ samples = []
186
+
187
+ # Update Tool progress monitor
188
+ self.set_percent_progress(0.0)
189
+ questions_completed = 0
190
+ number_of_questions = first_n_samples * k_samples
191
+
192
+ # Save completions and expected answers
193
+ csv_path = os.path.join(results_dir, "evaluation_results.csv")
194
+ with open(
195
+ csv_path, mode="w", newline="", encoding="utf-8", errors="replace"
196
+ ) as file:
197
+ writer = csv.writer(file)
198
+ writer.writerow(["Prompt", "Completion", "Expected Answer"])
199
+
200
+ for task_id in dataset_keys:
201
+ try:
202
+ for _ in range(k_samples):
203
+ prompt = dataset[task_id]["prompt"]
204
+ expected = dataset[task_id]["canonical_solution"]
205
+
206
+ # Generate completion
207
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
208
+ completion = model.generate(
209
+ input_ids,
210
+ max_new_tokens=512,
211
+ do_sample=False,
212
+ )
213
+ completion_text = tokenizer.decode(
214
+ completion[0], skip_special_tokens=True
215
+ )
216
+
217
+ # Save results
218
+ samples.append(
219
+ {"task_id": task_id, "completion": completion_text}
220
+ )
221
+ writer.writerow([prompt, completion_text, expected])
222
+
223
+ # Update progress monitor after completing all samples for a question
224
+ questions_completed = questions_completed + 1
225
+ percent_completed = (
226
+ questions_completed / number_of_questions * 100
227
+ )
228
+ self.set_percent_progress(percent_completed)
229
+
230
+ # pylint: disable=W0718
231
+ except Exception as e:
232
+ printing.log_info(f"Error processing task {task_id}: {str(e)}")
233
+ continue
234
+
235
+ # Save predictions and evaluate
236
+ pred_path = os.path.join(results_dir, "humaneval_predictions.jsonl")
237
+ write_jsonl(pred_path, samples)
238
+ printing.log_info(f"Results saved in: {results_dir}")
239
+
240
+ # Run functional correctness evaluation
241
+ k_values = [k_samples]
242
+ results = evaluate_functional_correctness(
243
+ pred_path,
244
+ k_values,
245
+ n_workers=1,
246
+ timeout=timeout,
247
+ problem_file=data_path,
248
+ ignore_incomplete=ignore_incomplete,
249
+ )
250
+ return results
251
+
252
+ def _save_metrics(self, state: State, results: Dict[str, float]) -> None:
253
+ """Save evaluation metrics to state."""
254
+ for metric, value in results.items():
255
+ metric_name = f"humaneval_{metric}"
256
+ state.save_stat(
257
+ metric_name, float(value) * 100 if value is not None else None
258
+ )
259
+ state.save_stat(f"{metric_name}_units", "%")
260
+ self.status_stats.append(metric_name)
261
+
262
+
263
+ # This file was originally licensed under Apache 2.0. It has been modified.
264
+ # Modifications Copyright (c) 2025 AMD