lemonade-sdk 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lemonade/__init__.py +5 -0
- lemonade/api.py +180 -0
- lemonade/cache.py +92 -0
- lemonade/cli.py +173 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/build.py +176 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/inference_engines.py +408 -0
- lemonade/common/network.py +93 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +471 -0
- lemonade/common/system_info.py +1411 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/profilers/memory_tracker.py +259 -0
- lemonade/profilers/profiler.py +58 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/accuracy.py +432 -0
- lemonade/tools/adapter.py +114 -0
- lemonade/tools/bench.py +302 -0
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +305 -0
- lemonade/tools/huggingface/bench.py +187 -0
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/huggingface/utils.py +359 -0
- lemonade/tools/humaneval.py +264 -0
- lemonade/tools/llamacpp/bench.py +255 -0
- lemonade/tools/llamacpp/load.py +222 -0
- lemonade/tools/llamacpp/utils.py +1260 -0
- lemonade/tools/management_tools.py +319 -0
- lemonade/tools/mmlu.py +319 -0
- lemonade/tools/oga/__init__.py +0 -0
- lemonade/tools/oga/bench.py +120 -0
- lemonade/tools/oga/load.py +804 -0
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/oga/utils.py +462 -0
- lemonade/tools/perplexity.py +147 -0
- lemonade/tools/prompt.py +263 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +899 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +320 -0
- lemonade/tools/server/serve.py +2123 -0
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/index.html +279 -0
- lemonade/tools/server/static/js/chat.js +1059 -0
- lemonade/tools/server/static/js/model-settings.js +183 -0
- lemonade/tools/server/static/js/models.js +1395 -0
- lemonade/tools/server/static/js/shared.js +556 -0
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +2654 -0
- lemonade/tools/server/static/webapp.html +321 -0
- lemonade/tools/server/tool_calls.py +153 -0
- lemonade/tools/server/tray.py +664 -0
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/port.py +77 -0
- lemonade/tools/server/utils/thread.py +85 -0
- lemonade/tools/server/utils/windows_tray.py +408 -0
- lemonade/tools/server/webapp.py +34 -0
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +239 -0
- lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
- lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
- lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
- lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
- lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
- lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +805 -0
- lemonade_server/model_manager.py +758 -0
- lemonade_server/pydantic_models.py +159 -0
- lemonade_server/server_models.json +643 -0
- lemonade_server/settings.py +39 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import nullcontext
|
|
4
|
+
import transformers
|
|
5
|
+
import torch
|
|
6
|
+
from lemonade.state import State
|
|
7
|
+
from lemonade.tools.adapter import TokenizerAdapter
|
|
8
|
+
from lemonade.tools.adapter import ModelAdapter
|
|
9
|
+
from lemonade.tools.bench import Bench
|
|
10
|
+
|
|
11
|
+
# Command line interfaces for tools will use string inputs for data
|
|
12
|
+
# types, however the internal tool logic will need to know the actual
|
|
13
|
+
# torch type
|
|
14
|
+
str_to_dtype = {
|
|
15
|
+
"float32": torch.float32,
|
|
16
|
+
"float16": torch.float16,
|
|
17
|
+
"bfloat16": torch.bfloat16,
|
|
18
|
+
"int8_static": torch.int8,
|
|
19
|
+
"int8_dynamic": torch.int8,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def make_example_inputs(state: State) -> Dict:
|
|
24
|
+
"""
|
|
25
|
+
Create a dictionary of LLM inputs that can be passed as an argument
|
|
26
|
+
into quantization, ONNX export, etc.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
tokenizer = state.tokenizer
|
|
30
|
+
inputs_ids = tokenizer("Hello there", return_tensors="pt").input_ids
|
|
31
|
+
return {"input_ids": inputs_ids}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class HuggingfaceTokenizerAdapter(TokenizerAdapter):
|
|
35
|
+
def __init__(self, tokenizer: transformers.AutoTokenizer, device: str):
|
|
36
|
+
super().__init__(tokenizer)
|
|
37
|
+
self.tokenizer = tokenizer
|
|
38
|
+
self.device = device
|
|
39
|
+
|
|
40
|
+
def __call__(self, prompt, **kwargs):
|
|
41
|
+
tokens = self.tokenizer(prompt, **kwargs)
|
|
42
|
+
if self.device:
|
|
43
|
+
return tokens.to(self.device)
|
|
44
|
+
else:
|
|
45
|
+
return tokens
|
|
46
|
+
|
|
47
|
+
def decode(self, response, **kwargs):
|
|
48
|
+
return self.tokenizer.decode(response, **kwargs)
|
|
49
|
+
|
|
50
|
+
def batch_decode(self, tokens, **kwargs):
|
|
51
|
+
return self.tokenizer.batch_decode(tokens, **kwargs)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def eos_token_id(self):
|
|
55
|
+
return self.tokenizer.eos_token_id
|
|
56
|
+
|
|
57
|
+
def save_pretrained(self, model_dir, **kwargs):
|
|
58
|
+
return self.tokenizer.save_pretrained(model_dir, **kwargs)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class HuggingfaceAdapter(ModelAdapter):
|
|
62
|
+
"""
|
|
63
|
+
Wrapper class for Huggingface LLMs that handle generation arguments
|
|
64
|
+
from callers to match HF specification.
|
|
65
|
+
|
|
66
|
+
repetition_penalty: helps the LLM avoid repeating the same short
|
|
67
|
+
phrase in the response over and over.
|
|
68
|
+
temperature: helps the LLM stay focused on the prompt.
|
|
69
|
+
do_sample: apply the temperature.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self, model, dtype=torch.float32, device="cpu", tokenizer=None):
|
|
73
|
+
super().__init__()
|
|
74
|
+
self.model = model
|
|
75
|
+
self.dtype = dtype
|
|
76
|
+
self.device = device
|
|
77
|
+
self.tokenizer = tokenizer
|
|
78
|
+
|
|
79
|
+
def generate(
|
|
80
|
+
self,
|
|
81
|
+
input_ids,
|
|
82
|
+
random_seed=1,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
|
|
86
|
+
# Move input_ids to the same device as the model
|
|
87
|
+
input_ids = input_ids.to(self.device)
|
|
88
|
+
|
|
89
|
+
# Fix temperature handling to avoid errors:
|
|
90
|
+
# If temperature is 0.0, force do_sample=False (greedy decoding)
|
|
91
|
+
if kwargs.get("temperature") == 0.0:
|
|
92
|
+
kwargs["do_sample"] = False
|
|
93
|
+
|
|
94
|
+
# If do_sample is False and temperature is 0.0, remove temperature
|
|
95
|
+
# to avoid the warning from HuggingFace.
|
|
96
|
+
# Note: This is the same approach taken by LM Eval Harness for handling temperature.
|
|
97
|
+
generation_kwargs = {
|
|
98
|
+
"max_new_tokens": kwargs.get("max_new_tokens", 512),
|
|
99
|
+
"do_sample": kwargs.get("do_sample", True),
|
|
100
|
+
**kwargs,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if random_seed is None:
|
|
104
|
+
torch.random.seed()
|
|
105
|
+
else:
|
|
106
|
+
torch.random.manual_seed(random_seed)
|
|
107
|
+
|
|
108
|
+
with torch.no_grad(), torch.inference_mode():
|
|
109
|
+
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
110
|
+
|
|
111
|
+
self.prompt_tokens = input_ids.shape[1]
|
|
112
|
+
self.response_tokens = len(outputs[0]) - self.prompt_tokens
|
|
113
|
+
return outputs
|
|
114
|
+
|
|
115
|
+
def _model_call(self, input_tensor):
|
|
116
|
+
"""Forward pass through the model to get logits
|
|
117
|
+
|
|
118
|
+
This method directly calls the model forward pass rather than using model.generate() for
|
|
119
|
+
several important reasons:
|
|
120
|
+
1. Purpose: We need raw logits from a single forward pass, while generate() is for producing
|
|
121
|
+
multiple tokens through iterative inference
|
|
122
|
+
2. Efficiency: Direct calls are more efficient for logprob calculations with no sampling
|
|
123
|
+
overhead
|
|
124
|
+
3. Precision: Logprob calculations require exact control over input-to-output mapping
|
|
125
|
+
4. Consistency: Similar approach used in both HF and OGA implementations
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
input_tensor: Input token IDs tensor
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Logits tensor from model forward pass
|
|
132
|
+
"""
|
|
133
|
+
with torch.no_grad(), torch.inference_mode():
|
|
134
|
+
outputs = self.model(input_tensor)
|
|
135
|
+
return outputs.logits
|
|
136
|
+
|
|
137
|
+
def _select_cont_toks(self, logits, context_len, cont_toks):
|
|
138
|
+
"""
|
|
139
|
+
Select logits corresponding to continuation tokens and gather their probabilities
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
logits: Model output logits
|
|
143
|
+
context_len: Length of input context
|
|
144
|
+
cont_toks: List of continuation token IDs
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tensor of log probabilities for continuation tokens
|
|
148
|
+
"""
|
|
149
|
+
# Get the continuation logits (discard context logits)
|
|
150
|
+
cont_logits = logits[context_len - 1 : context_len - 1 + len(cont_toks)]
|
|
151
|
+
|
|
152
|
+
# Convert cont_toks to tensor if needed
|
|
153
|
+
if not isinstance(cont_toks, torch.Tensor):
|
|
154
|
+
cont_toks = torch.tensor(cont_toks, dtype=torch.long, device=logits.device)
|
|
155
|
+
|
|
156
|
+
# Gather log probs at the corresponding token indices
|
|
157
|
+
log_probs = torch.log_softmax(cont_logits, dim=-1)
|
|
158
|
+
token_log_probs = torch.gather(log_probs, 1, cont_toks.unsqueeze(-1)).squeeze(
|
|
159
|
+
-1
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return token_log_probs
|
|
163
|
+
|
|
164
|
+
def compute_logprobs(
|
|
165
|
+
self, text, tokenizer, prompt_length=None, logprobs=None, echo=False
|
|
166
|
+
):
|
|
167
|
+
"""
|
|
168
|
+
Compute log probabilities for all tokens in the given text.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
text: The full text to analyze (e.g., prompt + completion)
|
|
172
|
+
prompt_length: Number of tokens in the prompt. If provided and echo=False,
|
|
173
|
+
only completion tokens after this position will be returned.
|
|
174
|
+
logprobs: If not None, return log probabilities. Value indicates how many top
|
|
175
|
+
alternatives to return. If True but not an integer, defaults to 5 alternatives.
|
|
176
|
+
echo: If True, include logprobs for prompt tokens. If False, only return logprobs
|
|
177
|
+
for completion tokens.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
- text_offset: Character offsets for each token in the text
|
|
181
|
+
- token_logprobs: Log probability for each token
|
|
182
|
+
- tokens: The actual tokens used
|
|
183
|
+
- top_logprobs: Top alternative log probabilities for each position
|
|
184
|
+
"""
|
|
185
|
+
if tokenizer is None:
|
|
186
|
+
raise ValueError("Tokenizer is required for logprob calculation")
|
|
187
|
+
|
|
188
|
+
# Encode the full text
|
|
189
|
+
tokens = tokenizer(text).input_ids
|
|
190
|
+
|
|
191
|
+
# Track character offsets for each token
|
|
192
|
+
text_offset = []
|
|
193
|
+
start_idx = 0
|
|
194
|
+
|
|
195
|
+
token_strings = []
|
|
196
|
+
for token_id in tokens:
|
|
197
|
+
token_str = tokenizer.decode([token_id])
|
|
198
|
+
token_strings.append(token_str)
|
|
199
|
+
|
|
200
|
+
# Calculate character offsets for tokens - handles cases where tokens
|
|
201
|
+
# may not directly match in the original text due to encoding differences,
|
|
202
|
+
# special characters, or tokenization artifacts
|
|
203
|
+
try:
|
|
204
|
+
pos = text[start_idx:].find(token_str)
|
|
205
|
+
if pos != -1:
|
|
206
|
+
text_offset.append(start_idx + pos)
|
|
207
|
+
start_idx += pos + len(token_str)
|
|
208
|
+
else:
|
|
209
|
+
text_offset.append(start_idx)
|
|
210
|
+
except (TypeError, ValueError, UnicodeError):
|
|
211
|
+
# Fallback to current position when matching fails due to encoding issues
|
|
212
|
+
text_offset.append(start_idx)
|
|
213
|
+
|
|
214
|
+
# Convert to tensor and get model output
|
|
215
|
+
input_tensor = torch.tensor([tokens], dtype=torch.long, device=self.device)
|
|
216
|
+
logits = self._model_call(input_tensor)[0]
|
|
217
|
+
|
|
218
|
+
# Calculate log probabilities for each token
|
|
219
|
+
all_log_probs = torch.log_softmax(logits, dim=-1)
|
|
220
|
+
|
|
221
|
+
# The first token doesn't have a conditional probability
|
|
222
|
+
# For tokens after the first, get the predicted probability
|
|
223
|
+
token_log_probs = []
|
|
224
|
+
top_logprobs_list = []
|
|
225
|
+
|
|
226
|
+
# For each position, get the actual token probability and top alternatives
|
|
227
|
+
for i in range(len(tokens)):
|
|
228
|
+
# Get previous token position logits
|
|
229
|
+
if i > 0: # First token has no preceding context
|
|
230
|
+
prev_logits = all_log_probs[i - 1]
|
|
231
|
+
curr_token_id = tokens[i]
|
|
232
|
+
# Get probability of the actual token that appeared
|
|
233
|
+
token_logprob = prev_logits[curr_token_id].item()
|
|
234
|
+
token_log_probs.append(token_logprob)
|
|
235
|
+
|
|
236
|
+
# Get top-k alternatives if requested
|
|
237
|
+
if logprobs is not None:
|
|
238
|
+
num_alternatives = logprobs if isinstance(logprobs, int) else 5
|
|
239
|
+
topk_values, topk_indices = torch.topk(
|
|
240
|
+
prev_logits, min(num_alternatives, prev_logits.size(-1))
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Create dictionary of token: logprob
|
|
244
|
+
position_logprobs = {}
|
|
245
|
+
for val, idx in zip(topk_values.tolist(), topk_indices.tolist()):
|
|
246
|
+
token_str = tokenizer.decode([idx])
|
|
247
|
+
position_logprobs[token_str] = val
|
|
248
|
+
|
|
249
|
+
top_logprobs_list.append(position_logprobs)
|
|
250
|
+
else:
|
|
251
|
+
# For the first token, we don't have a conditional probability
|
|
252
|
+
token_log_probs.append(None)
|
|
253
|
+
top_logprobs_list.append({})
|
|
254
|
+
|
|
255
|
+
# If we don't want to echo prompt tokens, filter them out
|
|
256
|
+
if not echo and prompt_length is not None:
|
|
257
|
+
# Ensure prompt_length is within bounds
|
|
258
|
+
prompt_length = min(prompt_length, len(tokens))
|
|
259
|
+
|
|
260
|
+
# Filter results to only include completion tokens
|
|
261
|
+
if prompt_length < len(tokens):
|
|
262
|
+
filtered_text_offset = text_offset[prompt_length:]
|
|
263
|
+
filtered_token_logprobs = token_log_probs[prompt_length:]
|
|
264
|
+
filtered_tokens = token_strings[prompt_length:]
|
|
265
|
+
filtered_top_logprobs = top_logprobs_list[prompt_length:]
|
|
266
|
+
|
|
267
|
+
return (
|
|
268
|
+
filtered_text_offset,
|
|
269
|
+
filtered_token_logprobs,
|
|
270
|
+
filtered_tokens,
|
|
271
|
+
filtered_top_logprobs,
|
|
272
|
+
)
|
|
273
|
+
else:
|
|
274
|
+
# No completion tokens
|
|
275
|
+
return [], [], [], []
|
|
276
|
+
|
|
277
|
+
return text_offset, token_log_probs, token_strings, top_logprobs_list
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def benchmark_huggingface_llm(
|
|
281
|
+
model: torch.nn.Module,
|
|
282
|
+
tokenizer,
|
|
283
|
+
input_ids,
|
|
284
|
+
dtype,
|
|
285
|
+
num_beams: int,
|
|
286
|
+
target_output_tokens: int,
|
|
287
|
+
iterations: int,
|
|
288
|
+
warmup_iterations: int,
|
|
289
|
+
report_progress_fn,
|
|
290
|
+
) -> List[Tuple[float, int]]:
|
|
291
|
+
|
|
292
|
+
amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
|
|
293
|
+
# The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
|
|
294
|
+
# where torch.cpu.amp.autocast(enabled=False) does nothing
|
|
295
|
+
with (
|
|
296
|
+
torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
|
|
297
|
+
if amp_enabled
|
|
298
|
+
else nullcontext()
|
|
299
|
+
):
|
|
300
|
+
|
|
301
|
+
per_iteration_result = []
|
|
302
|
+
tokens_out_len_list = []
|
|
303
|
+
|
|
304
|
+
# Early stopping is only a valid parameter with multiple beams
|
|
305
|
+
early_stopping = num_beams > 1
|
|
306
|
+
|
|
307
|
+
with torch.no_grad(), torch.inference_mode():
|
|
308
|
+
# Don't capture time for warmup
|
|
309
|
+
for count in range(warmup_iterations):
|
|
310
|
+
outputs = model.generate(
|
|
311
|
+
input_ids,
|
|
312
|
+
num_beams=num_beams,
|
|
313
|
+
max_new_tokens=target_output_tokens,
|
|
314
|
+
min_new_tokens=target_output_tokens,
|
|
315
|
+
early_stopping=early_stopping,
|
|
316
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
317
|
+
)
|
|
318
|
+
tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
|
|
319
|
+
report_progress_fn((count + 1) / (warmup_iterations + iterations))
|
|
320
|
+
|
|
321
|
+
for count in range(iterations):
|
|
322
|
+
# CUDA synchronization is required prior to GPU benchmarking
|
|
323
|
+
# This has no negative effect on CPU-only benchmarks, and is more robust than
|
|
324
|
+
# checking `model.device == "cuda"` since it applies to multi-GPU environments
|
|
325
|
+
# Synchronization is done before collecting the start time because this will
|
|
326
|
+
# ensure that the GPU has finished initialization tasks such as loading weights
|
|
327
|
+
if torch.cuda.is_available():
|
|
328
|
+
torch.cuda.synchronize()
|
|
329
|
+
start_time = time.perf_counter()
|
|
330
|
+
|
|
331
|
+
outputs = model.generate(
|
|
332
|
+
input_ids,
|
|
333
|
+
num_beams=num_beams,
|
|
334
|
+
max_new_tokens=target_output_tokens,
|
|
335
|
+
min_new_tokens=target_output_tokens,
|
|
336
|
+
early_stopping=early_stopping,
|
|
337
|
+
pad_token_id=tokenizer.eos_token_id,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
if torch.cuda.is_available():
|
|
341
|
+
torch.cuda.synchronize()
|
|
342
|
+
end_time = time.perf_counter()
|
|
343
|
+
|
|
344
|
+
latency = end_time - start_time
|
|
345
|
+
|
|
346
|
+
tokens_out_len_list.append(model.response_tokens)
|
|
347
|
+
|
|
348
|
+
# Only count an iteration if it produced enough tokens
|
|
349
|
+
if model.response_tokens >= target_output_tokens:
|
|
350
|
+
per_iteration_result.append((latency, model.response_tokens))
|
|
351
|
+
|
|
352
|
+
report_progress_fn(
|
|
353
|
+
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if not per_iteration_result:
|
|
357
|
+
raise Bench.not_enough_tokens(target_output_tokens)
|
|
358
|
+
|
|
359
|
+
return per_iteration_result, tokens_out_len_list
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
from typing import Dict, Optional, Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from lemonade.state import State
|
|
8
|
+
from lemonade.tools import Tool
|
|
9
|
+
import lemonade.common.printing as printing
|
|
10
|
+
import lemonade.common.build as build
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AccuracyHumaneval(Tool):
|
|
14
|
+
"""
|
|
15
|
+
HumanEval accuracy measurement tool.
|
|
16
|
+
|
|
17
|
+
This tool evaluates language models on the HumanEval dataset, which consists of
|
|
18
|
+
Python programming problems. It measures the model's ability to:
|
|
19
|
+
1. Generate functionally correct code completions
|
|
20
|
+
2. Pass unit tests for each programming problem
|
|
21
|
+
|
|
22
|
+
Metrics:
|
|
23
|
+
- pass@1: Percentage of problems solved with 1 generation attempt
|
|
24
|
+
- pass@10: Percentage of problems solved within 10 generation attempts
|
|
25
|
+
- pass@100: Percentage of problems solved within 100 generation attempts
|
|
26
|
+
|
|
27
|
+
See docs/dev_cli/humaneval_accuracy.md for more details
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
unique_name = "accuracy-humaneval"
|
|
31
|
+
DATASET = "https://github.com/openai/human-eval/blob/master/data/HumanEval.jsonl.gz?raw=true"
|
|
32
|
+
TOTAL_PROBLEMS = 164 # Total number of problems in the HumanEval dataset
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
super().__init__(monitor_message="Measuring accuracy with HumanEval")
|
|
36
|
+
self.status_stats = []
|
|
37
|
+
# Enable code evaluation for HumanEval
|
|
38
|
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
42
|
+
parser = __class__.helpful_parser(
|
|
43
|
+
short_description="Measure coding accuracy with HumanEval",
|
|
44
|
+
add_help=add_help,
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--k-samples",
|
|
48
|
+
type=int,
|
|
49
|
+
default=1,
|
|
50
|
+
help="Number of completions to generate per prompt for pass@k calculation"
|
|
51
|
+
" (default: %(default)s)",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--first-n-samples",
|
|
55
|
+
type=int,
|
|
56
|
+
default=AccuracyHumaneval.TOTAL_PROBLEMS,
|
|
57
|
+
help=f"Evaluate only the first N problems from the dataset (default: "
|
|
58
|
+
f"%(default)s, evaluates all {AccuracyHumaneval.TOTAL_PROBLEMS} problems)",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--timeout",
|
|
62
|
+
type=float,
|
|
63
|
+
default=30.0,
|
|
64
|
+
help="Timeout in seconds for each test case (default: %(default)s)",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--data-dir",
|
|
68
|
+
type=str,
|
|
69
|
+
default=None,
|
|
70
|
+
help="Custom directory for dataset storage (default: %(default)s, "
|
|
71
|
+
"uses <lemonade_cache_dir>/data/humaneval)",
|
|
72
|
+
)
|
|
73
|
+
return parser
|
|
74
|
+
|
|
75
|
+
def run(
|
|
76
|
+
self,
|
|
77
|
+
state: State,
|
|
78
|
+
data_dir: Optional[str] = None,
|
|
79
|
+
k_samples: int = 1,
|
|
80
|
+
first_n_samples: Optional[int] = TOTAL_PROBLEMS,
|
|
81
|
+
timeout: float = 30.0,
|
|
82
|
+
) -> State:
|
|
83
|
+
"""
|
|
84
|
+
Run HumanEval evaluation on the model.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
state: Current state containing model and tokenizer
|
|
88
|
+
data_dir: Optional custom directory for dataset storage
|
|
89
|
+
k_samples: Number of completions to generate per prompt for pass@k calculation
|
|
90
|
+
first_n_samples: Number of first N problems to evaluate
|
|
91
|
+
timeout: Timeout in seconds for each test case
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Updated state with evaluation results
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
# Validate required state components
|
|
98
|
+
if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
|
|
99
|
+
raise ValueError("State must contain both 'model' and 'tokenizer'")
|
|
100
|
+
|
|
101
|
+
# Setup directories
|
|
102
|
+
data_dir_to_use = data_dir or os.path.join(state.cache_dir, "data", "humaneval")
|
|
103
|
+
data_path = os.path.join(data_dir_to_use, "HumanEval.jsonl.gz")
|
|
104
|
+
model_results_dir = os.path.join(
|
|
105
|
+
build.output_dir(state.cache_dir, state.build_name), "humaneval"
|
|
106
|
+
)
|
|
107
|
+
os.makedirs(model_results_dir, exist_ok=True)
|
|
108
|
+
|
|
109
|
+
# Download dataset if needed
|
|
110
|
+
self._download_dataset(data_path)
|
|
111
|
+
|
|
112
|
+
# Run evaluation
|
|
113
|
+
results = self._evaluate_model(
|
|
114
|
+
state.model,
|
|
115
|
+
state.tokenizer,
|
|
116
|
+
data_path,
|
|
117
|
+
k_samples,
|
|
118
|
+
timeout,
|
|
119
|
+
model_results_dir,
|
|
120
|
+
first_n_samples,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Save metrics
|
|
124
|
+
self._save_metrics(state, results)
|
|
125
|
+
|
|
126
|
+
return state
|
|
127
|
+
|
|
128
|
+
def _download_dataset(self, output_path: str) -> None:
|
|
129
|
+
"""Download HumanEval dataset if not already present."""
|
|
130
|
+
|
|
131
|
+
import requests
|
|
132
|
+
|
|
133
|
+
if os.path.exists(output_path):
|
|
134
|
+
printing.log_info(f"Dataset already exists at: {output_path}")
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
138
|
+
response = requests.get(self.DATASET, stream=True)
|
|
139
|
+
|
|
140
|
+
if response.status_code == 200:
|
|
141
|
+
with open(output_path, "wb") as file:
|
|
142
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
143
|
+
file.write(chunk)
|
|
144
|
+
printing.log_info(f"Dataset downloaded successfully to: {output_path}")
|
|
145
|
+
else:
|
|
146
|
+
raise RuntimeError(
|
|
147
|
+
f"Failed to download dataset. Status code: {response.status_code}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def _evaluate_model(
|
|
151
|
+
self,
|
|
152
|
+
model: Any,
|
|
153
|
+
tokenizer: Any,
|
|
154
|
+
data_path: str,
|
|
155
|
+
k_samples: int,
|
|
156
|
+
timeout: float,
|
|
157
|
+
results_dir: str,
|
|
158
|
+
first_n_samples: Optional[int] = TOTAL_PROBLEMS,
|
|
159
|
+
) -> Dict[str, float]:
|
|
160
|
+
"""
|
|
161
|
+
Evaluate model on HumanEval dataset.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
model: The language model to evaluate
|
|
165
|
+
tokenizer: The tokenizer for the model
|
|
166
|
+
data_path: Path to the HumanEval dataset
|
|
167
|
+
k_samples: Number of completions per prompt for pass@k calculation
|
|
168
|
+
timeout: Test case timeout in seconds
|
|
169
|
+
results_dir: Directory to save results
|
|
170
|
+
first_n_samples: Number of first N problems to evaluate
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dictionary containing evaluation metrics
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
from human_eval.data import write_jsonl, read_problems
|
|
177
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
178
|
+
|
|
179
|
+
dataset = read_problems(data_path)
|
|
180
|
+
|
|
181
|
+
# Limit to first N problems
|
|
182
|
+
dataset_keys = list(dataset.keys())[:first_n_samples]
|
|
183
|
+
ignore_incomplete = True
|
|
184
|
+
|
|
185
|
+
samples = []
|
|
186
|
+
|
|
187
|
+
# Update Tool progress monitor
|
|
188
|
+
self.set_percent_progress(0.0)
|
|
189
|
+
questions_completed = 0
|
|
190
|
+
number_of_questions = first_n_samples * k_samples
|
|
191
|
+
|
|
192
|
+
# Save completions and expected answers
|
|
193
|
+
csv_path = os.path.join(results_dir, "evaluation_results.csv")
|
|
194
|
+
with open(
|
|
195
|
+
csv_path, mode="w", newline="", encoding="utf-8", errors="replace"
|
|
196
|
+
) as file:
|
|
197
|
+
writer = csv.writer(file)
|
|
198
|
+
writer.writerow(["Prompt", "Completion", "Expected Answer"])
|
|
199
|
+
|
|
200
|
+
for task_id in dataset_keys:
|
|
201
|
+
try:
|
|
202
|
+
for _ in range(k_samples):
|
|
203
|
+
prompt = dataset[task_id]["prompt"]
|
|
204
|
+
expected = dataset[task_id]["canonical_solution"]
|
|
205
|
+
|
|
206
|
+
# Generate completion
|
|
207
|
+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
|
208
|
+
completion = model.generate(
|
|
209
|
+
input_ids,
|
|
210
|
+
max_new_tokens=512,
|
|
211
|
+
do_sample=False,
|
|
212
|
+
)
|
|
213
|
+
completion_text = tokenizer.decode(
|
|
214
|
+
completion[0], skip_special_tokens=True
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Save results
|
|
218
|
+
samples.append(
|
|
219
|
+
{"task_id": task_id, "completion": completion_text}
|
|
220
|
+
)
|
|
221
|
+
writer.writerow([prompt, completion_text, expected])
|
|
222
|
+
|
|
223
|
+
# Update progress monitor after completing all samples for a question
|
|
224
|
+
questions_completed = questions_completed + 1
|
|
225
|
+
percent_completed = (
|
|
226
|
+
questions_completed / number_of_questions * 100
|
|
227
|
+
)
|
|
228
|
+
self.set_percent_progress(percent_completed)
|
|
229
|
+
|
|
230
|
+
# pylint: disable=W0718
|
|
231
|
+
except Exception as e:
|
|
232
|
+
printing.log_info(f"Error processing task {task_id}: {str(e)}")
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# Save predictions and evaluate
|
|
236
|
+
pred_path = os.path.join(results_dir, "humaneval_predictions.jsonl")
|
|
237
|
+
write_jsonl(pred_path, samples)
|
|
238
|
+
printing.log_info(f"Results saved in: {results_dir}")
|
|
239
|
+
|
|
240
|
+
# Run functional correctness evaluation
|
|
241
|
+
k_values = [k_samples]
|
|
242
|
+
results = evaluate_functional_correctness(
|
|
243
|
+
pred_path,
|
|
244
|
+
k_values,
|
|
245
|
+
n_workers=1,
|
|
246
|
+
timeout=timeout,
|
|
247
|
+
problem_file=data_path,
|
|
248
|
+
ignore_incomplete=ignore_incomplete,
|
|
249
|
+
)
|
|
250
|
+
return results
|
|
251
|
+
|
|
252
|
+
def _save_metrics(self, state: State, results: Dict[str, float]) -> None:
|
|
253
|
+
"""Save evaluation metrics to state."""
|
|
254
|
+
for metric, value in results.items():
|
|
255
|
+
metric_name = f"humaneval_{metric}"
|
|
256
|
+
state.save_stat(
|
|
257
|
+
metric_name, float(value) * 100 if value is not None else None
|
|
258
|
+
)
|
|
259
|
+
state.save_stat(f"{metric_name}_units", "%")
|
|
260
|
+
self.status_stats.append(metric_name)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
264
|
+
# Modifications Copyright (c) 2025 AMD
|