nvidia-livecodebench 25.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core_evals/livecodebench/__init__.py +1 -0
- core_evals/livecodebench/framework.yml +233 -0
- core_evals/livecodebench/framework_entrypoint.py +28 -0
- core_evals/livecodebench/output.py +51 -0
- livecodebench/__init__.py +0 -0
- livecodebench/benchmarks/__init__.py +31 -0
- livecodebench/benchmarks/code_execution.py +85 -0
- livecodebench/benchmarks/code_generation.py +160 -0
- livecodebench/benchmarks/test_output_prediction.py +90 -0
- livecodebench/benchmarks/utils.py +50 -0
- livecodebench/evaluation/__init__.py +24 -0
- livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
- livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
- livecodebench/evaluation/compute_scores.py +172 -0
- livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
- livecodebench/evaluation/metric.py +28 -0
- livecodebench/evaluation/old_results_check.py +91 -0
- livecodebench/evaluation/pass_k_utils.py +84 -0
- livecodebench/evaluation/testing_util.py +574 -0
- livecodebench/evaluation/utils_execute.py +285 -0
- livecodebench/lm_styles.py +581 -0
- livecodebench/prompts/__init__.py +22 -0
- livecodebench/prompts/code_execution.py +201 -0
- livecodebench/prompts/code_generation.py +372 -0
- livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
- livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
- livecodebench/prompts/self_repair.py +370 -0
- livecodebench/prompts/test_output_prediction.py +327 -0
- livecodebench/runner/__init__.py +0 -0
- livecodebench/runner/base_runner.py +188 -0
- livecodebench/runner/claude3_runner.py +70 -0
- livecodebench/runner/claude_runner.py +69 -0
- livecodebench/runner/cohere_runner.py +71 -0
- livecodebench/runner/custom_evaluator.py +132 -0
- livecodebench/runner/deepseek_runner.py +87 -0
- livecodebench/runner/gemini_runner.py +111 -0
- livecodebench/runner/generic_oai_server_runner.py +104 -0
- livecodebench/runner/main.py +255 -0
- livecodebench/runner/mistral_runner.py +71 -0
- livecodebench/runner/oai_runner.py +93 -0
- livecodebench/runner/parser.py +174 -0
- livecodebench/runner/runner_utils.py +62 -0
- livecodebench/runner/scenario_router.py +239 -0
- livecodebench/runner/vllm_runner.py +82 -0
- livecodebench/utils/__init__.py +0 -0
- livecodebench/utils/extraction_utils.py +82 -0
- livecodebench/utils/multiprocess.py +250 -0
- livecodebench/utils/path_utils.py +58 -0
- livecodebench/utils/scenarios.py +26 -0
- livecodebench/utils/seed_generator.py +44 -0
- nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
- nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
- nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
- nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
- nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
from anthropic import HUMAN_PROMPT, AI_PROMPT
|
|
20
|
+
|
|
21
|
+
from livecodebench.lm_styles import LMStyle
|
|
22
|
+
from livecodebench.benchmarks import TestOutputPredictionProblem
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PromptConstants:
|
|
26
|
+
SYSTEM_MESSAGE_CHAT_GENERIC = f"You are a helpful programming assistant and an expert Python programmer.\
|
|
27
|
+
You are helping a user to write a test case to help to check the correctness of the function.\
|
|
28
|
+
The user has written a input for the testcase.\
|
|
29
|
+
You will calculate the output of the testcase and\
|
|
30
|
+
write the whole assertion statement in the markdown code block with the correct output."
|
|
31
|
+
|
|
32
|
+
SYSTEM_MESSAGE_COMPLETION_GENERIC = f"You are a helpful programming assistant and an expert Python programmer.\
|
|
33
|
+
You are helping a user to write a test case to help to check the correctness of the function."
|
|
34
|
+
|
|
35
|
+
SYSTEM_MESSAGE_INST_CLLAMA = f"You are a helpful programming assistant and an expert Python programmer.\
|
|
36
|
+
You are helping a user to write a test case to help to check the correctness of the function.\
|
|
37
|
+
The user has written a input for the testcase.\
|
|
38
|
+
You will calculate the output of the testcase and \
|
|
39
|
+
write out the complete assertion statement between [PYTHON] and [/PYTHON] tags."
|
|
40
|
+
|
|
41
|
+
SYSTEM_MESSAGE_WIZARD = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
|
42
|
+
|
|
43
|
+
SYSTEM_MESSAGE_PHIND = f"""You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program. You must put the entired fixed program within code delimiters only for once., for example:
|
|
44
|
+
```python
|
|
45
|
+
# YOUR CODE HERE
|
|
46
|
+
```"""
|
|
47
|
+
|
|
48
|
+
FORMATTING_MESSAGE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
|
|
49
|
+
|
|
50
|
+
FORMATTING_WITHOUT_STARTER_MESSAGE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def truncate_io(io):
|
|
54
|
+
if len(str(io)) > 1000:
|
|
55
|
+
io = str(io)[:1000] + "...."
|
|
56
|
+
print(io)
|
|
57
|
+
return io
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def format_testcase_func_name_input(function_name, testcase):
|
|
61
|
+
"""
|
|
62
|
+
use the form of "assert func_name(input) == "
|
|
63
|
+
"""
|
|
64
|
+
# TODO should there be a space after the == ?
|
|
65
|
+
input_str = ", ".join(testcase.split("\n"))
|
|
66
|
+
return f"assert {function_name}({input_str}) == # TODO"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def parse_function_name_from_starter_code(starter_code):
|
|
70
|
+
"""
|
|
71
|
+
starter_code : str
|
|
72
|
+
"""
|
|
73
|
+
import ast
|
|
74
|
+
|
|
75
|
+
tree = ast.parse(starter_code)
|
|
76
|
+
fn = None
|
|
77
|
+
for node in ast.walk(tree):
|
|
78
|
+
if isinstance(node, ast.FunctionDef):
|
|
79
|
+
assert fn is None
|
|
80
|
+
fn = node.name
|
|
81
|
+
return fn
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_generic_question_template_test_completion(
|
|
85
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
86
|
+
):
|
|
87
|
+
prompt = f"Problem:\n{question.question_content}"
|
|
88
|
+
prompt += f"Function:\n```\n{question.starter_code}\n```\n"
|
|
89
|
+
|
|
90
|
+
# parse function name from starter_code
|
|
91
|
+
func_name = parse_function_name_from_starter_code(question.starter_code)
|
|
92
|
+
prompt += "Please complete the following test case:\n\n"
|
|
93
|
+
prompt += (
|
|
94
|
+
f"```\n{format_testcase_func_name_input(func_name, testcase_input)}\n```\n"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return prompt
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_cllama_question_template_answer(
|
|
101
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
102
|
+
):
|
|
103
|
+
prompt = f"### Question\n"
|
|
104
|
+
prompt += get_generic_question_template_test_completion(question, testcase_input)
|
|
105
|
+
prompt += f"### Answer\n"
|
|
106
|
+
return prompt
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_deepseekcode_question_template_answer(
|
|
110
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
111
|
+
):
|
|
112
|
+
prompt = f"### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n"
|
|
113
|
+
prompt += get_generic_question_template_test_completion(question, testcase_input)
|
|
114
|
+
prompt += f"### Response:\n\n"
|
|
115
|
+
return prompt
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_magicoder_question_template_answer(
|
|
119
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
120
|
+
):
|
|
121
|
+
# prompt = f"You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n"
|
|
122
|
+
prompt = f"Question:\n"
|
|
123
|
+
prompt += get_generic_question_template_test_completion(question, testcase_input)
|
|
124
|
+
prompt += f"@@ Response \n"
|
|
125
|
+
return prompt
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_mixtral_question_template_answer(
|
|
129
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
130
|
+
):
|
|
131
|
+
prompt = get_generic_question_template_test_completion(question, testcase_input)
|
|
132
|
+
return prompt
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def get_wizard_question_template_answer(
|
|
136
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
137
|
+
):
|
|
138
|
+
prompt = f"""### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"""
|
|
139
|
+
prompt += get_generic_question_template_test_completion(question, testcase_input)
|
|
140
|
+
prompt += f"### Response:\n"
|
|
141
|
+
return prompt
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_phind_question_template_answer(
|
|
145
|
+
question: TestOutputPredictionProblem, testcase_input: str
|
|
146
|
+
):
|
|
147
|
+
prompt = get_generic_question_template_test_completion(question, testcase_input)
|
|
148
|
+
prompt += f"\n\n### Assistant"
|
|
149
|
+
return prompt
|
|
150
|
+
|
|
151
|
+
def get_qwen_question_template_answer(question: TestOutputPredictionProblem, testcase_input: str):
|
|
152
|
+
from transformers import AutoTokenizer
|
|
153
|
+
|
|
154
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
155
|
+
"abacusai/Dracarys-72B-Instruct", padding_side="left", use_fast=False
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
prompt = f"""### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"""
|
|
159
|
+
prompt += get_generic_question_template_test_completion(question, testcase_input)
|
|
160
|
+
prompt += f"### Response:\n"
|
|
161
|
+
|
|
162
|
+
messages = [
|
|
163
|
+
{"role": "user", "content": prompt},
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
prompt = tokenizer.apply_chat_template(
|
|
167
|
+
messages,
|
|
168
|
+
tokenize=False,
|
|
169
|
+
add_generation_prompt=True,
|
|
170
|
+
truncation=False,
|
|
171
|
+
padding=False,
|
|
172
|
+
)
|
|
173
|
+
return prompt
|
|
174
|
+
|
|
175
|
+
def format_prompt_test_output(
|
|
176
|
+
question: TestOutputPredictionProblem, LanguageModelStyle: LMStyle
|
|
177
|
+
) -> str:
|
|
178
|
+
testcase_input = question.test[0].input
|
|
179
|
+
if LanguageModelStyle in (LMStyle.OpenAIChat, LMStyle.GenericOAIServer):
|
|
180
|
+
chat_messages = [
|
|
181
|
+
{
|
|
182
|
+
"role": "system",
|
|
183
|
+
"content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
|
|
184
|
+
},
|
|
185
|
+
]
|
|
186
|
+
chat_messages += [
|
|
187
|
+
{
|
|
188
|
+
"role": "user",
|
|
189
|
+
"content": get_generic_question_template_test_completion(
|
|
190
|
+
question, testcase_input
|
|
191
|
+
),
|
|
192
|
+
},
|
|
193
|
+
]
|
|
194
|
+
return chat_messages
|
|
195
|
+
if LanguageModelStyle == LMStyle.LLaMa3:
|
|
196
|
+
chat_messages = [
|
|
197
|
+
{
|
|
198
|
+
"role": "system",
|
|
199
|
+
"content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
|
|
200
|
+
},
|
|
201
|
+
]
|
|
202
|
+
chat_messages += [
|
|
203
|
+
{
|
|
204
|
+
"role": "user",
|
|
205
|
+
"content": get_generic_question_template_test_completion(
|
|
206
|
+
question, testcase_input
|
|
207
|
+
),
|
|
208
|
+
},
|
|
209
|
+
]
|
|
210
|
+
from transformers import AutoTokenizer
|
|
211
|
+
|
|
212
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
213
|
+
"meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left", use_fast=False
|
|
214
|
+
)
|
|
215
|
+
return tokenizer.apply_chat_template(
|
|
216
|
+
chat_messages,
|
|
217
|
+
tokenize=False,
|
|
218
|
+
add_generation_prompt=True,
|
|
219
|
+
truncation=False,
|
|
220
|
+
padding=False,
|
|
221
|
+
)
|
|
222
|
+
elif LanguageModelStyle == LMStyle.Claude:
|
|
223
|
+
prompt = f"{HUMAN_PROMPT}\n{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n"
|
|
224
|
+
prompt += f"{get_generic_question_template_test_completion(question, testcase_input).rstrip()}\n{AI_PROMPT}"
|
|
225
|
+
return prompt
|
|
226
|
+
elif LanguageModelStyle == LMStyle.Claude3:
|
|
227
|
+
system = PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC
|
|
228
|
+
prompt = [
|
|
229
|
+
{
|
|
230
|
+
"role": "user",
|
|
231
|
+
"content": get_generic_question_template_test_completion(
|
|
232
|
+
question, testcase_input
|
|
233
|
+
).rstrip(),
|
|
234
|
+
}
|
|
235
|
+
]
|
|
236
|
+
return system, prompt
|
|
237
|
+
elif LanguageModelStyle == LMStyle.Gemini:
|
|
238
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
|
|
239
|
+
prompt += (
|
|
240
|
+
f"{get_generic_question_template_test_completion(question, testcase_input)}"
|
|
241
|
+
)
|
|
242
|
+
return prompt
|
|
243
|
+
|
|
244
|
+
elif LanguageModelStyle == LMStyle.StarCoderInstruct:
|
|
245
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
|
|
246
|
+
prompt += (
|
|
247
|
+
f"{get_generic_question_template_test_completion(question, testcase_input)}"
|
|
248
|
+
)
|
|
249
|
+
return prompt
|
|
250
|
+
|
|
251
|
+
elif LanguageModelStyle == LMStyle.DeepSeekCodeInstruct:
|
|
252
|
+
prompt = (
|
|
253
|
+
f"{get_deepseekcode_question_template_answer(question, testcase_input)}"
|
|
254
|
+
)
|
|
255
|
+
return prompt
|
|
256
|
+
elif LanguageModelStyle == LMStyle.CodeLLaMaInstruct:
|
|
257
|
+
prompt = f"[INST] <<SYS>>\n{PromptConstants.SYSTEM_MESSAGE_INST_CLLAMA}\n<</SYS>>\n\n"
|
|
258
|
+
prompt += (
|
|
259
|
+
f"{get_cllama_question_template_answer(question, testcase_input)}\n[/INST]"
|
|
260
|
+
)
|
|
261
|
+
return prompt
|
|
262
|
+
elif LanguageModelStyle == LMStyle.MagiCoder:
|
|
263
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
|
|
264
|
+
prompt += f"{get_magicoder_question_template_answer(question, testcase_input)}"
|
|
265
|
+
return prompt
|
|
266
|
+
elif LanguageModelStyle == LMStyle.WizardCoder:
|
|
267
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_WIZARD}\n\n{get_wizard_question_template_answer(question, testcase_input)}"
|
|
268
|
+
return prompt
|
|
269
|
+
elif LanguageModelStyle == LMStyle.Phind:
|
|
270
|
+
prompt = f"### System Prompt\n\n{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n### User Message\n\n{get_phind_question_template_answer(question, testcase_input)}"
|
|
271
|
+
return prompt
|
|
272
|
+
elif LanguageModelStyle == LMStyle.OC:
|
|
273
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
|
|
274
|
+
prompt += (
|
|
275
|
+
f"{get_generic_question_template_test_completion(question, testcase_input)}"
|
|
276
|
+
)
|
|
277
|
+
return prompt
|
|
278
|
+
elif LanguageModelStyle == LMStyle.MistralWeb:
|
|
279
|
+
chat_messages = [
|
|
280
|
+
{
|
|
281
|
+
"role": "system",
|
|
282
|
+
"content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
"role": "user",
|
|
286
|
+
"content": get_generic_question_template_test_completion(
|
|
287
|
+
question, testcase_input
|
|
288
|
+
),
|
|
289
|
+
},
|
|
290
|
+
]
|
|
291
|
+
return chat_messages
|
|
292
|
+
elif (
|
|
293
|
+
LanguageModelStyle == LMStyle.DracarysQwen
|
|
294
|
+
):
|
|
295
|
+
prompt = f"{get_qwen_question_template_answer(question, testcase_input)}"
|
|
296
|
+
return prompt
|
|
297
|
+
elif LanguageModelStyle == LMStyle.DracarysLlama:
|
|
298
|
+
chat_messages = [
|
|
299
|
+
{
|
|
300
|
+
"role": "system",
|
|
301
|
+
"content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
|
|
302
|
+
},
|
|
303
|
+
]
|
|
304
|
+
chat_messages += [
|
|
305
|
+
{
|
|
306
|
+
"role": "user",
|
|
307
|
+
"content": get_generic_question_template_test_completion(
|
|
308
|
+
question, testcase_input
|
|
309
|
+
),
|
|
310
|
+
},
|
|
311
|
+
]
|
|
312
|
+
from transformers import AutoTokenizer
|
|
313
|
+
|
|
314
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
315
|
+
"abacusai/Dracarys-Llama-3.1-70B-Instruct", padding_side="right", use_fast=False
|
|
316
|
+
)
|
|
317
|
+
return tokenizer.apply_chat_template(
|
|
318
|
+
chat_messages,
|
|
319
|
+
tokenize=False,
|
|
320
|
+
add_generation_prompt=True,
|
|
321
|
+
truncation=False,
|
|
322
|
+
padding=False,
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
raise NotImplementedError(
|
|
326
|
+
f"LanguageModelStyle {LanguageModelStyle} not implemented"
|
|
327
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import json
|
|
21
|
+
from abc import ABC
|
|
22
|
+
|
|
23
|
+
from tqdm import tqdm
|
|
24
|
+
|
|
25
|
+
from livecodebench.lm_styles import LanguageModel
|
|
26
|
+
from livecodebench.utils.path_utils import get_cache_path
|
|
27
|
+
from livecodebench.utils.multiprocess import run_tasks_in_parallel
|
|
28
|
+
from livecodebench.runner.scenario_router import Scenario
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseRunner(ABC):
|
|
32
|
+
def __init__(self, args, model: LanguageModel):
|
|
33
|
+
self.args = args
|
|
34
|
+
self.model = model
|
|
35
|
+
self.client_kwargs: dict[str | str] = {}
|
|
36
|
+
|
|
37
|
+
if self.args.use_cache:
|
|
38
|
+
self.cache_path = get_cache_path(model.model_repr, args)
|
|
39
|
+
if os.path.exists(self.cache_path):
|
|
40
|
+
with open(self.cache_path) as f:
|
|
41
|
+
self.cache: dict = json.load(f)
|
|
42
|
+
else:
|
|
43
|
+
self.cache = {}
|
|
44
|
+
else:
|
|
45
|
+
self.cache_path = None
|
|
46
|
+
self.cache = None
|
|
47
|
+
|
|
48
|
+
def save_cache(self):
|
|
49
|
+
if self.args.use_cache:
|
|
50
|
+
with open(self.cache_path, "w") as f:
|
|
51
|
+
json.dump(self.cache, f, indent=4)
|
|
52
|
+
|
|
53
|
+
# @abstractmethod
|
|
54
|
+
def _run_single(self, prompt: str | list[dict[str, str]]) -> list[str]:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def run_single(combined_args) -> list[str]:
|
|
59
|
+
"""
|
|
60
|
+
Run the model for a single prompt and return the output
|
|
61
|
+
Static method to be used in multiprocessing
|
|
62
|
+
Calls the _run_single method with the combined arguments
|
|
63
|
+
"""
|
|
64
|
+
prompt: str | list[dict[str, str]]
|
|
65
|
+
cache: dict[str, str]
|
|
66
|
+
call_method: callable
|
|
67
|
+
prompt, cache, args, call_method, sample_idx = combined_args
|
|
68
|
+
|
|
69
|
+
if cache is not None and sample_idx in cache:
|
|
70
|
+
if len(cache[sample_idx]) == args.n:
|
|
71
|
+
return cache[sample_idx]
|
|
72
|
+
|
|
73
|
+
result = call_method(prompt, idx=sample_idx)
|
|
74
|
+
assert len(result) == args.n
|
|
75
|
+
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
def run_batch(self, prompts: list[str | list[dict[str, str]]], indices: list[int], use_progress_bar: bool = False) -> list[list[str]]:
|
|
79
|
+
outputs = []
|
|
80
|
+
arguments = [
|
|
81
|
+
(
|
|
82
|
+
prompt,
|
|
83
|
+
self.cache, ## pass the cache as argument for cache check
|
|
84
|
+
self.args, ## pass the args as argument for cache check
|
|
85
|
+
self._run_single, ## pass the _run_single method as argument because of multiprocessing
|
|
86
|
+
idx, ## pass the sample index
|
|
87
|
+
)
|
|
88
|
+
for prompt, idx in zip(prompts, indices)
|
|
89
|
+
]
|
|
90
|
+
if self.args.multiprocess > 1:
|
|
91
|
+
parallel_outputs = run_tasks_in_parallel(
|
|
92
|
+
self.run_single,
|
|
93
|
+
arguments,
|
|
94
|
+
self.args.multiprocess,
|
|
95
|
+
use_progress_bar=use_progress_bar,
|
|
96
|
+
)
|
|
97
|
+
for output in parallel_outputs:
|
|
98
|
+
if output.is_success():
|
|
99
|
+
outputs.append(output.result)
|
|
100
|
+
else:
|
|
101
|
+
print("Failed to run the model for some prompts")
|
|
102
|
+
print(output.status)
|
|
103
|
+
print(output.exception_tb)
|
|
104
|
+
raise RuntimeError(f"Model run failed with status='{output.status}'.")
|
|
105
|
+
else:
|
|
106
|
+
outputs = [self.run_single(argument) for argument in tqdm(arguments)]
|
|
107
|
+
|
|
108
|
+
if self.args.use_cache:
|
|
109
|
+
for output, idx in zip(outputs, indices):
|
|
110
|
+
self.cache[idx] = output ## save the output to cache using sample_idx as key
|
|
111
|
+
|
|
112
|
+
return outputs
|
|
113
|
+
|
|
114
|
+
def prompts_to_outputs(
|
|
115
|
+
self, prompts: list[str | list[dict[str, str]]]
|
|
116
|
+
) -> list[list[str]]:
|
|
117
|
+
if self.args.use_cache:
|
|
118
|
+
outputs = []
|
|
119
|
+
batch_size = self.args.cache_batch_size
|
|
120
|
+
for i in tqdm(range(0, len(prompts), batch_size)):
|
|
121
|
+
batch = prompts[i : i + batch_size]
|
|
122
|
+
batch_indices = list(range(i, i + len(batch)))
|
|
123
|
+
batch_outputs = self.run_batch(batch, batch_indices, use_progress_bar=False)
|
|
124
|
+
outputs.extend(batch_outputs)
|
|
125
|
+
self.save_cache()
|
|
126
|
+
else:
|
|
127
|
+
indices = list(range(len(prompts)))
|
|
128
|
+
outputs = self.run_batch(prompts, indices, use_progress_bar=True)
|
|
129
|
+
return outputs
|
|
130
|
+
|
|
131
|
+
def run_main_repair(self, benchmark: list, format_prompt: callable) -> list[list[str]]:
|
|
132
|
+
assert self.args.n == 1
|
|
133
|
+
with open(
|
|
134
|
+
f"output/{self.model.model_repr}/{Scenario.codegeneration}_{self.args.codegen_n}_{self.args.temperature}_eval_all.json"
|
|
135
|
+
) as f:
|
|
136
|
+
check_metadata_list = json.load(f)
|
|
137
|
+
|
|
138
|
+
outputs = [
|
|
139
|
+
[None for _ in range(self.args.codegen_n)]
|
|
140
|
+
for _ in range(len(benchmark))
|
|
141
|
+
]
|
|
142
|
+
prompts = []
|
|
143
|
+
prompt_index_to_question_idx = {}
|
|
144
|
+
prompt_index_to_code_idx = {}
|
|
145
|
+
count = 0
|
|
146
|
+
|
|
147
|
+
for problem_idx, problem in enumerate(benchmark):
|
|
148
|
+
for check_metadata_idx, check_metadata in enumerate(check_metadata_list):
|
|
149
|
+
if problem.question_id == check_metadata['question_id']:
|
|
150
|
+
count += 1
|
|
151
|
+
question_content = check_metadata["question_content"]
|
|
152
|
+
code_list = check_metadata["code_list"]
|
|
153
|
+
output_list = check_metadata["output_list"]
|
|
154
|
+
graded_list = check_metadata["graded_list"]
|
|
155
|
+
metadata = check_metadata["metadata"]
|
|
156
|
+
for code_idx in range(len(code_list)):
|
|
157
|
+
prompt = format_prompt(
|
|
158
|
+
question_content,
|
|
159
|
+
self.model.model_style,
|
|
160
|
+
code_list[code_idx],
|
|
161
|
+
graded_list[code_idx],
|
|
162
|
+
metadata[code_idx],
|
|
163
|
+
)
|
|
164
|
+
if prompt == "":
|
|
165
|
+
outputs[problem_idx][code_idx] = output_list[code_idx]
|
|
166
|
+
continue
|
|
167
|
+
prompts.append(prompt)
|
|
168
|
+
prompt_index_to_question_idx[len(prompts) - 1] = problem_idx
|
|
169
|
+
prompt_index_to_code_idx[len(prompts) - 1] = code_idx
|
|
170
|
+
assert len(benchmark)==count, f"{len(benchmark)=}!={count=}"
|
|
171
|
+
|
|
172
|
+
prompt_outputs = self.prompts_to_outputs(prompts)
|
|
173
|
+
for prompt_idx, output in enumerate(prompt_outputs):
|
|
174
|
+
question_idx = prompt_index_to_question_idx[prompt_idx]
|
|
175
|
+
code_idx = prompt_index_to_code_idx[prompt_idx]
|
|
176
|
+
outputs[question_idx][code_idx] = output
|
|
177
|
+
|
|
178
|
+
return outputs
|
|
179
|
+
|
|
180
|
+
def run_main(self, benchmark: list, format_prompt: callable) -> list[list[str]]:
|
|
181
|
+
if self.args.scenario == Scenario.selfrepair:
|
|
182
|
+
return self.run_main_repair(benchmark, format_prompt)
|
|
183
|
+
|
|
184
|
+
prompts = [
|
|
185
|
+
format_prompt(problem, self.model.model_style) for problem in benchmark
|
|
186
|
+
]
|
|
187
|
+
outputs = self.prompts_to_outputs(prompts)
|
|
188
|
+
return outputs
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
from time import sleep
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from anthropic import Anthropic
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
from livecodebench.runner.base_runner import BaseRunner
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Claude3Runner(BaseRunner):
|
|
31
|
+
client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
|
|
32
|
+
|
|
33
|
+
def __init__(self, args, model):
|
|
34
|
+
super().__init__(args, model)
|
|
35
|
+
self.client_kwargs: dict[str | str] = {
|
|
36
|
+
"model": args.model,
|
|
37
|
+
"temperature": args.temperature,
|
|
38
|
+
"max_tokens": args.max_tokens,
|
|
39
|
+
"top_p": args.top_p,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def _run_single(self, prompt: tuple[str, str]) -> list[str]:
|
|
43
|
+
|
|
44
|
+
def __run_single(counter):
|
|
45
|
+
try:
|
|
46
|
+
response = self.client.messages.create(
|
|
47
|
+
system=prompt[0],
|
|
48
|
+
messages=prompt[1],
|
|
49
|
+
**self.client_kwargs,
|
|
50
|
+
)
|
|
51
|
+
content = "\n".join([x.text for x in response.content])
|
|
52
|
+
return content
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
|
|
55
|
+
sleep(20 * (11 - counter))
|
|
56
|
+
counter = counter - 1
|
|
57
|
+
if counter == 0:
|
|
58
|
+
print(f"Failed to run model for {prompt}!")
|
|
59
|
+
print("Exception: ", repr(e))
|
|
60
|
+
raise e
|
|
61
|
+
return __run_single(counter)
|
|
62
|
+
|
|
63
|
+
outputs = []
|
|
64
|
+
try:
|
|
65
|
+
for _ in range(self.args.n):
|
|
66
|
+
outputs.append(__run_single(10))
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise e
|
|
69
|
+
|
|
70
|
+
return outputs
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
from time import sleep
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from anthropic import Anthropic
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
from livecodebench.runner.base_runner import BaseRunner
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ClaudeRunner(BaseRunner):
|
|
31
|
+
client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
|
|
32
|
+
|
|
33
|
+
def __init__(self, args, model):
|
|
34
|
+
super().__init__(args, model)
|
|
35
|
+
self.client_kwargs: dict[str | str] = {
|
|
36
|
+
"model": args.model,
|
|
37
|
+
"temperature": args.temperature,
|
|
38
|
+
"max_tokens_to_sample": args.max_tokens,
|
|
39
|
+
"top_p": args.top_p,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def _run_single(self, prompt: str) -> list[str]:
|
|
43
|
+
|
|
44
|
+
def __run_single(counter):
|
|
45
|
+
try:
|
|
46
|
+
response = self.client.completions.create(
|
|
47
|
+
prompt=prompt,
|
|
48
|
+
**self.client_kwargs,
|
|
49
|
+
)
|
|
50
|
+
content = response.completion
|
|
51
|
+
return content
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
|
|
54
|
+
sleep(20 * (11 - counter))
|
|
55
|
+
counter = counter - 1
|
|
56
|
+
if counter == 0:
|
|
57
|
+
print(f"Failed to run model for {prompt}!")
|
|
58
|
+
print("Exception: ", repr(e))
|
|
59
|
+
raise e
|
|
60
|
+
return __run_single(counter)
|
|
61
|
+
|
|
62
|
+
outputs = []
|
|
63
|
+
try:
|
|
64
|
+
for _ in range(self.args.n):
|
|
65
|
+
outputs.append(__run_single(10))
|
|
66
|
+
except Exception as e:
|
|
67
|
+
raise e
|
|
68
|
+
|
|
69
|
+
return outputs
|