nvidia-livecodebench 25.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. core_evals/livecodebench/__init__.py +1 -0
  2. core_evals/livecodebench/framework.yml +233 -0
  3. core_evals/livecodebench/framework_entrypoint.py +28 -0
  4. core_evals/livecodebench/output.py +51 -0
  5. livecodebench/__init__.py +0 -0
  6. livecodebench/benchmarks/__init__.py +31 -0
  7. livecodebench/benchmarks/code_execution.py +85 -0
  8. livecodebench/benchmarks/code_generation.py +160 -0
  9. livecodebench/benchmarks/test_output_prediction.py +90 -0
  10. livecodebench/benchmarks/utils.py +50 -0
  11. livecodebench/evaluation/__init__.py +24 -0
  12. livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
  13. livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
  14. livecodebench/evaluation/compute_scores.py +172 -0
  15. livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
  16. livecodebench/evaluation/metric.py +28 -0
  17. livecodebench/evaluation/old_results_check.py +91 -0
  18. livecodebench/evaluation/pass_k_utils.py +84 -0
  19. livecodebench/evaluation/testing_util.py +574 -0
  20. livecodebench/evaluation/utils_execute.py +285 -0
  21. livecodebench/lm_styles.py +581 -0
  22. livecodebench/prompts/__init__.py +22 -0
  23. livecodebench/prompts/code_execution.py +201 -0
  24. livecodebench/prompts/code_generation.py +372 -0
  25. livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
  26. livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
  27. livecodebench/prompts/self_repair.py +370 -0
  28. livecodebench/prompts/test_output_prediction.py +327 -0
  29. livecodebench/runner/__init__.py +0 -0
  30. livecodebench/runner/base_runner.py +188 -0
  31. livecodebench/runner/claude3_runner.py +70 -0
  32. livecodebench/runner/claude_runner.py +69 -0
  33. livecodebench/runner/cohere_runner.py +71 -0
  34. livecodebench/runner/custom_evaluator.py +132 -0
  35. livecodebench/runner/deepseek_runner.py +87 -0
  36. livecodebench/runner/gemini_runner.py +111 -0
  37. livecodebench/runner/generic_oai_server_runner.py +104 -0
  38. livecodebench/runner/main.py +255 -0
  39. livecodebench/runner/mistral_runner.py +71 -0
  40. livecodebench/runner/oai_runner.py +93 -0
  41. livecodebench/runner/parser.py +174 -0
  42. livecodebench/runner/runner_utils.py +62 -0
  43. livecodebench/runner/scenario_router.py +239 -0
  44. livecodebench/runner/vllm_runner.py +82 -0
  45. livecodebench/utils/__init__.py +0 -0
  46. livecodebench/utils/extraction_utils.py +82 -0
  47. livecodebench/utils/multiprocess.py +250 -0
  48. livecodebench/utils/path_utils.py +58 -0
  49. livecodebench/utils/scenarios.py +26 -0
  50. livecodebench/utils/seed_generator.py +44 -0
  51. nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
  52. nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
  53. nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
  54. nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
  55. nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,327 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ from anthropic import HUMAN_PROMPT, AI_PROMPT
20
+
21
+ from livecodebench.lm_styles import LMStyle
22
+ from livecodebench.benchmarks import TestOutputPredictionProblem
23
+
24
+
25
+ class PromptConstants:
26
+ SYSTEM_MESSAGE_CHAT_GENERIC = f"You are a helpful programming assistant and an expert Python programmer.\
27
+ You are helping a user to write a test case to help to check the correctness of the function.\
28
+ The user has written a input for the testcase.\
29
+ You will calculate the output of the testcase and\
30
+ write the whole assertion statement in the markdown code block with the correct output."
31
+
32
+ SYSTEM_MESSAGE_COMPLETION_GENERIC = f"You are a helpful programming assistant and an expert Python programmer.\
33
+ You are helping a user to write a test case to help to check the correctness of the function."
34
+
35
+ SYSTEM_MESSAGE_INST_CLLAMA = f"You are a helpful programming assistant and an expert Python programmer.\
36
+ You are helping a user to write a test case to help to check the correctness of the function.\
37
+ The user has written a input for the testcase.\
38
+ You will calculate the output of the testcase and \
39
+ write out the complete assertion statement between [PYTHON] and [/PYTHON] tags."
40
+
41
+ SYSTEM_MESSAGE_WIZARD = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
42
+
43
+ SYSTEM_MESSAGE_PHIND = f"""You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program. You must put the entired fixed program within code delimiters only for once., for example:
44
+ ```python
45
+ # YOUR CODE HERE
46
+ ```"""
47
+
48
+ FORMATTING_MESSAGE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
49
+
50
+ FORMATTING_WITHOUT_STARTER_MESSAGE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."
51
+
52
+
53
+ def truncate_io(io):
54
+ if len(str(io)) > 1000:
55
+ io = str(io)[:1000] + "...."
56
+ print(io)
57
+ return io
58
+
59
+
60
+ def format_testcase_func_name_input(function_name, testcase):
61
+ """
62
+ use the form of "assert func_name(input) == "
63
+ """
64
+ # TODO should there be a space after the == ?
65
+ input_str = ", ".join(testcase.split("\n"))
66
+ return f"assert {function_name}({input_str}) == # TODO"
67
+
68
+
69
+ def parse_function_name_from_starter_code(starter_code):
70
+ """
71
+ starter_code : str
72
+ """
73
+ import ast
74
+
75
+ tree = ast.parse(starter_code)
76
+ fn = None
77
+ for node in ast.walk(tree):
78
+ if isinstance(node, ast.FunctionDef):
79
+ assert fn is None
80
+ fn = node.name
81
+ return fn
82
+
83
+
84
+ def get_generic_question_template_test_completion(
85
+ question: TestOutputPredictionProblem, testcase_input: str
86
+ ):
87
+ prompt = f"Problem:\n{question.question_content}"
88
+ prompt += f"Function:\n```\n{question.starter_code}\n```\n"
89
+
90
+ # parse function name from starter_code
91
+ func_name = parse_function_name_from_starter_code(question.starter_code)
92
+ prompt += "Please complete the following test case:\n\n"
93
+ prompt += (
94
+ f"```\n{format_testcase_func_name_input(func_name, testcase_input)}\n```\n"
95
+ )
96
+
97
+ return prompt
98
+
99
+
100
+ def get_cllama_question_template_answer(
101
+ question: TestOutputPredictionProblem, testcase_input: str
102
+ ):
103
+ prompt = f"### Question\n"
104
+ prompt += get_generic_question_template_test_completion(question, testcase_input)
105
+ prompt += f"### Answer\n"
106
+ return prompt
107
+
108
+
109
+ def get_deepseekcode_question_template_answer(
110
+ question: TestOutputPredictionProblem, testcase_input: str
111
+ ):
112
+ prompt = f"### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n"
113
+ prompt += get_generic_question_template_test_completion(question, testcase_input)
114
+ prompt += f"### Response:\n\n"
115
+ return prompt
116
+
117
+
118
+ def get_magicoder_question_template_answer(
119
+ question: TestOutputPredictionProblem, testcase_input: str
120
+ ):
121
+ # prompt = f"You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n"
122
+ prompt = f"Question:\n"
123
+ prompt += get_generic_question_template_test_completion(question, testcase_input)
124
+ prompt += f"@@ Response \n"
125
+ return prompt
126
+
127
+
128
+ def get_mixtral_question_template_answer(
129
+ question: TestOutputPredictionProblem, testcase_input: str
130
+ ):
131
+ prompt = get_generic_question_template_test_completion(question, testcase_input)
132
+ return prompt
133
+
134
+
135
+ def get_wizard_question_template_answer(
136
+ question: TestOutputPredictionProblem, testcase_input: str
137
+ ):
138
+ prompt = f"""### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"""
139
+ prompt += get_generic_question_template_test_completion(question, testcase_input)
140
+ prompt += f"### Response:\n"
141
+ return prompt
142
+
143
+
144
+ def get_phind_question_template_answer(
145
+ question: TestOutputPredictionProblem, testcase_input: str
146
+ ):
147
+ prompt = get_generic_question_template_test_completion(question, testcase_input)
148
+ prompt += f"\n\n### Assistant"
149
+ return prompt
150
+
151
+ def get_qwen_question_template_answer(question: TestOutputPredictionProblem, testcase_input: str):
152
+ from transformers import AutoTokenizer
153
+
154
+ tokenizer = AutoTokenizer.from_pretrained(
155
+ "abacusai/Dracarys-72B-Instruct", padding_side="left", use_fast=False
156
+ )
157
+
158
+ prompt = f"""### Instruction: {PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"""
159
+ prompt += get_generic_question_template_test_completion(question, testcase_input)
160
+ prompt += f"### Response:\n"
161
+
162
+ messages = [
163
+ {"role": "user", "content": prompt},
164
+ ]
165
+
166
+ prompt = tokenizer.apply_chat_template(
167
+ messages,
168
+ tokenize=False,
169
+ add_generation_prompt=True,
170
+ truncation=False,
171
+ padding=False,
172
+ )
173
+ return prompt
174
+
175
+ def format_prompt_test_output(
176
+ question: TestOutputPredictionProblem, LanguageModelStyle: LMStyle
177
+ ) -> str:
178
+ testcase_input = question.test[0].input
179
+ if LanguageModelStyle in (LMStyle.OpenAIChat, LMStyle.GenericOAIServer):
180
+ chat_messages = [
181
+ {
182
+ "role": "system",
183
+ "content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
184
+ },
185
+ ]
186
+ chat_messages += [
187
+ {
188
+ "role": "user",
189
+ "content": get_generic_question_template_test_completion(
190
+ question, testcase_input
191
+ ),
192
+ },
193
+ ]
194
+ return chat_messages
195
+ if LanguageModelStyle == LMStyle.LLaMa3:
196
+ chat_messages = [
197
+ {
198
+ "role": "system",
199
+ "content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
200
+ },
201
+ ]
202
+ chat_messages += [
203
+ {
204
+ "role": "user",
205
+ "content": get_generic_question_template_test_completion(
206
+ question, testcase_input
207
+ ),
208
+ },
209
+ ]
210
+ from transformers import AutoTokenizer
211
+
212
+ tokenizer = AutoTokenizer.from_pretrained(
213
+ "meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left", use_fast=False
214
+ )
215
+ return tokenizer.apply_chat_template(
216
+ chat_messages,
217
+ tokenize=False,
218
+ add_generation_prompt=True,
219
+ truncation=False,
220
+ padding=False,
221
+ )
222
+ elif LanguageModelStyle == LMStyle.Claude:
223
+ prompt = f"{HUMAN_PROMPT}\n{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n"
224
+ prompt += f"{get_generic_question_template_test_completion(question, testcase_input).rstrip()}\n{AI_PROMPT}"
225
+ return prompt
226
+ elif LanguageModelStyle == LMStyle.Claude3:
227
+ system = PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC
228
+ prompt = [
229
+ {
230
+ "role": "user",
231
+ "content": get_generic_question_template_test_completion(
232
+ question, testcase_input
233
+ ).rstrip(),
234
+ }
235
+ ]
236
+ return system, prompt
237
+ elif LanguageModelStyle == LMStyle.Gemini:
238
+ prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
239
+ prompt += (
240
+ f"{get_generic_question_template_test_completion(question, testcase_input)}"
241
+ )
242
+ return prompt
243
+
244
+ elif LanguageModelStyle == LMStyle.StarCoderInstruct:
245
+ prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
246
+ prompt += (
247
+ f"{get_generic_question_template_test_completion(question, testcase_input)}"
248
+ )
249
+ return prompt
250
+
251
+ elif LanguageModelStyle == LMStyle.DeepSeekCodeInstruct:
252
+ prompt = (
253
+ f"{get_deepseekcode_question_template_answer(question, testcase_input)}"
254
+ )
255
+ return prompt
256
+ elif LanguageModelStyle == LMStyle.CodeLLaMaInstruct:
257
+ prompt = f"[INST] <<SYS>>\n{PromptConstants.SYSTEM_MESSAGE_INST_CLLAMA}\n<</SYS>>\n\n"
258
+ prompt += (
259
+ f"{get_cllama_question_template_answer(question, testcase_input)}\n[/INST]"
260
+ )
261
+ return prompt
262
+ elif LanguageModelStyle == LMStyle.MagiCoder:
263
+ prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
264
+ prompt += f"{get_magicoder_question_template_answer(question, testcase_input)}"
265
+ return prompt
266
+ elif LanguageModelStyle == LMStyle.WizardCoder:
267
+ prompt = f"{PromptConstants.SYSTEM_MESSAGE_WIZARD}\n\n{get_wizard_question_template_answer(question, testcase_input)}"
268
+ return prompt
269
+ elif LanguageModelStyle == LMStyle.Phind:
270
+ prompt = f"### System Prompt\n\n{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n\n### User Message\n\n{get_phind_question_template_answer(question, testcase_input)}"
271
+ return prompt
272
+ elif LanguageModelStyle == LMStyle.OC:
273
+ prompt = f"{PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC}\n"
274
+ prompt += (
275
+ f"{get_generic_question_template_test_completion(question, testcase_input)}"
276
+ )
277
+ return prompt
278
+ elif LanguageModelStyle == LMStyle.MistralWeb:
279
+ chat_messages = [
280
+ {
281
+ "role": "system",
282
+ "content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
283
+ },
284
+ {
285
+ "role": "user",
286
+ "content": get_generic_question_template_test_completion(
287
+ question, testcase_input
288
+ ),
289
+ },
290
+ ]
291
+ return chat_messages
292
+ elif (
293
+ LanguageModelStyle == LMStyle.DracarysQwen
294
+ ):
295
+ prompt = f"{get_qwen_question_template_answer(question, testcase_input)}"
296
+ return prompt
297
+ elif LanguageModelStyle == LMStyle.DracarysLlama:
298
+ chat_messages = [
299
+ {
300
+ "role": "system",
301
+ "content": PromptConstants.SYSTEM_MESSAGE_CHAT_GENERIC,
302
+ },
303
+ ]
304
+ chat_messages += [
305
+ {
306
+ "role": "user",
307
+ "content": get_generic_question_template_test_completion(
308
+ question, testcase_input
309
+ ),
310
+ },
311
+ ]
312
+ from transformers import AutoTokenizer
313
+
314
+ tokenizer = AutoTokenizer.from_pretrained(
315
+ "abacusai/Dracarys-Llama-3.1-70B-Instruct", padding_side="right", use_fast=False
316
+ )
317
+ return tokenizer.apply_chat_template(
318
+ chat_messages,
319
+ tokenize=False,
320
+ add_generation_prompt=True,
321
+ truncation=False,
322
+ padding=False,
323
+ )
324
+ else:
325
+ raise NotImplementedError(
326
+ f"LanguageModelStyle {LanguageModelStyle} not implemented"
327
+ )
File without changes
@@ -0,0 +1,188 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ import os
20
+ import json
21
+ from abc import ABC
22
+
23
+ from tqdm import tqdm
24
+
25
+ from livecodebench.lm_styles import LanguageModel
26
+ from livecodebench.utils.path_utils import get_cache_path
27
+ from livecodebench.utils.multiprocess import run_tasks_in_parallel
28
+ from livecodebench.runner.scenario_router import Scenario
29
+
30
+
31
+ class BaseRunner(ABC):
32
+ def __init__(self, args, model: LanguageModel):
33
+ self.args = args
34
+ self.model = model
35
+ self.client_kwargs: dict[str | str] = {}
36
+
37
+ if self.args.use_cache:
38
+ self.cache_path = get_cache_path(model.model_repr, args)
39
+ if os.path.exists(self.cache_path):
40
+ with open(self.cache_path) as f:
41
+ self.cache: dict = json.load(f)
42
+ else:
43
+ self.cache = {}
44
+ else:
45
+ self.cache_path = None
46
+ self.cache = None
47
+
48
+ def save_cache(self):
49
+ if self.args.use_cache:
50
+ with open(self.cache_path, "w") as f:
51
+ json.dump(self.cache, f, indent=4)
52
+
53
+ # @abstractmethod
54
+ def _run_single(self, prompt: str | list[dict[str, str]]) -> list[str]:
55
+ pass
56
+
57
+ @staticmethod
58
+ def run_single(combined_args) -> list[str]:
59
+ """
60
+ Run the model for a single prompt and return the output
61
+ Static method to be used in multiprocessing
62
+ Calls the _run_single method with the combined arguments
63
+ """
64
+ prompt: str | list[dict[str, str]]
65
+ cache: dict[str, str]
66
+ call_method: callable
67
+ prompt, cache, args, call_method, sample_idx = combined_args
68
+
69
+ if cache is not None and sample_idx in cache:
70
+ if len(cache[sample_idx]) == args.n:
71
+ return cache[sample_idx]
72
+
73
+ result = call_method(prompt, idx=sample_idx)
74
+ assert len(result) == args.n
75
+
76
+ return result
77
+
78
+ def run_batch(self, prompts: list[str | list[dict[str, str]]], indices: list[int], use_progress_bar: bool = False) -> list[list[str]]:
79
+ outputs = []
80
+ arguments = [
81
+ (
82
+ prompt,
83
+ self.cache, ## pass the cache as argument for cache check
84
+ self.args, ## pass the args as argument for cache check
85
+ self._run_single, ## pass the _run_single method as argument because of multiprocessing
86
+ idx, ## pass the sample index
87
+ )
88
+ for prompt, idx in zip(prompts, indices)
89
+ ]
90
+ if self.args.multiprocess > 1:
91
+ parallel_outputs = run_tasks_in_parallel(
92
+ self.run_single,
93
+ arguments,
94
+ self.args.multiprocess,
95
+ use_progress_bar=use_progress_bar,
96
+ )
97
+ for output in parallel_outputs:
98
+ if output.is_success():
99
+ outputs.append(output.result)
100
+ else:
101
+ print("Failed to run the model for some prompts")
102
+ print(output.status)
103
+ print(output.exception_tb)
104
+ raise RuntimeError(f"Model run failed with status='{output.status}'.")
105
+ else:
106
+ outputs = [self.run_single(argument) for argument in tqdm(arguments)]
107
+
108
+ if self.args.use_cache:
109
+ for output, idx in zip(outputs, indices):
110
+ self.cache[idx] = output ## save the output to cache using sample_idx as key
111
+
112
+ return outputs
113
+
114
+ def prompts_to_outputs(
115
+ self, prompts: list[str | list[dict[str, str]]]
116
+ ) -> list[list[str]]:
117
+ if self.args.use_cache:
118
+ outputs = []
119
+ batch_size = self.args.cache_batch_size
120
+ for i in tqdm(range(0, len(prompts), batch_size)):
121
+ batch = prompts[i : i + batch_size]
122
+ batch_indices = list(range(i, i + len(batch)))
123
+ batch_outputs = self.run_batch(batch, batch_indices, use_progress_bar=False)
124
+ outputs.extend(batch_outputs)
125
+ self.save_cache()
126
+ else:
127
+ indices = list(range(len(prompts)))
128
+ outputs = self.run_batch(prompts, indices, use_progress_bar=True)
129
+ return outputs
130
+
131
+ def run_main_repair(self, benchmark: list, format_prompt: callable) -> list[list[str]]:
132
+ assert self.args.n == 1
133
+ with open(
134
+ f"output/{self.model.model_repr}/{Scenario.codegeneration}_{self.args.codegen_n}_{self.args.temperature}_eval_all.json"
135
+ ) as f:
136
+ check_metadata_list = json.load(f)
137
+
138
+ outputs = [
139
+ [None for _ in range(self.args.codegen_n)]
140
+ for _ in range(len(benchmark))
141
+ ]
142
+ prompts = []
143
+ prompt_index_to_question_idx = {}
144
+ prompt_index_to_code_idx = {}
145
+ count = 0
146
+
147
+ for problem_idx, problem in enumerate(benchmark):
148
+ for check_metadata_idx, check_metadata in enumerate(check_metadata_list):
149
+ if problem.question_id == check_metadata['question_id']:
150
+ count += 1
151
+ question_content = check_metadata["question_content"]
152
+ code_list = check_metadata["code_list"]
153
+ output_list = check_metadata["output_list"]
154
+ graded_list = check_metadata["graded_list"]
155
+ metadata = check_metadata["metadata"]
156
+ for code_idx in range(len(code_list)):
157
+ prompt = format_prompt(
158
+ question_content,
159
+ self.model.model_style,
160
+ code_list[code_idx],
161
+ graded_list[code_idx],
162
+ metadata[code_idx],
163
+ )
164
+ if prompt == "":
165
+ outputs[problem_idx][code_idx] = output_list[code_idx]
166
+ continue
167
+ prompts.append(prompt)
168
+ prompt_index_to_question_idx[len(prompts) - 1] = problem_idx
169
+ prompt_index_to_code_idx[len(prompts) - 1] = code_idx
170
+ assert len(benchmark)==count, f"{len(benchmark)=}!={count=}"
171
+
172
+ prompt_outputs = self.prompts_to_outputs(prompts)
173
+ for prompt_idx, output in enumerate(prompt_outputs):
174
+ question_idx = prompt_index_to_question_idx[prompt_idx]
175
+ code_idx = prompt_index_to_code_idx[prompt_idx]
176
+ outputs[question_idx][code_idx] = output
177
+
178
+ return outputs
179
+
180
+ def run_main(self, benchmark: list, format_prompt: callable) -> list[list[str]]:
181
+ if self.args.scenario == Scenario.selfrepair:
182
+ return self.run_main_repair(benchmark, format_prompt)
183
+
184
+ prompts = [
185
+ format_prompt(problem, self.model.model_style) for problem in benchmark
186
+ ]
187
+ outputs = self.prompts_to_outputs(prompts)
188
+ return outputs
@@ -0,0 +1,70 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ import os
20
+ from time import sleep
21
+
22
+ try:
23
+ from anthropic import Anthropic
24
+ except ImportError as e:
25
+ pass
26
+
27
+ from livecodebench.runner.base_runner import BaseRunner
28
+
29
+
30
+ class Claude3Runner(BaseRunner):
31
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
32
+
33
+ def __init__(self, args, model):
34
+ super().__init__(args, model)
35
+ self.client_kwargs: dict[str | str] = {
36
+ "model": args.model,
37
+ "temperature": args.temperature,
38
+ "max_tokens": args.max_tokens,
39
+ "top_p": args.top_p,
40
+ }
41
+
42
+ def _run_single(self, prompt: tuple[str, str]) -> list[str]:
43
+
44
+ def __run_single(counter):
45
+ try:
46
+ response = self.client.messages.create(
47
+ system=prompt[0],
48
+ messages=prompt[1],
49
+ **self.client_kwargs,
50
+ )
51
+ content = "\n".join([x.text for x in response.content])
52
+ return content
53
+ except Exception as e:
54
+ print("Exception: ", repr(e), "Sleeping for 20 seconds...")
55
+ sleep(20 * (11 - counter))
56
+ counter = counter - 1
57
+ if counter == 0:
58
+ print(f"Failed to run model for {prompt}!")
59
+ print("Exception: ", repr(e))
60
+ raise e
61
+ return __run_single(counter)
62
+
63
+ outputs = []
64
+ try:
65
+ for _ in range(self.args.n):
66
+ outputs.append(__run_single(10))
67
+ except Exception as e:
68
+ raise e
69
+
70
+ return outputs
@@ -0,0 +1,69 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ import os
20
+ from time import sleep
21
+
22
+ try:
23
+ from anthropic import Anthropic
24
+ except ImportError as e:
25
+ pass
26
+
27
+ from livecodebench.runner.base_runner import BaseRunner
28
+
29
+
30
+ class ClaudeRunner(BaseRunner):
31
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
32
+
33
+ def __init__(self, args, model):
34
+ super().__init__(args, model)
35
+ self.client_kwargs: dict[str | str] = {
36
+ "model": args.model,
37
+ "temperature": args.temperature,
38
+ "max_tokens_to_sample": args.max_tokens,
39
+ "top_p": args.top_p,
40
+ }
41
+
42
+ def _run_single(self, prompt: str) -> list[str]:
43
+
44
+ def __run_single(counter):
45
+ try:
46
+ response = self.client.completions.create(
47
+ prompt=prompt,
48
+ **self.client_kwargs,
49
+ )
50
+ content = response.completion
51
+ return content
52
+ except Exception as e:
53
+ print("Exception: ", repr(e), "Sleeping for 20 seconds...")
54
+ sleep(20 * (11 - counter))
55
+ counter = counter - 1
56
+ if counter == 0:
57
+ print(f"Failed to run model for {prompt}!")
58
+ print("Exception: ", repr(e))
59
+ raise e
60
+ return __run_single(counter)
61
+
62
+ outputs = []
63
+ try:
64
+ for _ in range(self.args.n):
65
+ outputs.append(__run_single(10))
66
+ except Exception as e:
67
+ raise e
68
+
69
+ return outputs