nvidia-livecodebench 25.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core_evals/livecodebench/__init__.py +1 -0
- core_evals/livecodebench/framework.yml +233 -0
- core_evals/livecodebench/framework_entrypoint.py +28 -0
- core_evals/livecodebench/output.py +51 -0
- livecodebench/__init__.py +0 -0
- livecodebench/benchmarks/__init__.py +31 -0
- livecodebench/benchmarks/code_execution.py +85 -0
- livecodebench/benchmarks/code_generation.py +160 -0
- livecodebench/benchmarks/test_output_prediction.py +90 -0
- livecodebench/benchmarks/utils.py +50 -0
- livecodebench/evaluation/__init__.py +24 -0
- livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
- livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
- livecodebench/evaluation/compute_scores.py +172 -0
- livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
- livecodebench/evaluation/metric.py +28 -0
- livecodebench/evaluation/old_results_check.py +91 -0
- livecodebench/evaluation/pass_k_utils.py +84 -0
- livecodebench/evaluation/testing_util.py +574 -0
- livecodebench/evaluation/utils_execute.py +285 -0
- livecodebench/lm_styles.py +581 -0
- livecodebench/prompts/__init__.py +22 -0
- livecodebench/prompts/code_execution.py +201 -0
- livecodebench/prompts/code_generation.py +372 -0
- livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
- livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
- livecodebench/prompts/self_repair.py +370 -0
- livecodebench/prompts/test_output_prediction.py +327 -0
- livecodebench/runner/__init__.py +0 -0
- livecodebench/runner/base_runner.py +188 -0
- livecodebench/runner/claude3_runner.py +70 -0
- livecodebench/runner/claude_runner.py +69 -0
- livecodebench/runner/cohere_runner.py +71 -0
- livecodebench/runner/custom_evaluator.py +132 -0
- livecodebench/runner/deepseek_runner.py +87 -0
- livecodebench/runner/gemini_runner.py +111 -0
- livecodebench/runner/generic_oai_server_runner.py +104 -0
- livecodebench/runner/main.py +255 -0
- livecodebench/runner/mistral_runner.py +71 -0
- livecodebench/runner/oai_runner.py +93 -0
- livecodebench/runner/parser.py +174 -0
- livecodebench/runner/runner_utils.py +62 -0
- livecodebench/runner/scenario_router.py +239 -0
- livecodebench/runner/vllm_runner.py +82 -0
- livecodebench/utils/__init__.py +0 -0
- livecodebench/utils/extraction_utils.py +82 -0
- livecodebench/utils/multiprocess.py +250 -0
- livecodebench/utils/path_utils.py +58 -0
- livecodebench/utils/scenarios.py +26 -0
- livecodebench/utils/seed_generator.py +44 -0
- nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
- nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
- nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
- nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
- nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
|
|
21
|
+
from livecodebench.lm_styles import LMStyle
|
|
22
|
+
from livecodebench.benchmarks import CodeExecutionProblem
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_cot_output_prompt(s):
|
|
26
|
+
code, input = s
|
|
27
|
+
return f"""You are given a Python function and an assertion containing an input to the function. Complete the assertion with a literal (no unsimplified expressions, no function calls) containing the output when executing the provided code on the given input, even if the function is incorrect or incomplete. Do NOT output any extra information. Execute the program step by step before arriving at an answer, and provide the full assertion with the correct output in [ANSWER] and [/ANSWER] tags, following the examples.
|
|
28
|
+
|
|
29
|
+
[PYTHON]
|
|
30
|
+
def performOperation(s):
|
|
31
|
+
s = s + s
|
|
32
|
+
return "b" + s + "a"
|
|
33
|
+
assert performOperation(s = "hi") == ??
|
|
34
|
+
[/PYTHON]
|
|
35
|
+
[THOUGHT]
|
|
36
|
+
Let's execute the code step by step:
|
|
37
|
+
|
|
38
|
+
1. The function performOperation is defined, which takes a single argument s.
|
|
39
|
+
2. The function is called with the argument "hi", so within the function, s is initially "hi".
|
|
40
|
+
3. Inside the function, s is concatenated with itself, so s becomes "hihi".
|
|
41
|
+
4. The function then returns a new string that starts with "b", followed by the value of s (which is now "hihi"), and ends with "a".
|
|
42
|
+
5. The return value of the function is therefore "bhihia".
|
|
43
|
+
[/THOUGHT]
|
|
44
|
+
[ANSWER]
|
|
45
|
+
assert performOperation(s = "hi") == "bhihia"
|
|
46
|
+
[/ANSWER]
|
|
47
|
+
|
|
48
|
+
[PYTHON]
|
|
49
|
+
{code}
|
|
50
|
+
assert {input} == ??
|
|
51
|
+
[/PYTHON]
|
|
52
|
+
[THOUGHT]
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def make_direct_output_prompt(s):
|
|
57
|
+
code, input = s
|
|
58
|
+
return f"""You are given a Python function and an assertion containing an input to the function. Complete the assertion with a literal (no unsimplified expressions, no function calls) containing the output when executing the provided code on the given input, even if the function is incorrect or incomplete. Do NOT output any extra information. Provide the full assertion with the correct output in [ANSWER] and [/ANSWER] tags, following the examples.
|
|
59
|
+
|
|
60
|
+
[PYTHON]
|
|
61
|
+
def repeatNumber(number : int) -> int:
|
|
62
|
+
return number
|
|
63
|
+
assert repeatNumber(number = 17) == ??
|
|
64
|
+
[/PYTHON]
|
|
65
|
+
[ANSWER]
|
|
66
|
+
assert repeatNumber(number = 17) == 17
|
|
67
|
+
[/ANSWER]
|
|
68
|
+
|
|
69
|
+
[PYTHON]
|
|
70
|
+
def addCharacterA(string : str) -> str:
|
|
71
|
+
return string + "a"
|
|
72
|
+
assert addCharacterA(string = "x9j") == ??
|
|
73
|
+
[/PYTHON]
|
|
74
|
+
[ANSWER]
|
|
75
|
+
assert addCharacterA(string = "x9j") == "x9ja"
|
|
76
|
+
[/ANSWER]
|
|
77
|
+
|
|
78
|
+
[PYTHON]
|
|
79
|
+
{code}
|
|
80
|
+
assert {input} == ??
|
|
81
|
+
[/PYTHON]
|
|
82
|
+
[ANSWER]
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def format_prompt_execution(question, LanguageModelStyle):
|
|
87
|
+
return format_prompt_execution_base(question, LanguageModelStyle, False)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def format_prompt_execution_cot(question, LanguageModelStyle):
|
|
91
|
+
return format_prompt_execution_base(question, LanguageModelStyle, True)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def format_prompt_execution_base(
|
|
95
|
+
question: CodeExecutionProblem, LanguageModelStyle: LMStyle, cot: bool
|
|
96
|
+
) -> str:
|
|
97
|
+
code = question.code
|
|
98
|
+
input = question.input
|
|
99
|
+
system_message = "You are an expert at Python programming, code execution, test case generation, and fuzzing."
|
|
100
|
+
if cot:
|
|
101
|
+
prompt = make_cot_output_prompt((code, input))
|
|
102
|
+
else:
|
|
103
|
+
prompt = make_direct_output_prompt((code, input))
|
|
104
|
+
|
|
105
|
+
if LanguageModelStyle in (LMStyle.OpenAIChat, LMStyle.GenericOAIServer):
|
|
106
|
+
chat_messages = [
|
|
107
|
+
{
|
|
108
|
+
"role": "system",
|
|
109
|
+
"content": system_message,
|
|
110
|
+
},
|
|
111
|
+
]
|
|
112
|
+
chat_messages += [
|
|
113
|
+
{"role": "user", "content": prompt},
|
|
114
|
+
]
|
|
115
|
+
return chat_messages
|
|
116
|
+
if LanguageModelStyle == LMStyle.LLaMa3:
|
|
117
|
+
chat_messages = [
|
|
118
|
+
{
|
|
119
|
+
"role": "system",
|
|
120
|
+
"content": system_message,
|
|
121
|
+
},
|
|
122
|
+
]
|
|
123
|
+
chat_messages += [
|
|
124
|
+
{"role": "user", "content": prompt},
|
|
125
|
+
]
|
|
126
|
+
from transformers import AutoTokenizer
|
|
127
|
+
|
|
128
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
129
|
+
"meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left", use_fast=False
|
|
130
|
+
)
|
|
131
|
+
return tokenizer.apply_chat_template(
|
|
132
|
+
chat_messages,
|
|
133
|
+
tokenize=False,
|
|
134
|
+
add_generation_prompt=True,
|
|
135
|
+
truncation=False,
|
|
136
|
+
padding=False,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
elif LanguageModelStyle == LMStyle.Claude:
|
|
140
|
+
return prompt
|
|
141
|
+
elif LanguageModelStyle == LMStyle.Claude3:
|
|
142
|
+
prompt = [
|
|
143
|
+
{
|
|
144
|
+
"role": "user",
|
|
145
|
+
"content": prompt,
|
|
146
|
+
}
|
|
147
|
+
]
|
|
148
|
+
return system_message, prompt
|
|
149
|
+
elif LanguageModelStyle == LMStyle.Gemini:
|
|
150
|
+
return prompt
|
|
151
|
+
elif LanguageModelStyle == LMStyle.StarCoderInstruct:
|
|
152
|
+
return prompt
|
|
153
|
+
elif LanguageModelStyle == LMStyle.DeepSeekCodeInstruct:
|
|
154
|
+
return prompt
|
|
155
|
+
elif LanguageModelStyle == LMStyle.CodeLLaMaInstruct:
|
|
156
|
+
return prompt
|
|
157
|
+
elif LanguageModelStyle == LMStyle.MagiCoder:
|
|
158
|
+
return prompt
|
|
159
|
+
elif LanguageModelStyle == LMStyle.WizardCoder:
|
|
160
|
+
return prompt
|
|
161
|
+
elif LanguageModelStyle == LMStyle.Phind:
|
|
162
|
+
return prompt
|
|
163
|
+
elif LanguageModelStyle == LMStyle.OC:
|
|
164
|
+
return prompt
|
|
165
|
+
elif LanguageModelStyle == LMStyle.MistralWeb:
|
|
166
|
+
chat_messages = [
|
|
167
|
+
{
|
|
168
|
+
"role": "system",
|
|
169
|
+
"content": system_message,
|
|
170
|
+
},
|
|
171
|
+
{"role": "user", "content": prompt},
|
|
172
|
+
]
|
|
173
|
+
return chat_messages
|
|
174
|
+
elif LanguageModelStyle == LMStyle.DracarysLlama:
|
|
175
|
+
chat_messages = [
|
|
176
|
+
{
|
|
177
|
+
"role": "system",
|
|
178
|
+
"content": system_message,
|
|
179
|
+
},
|
|
180
|
+
]
|
|
181
|
+
chat_messages += [
|
|
182
|
+
{"role": "user", "content": prompt},
|
|
183
|
+
]
|
|
184
|
+
from transformers import AutoTokenizer
|
|
185
|
+
|
|
186
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
187
|
+
"abacusai/Dracarys-Llama-3.1-70B-Instruct", padding_side="right", use_fast=False
|
|
188
|
+
)
|
|
189
|
+
return tokenizer.apply_chat_template(
|
|
190
|
+
chat_messages,
|
|
191
|
+
tokenize=False,
|
|
192
|
+
add_generation_prompt=True,
|
|
193
|
+
truncation=False,
|
|
194
|
+
padding=False,
|
|
195
|
+
)
|
|
196
|
+
elif LanguageModelStyle == LMStyle.DracarysQwen:
|
|
197
|
+
return prompt
|
|
198
|
+
else:
|
|
199
|
+
raise NotImplementedError(
|
|
200
|
+
f"LanguageModelStyle {LanguageModelStyle} not implemented"
|
|
201
|
+
)
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from anthropic import HUMAN_PROMPT, AI_PROMPT
|
|
24
|
+
except ImportError:
|
|
25
|
+
HUMAN_PROMPT = None
|
|
26
|
+
AI_PROMPT = None
|
|
27
|
+
|
|
28
|
+
from livecodebench.lm_styles import LMStyle
|
|
29
|
+
from livecodebench.benchmarks.code_generation import CodeGenerationProblem
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PromptConstants:
|
|
33
|
+
SYSTEM_MESSAGE_GENERIC = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests."
|
|
34
|
+
|
|
35
|
+
SYSTEM_MESSAGE_GEMINI = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. Do NOT use system calls like `exit` in the generated program. Ensure that the first code block contains the solution."
|
|
36
|
+
|
|
37
|
+
SYSTEM_MESSAGE_GEMINITHINK = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests."
|
|
38
|
+
|
|
39
|
+
SYSTEM_MESSAGE_DEEPSEEK = f"You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you answer questions related to computer science."
|
|
40
|
+
|
|
41
|
+
SYSTEM_MESSAGE_CODEQWEN = (
|
|
42
|
+
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
|
|
46
|
+
|
|
47
|
+
FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_generic_question_template_answer(question: CodeGenerationProblem):
|
|
51
|
+
prompt = f"### Question:\n{question.question_content}\n\n"
|
|
52
|
+
if question.starter_code:
|
|
53
|
+
prompt += (
|
|
54
|
+
f"### Format: {PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
55
|
+
)
|
|
56
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n"
|
|
57
|
+
else:
|
|
58
|
+
prompt += f"### Format: {PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n"
|
|
59
|
+
prompt += "```python\n# YOUR CODE HERE\n```\n\n"
|
|
60
|
+
prompt += f"### Answer: (use the provided format with backticks)\n\n"
|
|
61
|
+
return prompt
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_oaireason_question_template_answer(question: CodeGenerationProblem):
|
|
65
|
+
prompt = f"### Question:\n{question.question_content}\n\n"
|
|
66
|
+
if question.starter_code:
|
|
67
|
+
prompt += (
|
|
68
|
+
f"### Format: {PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
69
|
+
)
|
|
70
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n"
|
|
71
|
+
else:
|
|
72
|
+
prompt += f"### Format: Implement a function called `main()` which orchastrates the solution by reading inputs from stdin and writing the answer to stdout. Feel free to use additional functions as necessary. Next do NOT forget to call `main` function at the end of the program otherwise you will not be awarded any points.\n"
|
|
73
|
+
prompt += "```python\n# YOUR CODE HERE\n```\n\n"
|
|
74
|
+
prompt += f"### Answer: (use the provided format with backticks)\n\n"
|
|
75
|
+
return prompt
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_geminithinking_question_template_answer(question: CodeGenerationProblem):
|
|
79
|
+
prompt = f"### Question:\n{question.question_content}\n\n"
|
|
80
|
+
if question.starter_code:
|
|
81
|
+
prompt += (
|
|
82
|
+
f"### Format: {PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
83
|
+
)
|
|
84
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n"
|
|
85
|
+
else:
|
|
86
|
+
prompt += f"### Format: {PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n"
|
|
87
|
+
prompt += "```python\n# YOUR CODE HERE\n```\n\n"
|
|
88
|
+
prompt += f"### Answer: (use the provided format with backticks)\n\n"
|
|
89
|
+
return prompt
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_deepseekcode_question_template_answer(question: CodeGenerationProblem):
|
|
93
|
+
prompt = f"### Instruction: You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n"
|
|
94
|
+
prompt += f"Question:\n{question.question_content}\n\n"
|
|
95
|
+
if question.starter_code:
|
|
96
|
+
prompt += (
|
|
97
|
+
f"### Instruction: {PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
98
|
+
)
|
|
99
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n"
|
|
100
|
+
else:
|
|
101
|
+
prompt += (
|
|
102
|
+
f"### Instruction: {PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n"
|
|
103
|
+
)
|
|
104
|
+
prompt += f"```python\n# YOUR CODE HERE\n```\n\n"
|
|
105
|
+
prompt += f"### Response:\n\n"
|
|
106
|
+
return prompt
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_qwen_question_template_answer(question: CodeGenerationProblem):
|
|
110
|
+
from transformers import AutoTokenizer
|
|
111
|
+
|
|
112
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
113
|
+
"/abacus/models/Qwen1.5-72B-Chat/", padding_side="left", use_fast=False
|
|
114
|
+
)
|
|
115
|
+
prompt = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n"
|
|
116
|
+
prompt += f"Question:\n{question.question_content}\n\n"
|
|
117
|
+
if question.starter_code:
|
|
118
|
+
prompt += f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
119
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n"
|
|
120
|
+
else:
|
|
121
|
+
prompt += f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n\n"
|
|
122
|
+
prompt += f"```python\n# YOUR CODE HERE\n```\n\n"
|
|
123
|
+
|
|
124
|
+
messages = [
|
|
125
|
+
{"role": "system", "content": PromptConstants.SYSTEM_MESSAGE_GENERIC},
|
|
126
|
+
{"role": "user", "content": prompt},
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
prompt = tokenizer.apply_chat_template(
|
|
130
|
+
messages,
|
|
131
|
+
tokenize=False,
|
|
132
|
+
add_generation_prompt=True,
|
|
133
|
+
truncation=False,
|
|
134
|
+
padding=False,
|
|
135
|
+
)
|
|
136
|
+
return prompt
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_codeqwen_question_template_answer(question: CodeGenerationProblem):
|
|
140
|
+
prompt = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n"
|
|
141
|
+
prompt += f"Question: {question.question_content}\n\n"
|
|
142
|
+
if question.starter_code:
|
|
143
|
+
prompt += f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
|
|
144
|
+
prompt += f"```python\n{question.starter_code}\n```\n\n<|im_end|>\n"
|
|
145
|
+
else:
|
|
146
|
+
prompt += f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n"
|
|
147
|
+
prompt += f"```python\n# YOUR CODE HERE\n```\n\n<|im_end|>\n"
|
|
148
|
+
prompt += f"<|im_start|>assistant\n"
|
|
149
|
+
return prompt
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# Get the absolute path of the current script
|
|
153
|
+
current_dir = Path(__file__).absolute().parent
|
|
154
|
+
|
|
155
|
+
# Navigate two directories up to reach the project root
|
|
156
|
+
project_root = current_dir.parent.parent
|
|
157
|
+
|
|
158
|
+
# Construct the path to the JSON files relative to the project root
|
|
159
|
+
func_path = project_root / 'livecodebench' / 'prompts' / 'few_shot_examples' / 'generation' / 'func.json'
|
|
160
|
+
stdin_path = project_root / 'livecodebench' / 'prompts' / 'few_shot_examples' / 'generation' / 'stdin.json'
|
|
161
|
+
|
|
162
|
+
# Open and load the JSON files
|
|
163
|
+
with open(func_path) as f:
|
|
164
|
+
func = json.load(f)
|
|
165
|
+
|
|
166
|
+
with open(stdin_path) as f:
|
|
167
|
+
stdin = json.load(f)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_base_model_question_template_answer(question: CodeGenerationProblem):
|
|
171
|
+
if question.starter_code:
|
|
172
|
+
examples_json = func
|
|
173
|
+
else:
|
|
174
|
+
examples_json = stdin
|
|
175
|
+
|
|
176
|
+
def get_example_prompt(example):
|
|
177
|
+
prompt = ""
|
|
178
|
+
prompt += "### Question\n"
|
|
179
|
+
prompt += example["question"]
|
|
180
|
+
prompt += "\n\n"
|
|
181
|
+
if question.starter_code:
|
|
182
|
+
prompt += "### Starter Code\n"
|
|
183
|
+
prompt += example["sample_code"]
|
|
184
|
+
prompt += "\n\n"
|
|
185
|
+
prompt += "### Answer\n\n"
|
|
186
|
+
prompt += example["answer"]
|
|
187
|
+
if example["answer"]:
|
|
188
|
+
prompt += "\n\n"
|
|
189
|
+
return prompt
|
|
190
|
+
|
|
191
|
+
prompt = ""
|
|
192
|
+
prompt += get_example_prompt(examples_json[0])
|
|
193
|
+
prompt += get_example_prompt(
|
|
194
|
+
{
|
|
195
|
+
"question": question.question_content,
|
|
196
|
+
"sample_code": question.starter_code,
|
|
197
|
+
"answer": "",
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
return prompt
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def format_prompt_generation(
|
|
204
|
+
question: CodeGenerationProblem, LanguageModelStyle: LMStyle
|
|
205
|
+
) -> str:
|
|
206
|
+
if LanguageModelStyle in [LMStyle.OpenAIChat, LMStyle.DeepSeekAPI]:
|
|
207
|
+
chat_messages = [
|
|
208
|
+
{
|
|
209
|
+
"role": "system",
|
|
210
|
+
"content": PromptConstants.SYSTEM_MESSAGE_GENERIC,
|
|
211
|
+
},
|
|
212
|
+
]
|
|
213
|
+
chat_messages += [
|
|
214
|
+
{
|
|
215
|
+
"role": "user",
|
|
216
|
+
"content": get_generic_question_template_answer(question),
|
|
217
|
+
},
|
|
218
|
+
]
|
|
219
|
+
return chat_messages
|
|
220
|
+
elif LanguageModelStyle == LMStyle.OpenAIReasonPreview:
|
|
221
|
+
chat_messages = [
|
|
222
|
+
{
|
|
223
|
+
"role": "user",
|
|
224
|
+
"content": PromptConstants.SYSTEM_MESSAGE_GENERIC
|
|
225
|
+
+ "\n\n"
|
|
226
|
+
+ get_generic_question_template_answer(question),
|
|
227
|
+
},
|
|
228
|
+
]
|
|
229
|
+
return chat_messages
|
|
230
|
+
elif LanguageModelStyle == LMStyle.OpenAIReason:
|
|
231
|
+
chat_messages = [
|
|
232
|
+
{
|
|
233
|
+
"role": "user",
|
|
234
|
+
"content": PromptConstants.SYSTEM_MESSAGE_GENERIC
|
|
235
|
+
+ "\n\n"
|
|
236
|
+
+ get_oaireason_question_template_answer(question),
|
|
237
|
+
},
|
|
238
|
+
]
|
|
239
|
+
return chat_messages
|
|
240
|
+
|
|
241
|
+
if LanguageModelStyle == LMStyle.LLaMa3:
|
|
242
|
+
chat_messages = [
|
|
243
|
+
{
|
|
244
|
+
"role": "system",
|
|
245
|
+
"content": PromptConstants.SYSTEM_MESSAGE_GENERIC,
|
|
246
|
+
},
|
|
247
|
+
]
|
|
248
|
+
chat_messages += [
|
|
249
|
+
{
|
|
250
|
+
"role": "user",
|
|
251
|
+
"content": get_generic_question_template_answer(question),
|
|
252
|
+
},
|
|
253
|
+
]
|
|
254
|
+
from transformers import AutoTokenizer
|
|
255
|
+
|
|
256
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
257
|
+
"meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left", use_fast=False
|
|
258
|
+
)
|
|
259
|
+
return tokenizer.apply_chat_template(
|
|
260
|
+
chat_messages,
|
|
261
|
+
tokenize=False,
|
|
262
|
+
add_generation_prompt=True,
|
|
263
|
+
truncation=False,
|
|
264
|
+
padding=False,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if LanguageModelStyle == LMStyle.Claude:
|
|
268
|
+
prompt = f"{HUMAN_PROMPT}\n"
|
|
269
|
+
prompt += f"{PromptConstants.SYSTEM_MESSAGE_GENERIC}\n\n"
|
|
270
|
+
prompt += f"{get_generic_question_template_answer(question).rstrip()}\n"
|
|
271
|
+
prompt += f"{AI_PROMPT}"
|
|
272
|
+
return prompt
|
|
273
|
+
|
|
274
|
+
if LanguageModelStyle == LMStyle.Claude3:
|
|
275
|
+
system = PromptConstants.SYSTEM_MESSAGE_GENERIC
|
|
276
|
+
prompt = [
|
|
277
|
+
{
|
|
278
|
+
"role": "user",
|
|
279
|
+
"content": get_generic_question_template_answer(question).rstrip(),
|
|
280
|
+
}
|
|
281
|
+
]
|
|
282
|
+
return system, prompt
|
|
283
|
+
|
|
284
|
+
if LanguageModelStyle == LMStyle.Gemini:
|
|
285
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_GEMINI}\n"
|
|
286
|
+
prompt += f"{get_generic_question_template_answer(question)}"
|
|
287
|
+
return prompt
|
|
288
|
+
|
|
289
|
+
if LanguageModelStyle == LMStyle.GeminiThinking:
|
|
290
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_GEMINITHINK}\n"
|
|
291
|
+
prompt += f"{get_geminithinking_question_template_answer(question)}"
|
|
292
|
+
return prompt
|
|
293
|
+
|
|
294
|
+
if LanguageModelStyle == LMStyle.MistralWeb:
|
|
295
|
+
chat_messages = [
|
|
296
|
+
{
|
|
297
|
+
"role": "system",
|
|
298
|
+
"content": PromptConstants.SYSTEM_MESSAGE_GENERIC,
|
|
299
|
+
},
|
|
300
|
+
{
|
|
301
|
+
"role": "user",
|
|
302
|
+
"content": get_generic_question_template_answer(question),
|
|
303
|
+
},
|
|
304
|
+
]
|
|
305
|
+
return chat_messages
|
|
306
|
+
|
|
307
|
+
if LanguageModelStyle == LMStyle.DeepSeekCodeInstruct:
|
|
308
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_DEEPSEEK}\n\n"
|
|
309
|
+
prompt += f"{get_deepseekcode_question_template_answer(question)}"
|
|
310
|
+
return prompt
|
|
311
|
+
|
|
312
|
+
if LanguageModelStyle == LMStyle.CodeQwenInstruct:
|
|
313
|
+
prompt = f"{PromptConstants.SYSTEM_MESSAGE_CODEQWEN}\n\n"
|
|
314
|
+
prompt += f"{get_codeqwen_question_template_answer(question)}"
|
|
315
|
+
return prompt
|
|
316
|
+
|
|
317
|
+
if LanguageModelStyle == LMStyle.GenericBase:
|
|
318
|
+
prompt = get_base_model_question_template_answer(question)
|
|
319
|
+
return prompt
|
|
320
|
+
|
|
321
|
+
if LanguageModelStyle == LMStyle.GenericOAIServer:
|
|
322
|
+
prompt = [
|
|
323
|
+
{
|
|
324
|
+
"role": "user",
|
|
325
|
+
"content": get_generic_question_template_answer(question).rstrip(),
|
|
326
|
+
}
|
|
327
|
+
]
|
|
328
|
+
return prompt
|
|
329
|
+
|
|
330
|
+
raise NotImplementedError(
|
|
331
|
+
f"LanguageModelStyle {LanguageModelStyle} not implemented"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def test():
|
|
336
|
+
import pathlib
|
|
337
|
+
|
|
338
|
+
base_dir = "logs/example_prompts/generation"
|
|
339
|
+
pathlib.Path(base_dir).mkdir(parents=True, exist_ok=True)
|
|
340
|
+
|
|
341
|
+
for lmstyle in LMStyle:
|
|
342
|
+
generation_problem = CodeGenerationProblem(
|
|
343
|
+
"title",
|
|
344
|
+
"question-content",
|
|
345
|
+
"leetcode",
|
|
346
|
+
"question_id",
|
|
347
|
+
"contest_id",
|
|
348
|
+
"contest_date",
|
|
349
|
+
"",
|
|
350
|
+
"easy",
|
|
351
|
+
"[]",
|
|
352
|
+
"[]",
|
|
353
|
+
"{}",
|
|
354
|
+
)
|
|
355
|
+
prompt1 = format_prompt_generation(generation_problem, lmstyle)
|
|
356
|
+
with open(f"{base_dir}/{lmstyle}_1.txt", "w") as f:
|
|
357
|
+
try:
|
|
358
|
+
f.write(prompt1)
|
|
359
|
+
except TypeError:
|
|
360
|
+
f.write(json.dumps(prompt1))
|
|
361
|
+
|
|
362
|
+
generation_problem.starter_code = "starter code"
|
|
363
|
+
prompt2 = format_prompt_generation(generation_problem, lmstyle)
|
|
364
|
+
with open(f"{base_dir}/{lmstyle}_2.txt", "w") as f:
|
|
365
|
+
try:
|
|
366
|
+
f.write(prompt2)
|
|
367
|
+
except TypeError:
|
|
368
|
+
f.write(json.dumps(prompt2))
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
if __name__ == "__main__":
|
|
372
|
+
test()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"question": "You are given a 0-indexed array of positive integers nums. Find the number of triplets (i, j, k) that meet the following conditions:\n\n0 <= i < j < k < nums.length\nnums[i], nums[j], and nums[k] are pairwise distinct.\n\t\nIn other words, nums[i] != nums[j], nums[i] != nums[k], and nums[j] != nums[k].\n\n\n\nReturn the number of triplets that meet the conditions.\n \nExample 1:\n\nInput: nums = [4,4,2,4,3]\nOutput: 3\nExplanation: The following triplets meet the conditions:\n- (0, 2, 4) because 4 != 2 != 3\n- (1, 2, 4) because 4 != 2 != 3\n- (2, 3, 4) because 2 != 4 != 3\nSince there are 3 triplets, we return 3.\nNote that (2, 0, 4) is not a valid triplet because 2 > 0.\n\nExample 2:\n\nInput: nums = [1,1,1,1,1]\nOutput: 0\nExplanation: No triplets meet the conditions so we return 0.\n\n \nConstraints:\n\n3 <= nums.length <= 100\n1 <= nums[i] <= 1000\n\n",
|
|
4
|
+
"sample_code": "class Solution:\n def unequalTriplets(self, nums: List[int]) -> int:\n ",
|
|
5
|
+
"answer": "class Solution:\n def unequalTriplets(self, a: List[int]) -> int:\n ans = 0\n n = len(a)\n for i in range(n):\n for j in range(i + 1, n):\n for k in range(j + 1, n):\n ans += len({a[i], a[j], a[k]}) == 3\n return ans"
|
|
6
|
+
},
|
|
7
|
+
{
|
|
8
|
+
"question": "You are given two strings s and t consisting of only lowercase English letters.\nReturn the minimum number of characters that need to be appended to the end of s so that t becomes a subsequence of s.\nA subsequence is a string that can be derived from another string by deleting some or no characters without changing the order of the remaining characters.\n \nExample 1:\n\nInput: s = \"coaching\", t = \"coding\"\nOutput: 4\nExplanation: Append the characters \"ding\" to the end of s so that s = \"coachingding\".\nNow, t is a subsequence of s (\"coachingding\").\nIt can be shown that appending any 3 characters to the end of s will never make t a subsequence.\n\nExample 2:\n\nInput: s = \"abcde\", t = \"a\"\nOutput: 0\nExplanation: t is already a subsequence of s (\"abcde\").\n\nExample 3:\n\nInput: s = \"z\", t = \"abcde\"\nOutput: 5\nExplanation: Append the characters \"abcde\" to the end of s so that s = \"zabcde\".\nNow, t is a subsequence of s (\"zabcde\").\nIt can be shown that appending any 4 characters to the end of s will never make t a subsequence.\n\n \nConstraints:\n\n1 <= s.length, t.length <= 10^5\ns and t consist only of lowercase English letters.\n\n",
|
|
9
|
+
"sample_code": "class Solution:\n def appendCharacters(self, s: str, t: str) -> int:\n ",
|
|
10
|
+
"answer": "class Solution:\n def appendCharacters(self, s: str, t: str) -> int:\n i = 0\n for char in s:\n if i < len(t) and char == t[i]:\n i += 1\n return len(t) - i"
|
|
11
|
+
}
|
|
12
|
+
]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"question": "You have $n$ gifts and you want to give all of them to children. Of course, you don't want to offend anyone, so all gifts should be equal between each other. The $i$-th gift consists of $a_i$ candies and $b_i$ oranges.\n\nDuring one move, you can choose some gift $1 \\le i \\le n$ and do one of the following operations:\n\n eat exactly one candy from this gift (decrease $a_i$ by one); eat exactly one orange from this gift (decrease $b_i$ by one); eat exactly one candy and exactly one orange from this gift (decrease both $a_i$ and $b_i$ by one). \n\nOf course, you can not eat a candy or orange if it's not present in the gift (so neither $a_i$ nor $b_i$ can become less than zero).\n\nAs said above, all gifts should be equal. This means that after some sequence of moves the following two conditions should be satisfied: $a_1 = a_2 = \\dots = a_n$ and $b_1 = b_2 = \\dots = b_n$ (and $a_i$ equals $b_i$ is not necessary).\n\nYour task is to find the minimum number of moves required to equalize all the given gifts.\n\nYou have to answer $t$ independent test cases.\n\n\n-----Input-----\n\nThe first line of the input contains one integer $t$ ($1 \\le t \\le 1000$) \u2014 the number of test cases. Then $t$ test cases follow.\n\nThe first line of the test case contains one integer $n$ ($1 \\le n \\le 50$) \u2014 the number of gifts. The second line of the test case contains $n$ integers $a_1, a_2, \\dots, a_n$ ($1 \\le a_i \\le 10^9$), where $a_i$ is the number of candies in the $i$-th gift. The third line of the test case contains $n$ integers $b_1, b_2, \\dots, b_n$ ($1 \\le b_i \\le 10^9$), where $b_i$ is the number of oranges in the $i$-th gift.\n\n\n-----Output-----\n\nFor each test case, print one integer: the minimum number of moves required to equalize all the given gifts.\n\n\n-----Example-----\nInput\n5\n3\n3 5 6\n3 2 3\n5\n1 2 3 4 5\n5 4 3 2 1\n3\n1 1 1\n2 2 2\n6\n1 1000000000 1000000000 1000000000 1000000000 1000000000\n1 1 1 1 1 1\n3\n10 12 8\n7 5 4\n\nOutput\n6\n16\n0\n4999999995\n7\n\n\n\n-----Note-----\n\nIn the first test case of the example, we can perform the following sequence of moves:\n\n choose the first gift and eat one orange from it, so $a = [3, 5, 6]$ and $b = [2, 2, 3]$; choose the second gift and eat one candy from it, so $a = [3, 4, 6]$ and $b = [2, 2, 3]$; choose the second gift and eat one candy from it, so $a = [3, 3, 6]$ and $b = [2, 2, 3]$; choose the third gift and eat one candy and one orange from it, so $a = [3, 3, 5]$ and $b = [2, 2, 2]$; choose the third gift and eat one candy from it, so $a = [3, 3, 4]$ and $b = [2, 2, 2]$; choose the third gift and eat one candy from it, so $a = [3, 3, 3]$ and $b = [2, 2, 2]$.",
|
|
4
|
+
"answer": "def minimum_moves(t, test_cases):\n for _ in range(t):\n n = test_cases[_][0]\n candies = test_cases[_][1]\n oranges = test_cases[_][2]\n min_candies = min(candies)\n min_oranges = min(oranges)\n ans = 0\n for i in range(n):\n ans += max(candies[i] - min_candies, oranges[i] - min_oranges)\n print(ans)\n\n\ndef main():\n t = int(input())\n test_cases = []\n for _ in range(t):\n n = int(input())\n candies = list(map(int, input().split()))\n oranges = list(map(int, input().split()))\n test_cases.append((n, candies, oranges))\n minimum_moves(t, test_cases)\n\n\nmain()\n"
|
|
5
|
+
},
|
|
6
|
+
{
|
|
7
|
+
"question": "Let's call a string a phone number if it has length 11 and fits the pattern \"8xxxxxxxxxx\", where each \"x\" is replaced by a digit.\n\nFor example, \"80123456789\" and \"80000000000\" are phone numbers, while \"8012345678\" and \"79000000000\" are not.\n\nYou have n cards with digits, and you want to use them to make as many phone numbers as possible. Each card must be used in at most one phone number, and you don't have to use all cards. The phone numbers do not necessarily have to be distinct.\n\nInput\n\nThe first line contains an integer n \u2014 the number of cards with digits that you have (1 \u2264 n \u2264 100).\n\nThe second line contains a string of n digits (characters \"0\", \"1\", ..., \"9\") s_1, s_2, \u2026, s_n. The string will not contain any other characters, such as leading or trailing spaces.\n\nOutput\n\nIf at least one phone number can be made from these cards, output the maximum number of phone numbers that can be made. Otherwise, output 0.\n\nExamples\n\nInput\n\n11\n00000000008\n\n\nOutput\n\n1\n\n\nInput\n\n22\n0011223344556677889988\n\n\nOutput\n\n2\n\n\nInput\n\n11\n31415926535\n\n\nOutput\n\n0\n\nNote\n\nIn the first example, one phone number, \"8000000000\", can be made from these cards.\n\nIn the second example, you can make two phone numbers from the cards, for example, \"80123456789\" and \"80123456789\".\n\nIn the third example you can't make any phone number from the given cards.",
|
|
8
|
+
"answer": "def count_phone_numbers(num_cards, card_digits):\n count_eights = card_digits.count(\"8\")\n max_phone_numbers = num_cards // 11\n max_possible = min(count_eights, max_phone_numbers)\n return max_possible\n\ndef main():\n num_cards = int(input())\n card_digits = input().strip()\n max_possible = count_phone_numbers(num_cards, card_digits)\n print(max_possible)\n\nmain()"
|
|
9
|
+
}
|
|
10
|
+
]
|