nvidia-livecodebench 25.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core_evals/livecodebench/__init__.py +1 -0
- core_evals/livecodebench/framework.yml +233 -0
- core_evals/livecodebench/framework_entrypoint.py +28 -0
- core_evals/livecodebench/output.py +51 -0
- livecodebench/__init__.py +0 -0
- livecodebench/benchmarks/__init__.py +31 -0
- livecodebench/benchmarks/code_execution.py +85 -0
- livecodebench/benchmarks/code_generation.py +160 -0
- livecodebench/benchmarks/test_output_prediction.py +90 -0
- livecodebench/benchmarks/utils.py +50 -0
- livecodebench/evaluation/__init__.py +24 -0
- livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
- livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
- livecodebench/evaluation/compute_scores.py +172 -0
- livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
- livecodebench/evaluation/metric.py +28 -0
- livecodebench/evaluation/old_results_check.py +91 -0
- livecodebench/evaluation/pass_k_utils.py +84 -0
- livecodebench/evaluation/testing_util.py +574 -0
- livecodebench/evaluation/utils_execute.py +285 -0
- livecodebench/lm_styles.py +581 -0
- livecodebench/prompts/__init__.py +22 -0
- livecodebench/prompts/code_execution.py +201 -0
- livecodebench/prompts/code_generation.py +372 -0
- livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
- livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
- livecodebench/prompts/self_repair.py +370 -0
- livecodebench/prompts/test_output_prediction.py +327 -0
- livecodebench/runner/__init__.py +0 -0
- livecodebench/runner/base_runner.py +188 -0
- livecodebench/runner/claude3_runner.py +70 -0
- livecodebench/runner/claude_runner.py +69 -0
- livecodebench/runner/cohere_runner.py +71 -0
- livecodebench/runner/custom_evaluator.py +132 -0
- livecodebench/runner/deepseek_runner.py +87 -0
- livecodebench/runner/gemini_runner.py +111 -0
- livecodebench/runner/generic_oai_server_runner.py +104 -0
- livecodebench/runner/main.py +255 -0
- livecodebench/runner/mistral_runner.py +71 -0
- livecodebench/runner/oai_runner.py +93 -0
- livecodebench/runner/parser.py +174 -0
- livecodebench/runner/runner_utils.py +62 -0
- livecodebench/runner/scenario_router.py +239 -0
- livecodebench/runner/vllm_runner.py +82 -0
- livecodebench/utils/__init__.py +0 -0
- livecodebench/utils/extraction_utils.py +82 -0
- livecodebench/utils/multiprocess.py +250 -0
- livecodebench/utils/path_utils.py +58 -0
- livecodebench/utils/scenarios.py +26 -0
- livecodebench/utils/seed_generator.py +44 -0
- nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
- nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
- nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
- nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
- nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
framework:
|
|
2
|
+
name: livecodebench
|
|
3
|
+
pkg_name: livecodebench
|
|
4
|
+
full_name: LiveCodeBench
|
|
5
|
+
description: Holistic and Contamination Free Evaluation of Large Language Models for Code. Paper https://arxiv.org/pdf/2403.07974
|
|
6
|
+
url: https://github.com/LiveCodeBench/LiveCodeBench
|
|
7
|
+
source: https://gitlab-master.nvidia.com/dl/JoC/competitive_evaluation/LiveCodeBench
|
|
8
|
+
defaults:
|
|
9
|
+
command: >
|
|
10
|
+
{% if target.api_endpoint.api_key is not none %}
|
|
11
|
+
export API_KEY=${{target.api_endpoint.api_key}} &&
|
|
12
|
+
{% endif %}
|
|
13
|
+
livecodebench --model {{target.api_endpoint.model_id}} \
|
|
14
|
+
--scenario {{config.params.task}} \
|
|
15
|
+
--release_version {{config.params.extra.release_version}} \
|
|
16
|
+
--url {{target.api_endpoint.url}} \
|
|
17
|
+
--temperature {{config.params.temperature}} \
|
|
18
|
+
--top_p {{config.params.top_p}} \
|
|
19
|
+
--evaluate \
|
|
20
|
+
--codegen_n {{config.params.extra.n_samples}} \
|
|
21
|
+
--use_cache \
|
|
22
|
+
--cache_batch_size {{config.params.extra.cache_batch_size}} \
|
|
23
|
+
--num_process_evaluate {{config.params.extra.num_process_evaluate}} \
|
|
24
|
+
--n {{config.params.extra.n_samples}} \
|
|
25
|
+
--max_tokens {{config.params.max_new_tokens}} \
|
|
26
|
+
--out_dir {{config.output_dir}} \
|
|
27
|
+
--multiprocess {{config.params.parallelism}} \
|
|
28
|
+
--max_retries {{config.params.max_retries}} \
|
|
29
|
+
--timeout {{config.params.request_timeout}}{% if config.params.extra.start_date is not none %} --start_date {{config.params.extra.start_date}} {% endif %} {% if config.params.extra.end_date is not none %} --end_date {{config.params.extra.end_date}} {% endif %} {% if config.params.extra.support_system_role %} --support_system_role {% endif %} {% if config.params.limit_samples is not none %} --first_n {{config.params.limit_samples}}{% endif %}{% if config.params.extra.cot_code_execution == true %} --cot_code_execution {% endif %}{% if config.params.extra.args is defined %} {{ config.params.extra.args }} {% endif %}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
config:
|
|
34
|
+
params:
|
|
35
|
+
limit_samples: null
|
|
36
|
+
max_new_tokens: 4096
|
|
37
|
+
temperature: 0.0
|
|
38
|
+
top_p: 0.00001
|
|
39
|
+
parallelism: 10
|
|
40
|
+
max_retries: 5
|
|
41
|
+
request_timeout: 60
|
|
42
|
+
extra:
|
|
43
|
+
n_samples: 10
|
|
44
|
+
num_process_evaluate: 5
|
|
45
|
+
cache_batch_size: 10
|
|
46
|
+
support_system_role: false
|
|
47
|
+
start_date: null # format: YYYY-MM-DD
|
|
48
|
+
end_date : null # format: YYYY-MM-DD
|
|
49
|
+
cot_code_execution: false
|
|
50
|
+
|
|
51
|
+
target:
|
|
52
|
+
api_endpoint: {} # required to add: url, model_id, api_key
|
|
53
|
+
evaluations:
|
|
54
|
+
- name: codegeneration_release_latest
|
|
55
|
+
description: Code generation latest version
|
|
56
|
+
defaults:
|
|
57
|
+
config:
|
|
58
|
+
type: codegeneration_release_latest
|
|
59
|
+
supported_endpoint_types:
|
|
60
|
+
- chat
|
|
61
|
+
params:
|
|
62
|
+
task: codegeneration
|
|
63
|
+
extra:
|
|
64
|
+
release_version: release_latest
|
|
65
|
+
- name: codegeneration_release_v1
|
|
66
|
+
description: The initial release of the dataset with problems released between May 2023 and Mar 2024 containing 400 problems.
|
|
67
|
+
defaults:
|
|
68
|
+
config:
|
|
69
|
+
type: codegeneration_release_v1
|
|
70
|
+
supported_endpoint_types:
|
|
71
|
+
- chat
|
|
72
|
+
params:
|
|
73
|
+
task: codegeneration
|
|
74
|
+
extra:
|
|
75
|
+
release_version: release_v1
|
|
76
|
+
|
|
77
|
+
- name: codegeneration_release_v2
|
|
78
|
+
description: The updated release of the dataset with problems released between May 2023 and May 2024 containing 511 problems.
|
|
79
|
+
defaults:
|
|
80
|
+
config:
|
|
81
|
+
type: codegeneration_release_v2
|
|
82
|
+
supported_endpoint_types:
|
|
83
|
+
- chat
|
|
84
|
+
params:
|
|
85
|
+
task: codegeneration
|
|
86
|
+
extra:
|
|
87
|
+
release_version: release_v2
|
|
88
|
+
|
|
89
|
+
- name: codegeneration_release_v3
|
|
90
|
+
description: The updated release of the dataset with problems released between May 2023 and Jul 2024 containing 612 problems.
|
|
91
|
+
defaults:
|
|
92
|
+
config:
|
|
93
|
+
type: codegeneration_release_v3
|
|
94
|
+
supported_endpoint_types:
|
|
95
|
+
- chat
|
|
96
|
+
params:
|
|
97
|
+
task: codegeneration
|
|
98
|
+
extra:
|
|
99
|
+
release_version: release_v3
|
|
100
|
+
|
|
101
|
+
- name: codegeneration_release_v4
|
|
102
|
+
description: The updated release of the dataset with problems released between May 2023 and Sep 2024 containing 713 problems.
|
|
103
|
+
defaults:
|
|
104
|
+
config:
|
|
105
|
+
type: codegeneration_release_v4
|
|
106
|
+
supported_endpoint_types:
|
|
107
|
+
- chat
|
|
108
|
+
params:
|
|
109
|
+
task: codegeneration
|
|
110
|
+
extra:
|
|
111
|
+
release_version: release_v4
|
|
112
|
+
|
|
113
|
+
- name: codegeneration_release_v5
|
|
114
|
+
description: The updated release of the dataset with problems released between May 2023 and Jan 2025 containing 880 problems.
|
|
115
|
+
defaults:
|
|
116
|
+
config:
|
|
117
|
+
type: codegeneration_release_v5
|
|
118
|
+
supported_endpoint_types:
|
|
119
|
+
- chat
|
|
120
|
+
params:
|
|
121
|
+
task: codegeneration
|
|
122
|
+
extra:
|
|
123
|
+
release_version: release_v5
|
|
124
|
+
- name: codegeneration_release_v6
|
|
125
|
+
description: The updated release of the dataset with problems released between May 2023 and Apr 2025 containing 1055 problems.
|
|
126
|
+
defaults:
|
|
127
|
+
config:
|
|
128
|
+
type: codegeneration_release_v6
|
|
129
|
+
supported_endpoint_types:
|
|
130
|
+
- chat
|
|
131
|
+
params:
|
|
132
|
+
task: codegeneration
|
|
133
|
+
extra:
|
|
134
|
+
release_version: release_v6
|
|
135
|
+
- name: codegeneration_notfast
|
|
136
|
+
description: Not fast version of code generation (v2).
|
|
137
|
+
defaults:
|
|
138
|
+
config:
|
|
139
|
+
type: codegeneration_notfast
|
|
140
|
+
supported_endpoint_types:
|
|
141
|
+
- chat
|
|
142
|
+
params:
|
|
143
|
+
task: codegeneration
|
|
144
|
+
extra:
|
|
145
|
+
args: "--not_fast"
|
|
146
|
+
|
|
147
|
+
- name: testoutputprediction
|
|
148
|
+
description: Solve the natural language task on a specified input, evaluating the ability to generate testing outputs. The model is given the natural language problem description and an input, and the output should be the output for the problem.
|
|
149
|
+
defaults:
|
|
150
|
+
config:
|
|
151
|
+
type: testoutputprediction
|
|
152
|
+
supported_endpoint_types:
|
|
153
|
+
- chat
|
|
154
|
+
params:
|
|
155
|
+
task: testoutputprediction
|
|
156
|
+
extra:
|
|
157
|
+
release_version: release_latest
|
|
158
|
+
|
|
159
|
+
- name: codeexecution_v2
|
|
160
|
+
description: “Execute” a program on an input, evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
|
|
161
|
+
defaults:
|
|
162
|
+
config:
|
|
163
|
+
type: codeexecution_v2
|
|
164
|
+
supported_endpoint_types:
|
|
165
|
+
- chat
|
|
166
|
+
params:
|
|
167
|
+
task: codeexecution
|
|
168
|
+
extra:
|
|
169
|
+
release_version: release_v2
|
|
170
|
+
- name: codeexecution_v2_cot
|
|
171
|
+
description: “CoT. Execute” a program on an input, evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
|
|
172
|
+
defaults:
|
|
173
|
+
config:
|
|
174
|
+
type: codeexecution_v2_cot
|
|
175
|
+
supported_endpoint_types:
|
|
176
|
+
- chat
|
|
177
|
+
params:
|
|
178
|
+
task: codeexecution
|
|
179
|
+
extra:
|
|
180
|
+
release_version: release_v2
|
|
181
|
+
cot_code_execution: true
|
|
182
|
+
# NOTE(dfridman): same as `livecodebench_0724_0125`. Leaving it as legacy.
|
|
183
|
+
- name: AA_code_generation
|
|
184
|
+
description: "AA code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result."
|
|
185
|
+
defaults:
|
|
186
|
+
config:
|
|
187
|
+
type: AA_codegeneration
|
|
188
|
+
supported_endpoint_types:
|
|
189
|
+
- chat
|
|
190
|
+
params:
|
|
191
|
+
task: codegeneration
|
|
192
|
+
extra:
|
|
193
|
+
release_version: release_v5
|
|
194
|
+
n_samples: 3
|
|
195
|
+
start_date: 2024-07-01
|
|
196
|
+
end_date: 2025-01-01
|
|
197
|
+
|
|
198
|
+
- name: livecodebench_0724_0125
|
|
199
|
+
description: >-
|
|
200
|
+
- Code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
|
|
201
|
+
- The data period and sampling parameters used by Artificial Analaysis (https://artificialanalysis.ai/methodology/intelligence-benchmarking)
|
|
202
|
+
defaults:
|
|
203
|
+
config:
|
|
204
|
+
type: livecodebench_0724_0125
|
|
205
|
+
supported_endpoint_types:
|
|
206
|
+
- chat
|
|
207
|
+
params:
|
|
208
|
+
task: codegeneration
|
|
209
|
+
temperature: 0.0
|
|
210
|
+
max_new_tokens: 4096
|
|
211
|
+
extra:
|
|
212
|
+
release_version: release_v5
|
|
213
|
+
n_samples: 3
|
|
214
|
+
start_date: 2024-07-01
|
|
215
|
+
end_date: 2025-01-01
|
|
216
|
+
- name: livecodebench_0824_0225
|
|
217
|
+
description:
|
|
218
|
+
- Code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
|
|
219
|
+
- The data period and sampling parameters used by NeMo Alignment team.
|
|
220
|
+
defaults:
|
|
221
|
+
config:
|
|
222
|
+
type: livecodebench_0824_0225
|
|
223
|
+
supported_endpoint_types:
|
|
224
|
+
- chat
|
|
225
|
+
params:
|
|
226
|
+
task: codegeneration
|
|
227
|
+
temperature: 0.0
|
|
228
|
+
max_new_tokens: 4096
|
|
229
|
+
extra:
|
|
230
|
+
release_version: release_v5
|
|
231
|
+
n_samples: 3
|
|
232
|
+
start_date: 2024-08-01
|
|
233
|
+
end_date: 2025-02-01
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import warnings
|
|
17
|
+
from nvidia_eval_commons.api.run import run_eval
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main():
|
|
21
|
+
warnings.warn(
|
|
22
|
+
"This command is deprecated and will be removed in the next release. Please use the `eval-factory` command instead.",
|
|
23
|
+
DeprecationWarning,
|
|
24
|
+
stacklevel=2,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Add this `framework_entrypoint:main` as an entrypoint to `pyproject.toml`, example
|
|
28
|
+
run_eval()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import pathlib
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
from nvidia_eval_commons.api.api_dataclasses import EvaluationResult
|
|
21
|
+
|
|
22
|
+
# This is the only required function
|
|
23
|
+
def parse_output(output_dir: str) -> EvaluationResult:
|
|
24
|
+
result_files = list(pathlib.Path(output_dir).rglob("*eval.json"))
|
|
25
|
+
if not result_files:
|
|
26
|
+
raise FileNotFoundError("Failed to find `*eval.json` with metric.")
|
|
27
|
+
if len(result_files) > 1:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
"More than 1 `*eval.json.json` files found. `output_dir` must contain a single evaluation."
|
|
30
|
+
)
|
|
31
|
+
match = re.search(r"Scenario\.([^_]+)_", str(result_files[0]))
|
|
32
|
+
task_name = match.group(1)
|
|
33
|
+
|
|
34
|
+
with open(result_files[0]) as fp:
|
|
35
|
+
results = json.load(fp)[0]
|
|
36
|
+
if 'detail' in results:
|
|
37
|
+
results.pop('detail')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
tasks = {}
|
|
41
|
+
metrics = {}
|
|
42
|
+
for metric_name, value in results.items():
|
|
43
|
+
if metric_name.endswith("_stderr"):
|
|
44
|
+
continue
|
|
45
|
+
metrics[metric_name] = dict(
|
|
46
|
+
scores={metric_name: dict(value=value, stats={"stderr": results[f"{metric_name}_stderr"]})}
|
|
47
|
+
)
|
|
48
|
+
tasks[task_name] = dict(
|
|
49
|
+
metrics=metrics
|
|
50
|
+
)
|
|
51
|
+
return EvaluationResult(groups=tasks, tasks=tasks)
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
from livecodebench.benchmarks.code_generation import (
|
|
20
|
+
CodeGenerationProblem,
|
|
21
|
+
load_code_generation_dataset,
|
|
22
|
+
load_code_generation_dataset_not_fast,
|
|
23
|
+
)
|
|
24
|
+
from livecodebench.benchmarks.test_output_prediction import (
|
|
25
|
+
TestOutputPredictionProblem,
|
|
26
|
+
load_test_prediction_dataset,
|
|
27
|
+
)
|
|
28
|
+
from livecodebench.benchmarks.code_execution import (
|
|
29
|
+
CodeExecutionProblem,
|
|
30
|
+
load_code_execution_dataset,
|
|
31
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
|
|
22
|
+
from datasets import load_dataset
|
|
23
|
+
from livecodebench.benchmarks.utils import filter_dataset_by_date
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class CodeExecutionProblem:
|
|
28
|
+
question_id: str
|
|
29
|
+
contest_id: str
|
|
30
|
+
contest_date: datetime
|
|
31
|
+
difficulty: str
|
|
32
|
+
function_name: str
|
|
33
|
+
code: str
|
|
34
|
+
input: str
|
|
35
|
+
output: str
|
|
36
|
+
id: str
|
|
37
|
+
problem_id: str
|
|
38
|
+
numsteps: int
|
|
39
|
+
|
|
40
|
+
def __post_init__(self):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
|
|
44
|
+
return {
|
|
45
|
+
"question_id": self.question_id,
|
|
46
|
+
"contest_id": self.contest_id,
|
|
47
|
+
"contest_date": self.contest_date.isoformat(),
|
|
48
|
+
"difficulty": self.difficulty,
|
|
49
|
+
"function_name": self.function_name,
|
|
50
|
+
"code": self.code,
|
|
51
|
+
"input": self.input,
|
|
52
|
+
"output": self.output,
|
|
53
|
+
"id": self.id,
|
|
54
|
+
"problem_id": self.problem_id,
|
|
55
|
+
"numsteps": self.numsteps,
|
|
56
|
+
"output_list": output_list,
|
|
57
|
+
"pred_list": pred_list,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def insert_output_evaluation(
|
|
61
|
+
self, output_list: list[str], code_list: list[str], graded_list: list[bool]
|
|
62
|
+
) -> dict:
|
|
63
|
+
output = self.insert_output(output_list, code_list)
|
|
64
|
+
output["graded_list"] = graded_list
|
|
65
|
+
output["pass@1"] = graded_list.count(True) / len(graded_list)
|
|
66
|
+
return output
|
|
67
|
+
|
|
68
|
+
def get_evaluation_sample(self) -> dict:
|
|
69
|
+
return {
|
|
70
|
+
"code": self.code,
|
|
71
|
+
"input": self.input,
|
|
72
|
+
"output": self.output,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_code_execution_dataset(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
|
|
77
|
+
dataset = load_dataset("livecodebench/execution-v2", split="test")
|
|
78
|
+
dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
|
|
79
|
+
dataset = [CodeExecutionProblem(**p) for p in dataset] # type: ignore
|
|
80
|
+
print(f"Loaded {len(dataset)} problems")
|
|
81
|
+
return dataset
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
dataset = load_code_execution_dataset()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import zlib
|
|
21
|
+
import pickle
|
|
22
|
+
import base64
|
|
23
|
+
from enum import Enum
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
|
|
27
|
+
from datasets import load_dataset
|
|
28
|
+
from livecodebench.benchmarks.utils import filter_dataset_by_date
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Platform(Enum):
|
|
32
|
+
LEETCODE = "leetcode"
|
|
33
|
+
CODEFORCES = "codeforces"
|
|
34
|
+
ATCODER = "atcoder"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Difficulty(Enum):
|
|
38
|
+
EASY = "easy"
|
|
39
|
+
MEDIUM = "medium"
|
|
40
|
+
HARD = "hard"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestType(Enum):
|
|
44
|
+
STDIN = "stdin"
|
|
45
|
+
FUNCTIONAL = "functional"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class Test:
|
|
50
|
+
input: str
|
|
51
|
+
output: str
|
|
52
|
+
testtype: TestType
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
self.testtype = TestType(self.testtype)
|
|
56
|
+
# if self.testtype == TestType.FUNCTIONAL:
|
|
57
|
+
# self.input = json.loads(self.input)
|
|
58
|
+
# self.output = json.loads(self.output)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class CodeGenerationProblem:
|
|
63
|
+
question_title: str
|
|
64
|
+
question_content: str
|
|
65
|
+
platform: Platform
|
|
66
|
+
question_id: str
|
|
67
|
+
contest_id: str
|
|
68
|
+
contest_date: datetime
|
|
69
|
+
starter_code: str
|
|
70
|
+
difficulty: Difficulty
|
|
71
|
+
public_test_cases: list[Test]
|
|
72
|
+
private_test_cases: list[Test]
|
|
73
|
+
metadata: dict
|
|
74
|
+
|
|
75
|
+
def __post_init__(self):
|
|
76
|
+
self.platform = Platform(self.platform)
|
|
77
|
+
self.difficulty = Difficulty(self.difficulty)
|
|
78
|
+
self.contest_date = datetime.fromisoformat(self.contest_date)
|
|
79
|
+
|
|
80
|
+
self.public_test_cases = json.loads(self.public_test_cases) # type: ignore
|
|
81
|
+
self.public_test_cases = [Test(**t) for t in self.public_test_cases]
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
self.private_test_cases = json.loads(self.private_test_cases) # type: ignore
|
|
85
|
+
except:
|
|
86
|
+
self.private_test_cases = json.loads(
|
|
87
|
+
pickle.loads(
|
|
88
|
+
zlib.decompress(
|
|
89
|
+
base64.b64decode(self.private_test_cases.encode("utf-8")) # type: ignore
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
) # type: ignore
|
|
93
|
+
self.private_test_cases = [Test(**t) for t in self.private_test_cases]
|
|
94
|
+
|
|
95
|
+
self.metadata = json.loads(self.metadata) # type: ignore
|
|
96
|
+
|
|
97
|
+
def insert_output(self, output_list: list[str], code_list: list[str]) -> dict:
|
|
98
|
+
return {
|
|
99
|
+
"question_title": self.question_title,
|
|
100
|
+
"question_content": self.question_content,
|
|
101
|
+
"platform": self.platform.value,
|
|
102
|
+
"question_id": self.question_id,
|
|
103
|
+
"contest_id": self.contest_id,
|
|
104
|
+
"contest_date": self.contest_date.isoformat(),
|
|
105
|
+
"starter_code": self.starter_code,
|
|
106
|
+
"difficulty": self.difficulty.value,
|
|
107
|
+
"output_list": output_list,
|
|
108
|
+
"code_list": code_list,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
def insert_output_evaluation(
|
|
112
|
+
self,
|
|
113
|
+
output_list: list[str],
|
|
114
|
+
code_list: list[str],
|
|
115
|
+
graded_list: list[bool],
|
|
116
|
+
**kwargs,
|
|
117
|
+
) -> dict:
|
|
118
|
+
output = self.insert_output(output_list, code_list)
|
|
119
|
+
output["graded_list"] = graded_list
|
|
120
|
+
output["pass@1"] = graded_list.count(True) / len(graded_list)
|
|
121
|
+
for k, v in kwargs.items():
|
|
122
|
+
output[k] = v
|
|
123
|
+
return output
|
|
124
|
+
|
|
125
|
+
def get_evaluation_sample(self):
|
|
126
|
+
return {
|
|
127
|
+
"input_output": json.dumps(
|
|
128
|
+
{
|
|
129
|
+
"inputs": [
|
|
130
|
+
t.input
|
|
131
|
+
for t in self.public_test_cases + self.private_test_cases
|
|
132
|
+
],
|
|
133
|
+
"outputs": [
|
|
134
|
+
t.output
|
|
135
|
+
for t in self.public_test_cases + self.private_test_cases
|
|
136
|
+
],
|
|
137
|
+
"fn_name": self.metadata.get("func_name", None),
|
|
138
|
+
}
|
|
139
|
+
),
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load_code_generation_dataset(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
|
|
144
|
+
dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
|
|
145
|
+
dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
|
|
146
|
+
dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
|
|
147
|
+
print(f"Loaded {len(dataset)} problems")
|
|
148
|
+
return dataset
|
|
149
|
+
|
|
150
|
+
def load_code_generation_dataset_not_fast(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
|
|
151
|
+
dataset = load_dataset("livecodebench/code_generation", split="test")
|
|
152
|
+
dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
|
|
153
|
+
dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
|
|
154
|
+
print(f"Loaded {len(dataset)} problems")
|
|
155
|
+
return dataset
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == "__main__":
|
|
160
|
+
dataset = load_code_generation_dataset()
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
|
|
23
|
+
from datasets import load_dataset
|
|
24
|
+
|
|
25
|
+
from livecodebench.benchmarks.utils import filter_dataset_by_date
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Test:
|
|
30
|
+
input: str
|
|
31
|
+
output: str
|
|
32
|
+
testtype: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class TestOutputPredictionProblem:
|
|
37
|
+
question_title: str
|
|
38
|
+
question_content: str
|
|
39
|
+
question_id: str
|
|
40
|
+
contest_id: str
|
|
41
|
+
contest_date: datetime
|
|
42
|
+
difficulty: str
|
|
43
|
+
test: list[Test]
|
|
44
|
+
starter_code: str
|
|
45
|
+
function_name: str
|
|
46
|
+
test_id: int
|
|
47
|
+
|
|
48
|
+
def __post_init__(self):
|
|
49
|
+
self.test = [Test(**t) for t in json.loads(self.test)] # type: ignore
|
|
50
|
+
|
|
51
|
+
def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
|
|
52
|
+
return {
|
|
53
|
+
"question_title": self.question_title,
|
|
54
|
+
"question_content": self.question_content,
|
|
55
|
+
"question_id": self.question_id,
|
|
56
|
+
"contest_id": self.contest_id,
|
|
57
|
+
"contest_date": self.contest_date.isoformat(),
|
|
58
|
+
"difficulty": self.difficulty,
|
|
59
|
+
"output_list": output_list,
|
|
60
|
+
"pred_list": pred_list,
|
|
61
|
+
"test_id": self.test_id,
|
|
62
|
+
"function_name": self.function_name,
|
|
63
|
+
"starter_code": self.starter_code,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def insert_output_evaluation(
|
|
67
|
+
self, output_list: list[str], code_list: list[str], graded_list: list[bool]
|
|
68
|
+
) -> dict:
|
|
69
|
+
output = self.insert_output(output_list, code_list)
|
|
70
|
+
output["graded_list"] = graded_list
|
|
71
|
+
output["pass@1"] = graded_list.count(True) / len(graded_list)
|
|
72
|
+
return output
|
|
73
|
+
|
|
74
|
+
def get_evaluation_sample(self) -> dict:
|
|
75
|
+
return {
|
|
76
|
+
"input": self.question_content,
|
|
77
|
+
"output": self.test[0].output,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def load_test_prediction_dataset(release_version="release_v1", start_date: str | None = None, end_date: str | None = None) -> list[TestOutputPredictionProblem]:
|
|
82
|
+
dataset = load_dataset("livecodebench/test_generation", split="test") # type: ignore
|
|
83
|
+
dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
|
|
84
|
+
dataset = [TestOutputPredictionProblem(**d) for d in dataset]
|
|
85
|
+
print(f"Loaded {len(dataset)} prediction problems")
|
|
86
|
+
return dataset
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
dataset = load_test_prediction_dataset()
|