nvidia-livecodebench 25.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. core_evals/livecodebench/__init__.py +1 -0
  2. core_evals/livecodebench/framework.yml +233 -0
  3. core_evals/livecodebench/framework_entrypoint.py +28 -0
  4. core_evals/livecodebench/output.py +51 -0
  5. livecodebench/__init__.py +0 -0
  6. livecodebench/benchmarks/__init__.py +31 -0
  7. livecodebench/benchmarks/code_execution.py +85 -0
  8. livecodebench/benchmarks/code_generation.py +160 -0
  9. livecodebench/benchmarks/test_output_prediction.py +90 -0
  10. livecodebench/benchmarks/utils.py +50 -0
  11. livecodebench/evaluation/__init__.py +24 -0
  12. livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
  13. livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
  14. livecodebench/evaluation/compute_scores.py +172 -0
  15. livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
  16. livecodebench/evaluation/metric.py +28 -0
  17. livecodebench/evaluation/old_results_check.py +91 -0
  18. livecodebench/evaluation/pass_k_utils.py +84 -0
  19. livecodebench/evaluation/testing_util.py +574 -0
  20. livecodebench/evaluation/utils_execute.py +285 -0
  21. livecodebench/lm_styles.py +581 -0
  22. livecodebench/prompts/__init__.py +22 -0
  23. livecodebench/prompts/code_execution.py +201 -0
  24. livecodebench/prompts/code_generation.py +372 -0
  25. livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
  26. livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
  27. livecodebench/prompts/self_repair.py +370 -0
  28. livecodebench/prompts/test_output_prediction.py +327 -0
  29. livecodebench/runner/__init__.py +0 -0
  30. livecodebench/runner/base_runner.py +188 -0
  31. livecodebench/runner/claude3_runner.py +70 -0
  32. livecodebench/runner/claude_runner.py +69 -0
  33. livecodebench/runner/cohere_runner.py +71 -0
  34. livecodebench/runner/custom_evaluator.py +132 -0
  35. livecodebench/runner/deepseek_runner.py +87 -0
  36. livecodebench/runner/gemini_runner.py +111 -0
  37. livecodebench/runner/generic_oai_server_runner.py +104 -0
  38. livecodebench/runner/main.py +255 -0
  39. livecodebench/runner/mistral_runner.py +71 -0
  40. livecodebench/runner/oai_runner.py +93 -0
  41. livecodebench/runner/parser.py +174 -0
  42. livecodebench/runner/runner_utils.py +62 -0
  43. livecodebench/runner/scenario_router.py +239 -0
  44. livecodebench/runner/vllm_runner.py +82 -0
  45. livecodebench/utils/__init__.py +0 -0
  46. livecodebench/utils/extraction_utils.py +82 -0
  47. livecodebench/utils/multiprocess.py +250 -0
  48. livecodebench/utils/path_utils.py +58 -0
  49. livecodebench/utils/scenarios.py +26 -0
  50. livecodebench/utils/seed_generator.py +44 -0
  51. nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
  52. nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
  53. nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
  54. nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
  55. nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,233 @@
1
+ framework:
2
+ name: livecodebench
3
+ pkg_name: livecodebench
4
+ full_name: LiveCodeBench
5
+ description: Holistic and Contamination Free Evaluation of Large Language Models for Code. Paper https://arxiv.org/pdf/2403.07974
6
+ url: https://github.com/LiveCodeBench/LiveCodeBench
7
+ source: https://gitlab-master.nvidia.com/dl/JoC/competitive_evaluation/LiveCodeBench
8
+ defaults:
9
+ command: >
10
+ {% if target.api_endpoint.api_key is not none %}
11
+ export API_KEY=${{target.api_endpoint.api_key}} &&
12
+ {% endif %}
13
+ livecodebench --model {{target.api_endpoint.model_id}} \
14
+ --scenario {{config.params.task}} \
15
+ --release_version {{config.params.extra.release_version}} \
16
+ --url {{target.api_endpoint.url}} \
17
+ --temperature {{config.params.temperature}} \
18
+ --top_p {{config.params.top_p}} \
19
+ --evaluate \
20
+ --codegen_n {{config.params.extra.n_samples}} \
21
+ --use_cache \
22
+ --cache_batch_size {{config.params.extra.cache_batch_size}} \
23
+ --num_process_evaluate {{config.params.extra.num_process_evaluate}} \
24
+ --n {{config.params.extra.n_samples}} \
25
+ --max_tokens {{config.params.max_new_tokens}} \
26
+ --out_dir {{config.output_dir}} \
27
+ --multiprocess {{config.params.parallelism}} \
28
+ --max_retries {{config.params.max_retries}} \
29
+ --timeout {{config.params.request_timeout}}{% if config.params.extra.start_date is not none %} --start_date {{config.params.extra.start_date}} {% endif %} {% if config.params.extra.end_date is not none %} --end_date {{config.params.extra.end_date}} {% endif %} {% if config.params.extra.support_system_role %} --support_system_role {% endif %} {% if config.params.limit_samples is not none %} --first_n {{config.params.limit_samples}}{% endif %}{% if config.params.extra.cot_code_execution == true %} --cot_code_execution {% endif %}{% if config.params.extra.args is defined %} {{ config.params.extra.args }} {% endif %}
30
+
31
+
32
+
33
+ config:
34
+ params:
35
+ limit_samples: null
36
+ max_new_tokens: 4096
37
+ temperature: 0.0
38
+ top_p: 0.00001
39
+ parallelism: 10
40
+ max_retries: 5
41
+ request_timeout: 60
42
+ extra:
43
+ n_samples: 10
44
+ num_process_evaluate: 5
45
+ cache_batch_size: 10
46
+ support_system_role: false
47
+ start_date: null # format: YYYY-MM-DD
48
+ end_date : null # format: YYYY-MM-DD
49
+ cot_code_execution: false
50
+
51
+ target:
52
+ api_endpoint: {} # required to add: url, model_id, api_key
53
+ evaluations:
54
+ - name: codegeneration_release_latest
55
+ description: Code generation latest version
56
+ defaults:
57
+ config:
58
+ type: codegeneration_release_latest
59
+ supported_endpoint_types:
60
+ - chat
61
+ params:
62
+ task: codegeneration
63
+ extra:
64
+ release_version: release_latest
65
+ - name: codegeneration_release_v1
66
+ description: The initial release of the dataset with problems released between May 2023 and Mar 2024 containing 400 problems.
67
+ defaults:
68
+ config:
69
+ type: codegeneration_release_v1
70
+ supported_endpoint_types:
71
+ - chat
72
+ params:
73
+ task: codegeneration
74
+ extra:
75
+ release_version: release_v1
76
+
77
+ - name: codegeneration_release_v2
78
+ description: The updated release of the dataset with problems released between May 2023 and May 2024 containing 511 problems.
79
+ defaults:
80
+ config:
81
+ type: codegeneration_release_v2
82
+ supported_endpoint_types:
83
+ - chat
84
+ params:
85
+ task: codegeneration
86
+ extra:
87
+ release_version: release_v2
88
+
89
+ - name: codegeneration_release_v3
90
+ description: The updated release of the dataset with problems released between May 2023 and Jul 2024 containing 612 problems.
91
+ defaults:
92
+ config:
93
+ type: codegeneration_release_v3
94
+ supported_endpoint_types:
95
+ - chat
96
+ params:
97
+ task: codegeneration
98
+ extra:
99
+ release_version: release_v3
100
+
101
+ - name: codegeneration_release_v4
102
+ description: The updated release of the dataset with problems released between May 2023 and Sep 2024 containing 713 problems.
103
+ defaults:
104
+ config:
105
+ type: codegeneration_release_v4
106
+ supported_endpoint_types:
107
+ - chat
108
+ params:
109
+ task: codegeneration
110
+ extra:
111
+ release_version: release_v4
112
+
113
+ - name: codegeneration_release_v5
114
+ description: The updated release of the dataset with problems released between May 2023 and Jan 2025 containing 880 problems.
115
+ defaults:
116
+ config:
117
+ type: codegeneration_release_v5
118
+ supported_endpoint_types:
119
+ - chat
120
+ params:
121
+ task: codegeneration
122
+ extra:
123
+ release_version: release_v5
124
+ - name: codegeneration_release_v6
125
+ description: The updated release of the dataset with problems released between May 2023 and Apr 2025 containing 1055 problems.
126
+ defaults:
127
+ config:
128
+ type: codegeneration_release_v6
129
+ supported_endpoint_types:
130
+ - chat
131
+ params:
132
+ task: codegeneration
133
+ extra:
134
+ release_version: release_v6
135
+ - name: codegeneration_notfast
136
+ description: Not fast version of code generation (v2).
137
+ defaults:
138
+ config:
139
+ type: codegeneration_notfast
140
+ supported_endpoint_types:
141
+ - chat
142
+ params:
143
+ task: codegeneration
144
+ extra:
145
+ args: "--not_fast"
146
+
147
+ - name: testoutputprediction
148
+ description: Solve the natural language task on a specified input, evaluating the ability to generate testing outputs. The model is given the natural language problem description and an input, and the output should be the output for the problem.
149
+ defaults:
150
+ config:
151
+ type: testoutputprediction
152
+ supported_endpoint_types:
153
+ - chat
154
+ params:
155
+ task: testoutputprediction
156
+ extra:
157
+ release_version: release_latest
158
+
159
+ - name: codeexecution_v2
160
+ description: “Execute” a program on an input, evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
161
+ defaults:
162
+ config:
163
+ type: codeexecution_v2
164
+ supported_endpoint_types:
165
+ - chat
166
+ params:
167
+ task: codeexecution
168
+ extra:
169
+ release_version: release_v2
170
+ - name: codeexecution_v2_cot
171
+ description: “CoT. Execute” a program on an input, evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
172
+ defaults:
173
+ config:
174
+ type: codeexecution_v2_cot
175
+ supported_endpoint_types:
176
+ - chat
177
+ params:
178
+ task: codeexecution
179
+ extra:
180
+ release_version: release_v2
181
+ cot_code_execution: true
182
+ # NOTE(dfridman): same as `livecodebench_0724_0125`. Leaving it as legacy.
183
+ - name: AA_code_generation
184
+ description: "AA code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result."
185
+ defaults:
186
+ config:
187
+ type: AA_codegeneration
188
+ supported_endpoint_types:
189
+ - chat
190
+ params:
191
+ task: codegeneration
192
+ extra:
193
+ release_version: release_v5
194
+ n_samples: 3
195
+ start_date: 2024-07-01
196
+ end_date: 2025-01-01
197
+
198
+ - name: livecodebench_0724_0125
199
+ description: >-
200
+ - Code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
201
+ - The data period and sampling parameters used by Artificial Analaysis (https://artificialanalysis.ai/methodology/intelligence-benchmarking)
202
+ defaults:
203
+ config:
204
+ type: livecodebench_0724_0125
205
+ supported_endpoint_types:
206
+ - chat
207
+ params:
208
+ task: codegeneration
209
+ temperature: 0.0
210
+ max_new_tokens: 4096
211
+ extra:
212
+ release_version: release_v5
213
+ n_samples: 3
214
+ start_date: 2024-07-01
215
+ end_date: 2025-01-01
216
+ - name: livecodebench_0824_0225
217
+ description:
218
+ - Code generation evaluating code comprehension ability. The model is given a program and an input, and the output should be the result.
219
+ - The data period and sampling parameters used by NeMo Alignment team.
220
+ defaults:
221
+ config:
222
+ type: livecodebench_0824_0225
223
+ supported_endpoint_types:
224
+ - chat
225
+ params:
226
+ task: codegeneration
227
+ temperature: 0.0
228
+ max_new_tokens: 4096
229
+ extra:
230
+ release_version: release_v5
231
+ n_samples: 3
232
+ start_date: 2024-08-01
233
+ end_date: 2025-02-01
@@ -0,0 +1,28 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import warnings
17
+ from nvidia_eval_commons.api.run import run_eval
18
+
19
+
20
+ def main():
21
+ warnings.warn(
22
+ "This command is deprecated and will be removed in the next release. Please use the `eval-factory` command instead.",
23
+ DeprecationWarning,
24
+ stacklevel=2,
25
+ )
26
+
27
+ # Add this `framework_entrypoint:main` as an entrypoint to `pyproject.toml`, example
28
+ run_eval()
@@ -0,0 +1,51 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import json
17
+ import pathlib
18
+ import re
19
+
20
+ from nvidia_eval_commons.api.api_dataclasses import EvaluationResult
21
+
22
+ # This is the only required function
23
+ def parse_output(output_dir: str) -> EvaluationResult:
24
+ result_files = list(pathlib.Path(output_dir).rglob("*eval.json"))
25
+ if not result_files:
26
+ raise FileNotFoundError("Failed to find `*eval.json` with metric.")
27
+ if len(result_files) > 1:
28
+ raise ValueError(
29
+ "More than 1 `*eval.json.json` files found. `output_dir` must contain a single evaluation."
30
+ )
31
+ match = re.search(r"Scenario\.([^_]+)_", str(result_files[0]))
32
+ task_name = match.group(1)
33
+
34
+ with open(result_files[0]) as fp:
35
+ results = json.load(fp)[0]
36
+ if 'detail' in results:
37
+ results.pop('detail')
38
+
39
+
40
+ tasks = {}
41
+ metrics = {}
42
+ for metric_name, value in results.items():
43
+ if metric_name.endswith("_stderr"):
44
+ continue
45
+ metrics[metric_name] = dict(
46
+ scores={metric_name: dict(value=value, stats={"stderr": results[f"{metric_name}_stderr"]})}
47
+ )
48
+ tasks[task_name] = dict(
49
+ metrics=metrics
50
+ )
51
+ return EvaluationResult(groups=tasks, tasks=tasks)
File without changes
@@ -0,0 +1,31 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ from livecodebench.benchmarks.code_generation import (
20
+ CodeGenerationProblem,
21
+ load_code_generation_dataset,
22
+ load_code_generation_dataset_not_fast,
23
+ )
24
+ from livecodebench.benchmarks.test_output_prediction import (
25
+ TestOutputPredictionProblem,
26
+ load_test_prediction_dataset,
27
+ )
28
+ from livecodebench.benchmarks.code_execution import (
29
+ CodeExecutionProblem,
30
+ load_code_execution_dataset,
31
+ )
@@ -0,0 +1,85 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ from datetime import datetime
20
+ from dataclasses import dataclass
21
+
22
+ from datasets import load_dataset
23
+ from livecodebench.benchmarks.utils import filter_dataset_by_date
24
+
25
+
26
+ @dataclass
27
+ class CodeExecutionProblem:
28
+ question_id: str
29
+ contest_id: str
30
+ contest_date: datetime
31
+ difficulty: str
32
+ function_name: str
33
+ code: str
34
+ input: str
35
+ output: str
36
+ id: str
37
+ problem_id: str
38
+ numsteps: int
39
+
40
+ def __post_init__(self):
41
+ pass
42
+
43
+ def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
44
+ return {
45
+ "question_id": self.question_id,
46
+ "contest_id": self.contest_id,
47
+ "contest_date": self.contest_date.isoformat(),
48
+ "difficulty": self.difficulty,
49
+ "function_name": self.function_name,
50
+ "code": self.code,
51
+ "input": self.input,
52
+ "output": self.output,
53
+ "id": self.id,
54
+ "problem_id": self.problem_id,
55
+ "numsteps": self.numsteps,
56
+ "output_list": output_list,
57
+ "pred_list": pred_list,
58
+ }
59
+
60
+ def insert_output_evaluation(
61
+ self, output_list: list[str], code_list: list[str], graded_list: list[bool]
62
+ ) -> dict:
63
+ output = self.insert_output(output_list, code_list)
64
+ output["graded_list"] = graded_list
65
+ output["pass@1"] = graded_list.count(True) / len(graded_list)
66
+ return output
67
+
68
+ def get_evaluation_sample(self) -> dict:
69
+ return {
70
+ "code": self.code,
71
+ "input": self.input,
72
+ "output": self.output,
73
+ }
74
+
75
+
76
+ def load_code_execution_dataset(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
77
+ dataset = load_dataset("livecodebench/execution-v2", split="test")
78
+ dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
79
+ dataset = [CodeExecutionProblem(**p) for p in dataset] # type: ignore
80
+ print(f"Loaded {len(dataset)} problems")
81
+ return dataset
82
+
83
+
84
+ if __name__ == "__main__":
85
+ dataset = load_code_execution_dataset()
@@ -0,0 +1,160 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ import json
20
+ import zlib
21
+ import pickle
22
+ import base64
23
+ from enum import Enum
24
+ from datetime import datetime
25
+ from dataclasses import dataclass
26
+
27
+ from datasets import load_dataset
28
+ from livecodebench.benchmarks.utils import filter_dataset_by_date
29
+
30
+
31
+ class Platform(Enum):
32
+ LEETCODE = "leetcode"
33
+ CODEFORCES = "codeforces"
34
+ ATCODER = "atcoder"
35
+
36
+
37
+ class Difficulty(Enum):
38
+ EASY = "easy"
39
+ MEDIUM = "medium"
40
+ HARD = "hard"
41
+
42
+
43
+ class TestType(Enum):
44
+ STDIN = "stdin"
45
+ FUNCTIONAL = "functional"
46
+
47
+
48
+ @dataclass
49
+ class Test:
50
+ input: str
51
+ output: str
52
+ testtype: TestType
53
+
54
+ def __post_init__(self):
55
+ self.testtype = TestType(self.testtype)
56
+ # if self.testtype == TestType.FUNCTIONAL:
57
+ # self.input = json.loads(self.input)
58
+ # self.output = json.loads(self.output)
59
+
60
+
61
+ @dataclass
62
+ class CodeGenerationProblem:
63
+ question_title: str
64
+ question_content: str
65
+ platform: Platform
66
+ question_id: str
67
+ contest_id: str
68
+ contest_date: datetime
69
+ starter_code: str
70
+ difficulty: Difficulty
71
+ public_test_cases: list[Test]
72
+ private_test_cases: list[Test]
73
+ metadata: dict
74
+
75
+ def __post_init__(self):
76
+ self.platform = Platform(self.platform)
77
+ self.difficulty = Difficulty(self.difficulty)
78
+ self.contest_date = datetime.fromisoformat(self.contest_date)
79
+
80
+ self.public_test_cases = json.loads(self.public_test_cases) # type: ignore
81
+ self.public_test_cases = [Test(**t) for t in self.public_test_cases]
82
+
83
+ try:
84
+ self.private_test_cases = json.loads(self.private_test_cases) # type: ignore
85
+ except:
86
+ self.private_test_cases = json.loads(
87
+ pickle.loads(
88
+ zlib.decompress(
89
+ base64.b64decode(self.private_test_cases.encode("utf-8")) # type: ignore
90
+ )
91
+ )
92
+ ) # type: ignore
93
+ self.private_test_cases = [Test(**t) for t in self.private_test_cases]
94
+
95
+ self.metadata = json.loads(self.metadata) # type: ignore
96
+
97
+ def insert_output(self, output_list: list[str], code_list: list[str]) -> dict:
98
+ return {
99
+ "question_title": self.question_title,
100
+ "question_content": self.question_content,
101
+ "platform": self.platform.value,
102
+ "question_id": self.question_id,
103
+ "contest_id": self.contest_id,
104
+ "contest_date": self.contest_date.isoformat(),
105
+ "starter_code": self.starter_code,
106
+ "difficulty": self.difficulty.value,
107
+ "output_list": output_list,
108
+ "code_list": code_list,
109
+ }
110
+
111
+ def insert_output_evaluation(
112
+ self,
113
+ output_list: list[str],
114
+ code_list: list[str],
115
+ graded_list: list[bool],
116
+ **kwargs,
117
+ ) -> dict:
118
+ output = self.insert_output(output_list, code_list)
119
+ output["graded_list"] = graded_list
120
+ output["pass@1"] = graded_list.count(True) / len(graded_list)
121
+ for k, v in kwargs.items():
122
+ output[k] = v
123
+ return output
124
+
125
+ def get_evaluation_sample(self):
126
+ return {
127
+ "input_output": json.dumps(
128
+ {
129
+ "inputs": [
130
+ t.input
131
+ for t in self.public_test_cases + self.private_test_cases
132
+ ],
133
+ "outputs": [
134
+ t.output
135
+ for t in self.public_test_cases + self.private_test_cases
136
+ ],
137
+ "fn_name": self.metadata.get("func_name", None),
138
+ }
139
+ ),
140
+ }
141
+
142
+
143
+ def load_code_generation_dataset(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
144
+ dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
145
+ dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
146
+ dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
147
+ print(f"Loaded {len(dataset)} problems")
148
+ return dataset
149
+
150
+ def load_code_generation_dataset_not_fast(release_version="release_v1", start_date: str = None, end_date: str = None) -> list:
151
+ dataset = load_dataset("livecodebench/code_generation", split="test")
152
+ dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
153
+ dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
154
+ print(f"Loaded {len(dataset)} problems")
155
+ return dataset
156
+
157
+
158
+
159
+ if __name__ == "__main__":
160
+ dataset = load_code_generation_dataset()
@@ -0,0 +1,90 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ import json
20
+ from datetime import datetime
21
+ from dataclasses import dataclass
22
+
23
+ from datasets import load_dataset
24
+
25
+ from livecodebench.benchmarks.utils import filter_dataset_by_date
26
+
27
+
28
+ @dataclass
29
+ class Test:
30
+ input: str
31
+ output: str
32
+ testtype: str
33
+
34
+
35
+ @dataclass
36
+ class TestOutputPredictionProblem:
37
+ question_title: str
38
+ question_content: str
39
+ question_id: str
40
+ contest_id: str
41
+ contest_date: datetime
42
+ difficulty: str
43
+ test: list[Test]
44
+ starter_code: str
45
+ function_name: str
46
+ test_id: int
47
+
48
+ def __post_init__(self):
49
+ self.test = [Test(**t) for t in json.loads(self.test)] # type: ignore
50
+
51
+ def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
52
+ return {
53
+ "question_title": self.question_title,
54
+ "question_content": self.question_content,
55
+ "question_id": self.question_id,
56
+ "contest_id": self.contest_id,
57
+ "contest_date": self.contest_date.isoformat(),
58
+ "difficulty": self.difficulty,
59
+ "output_list": output_list,
60
+ "pred_list": pred_list,
61
+ "test_id": self.test_id,
62
+ "function_name": self.function_name,
63
+ "starter_code": self.starter_code,
64
+ }
65
+
66
+ def insert_output_evaluation(
67
+ self, output_list: list[str], code_list: list[str], graded_list: list[bool]
68
+ ) -> dict:
69
+ output = self.insert_output(output_list, code_list)
70
+ output["graded_list"] = graded_list
71
+ output["pass@1"] = graded_list.count(True) / len(graded_list)
72
+ return output
73
+
74
+ def get_evaluation_sample(self) -> dict:
75
+ return {
76
+ "input": self.question_content,
77
+ "output": self.test[0].output,
78
+ }
79
+
80
+
81
+ def load_test_prediction_dataset(release_version="release_v1", start_date: str | None = None, end_date: str | None = None) -> list[TestOutputPredictionProblem]:
82
+ dataset = load_dataset("livecodebench/test_generation", split="test") # type: ignore
83
+ dataset = filter_dataset_by_date(dataset, start_date=start_date, end_date=end_date)
84
+ dataset = [TestOutputPredictionProblem(**d) for d in dataset]
85
+ print(f"Loaded {len(dataset)} prediction problems")
86
+ return dataset
87
+
88
+
89
+ if __name__ == "__main__":
90
+ dataset = load_test_prediction_dataset()