nvidia-livecodebench 25.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core_evals/livecodebench/__init__.py +1 -0
- core_evals/livecodebench/framework.yml +233 -0
- core_evals/livecodebench/framework_entrypoint.py +28 -0
- core_evals/livecodebench/output.py +51 -0
- livecodebench/__init__.py +0 -0
- livecodebench/benchmarks/__init__.py +31 -0
- livecodebench/benchmarks/code_execution.py +85 -0
- livecodebench/benchmarks/code_generation.py +160 -0
- livecodebench/benchmarks/test_output_prediction.py +90 -0
- livecodebench/benchmarks/utils.py +50 -0
- livecodebench/evaluation/__init__.py +24 -0
- livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
- livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
- livecodebench/evaluation/compute_scores.py +172 -0
- livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
- livecodebench/evaluation/metric.py +28 -0
- livecodebench/evaluation/old_results_check.py +91 -0
- livecodebench/evaluation/pass_k_utils.py +84 -0
- livecodebench/evaluation/testing_util.py +574 -0
- livecodebench/evaluation/utils_execute.py +285 -0
- livecodebench/lm_styles.py +581 -0
- livecodebench/prompts/__init__.py +22 -0
- livecodebench/prompts/code_execution.py +201 -0
- livecodebench/prompts/code_generation.py +372 -0
- livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
- livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
- livecodebench/prompts/self_repair.py +370 -0
- livecodebench/prompts/test_output_prediction.py +327 -0
- livecodebench/runner/__init__.py +0 -0
- livecodebench/runner/base_runner.py +188 -0
- livecodebench/runner/claude3_runner.py +70 -0
- livecodebench/runner/claude_runner.py +69 -0
- livecodebench/runner/cohere_runner.py +71 -0
- livecodebench/runner/custom_evaluator.py +132 -0
- livecodebench/runner/deepseek_runner.py +87 -0
- livecodebench/runner/gemini_runner.py +111 -0
- livecodebench/runner/generic_oai_server_runner.py +104 -0
- livecodebench/runner/main.py +255 -0
- livecodebench/runner/mistral_runner.py +71 -0
- livecodebench/runner/oai_runner.py +93 -0
- livecodebench/runner/parser.py +174 -0
- livecodebench/runner/runner_utils.py +62 -0
- livecodebench/runner/scenario_router.py +239 -0
- livecodebench/runner/vllm_runner.py +82 -0
- livecodebench/utils/__init__.py +0 -0
- livecodebench/utils/extraction_utils.py +82 -0
- livecodebench/utils/multiprocess.py +250 -0
- livecodebench/utils/path_utils.py +58 -0
- livecodebench/utils/scenarios.py +26 -0
- livecodebench/utils/seed_generator.py +44 -0
- nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
- nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
- nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
- nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
- nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
# Original Copyright 2025 LiveCodeBench
|
|
17
|
+
# For the original license and copyright information, see the LICENSE file in this repository.
|
|
18
|
+
|
|
19
|
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
20
|
+
#
|
|
21
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
22
|
+
# you may not use this file except in compliance with the License.
|
|
23
|
+
# You may obtain a copy of the License at
|
|
24
|
+
#
|
|
25
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
26
|
+
#
|
|
27
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
28
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
29
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
30
|
+
# See the License for the specific language governing permissions and
|
|
31
|
+
# limitations under the License.
|
|
32
|
+
|
|
33
|
+
# This code is adapted from OpenAI's release
|
|
34
|
+
# https://github.com/openai/human-eval/blob/master/human_eval/execution.py
|
|
35
|
+
|
|
36
|
+
import contextlib
|
|
37
|
+
import faulthandler
|
|
38
|
+
import io
|
|
39
|
+
import multiprocessing
|
|
40
|
+
import os
|
|
41
|
+
import platform
|
|
42
|
+
import signal
|
|
43
|
+
import tempfile
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
|
|
47
|
+
from copy import deepcopy
|
|
48
|
+
from string import ascii_lowercase
|
|
49
|
+
from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
|
|
50
|
+
from collections import defaultdict, deque, Counter
|
|
51
|
+
from bisect import bisect, bisect_left, bisect_right, insort
|
|
52
|
+
from heapq import heappush, heappop, heapify, merge
|
|
53
|
+
from functools import reduce, cache, lru_cache
|
|
54
|
+
from random import randrange, shuffle
|
|
55
|
+
from operator import itemgetter, sub
|
|
56
|
+
from re import search as re_search # Assuming 're' refers to a regex search
|
|
57
|
+
from os.path import commonprefix
|
|
58
|
+
from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
|
|
59
|
+
import copy
|
|
60
|
+
import string
|
|
61
|
+
import math
|
|
62
|
+
import collections
|
|
63
|
+
import bisect
|
|
64
|
+
import heapq
|
|
65
|
+
import functools
|
|
66
|
+
import random
|
|
67
|
+
import itertools
|
|
68
|
+
import operator
|
|
69
|
+
import re
|
|
70
|
+
import numpy as np
|
|
71
|
+
import pandas as pd
|
|
72
|
+
from math import log, prod # 'log' and 'prod' are functions in the math module
|
|
73
|
+
from collections import deque, defaultdict, Counter, OrderedDict
|
|
74
|
+
from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
|
|
75
|
+
from functools import lru_cache, reduce, partial
|
|
76
|
+
# from sortedcontainers import SortedList, SortedDict, SortedSet
|
|
77
|
+
# import sortedcontainers
|
|
78
|
+
from operator import iand
|
|
79
|
+
import sys
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def check_correctness(check_program, timeout=3):
|
|
83
|
+
"""
|
|
84
|
+
Evaluates the functional correctness of a completion by running the test
|
|
85
|
+
suite provided in the problem.
|
|
86
|
+
|
|
87
|
+
:param completion_id: an optional completion ID so we can match
|
|
88
|
+
the results later even if execution finishes asynchronously.
|
|
89
|
+
"""
|
|
90
|
+
manager = multiprocessing.Manager()
|
|
91
|
+
result = manager.list()
|
|
92
|
+
|
|
93
|
+
p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
|
|
94
|
+
p.start()
|
|
95
|
+
p.join(timeout=timeout + 1)
|
|
96
|
+
if p.is_alive():
|
|
97
|
+
p.kill()
|
|
98
|
+
|
|
99
|
+
if not result:
|
|
100
|
+
result.append("timed out")
|
|
101
|
+
|
|
102
|
+
return result[0] == "passed"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def unsafe_execute(check_program, result, timeout):
|
|
106
|
+
|
|
107
|
+
with create_tempdir():
|
|
108
|
+
|
|
109
|
+
# These system calls are needed when cleaning up tempdir.
|
|
110
|
+
import os
|
|
111
|
+
import shutil
|
|
112
|
+
|
|
113
|
+
rmtree = shutil.rmtree
|
|
114
|
+
rmdir = os.rmdir
|
|
115
|
+
chdir = os.chdir
|
|
116
|
+
|
|
117
|
+
# Disable functionalities that can make destructive changes to the test.
|
|
118
|
+
reliability_guard()
|
|
119
|
+
|
|
120
|
+
# Run program.
|
|
121
|
+
try:
|
|
122
|
+
exec_globals = {}
|
|
123
|
+
with swallow_io():
|
|
124
|
+
with time_limit(timeout):
|
|
125
|
+
exec(check_program, exec_globals)
|
|
126
|
+
result.append("passed")
|
|
127
|
+
except TimeoutException:
|
|
128
|
+
result.append("timed out")
|
|
129
|
+
except BaseException as e:
|
|
130
|
+
result.append(f"failed: {e}")
|
|
131
|
+
|
|
132
|
+
# Needed for cleaning up.
|
|
133
|
+
shutil.rmtree = rmtree
|
|
134
|
+
os.rmdir = rmdir
|
|
135
|
+
os.chdir = chdir
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@contextlib.contextmanager
|
|
139
|
+
def time_limit(seconds):
|
|
140
|
+
def signal_handler(signum, frame):
|
|
141
|
+
raise TimeoutException("Timed out!")
|
|
142
|
+
|
|
143
|
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
|
144
|
+
signal.signal(signal.SIGALRM, signal_handler)
|
|
145
|
+
try:
|
|
146
|
+
yield
|
|
147
|
+
finally:
|
|
148
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@contextlib.contextmanager
|
|
152
|
+
def swallow_io():
|
|
153
|
+
stream = WriteOnlyStringIO()
|
|
154
|
+
with contextlib.redirect_stdout(stream):
|
|
155
|
+
with contextlib.redirect_stderr(stream):
|
|
156
|
+
with redirect_stdin(stream):
|
|
157
|
+
yield
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@contextlib.contextmanager
|
|
161
|
+
def create_tempdir():
|
|
162
|
+
with tempfile.TemporaryDirectory() as dirname:
|
|
163
|
+
with chdir(dirname):
|
|
164
|
+
yield dirname
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class TimeoutException(Exception):
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class WriteOnlyStringIO(io.StringIO):
|
|
172
|
+
"""StringIO that throws an exception when it's read from"""
|
|
173
|
+
|
|
174
|
+
def read(self, *args, **kwargs):
|
|
175
|
+
raise OSError
|
|
176
|
+
|
|
177
|
+
def readline(self, *args, **kwargs):
|
|
178
|
+
raise OSError
|
|
179
|
+
|
|
180
|
+
def readlines(self, *args, **kwargs):
|
|
181
|
+
raise OSError
|
|
182
|
+
|
|
183
|
+
def readable(self, *args, **kwargs):
|
|
184
|
+
"""Returns True if the IO object can be read."""
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
|
189
|
+
_stream = "stdin"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@contextlib.contextmanager
|
|
193
|
+
def chdir(root):
|
|
194
|
+
if root == ".":
|
|
195
|
+
yield
|
|
196
|
+
return
|
|
197
|
+
cwd = os.getcwd()
|
|
198
|
+
os.chdir(root)
|
|
199
|
+
try:
|
|
200
|
+
yield
|
|
201
|
+
except BaseException as exc:
|
|
202
|
+
raise exc
|
|
203
|
+
finally:
|
|
204
|
+
os.chdir(cwd)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def reliability_guard(maximum_memory_bytes=None):
|
|
208
|
+
"""
|
|
209
|
+
This disables various destructive functions and prevents the generated code
|
|
210
|
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
|
211
|
+
removing filesystem files, etc.)
|
|
212
|
+
|
|
213
|
+
WARNING
|
|
214
|
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
|
215
|
+
generated code, should not be blindly executed outside of one. See the
|
|
216
|
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
|
217
|
+
with caution.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
if maximum_memory_bytes is not None:
|
|
221
|
+
import resource
|
|
222
|
+
|
|
223
|
+
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
|
224
|
+
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
|
225
|
+
if not platform.uname().system == "Darwin":
|
|
226
|
+
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
|
227
|
+
|
|
228
|
+
faulthandler.disable()
|
|
229
|
+
|
|
230
|
+
import builtins
|
|
231
|
+
|
|
232
|
+
builtins.exit = None
|
|
233
|
+
builtins.quit = None
|
|
234
|
+
|
|
235
|
+
import os
|
|
236
|
+
|
|
237
|
+
os.environ["OMP_NUM_THREADS"] = "1"
|
|
238
|
+
|
|
239
|
+
os.kill = None
|
|
240
|
+
os.system = None
|
|
241
|
+
os.putenv = None
|
|
242
|
+
os.remove = None
|
|
243
|
+
os.removedirs = None
|
|
244
|
+
os.rmdir = None
|
|
245
|
+
os.fchdir = None
|
|
246
|
+
os.setuid = None
|
|
247
|
+
os.fork = None
|
|
248
|
+
os.forkpty = None
|
|
249
|
+
os.killpg = None
|
|
250
|
+
os.rename = None
|
|
251
|
+
os.renames = None
|
|
252
|
+
os.truncate = None
|
|
253
|
+
os.replace = None
|
|
254
|
+
os.unlink = None
|
|
255
|
+
os.fchmod = None
|
|
256
|
+
os.fchown = None
|
|
257
|
+
os.chmod = None
|
|
258
|
+
os.chown = None
|
|
259
|
+
os.chroot = None
|
|
260
|
+
os.fchdir = None
|
|
261
|
+
os.lchflags = None
|
|
262
|
+
os.lchmod = None
|
|
263
|
+
os.lchown = None
|
|
264
|
+
os.getcwd = None
|
|
265
|
+
os.chdir = None
|
|
266
|
+
|
|
267
|
+
import shutil
|
|
268
|
+
|
|
269
|
+
shutil.rmtree = None
|
|
270
|
+
shutil.move = None
|
|
271
|
+
shutil.chown = None
|
|
272
|
+
|
|
273
|
+
import subprocess
|
|
274
|
+
|
|
275
|
+
subprocess.Popen = None # type: ignore
|
|
276
|
+
|
|
277
|
+
__builtins__["help"] = None
|
|
278
|
+
|
|
279
|
+
import sys
|
|
280
|
+
|
|
281
|
+
sys.modules["ipdb"] = None
|
|
282
|
+
sys.modules["joblib"] = None
|
|
283
|
+
sys.modules["resource"] = None
|
|
284
|
+
sys.modules["psutil"] = None
|
|
285
|
+
sys.modules["tkinter"] = None
|