nvidia-livecodebench 25.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. core_evals/livecodebench/__init__.py +1 -0
  2. core_evals/livecodebench/framework.yml +233 -0
  3. core_evals/livecodebench/framework_entrypoint.py +28 -0
  4. core_evals/livecodebench/output.py +51 -0
  5. livecodebench/__init__.py +0 -0
  6. livecodebench/benchmarks/__init__.py +31 -0
  7. livecodebench/benchmarks/code_execution.py +85 -0
  8. livecodebench/benchmarks/code_generation.py +160 -0
  9. livecodebench/benchmarks/test_output_prediction.py +90 -0
  10. livecodebench/benchmarks/utils.py +50 -0
  11. livecodebench/evaluation/__init__.py +24 -0
  12. livecodebench/evaluation/compute_code_execution_metrics.py +73 -0
  13. livecodebench/evaluation/compute_code_generation_metrics.py +278 -0
  14. livecodebench/evaluation/compute_scores.py +172 -0
  15. livecodebench/evaluation/compute_test_output_prediction_metrics.py +125 -0
  16. livecodebench/evaluation/metric.py +28 -0
  17. livecodebench/evaluation/old_results_check.py +91 -0
  18. livecodebench/evaluation/pass_k_utils.py +84 -0
  19. livecodebench/evaluation/testing_util.py +574 -0
  20. livecodebench/evaluation/utils_execute.py +285 -0
  21. livecodebench/lm_styles.py +581 -0
  22. livecodebench/prompts/__init__.py +22 -0
  23. livecodebench/prompts/code_execution.py +201 -0
  24. livecodebench/prompts/code_generation.py +372 -0
  25. livecodebench/prompts/few_shot_examples/generation/func.json +12 -0
  26. livecodebench/prompts/few_shot_examples/generation/stdin.json +10 -0
  27. livecodebench/prompts/self_repair.py +370 -0
  28. livecodebench/prompts/test_output_prediction.py +327 -0
  29. livecodebench/runner/__init__.py +0 -0
  30. livecodebench/runner/base_runner.py +188 -0
  31. livecodebench/runner/claude3_runner.py +70 -0
  32. livecodebench/runner/claude_runner.py +69 -0
  33. livecodebench/runner/cohere_runner.py +71 -0
  34. livecodebench/runner/custom_evaluator.py +132 -0
  35. livecodebench/runner/deepseek_runner.py +87 -0
  36. livecodebench/runner/gemini_runner.py +111 -0
  37. livecodebench/runner/generic_oai_server_runner.py +104 -0
  38. livecodebench/runner/main.py +255 -0
  39. livecodebench/runner/mistral_runner.py +71 -0
  40. livecodebench/runner/oai_runner.py +93 -0
  41. livecodebench/runner/parser.py +174 -0
  42. livecodebench/runner/runner_utils.py +62 -0
  43. livecodebench/runner/scenario_router.py +239 -0
  44. livecodebench/runner/vllm_runner.py +82 -0
  45. livecodebench/utils/__init__.py +0 -0
  46. livecodebench/utils/extraction_utils.py +82 -0
  47. livecodebench/utils/multiprocess.py +250 -0
  48. livecodebench/utils/path_utils.py +58 -0
  49. livecodebench/utils/scenarios.py +26 -0
  50. livecodebench/utils/seed_generator.py +44 -0
  51. nvidia_livecodebench-25.8.dist-info/METADATA +518 -0
  52. nvidia_livecodebench-25.8.dist-info/RECORD +55 -0
  53. nvidia_livecodebench-25.8.dist-info/WHEEL +4 -0
  54. nvidia_livecodebench-25.8.dist-info/entry_points.txt +4 -0
  55. nvidia_livecodebench-25.8.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,285 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Original Copyright 2025 LiveCodeBench
17
+ # For the original license and copyright information, see the LICENSE file in this repository.
18
+
19
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
20
+ #
21
+ # Licensed under the Apache License, Version 2.0 (the "License");
22
+ # you may not use this file except in compliance with the License.
23
+ # You may obtain a copy of the License at
24
+ #
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ #
27
+ # Unless required by applicable law or agreed to in writing, software
28
+ # distributed under the License is distributed on an "AS IS" BASIS,
29
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
+ # See the License for the specific language governing permissions and
31
+ # limitations under the License.
32
+
33
+ # This code is adapted from OpenAI's release
34
+ # https://github.com/openai/human-eval/blob/master/human_eval/execution.py
35
+
36
+ import contextlib
37
+ import faulthandler
38
+ import io
39
+ import multiprocessing
40
+ import os
41
+ import platform
42
+ import signal
43
+ import tempfile
44
+
45
+
46
+ BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
47
+ from copy import deepcopy
48
+ from string import ascii_lowercase
49
+ from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
50
+ from collections import defaultdict, deque, Counter
51
+ from bisect import bisect, bisect_left, bisect_right, insort
52
+ from heapq import heappush, heappop, heapify, merge
53
+ from functools import reduce, cache, lru_cache
54
+ from random import randrange, shuffle
55
+ from operator import itemgetter, sub
56
+ from re import search as re_search # Assuming 're' refers to a regex search
57
+ from os.path import commonprefix
58
+ from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
59
+ import copy
60
+ import string
61
+ import math
62
+ import collections
63
+ import bisect
64
+ import heapq
65
+ import functools
66
+ import random
67
+ import itertools
68
+ import operator
69
+ import re
70
+ import numpy as np
71
+ import pandas as pd
72
+ from math import log, prod # 'log' and 'prod' are functions in the math module
73
+ from collections import deque, defaultdict, Counter, OrderedDict
74
+ from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
75
+ from functools import lru_cache, reduce, partial
76
+ # from sortedcontainers import SortedList, SortedDict, SortedSet
77
+ # import sortedcontainers
78
+ from operator import iand
79
+ import sys
80
+ """
81
+
82
+ def check_correctness(check_program, timeout=3):
83
+ """
84
+ Evaluates the functional correctness of a completion by running the test
85
+ suite provided in the problem.
86
+
87
+ :param completion_id: an optional completion ID so we can match
88
+ the results later even if execution finishes asynchronously.
89
+ """
90
+ manager = multiprocessing.Manager()
91
+ result = manager.list()
92
+
93
+ p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
94
+ p.start()
95
+ p.join(timeout=timeout + 1)
96
+ if p.is_alive():
97
+ p.kill()
98
+
99
+ if not result:
100
+ result.append("timed out")
101
+
102
+ return result[0] == "passed"
103
+
104
+
105
+ def unsafe_execute(check_program, result, timeout):
106
+
107
+ with create_tempdir():
108
+
109
+ # These system calls are needed when cleaning up tempdir.
110
+ import os
111
+ import shutil
112
+
113
+ rmtree = shutil.rmtree
114
+ rmdir = os.rmdir
115
+ chdir = os.chdir
116
+
117
+ # Disable functionalities that can make destructive changes to the test.
118
+ reliability_guard()
119
+
120
+ # Run program.
121
+ try:
122
+ exec_globals = {}
123
+ with swallow_io():
124
+ with time_limit(timeout):
125
+ exec(check_program, exec_globals)
126
+ result.append("passed")
127
+ except TimeoutException:
128
+ result.append("timed out")
129
+ except BaseException as e:
130
+ result.append(f"failed: {e}")
131
+
132
+ # Needed for cleaning up.
133
+ shutil.rmtree = rmtree
134
+ os.rmdir = rmdir
135
+ os.chdir = chdir
136
+
137
+
138
+ @contextlib.contextmanager
139
+ def time_limit(seconds):
140
+ def signal_handler(signum, frame):
141
+ raise TimeoutException("Timed out!")
142
+
143
+ signal.setitimer(signal.ITIMER_REAL, seconds)
144
+ signal.signal(signal.SIGALRM, signal_handler)
145
+ try:
146
+ yield
147
+ finally:
148
+ signal.setitimer(signal.ITIMER_REAL, 0)
149
+
150
+
151
+ @contextlib.contextmanager
152
+ def swallow_io():
153
+ stream = WriteOnlyStringIO()
154
+ with contextlib.redirect_stdout(stream):
155
+ with contextlib.redirect_stderr(stream):
156
+ with redirect_stdin(stream):
157
+ yield
158
+
159
+
160
+ @contextlib.contextmanager
161
+ def create_tempdir():
162
+ with tempfile.TemporaryDirectory() as dirname:
163
+ with chdir(dirname):
164
+ yield dirname
165
+
166
+
167
+ class TimeoutException(Exception):
168
+ pass
169
+
170
+
171
+ class WriteOnlyStringIO(io.StringIO):
172
+ """StringIO that throws an exception when it's read from"""
173
+
174
+ def read(self, *args, **kwargs):
175
+ raise OSError
176
+
177
+ def readline(self, *args, **kwargs):
178
+ raise OSError
179
+
180
+ def readlines(self, *args, **kwargs):
181
+ raise OSError
182
+
183
+ def readable(self, *args, **kwargs):
184
+ """Returns True if the IO object can be read."""
185
+ return False
186
+
187
+
188
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
189
+ _stream = "stdin"
190
+
191
+
192
+ @contextlib.contextmanager
193
+ def chdir(root):
194
+ if root == ".":
195
+ yield
196
+ return
197
+ cwd = os.getcwd()
198
+ os.chdir(root)
199
+ try:
200
+ yield
201
+ except BaseException as exc:
202
+ raise exc
203
+ finally:
204
+ os.chdir(cwd)
205
+
206
+
207
+ def reliability_guard(maximum_memory_bytes=None):
208
+ """
209
+ This disables various destructive functions and prevents the generated code
210
+ from interfering with the test (e.g. fork bomb, killing other processes,
211
+ removing filesystem files, etc.)
212
+
213
+ WARNING
214
+ This function is NOT a security sandbox. Untrusted code, including, model-
215
+ generated code, should not be blindly executed outside of one. See the
216
+ Codex paper for more information about OpenAI's code sandbox, and proceed
217
+ with caution.
218
+ """
219
+
220
+ if maximum_memory_bytes is not None:
221
+ import resource
222
+
223
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
224
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
225
+ if not platform.uname().system == "Darwin":
226
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
227
+
228
+ faulthandler.disable()
229
+
230
+ import builtins
231
+
232
+ builtins.exit = None
233
+ builtins.quit = None
234
+
235
+ import os
236
+
237
+ os.environ["OMP_NUM_THREADS"] = "1"
238
+
239
+ os.kill = None
240
+ os.system = None
241
+ os.putenv = None
242
+ os.remove = None
243
+ os.removedirs = None
244
+ os.rmdir = None
245
+ os.fchdir = None
246
+ os.setuid = None
247
+ os.fork = None
248
+ os.forkpty = None
249
+ os.killpg = None
250
+ os.rename = None
251
+ os.renames = None
252
+ os.truncate = None
253
+ os.replace = None
254
+ os.unlink = None
255
+ os.fchmod = None
256
+ os.fchown = None
257
+ os.chmod = None
258
+ os.chown = None
259
+ os.chroot = None
260
+ os.fchdir = None
261
+ os.lchflags = None
262
+ os.lchmod = None
263
+ os.lchown = None
264
+ os.getcwd = None
265
+ os.chdir = None
266
+
267
+ import shutil
268
+
269
+ shutil.rmtree = None
270
+ shutil.move = None
271
+ shutil.chown = None
272
+
273
+ import subprocess
274
+
275
+ subprocess.Popen = None # type: ignore
276
+
277
+ __builtins__["help"] = None
278
+
279
+ import sys
280
+
281
+ sys.modules["ipdb"] = None
282
+ sys.modules["joblib"] = None
283
+ sys.modules["resource"] = None
284
+ sys.modules["psutil"] = None
285
+ sys.modules["tkinter"] = None