evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/arc/arc_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +32 -4
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
- evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
- evalscope/benchmarks/super_gpqa/utils.py +0 -5
- evalscope/collections/evaluator.py +4 -4
- evalscope/config.py +11 -3
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +56 -17
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +11 -40
- evalscope/perf/benchmark.py +39 -28
- evalscope/perf/http_client.py +9 -1
- evalscope/perf/main.py +2 -1
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/utils/db_util.py +3 -0
- evalscope/run.py +15 -3
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +27 -1
- tests/cli/test_run.py +103 -11
- tests/perf/test_perf.py +23 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import numpy as np
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
6
|
+
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
from .pass_k_utils import compute_metrics_from_results
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def codegen_check_correctness(sample, generation, timeout, debug=True):
|
|
14
|
+
"""Check correctness of code generation with a global timeout.
|
|
15
|
+
|
|
16
|
+
The global timeout is to catch some extreme/rare cases not handled by the
|
|
17
|
+
timeouts inside `run_test`
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def _temp_run(sample, generation, debug, result, metadata_list, timeout):
|
|
21
|
+
from .testing_util import run_test
|
|
22
|
+
res, metadata = run_test(sample, test=generation, debug=debug, timeout=timeout)
|
|
23
|
+
result.append(res)
|
|
24
|
+
metadata_list.append(metadata)
|
|
25
|
+
|
|
26
|
+
manager = multiprocessing.Manager()
|
|
27
|
+
result = manager.list()
|
|
28
|
+
metadata_list = manager.list()
|
|
29
|
+
p = multiprocessing.Process(
|
|
30
|
+
target=_temp_run,
|
|
31
|
+
args=(sample, generation, debug, result, metadata_list, timeout),
|
|
32
|
+
)
|
|
33
|
+
p.start()
|
|
34
|
+
p.join(timeout=(timeout + 1) * len(json.loads(sample['input_output'])['inputs']) + 5)
|
|
35
|
+
if p.is_alive():
|
|
36
|
+
p.kill()
|
|
37
|
+
if not result:
|
|
38
|
+
in_outs = json.loads(sample['input_output'])
|
|
39
|
+
# consider that all tests failed
|
|
40
|
+
result = [[-1 for i in range(len(in_outs['inputs']))]]
|
|
41
|
+
if debug:
|
|
42
|
+
logger.info('global timeout')
|
|
43
|
+
return result[0], metadata_list[0]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def evaluate_generations_by_problem(problem_generations: list, sample: list, debug: bool, timeout: int):
|
|
47
|
+
"""Evaluate each problem.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
problem_generations:
|
|
51
|
+
sample:
|
|
52
|
+
debug:
|
|
53
|
+
timeout
|
|
54
|
+
"""
|
|
55
|
+
# problem_generations: list[str] = args[0]
|
|
56
|
+
# sample = args[1]
|
|
57
|
+
# debug: bool = args[2]
|
|
58
|
+
# timeout: int = args[3]
|
|
59
|
+
|
|
60
|
+
res = []
|
|
61
|
+
metadata = []
|
|
62
|
+
for o_idx, o in enumerate(problem_generations):
|
|
63
|
+
curr_res = [-2]
|
|
64
|
+
try:
|
|
65
|
+
curr_res, curr_metadata = codegen_check_correctness(sample, o, timeout=timeout, debug=debug)
|
|
66
|
+
if debug:
|
|
67
|
+
logger.info(f'\nSuccessful compilation of task {o_idx}!')
|
|
68
|
+
fixed = []
|
|
69
|
+
for e in curr_res:
|
|
70
|
+
if isinstance(e, np.ndarray):
|
|
71
|
+
e = e.item(0)
|
|
72
|
+
if isinstance(e, np.bool_):
|
|
73
|
+
e = bool(e)
|
|
74
|
+
fixed.append(e)
|
|
75
|
+
curr_res = fixed
|
|
76
|
+
if not np.all(curr_res):
|
|
77
|
+
if debug:
|
|
78
|
+
logger.info(f'Results were not True for all test cases' # noqa: F541, E501
|
|
79
|
+
f' {curr_res=}\n')
|
|
80
|
+
except Exception as e:
|
|
81
|
+
if debug:
|
|
82
|
+
logger.info(f'Compilation failed, test framework exception' # noqa: F541, E501
|
|
83
|
+
f' = {repr(e)}{e}\n')
|
|
84
|
+
# break
|
|
85
|
+
curr_metadata = {}
|
|
86
|
+
finally:
|
|
87
|
+
assert isinstance(curr_res, list)
|
|
88
|
+
assert isinstance(curr_metadata, dict)
|
|
89
|
+
res.append(curr_res)
|
|
90
|
+
metadata.append(curr_metadata)
|
|
91
|
+
if debug:
|
|
92
|
+
for i, r in enumerate(problem_generations):
|
|
93
|
+
logger.info(f'Sample\n{r}\nResult\n{res[i]}')
|
|
94
|
+
logger.info('*' * 30 + '\n\n')
|
|
95
|
+
return res, metadata
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def evaluate_generations(
|
|
99
|
+
samples_list: list,
|
|
100
|
+
generations_list: list[list[str]],
|
|
101
|
+
debug: bool = False,
|
|
102
|
+
num_process_evaluate: int = 16,
|
|
103
|
+
timeout=6,
|
|
104
|
+
):
|
|
105
|
+
"""We take the list of code generations and try to compile them and the run
|
|
106
|
+
their corresponding unit tests which are retrieved from the APPS dataset.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
generations: list of code generations (same order as samples in APPS
|
|
110
|
+
dataset)
|
|
111
|
+
level: difficulty level used in the generation, can be "all",
|
|
112
|
+
"introductory", "interview" or "competition"
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
results: dictionary of results, key is the problem index, value is
|
|
116
|
+
a list of results for each generation
|
|
117
|
+
[-2] = compile error, [-1] = runtime error [False] = failed test
|
|
118
|
+
case [True] = passed test case
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# generations are code generations in the same order of the dataset
|
|
122
|
+
|
|
123
|
+
inputs = [[(generations_list[index], samples_list[index], debug, timeout), index]
|
|
124
|
+
for index in range(len(generations_list))]
|
|
125
|
+
|
|
126
|
+
with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
|
|
127
|
+
futures = {
|
|
128
|
+
executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
|
|
129
|
+
for (problem_generations, sample, debug, timeout), index in inputs
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
results = {}
|
|
133
|
+
metadata = {}
|
|
134
|
+
for future in as_completed(futures):
|
|
135
|
+
index = futures[future]
|
|
136
|
+
results[index], metadata[index] = future.result()
|
|
137
|
+
|
|
138
|
+
assert len(results) == len(inputs), f'results = {len(results)} inputs = {len(inputs)} {results=}'
|
|
139
|
+
# results = {i: r for r, (_, i) in zip(results, inputs)}
|
|
140
|
+
|
|
141
|
+
return results, metadata
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def codegen_metrics(
|
|
145
|
+
samples_list,
|
|
146
|
+
generations_list,
|
|
147
|
+
k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
|
|
148
|
+
num_process_evaluate=16,
|
|
149
|
+
timeout=6,
|
|
150
|
+
debug=False,
|
|
151
|
+
):
|
|
152
|
+
|
|
153
|
+
samples_linear = []
|
|
154
|
+
generations_linear = []
|
|
155
|
+
remap_index = []
|
|
156
|
+
results = defaultdict(list)
|
|
157
|
+
metadatas = defaultdict(list)
|
|
158
|
+
for idx, (sample, generation_list) in enumerate(zip(samples_list, generations_list)):
|
|
159
|
+
assert isinstance(generation_list, list), generations_list[0]
|
|
160
|
+
for generation in generation_list:
|
|
161
|
+
assert isinstance(generation, str), generations_list[0]
|
|
162
|
+
samples_linear.append(sample)
|
|
163
|
+
generations_linear.append([generation])
|
|
164
|
+
remap_index.append(idx)
|
|
165
|
+
|
|
166
|
+
results_linear, metadatas_linear = evaluate_generations(
|
|
167
|
+
samples_linear,
|
|
168
|
+
generations_linear,
|
|
169
|
+
debug=debug,
|
|
170
|
+
num_process_evaluate=num_process_evaluate,
|
|
171
|
+
timeout=timeout,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
for idx, sub_results in sorted(results_linear.items(), key=lambda x: x[0]):
|
|
175
|
+
results[remap_index[idx]].append(sub_results[0])
|
|
176
|
+
|
|
177
|
+
for idx, sub_metadatas in sorted(metadatas_linear.items(), key=lambda x: x[0]):
|
|
178
|
+
metadatas[remap_index[idx]].append(sub_metadatas[0])
|
|
179
|
+
|
|
180
|
+
metrics = compute_metrics_from_results(results, k_list=k_list)
|
|
181
|
+
|
|
182
|
+
final_metadata = []
|
|
183
|
+
for key in sorted(list(metadatas.keys())):
|
|
184
|
+
final_metadata.append(metadatas[key])
|
|
185
|
+
for i in range(len(final_metadata)):
|
|
186
|
+
if type(final_metadata[i]) is not list:
|
|
187
|
+
final_metadata[i] = [json.dumps(final_metadata[i])]
|
|
188
|
+
else:
|
|
189
|
+
final_metadata[i] = [json.dumps(x) for x in final_metadata[i]]
|
|
190
|
+
|
|
191
|
+
assert len(final_metadata[i]) == len(generations_list[0]), f'{len(final_metadata[i])=}'
|
|
192
|
+
|
|
193
|
+
return [metrics, results, final_metadata]
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# Copyright 2020 The HuggingFace Datasets Authors and the
|
|
2
|
+
# current dataset script contributor.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
# This code is adapted from OpenAI's release
|
|
17
|
+
# https://github.com/openai/human-eval/blob/master/human_eval/execution.py
|
|
18
|
+
|
|
19
|
+
import contextlib
|
|
20
|
+
import faulthandler
|
|
21
|
+
import io
|
|
22
|
+
import multiprocessing
|
|
23
|
+
import os
|
|
24
|
+
import platform
|
|
25
|
+
import signal
|
|
26
|
+
import tempfile
|
|
27
|
+
|
|
28
|
+
BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
|
|
29
|
+
from copy import deepcopy
|
|
30
|
+
from string import ascii_lowercase
|
|
31
|
+
from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
|
|
32
|
+
from collections import defaultdict, deque, Counter
|
|
33
|
+
from bisect import bisect, bisect_left, bisect_right, insort
|
|
34
|
+
from heapq import heappush, heappop, heapify, merge
|
|
35
|
+
from functools import reduce, cache, lru_cache
|
|
36
|
+
from random import randrange, shuffle
|
|
37
|
+
from operator import itemgetter, sub
|
|
38
|
+
from re import search as re_search # Assuming 're' refers to a regex search
|
|
39
|
+
from os.path import commonprefix
|
|
40
|
+
from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
|
|
41
|
+
import copy
|
|
42
|
+
import string
|
|
43
|
+
import math
|
|
44
|
+
import collections
|
|
45
|
+
import bisect
|
|
46
|
+
import heapq
|
|
47
|
+
import functools
|
|
48
|
+
import random
|
|
49
|
+
import itertools
|
|
50
|
+
import operator
|
|
51
|
+
import re
|
|
52
|
+
import numpy as np
|
|
53
|
+
import pandas as pd
|
|
54
|
+
from math import log, prod # 'log' and 'prod' are functions in the math module
|
|
55
|
+
from collections import deque, defaultdict, Counter, OrderedDict
|
|
56
|
+
from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
|
|
57
|
+
from functools import lru_cache, reduce, partial
|
|
58
|
+
# from sortedcontainers import SortedList, SortedDict, SortedSet
|
|
59
|
+
# import sortedcontainers
|
|
60
|
+
from operator import iand
|
|
61
|
+
import sys
|
|
62
|
+
""" # noqa: E501
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def codeexecute_check_correctness(check_program, timeout=3):
|
|
66
|
+
"""Evaluates the functional correctness of a completion by running the test
|
|
67
|
+
suite provided in the problem.
|
|
68
|
+
|
|
69
|
+
:param completion_id: an optional completion ID so we can match
|
|
70
|
+
the results later even if execution finishes asynchronously.
|
|
71
|
+
"""
|
|
72
|
+
manager = multiprocessing.Manager()
|
|
73
|
+
result = manager.list()
|
|
74
|
+
|
|
75
|
+
p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
|
|
76
|
+
p.start()
|
|
77
|
+
p.join(timeout=timeout + 1)
|
|
78
|
+
if p.is_alive():
|
|
79
|
+
p.kill()
|
|
80
|
+
|
|
81
|
+
if not result:
|
|
82
|
+
result.append('timed out')
|
|
83
|
+
|
|
84
|
+
return result[0] == 'passed'
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def unsafe_execute(check_program, result, timeout):
|
|
88
|
+
|
|
89
|
+
with create_tempdir():
|
|
90
|
+
|
|
91
|
+
# These system calls are needed when cleaning up tempdir.
|
|
92
|
+
import os
|
|
93
|
+
import shutil
|
|
94
|
+
|
|
95
|
+
rmtree = shutil.rmtree
|
|
96
|
+
rmdir = os.rmdir
|
|
97
|
+
chdir = os.chdir
|
|
98
|
+
|
|
99
|
+
# Disable functionalities that can make destructive changes
|
|
100
|
+
# to the test.
|
|
101
|
+
reliability_guard()
|
|
102
|
+
|
|
103
|
+
# Run program.
|
|
104
|
+
try:
|
|
105
|
+
exec_globals = {}
|
|
106
|
+
with swallow_io():
|
|
107
|
+
with time_limit(timeout):
|
|
108
|
+
exec(check_program, exec_globals)
|
|
109
|
+
result.append('passed')
|
|
110
|
+
except TimeoutException:
|
|
111
|
+
result.append('timed out')
|
|
112
|
+
except BaseException as e:
|
|
113
|
+
result.append(f'failed: {e}')
|
|
114
|
+
|
|
115
|
+
# Needed for cleaning up.
|
|
116
|
+
shutil.rmtree = rmtree
|
|
117
|
+
os.rmdir = rmdir
|
|
118
|
+
os.chdir = chdir
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@contextlib.contextmanager
|
|
122
|
+
def time_limit(seconds):
|
|
123
|
+
|
|
124
|
+
def signal_handler(signum, frame):
|
|
125
|
+
raise TimeoutException('Timed out!')
|
|
126
|
+
|
|
127
|
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
|
128
|
+
signal.signal(signal.SIGALRM, signal_handler)
|
|
129
|
+
try:
|
|
130
|
+
yield
|
|
131
|
+
finally:
|
|
132
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@contextlib.contextmanager
|
|
136
|
+
def swallow_io():
|
|
137
|
+
stream = WriteOnlyStringIO()
|
|
138
|
+
with contextlib.redirect_stdout(stream):
|
|
139
|
+
with contextlib.redirect_stderr(stream):
|
|
140
|
+
with redirect_stdin(stream):
|
|
141
|
+
yield
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@contextlib.contextmanager
|
|
145
|
+
def create_tempdir():
|
|
146
|
+
with tempfile.TemporaryDirectory() as dirname:
|
|
147
|
+
with chdir(dirname):
|
|
148
|
+
yield dirname
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class TimeoutException(Exception):
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class WriteOnlyStringIO(io.StringIO):
|
|
156
|
+
"""StringIO that throws an exception when it's read from."""
|
|
157
|
+
|
|
158
|
+
def read(self, *args, **kwargs):
|
|
159
|
+
raise OSError
|
|
160
|
+
|
|
161
|
+
def readline(self, *args, **kwargs):
|
|
162
|
+
raise OSError
|
|
163
|
+
|
|
164
|
+
def readlines(self, *args, **kwargs):
|
|
165
|
+
raise OSError
|
|
166
|
+
|
|
167
|
+
def readable(self, *args, **kwargs):
|
|
168
|
+
"""Returns True if the IO object can be read."""
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
|
173
|
+
_stream = 'stdin'
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@contextlib.contextmanager
|
|
177
|
+
def chdir(root):
|
|
178
|
+
if root == '.':
|
|
179
|
+
yield
|
|
180
|
+
return
|
|
181
|
+
cwd = os.getcwd()
|
|
182
|
+
os.chdir(root)
|
|
183
|
+
try:
|
|
184
|
+
yield
|
|
185
|
+
except BaseException as exc:
|
|
186
|
+
raise exc
|
|
187
|
+
finally:
|
|
188
|
+
os.chdir(cwd)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def reliability_guard(maximum_memory_bytes=None):
|
|
192
|
+
"""This disables various destructive functions and prevents the generated
|
|
193
|
+
code from interfering with the test (e.g. fork bomb, killing other
|
|
194
|
+
processes, removing filesystem files, etc.)
|
|
195
|
+
|
|
196
|
+
WARNING This function is NOT a security sandbox. Untrusted code, including,
|
|
197
|
+
model- generated code, should not be blindly executed outside of one. See
|
|
198
|
+
the Codex paper for more information about OpenAI's code sandbox, and
|
|
199
|
+
proceed with caution.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
if maximum_memory_bytes is not None:
|
|
203
|
+
import resource
|
|
204
|
+
|
|
205
|
+
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
|
206
|
+
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
|
207
|
+
if not platform.uname().system == 'Darwin':
|
|
208
|
+
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
|
209
|
+
|
|
210
|
+
faulthandler.disable()
|
|
211
|
+
|
|
212
|
+
import builtins
|
|
213
|
+
|
|
214
|
+
builtins.exit = None
|
|
215
|
+
builtins.quit = None
|
|
216
|
+
|
|
217
|
+
import os
|
|
218
|
+
|
|
219
|
+
os.environ['OMP_NUM_THREADS'] = '1'
|
|
220
|
+
|
|
221
|
+
os.kill = None
|
|
222
|
+
os.system = None
|
|
223
|
+
os.putenv = None
|
|
224
|
+
os.remove = None
|
|
225
|
+
os.removedirs = None
|
|
226
|
+
os.rmdir = None
|
|
227
|
+
os.fchdir = None
|
|
228
|
+
os.setuid = None
|
|
229
|
+
os.fork = None
|
|
230
|
+
os.forkpty = None
|
|
231
|
+
os.killpg = None
|
|
232
|
+
os.rename = None
|
|
233
|
+
os.renames = None
|
|
234
|
+
os.truncate = None
|
|
235
|
+
os.replace = None
|
|
236
|
+
os.unlink = None
|
|
237
|
+
os.fchmod = None
|
|
238
|
+
os.fchown = None
|
|
239
|
+
os.chmod = None
|
|
240
|
+
os.chown = None
|
|
241
|
+
os.chroot = None
|
|
242
|
+
os.fchdir = None
|
|
243
|
+
os.lchflags = None
|
|
244
|
+
os.lchmod = None
|
|
245
|
+
os.lchown = None
|
|
246
|
+
os.getcwd = None
|
|
247
|
+
os.chdir = None
|
|
248
|
+
|
|
249
|
+
import shutil
|
|
250
|
+
|
|
251
|
+
shutil.rmtree = None
|
|
252
|
+
shutil.move = None
|
|
253
|
+
shutil.chown = None
|
|
254
|
+
|
|
255
|
+
import subprocess
|
|
256
|
+
|
|
257
|
+
subprocess.Popen = None # type: ignore
|
|
258
|
+
|
|
259
|
+
__builtins__['help'] = None
|
|
260
|
+
|
|
261
|
+
import sys
|
|
262
|
+
|
|
263
|
+
sys.modules['ipdb'] = None
|
|
264
|
+
sys.modules['joblib'] = None
|
|
265
|
+
sys.modules['resource'] = None
|
|
266
|
+
sys.modules['psutil'] = None
|
|
267
|
+
sys.modules['tkinter'] = None
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Copyright LiveCodeBench @ 2024,
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_code_generation(model_output: str, model_type: str = 'chat'):
|
|
7
|
+
# modified from
|
|
8
|
+
outputlines = model_output.split('\n')
|
|
9
|
+
# TODO: handle codellama
|
|
10
|
+
|
|
11
|
+
if model_type == 'base':
|
|
12
|
+
return model_output.strip()
|
|
13
|
+
elif model_type == 'chat':
|
|
14
|
+
indexlines = [i for i, line in enumerate(outputlines) if '```' in line]
|
|
15
|
+
else:
|
|
16
|
+
raise ValueError(f'Invalid mode type: {model_type}')
|
|
17
|
+
|
|
18
|
+
if len(indexlines) < 2:
|
|
19
|
+
return ''
|
|
20
|
+
return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def extract_code_execution(model_output: str, cot: bool = False):
|
|
24
|
+
pattern = r'\[PYTHON\](.*?)\[\/PYTHON\]'
|
|
25
|
+
matches = re.findall(pattern, model_output, re.DOTALL)
|
|
26
|
+
if matches:
|
|
27
|
+
# fetch the last one
|
|
28
|
+
model_output = matches[-1]
|
|
29
|
+
|
|
30
|
+
if '[PYTHON]' in model_output:
|
|
31
|
+
model_output
|
|
32
|
+
if cot:
|
|
33
|
+
if '[ANSWER]' in model_output:
|
|
34
|
+
model_output = model_output.split('[ANSWER]')[1].strip()
|
|
35
|
+
if '==' in model_output:
|
|
36
|
+
model_output = model_output.split('==')[1].strip()
|
|
37
|
+
if '[/ANSWER]' in model_output:
|
|
38
|
+
model_output = model_output.split('[/ANSWER]')[0].strip()
|
|
39
|
+
else:
|
|
40
|
+
model_output = model_output.split('\n')[0].strip()
|
|
41
|
+
return model_output.strip()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def extract_test_output_code(model_output: str):
|
|
45
|
+
outputlines = model_output.split('\n')
|
|
46
|
+
# find the last line startwith assert...
|
|
47
|
+
indexlines = [i for i, line in enumerate(outputlines) if line.startswith('assert')]
|
|
48
|
+
if indexlines:
|
|
49
|
+
return outputlines[indexlines[-1]]
|
|
50
|
+
|
|
51
|
+
# TODO: handle codellama format
|
|
52
|
+
# if lmstyle and lmstyle == LMStyle.CodeLLaMaInstruct:
|
|
53
|
+
# indexlines = \
|
|
54
|
+
# [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
|
|
55
|
+
# else:
|
|
56
|
+
|
|
57
|
+
# first try to extract ```python if not then try ```
|
|
58
|
+
indexlines = [i for i, line in enumerate(outputlines) if '```python' in line or '```Python' in line]
|
|
59
|
+
if indexlines:
|
|
60
|
+
start_index = indexlines[0]
|
|
61
|
+
else:
|
|
62
|
+
start_index = None
|
|
63
|
+
indexlines = [i for i, line in enumerate(outputlines) if '```' in line]
|
|
64
|
+
if start_index is not None:
|
|
65
|
+
indexlines = [i for i in indexlines if i > start_index]
|
|
66
|
+
indexlines = [start_index] + indexlines
|
|
67
|
+
|
|
68
|
+
if len(indexlines) < 2:
|
|
69
|
+
return ''
|
|
70
|
+
return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from tqdm import tqdm
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@Benchmark.register(
|
|
10
|
+
name='live_code_bench',
|
|
11
|
+
pretty_name='Live Code Bench',
|
|
12
|
+
dataset_id='AI-ModelScope/code_generation_lite',
|
|
13
|
+
subset_list=['release_latest'],
|
|
14
|
+
metric_list=['Pass@1'],
|
|
15
|
+
few_shot_num=0,
|
|
16
|
+
train_split=None,
|
|
17
|
+
eval_split='test',
|
|
18
|
+
extra_params={
|
|
19
|
+
'start_date': None,
|
|
20
|
+
'end_date': None,
|
|
21
|
+
'num_process_evaluate': 1,
|
|
22
|
+
'timeout': 6
|
|
23
|
+
},
|
|
24
|
+
system_prompt=
|
|
25
|
+
'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
|
|
26
|
+
prompt_template=
|
|
27
|
+
'### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n', # noqa: E501
|
|
28
|
+
)
|
|
29
|
+
class LiveCodeBenchAdapter(DataAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
|
|
34
|
+
extra_params = kwargs.get('extra_params', {})
|
|
35
|
+
|
|
36
|
+
self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
|
|
37
|
+
self.timeout = extra_params.get('timeout', 6)
|
|
38
|
+
self.start_date = extra_params.get('start_date')
|
|
39
|
+
self.end_date = extra_params.get('end_date')
|
|
40
|
+
|
|
41
|
+
def load(self, **kwargs) -> dict:
|
|
42
|
+
from .load_utils import filter_date, transform
|
|
43
|
+
|
|
44
|
+
# Note: need trust_remote_code=True to load the python script
|
|
45
|
+
dataset_dict = super().load(trust_remote_code=True, **kwargs)
|
|
46
|
+
new_dataset_dict = {}
|
|
47
|
+
for subset_key, dataset in dataset_dict.items():
|
|
48
|
+
datasets = dataset[self.eval_split]
|
|
49
|
+
filtered_datasets = filter_date(datasets, start_date=self.start_date, end_date=self.end_date)
|
|
50
|
+
|
|
51
|
+
transformed_datasets = [transform(item) for item in tqdm(filtered_datasets, desc='Transforming data')]
|
|
52
|
+
new_dataset_dict[subset_key] = {self.eval_split: transformed_datasets}
|
|
53
|
+
return new_dataset_dict
|
|
54
|
+
|
|
55
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
56
|
+
"""
|
|
57
|
+
Generate the prompt for the model input.
|
|
58
|
+
"""
|
|
59
|
+
format_prompt = input_d['format_prompt']
|
|
60
|
+
question_content = input_d['question_content']
|
|
61
|
+
full_prompt = self.prompt_template.format(question_content=question_content, format_prompt=format_prompt)
|
|
62
|
+
|
|
63
|
+
return self.gen_prompt_data(full_prompt)
|
|
64
|
+
|
|
65
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
66
|
+
# Extract the gold answer from the input dict.
|
|
67
|
+
return input_d
|
|
68
|
+
|
|
69
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
70
|
+
"""
|
|
71
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
72
|
+
"""
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
def match(self, gold: dict, pred: str) -> float:
|
|
76
|
+
from .evaluate_utils import codegen_metrics
|
|
77
|
+
from .extract_utils import extract_code_generation
|
|
78
|
+
|
|
79
|
+
ext_pred = extract_code_generation(pred)
|
|
80
|
+
|
|
81
|
+
references = [{'input_output': gold['evaluation_sample']}]
|
|
82
|
+
predictions = [[ext_pred]]
|
|
83
|
+
metrics, eval_results, final_metadata = codegen_metrics(
|
|
84
|
+
references,
|
|
85
|
+
predictions,
|
|
86
|
+
k_list=[1],
|
|
87
|
+
num_process_evaluate=self.num_process_evaluate,
|
|
88
|
+
timeout=self.timeout,
|
|
89
|
+
)
|
|
90
|
+
return metrics['pass@1'] / 100 # convert to point scale
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import pickle
|
|
4
|
+
import zlib
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from evalscope.benchmarks.live_code_bench.prompts import CodeGenerationPromptConstants
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def transform(item):
|
|
14
|
+
# Define the dataitem mapping logic
|
|
15
|
+
|
|
16
|
+
# starter_code
|
|
17
|
+
if item['starter_code']:
|
|
18
|
+
format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n' # noqa: E501
|
|
19
|
+
format_prompt += f"```python\n{item['starter_code']}\n```\n\n"
|
|
20
|
+
else:
|
|
21
|
+
format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n' # noqa: E501
|
|
22
|
+
format_prompt += '```python\n# YOUR CODE HERE\n```\n\n'
|
|
23
|
+
|
|
24
|
+
item['format_prompt'] = format_prompt
|
|
25
|
+
|
|
26
|
+
# load test cases
|
|
27
|
+
public_test_cases = item['public_test_cases']
|
|
28
|
+
public_test_cases = json.loads(item['public_test_cases'])
|
|
29
|
+
|
|
30
|
+
private_test_cases = item['private_test_cases']
|
|
31
|
+
try:
|
|
32
|
+
private_test_cases = json.loads(item['private_test_cases'])
|
|
33
|
+
except Exception as e: # noqa: F841
|
|
34
|
+
private_test_cases = json.loads(
|
|
35
|
+
pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8')) # type: ignore
|
|
36
|
+
))) # type: ignore
|
|
37
|
+
|
|
38
|
+
# load metadata
|
|
39
|
+
metadata = json.loads(item['metadata'])
|
|
40
|
+
evaluation_sample = json.dumps({
|
|
41
|
+
'inputs': [t['input'] for t in public_test_cases + private_test_cases],
|
|
42
|
+
'outputs': [t['output'] for t in public_test_cases + private_test_cases],
|
|
43
|
+
'fn_name': metadata.get('func_name', None),
|
|
44
|
+
})
|
|
45
|
+
item['evaluation_sample'] = evaluation_sample
|
|
46
|
+
|
|
47
|
+
return item
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def filter_date(dataset, start_date=None, end_date=None):
|
|
51
|
+
new_dataset = []
|
|
52
|
+
|
|
53
|
+
for item in dataset:
|
|
54
|
+
contest_date = datetime.fromisoformat(item['contest_date'])
|
|
55
|
+
if start_date is not None:
|
|
56
|
+
p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
57
|
+
if p_start_date > contest_date:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
if end_date is not None:
|
|
61
|
+
p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
|
|
62
|
+
if p_end_date < contest_date:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
new_dataset.append(item)
|
|
66
|
+
|
|
67
|
+
if start_date or end_date:
|
|
68
|
+
logger.info(
|
|
69
|
+
f'Filtered dataset with start_date: {start_date}, end_date: {end_date}, remaining items: {len(new_dataset)}'
|
|
70
|
+
)
|
|
71
|
+
return new_dataset
|