evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (50) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -3
  3. evalscope/benchmarks/benchmark.py +3 -2
  4. evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  5. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  6. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  7. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  8. evalscope/benchmarks/data_adapter.py +32 -4
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  10. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  11. evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  12. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  13. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
  23. evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. evalscope/collections/evaluator.py +4 -4
  25. evalscope/config.py +11 -3
  26. evalscope/constants.py +8 -0
  27. evalscope/evaluator/evaluator.py +56 -17
  28. evalscope/metrics/llm_judge.py +104 -0
  29. evalscope/models/custom_adapter.py +1 -1
  30. evalscope/perf/arguments.py +11 -40
  31. evalscope/perf/benchmark.py +39 -28
  32. evalscope/perf/http_client.py +9 -1
  33. evalscope/perf/main.py +2 -1
  34. evalscope/perf/plugin/datasets/__init__.py +1 -0
  35. evalscope/perf/plugin/datasets/openqa.py +6 -11
  36. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  37. evalscope/perf/utils/db_util.py +3 -0
  38. evalscope/run.py +15 -3
  39. evalscope/third_party/longbench_write/infer.py +1 -1
  40. evalscope/version.py +2 -2
  41. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
  42. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
  43. tests/cli/test_all.py +144 -0
  44. tests/cli/test_collection.py +27 -1
  45. tests/cli/test_run.py +103 -11
  46. tests/perf/test_perf.py +23 -0
  47. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
  48. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
  49. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
  50. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,193 @@
1
+ import json
2
+ import multiprocessing
3
+ import numpy as np
4
+ from collections import defaultdict
5
+ from concurrent.futures import ProcessPoolExecutor, as_completed
6
+
7
+ from evalscope.utils.logger import get_logger
8
+ from .pass_k_utils import compute_metrics_from_results
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ def codegen_check_correctness(sample, generation, timeout, debug=True):
14
+ """Check correctness of code generation with a global timeout.
15
+
16
+ The global timeout is to catch some extreme/rare cases not handled by the
17
+ timeouts inside `run_test`
18
+ """
19
+
20
+ def _temp_run(sample, generation, debug, result, metadata_list, timeout):
21
+ from .testing_util import run_test
22
+ res, metadata = run_test(sample, test=generation, debug=debug, timeout=timeout)
23
+ result.append(res)
24
+ metadata_list.append(metadata)
25
+
26
+ manager = multiprocessing.Manager()
27
+ result = manager.list()
28
+ metadata_list = manager.list()
29
+ p = multiprocessing.Process(
30
+ target=_temp_run,
31
+ args=(sample, generation, debug, result, metadata_list, timeout),
32
+ )
33
+ p.start()
34
+ p.join(timeout=(timeout + 1) * len(json.loads(sample['input_output'])['inputs']) + 5)
35
+ if p.is_alive():
36
+ p.kill()
37
+ if not result:
38
+ in_outs = json.loads(sample['input_output'])
39
+ # consider that all tests failed
40
+ result = [[-1 for i in range(len(in_outs['inputs']))]]
41
+ if debug:
42
+ logger.info('global timeout')
43
+ return result[0], metadata_list[0]
44
+
45
+
46
+ def evaluate_generations_by_problem(problem_generations: list, sample: list, debug: bool, timeout: int):
47
+ """Evaluate each problem.
48
+
49
+ Args:
50
+ problem_generations:
51
+ sample:
52
+ debug:
53
+ timeout
54
+ """
55
+ # problem_generations: list[str] = args[0]
56
+ # sample = args[1]
57
+ # debug: bool = args[2]
58
+ # timeout: int = args[3]
59
+
60
+ res = []
61
+ metadata = []
62
+ for o_idx, o in enumerate(problem_generations):
63
+ curr_res = [-2]
64
+ try:
65
+ curr_res, curr_metadata = codegen_check_correctness(sample, o, timeout=timeout, debug=debug)
66
+ if debug:
67
+ logger.info(f'\nSuccessful compilation of task {o_idx}!')
68
+ fixed = []
69
+ for e in curr_res:
70
+ if isinstance(e, np.ndarray):
71
+ e = e.item(0)
72
+ if isinstance(e, np.bool_):
73
+ e = bool(e)
74
+ fixed.append(e)
75
+ curr_res = fixed
76
+ if not np.all(curr_res):
77
+ if debug:
78
+ logger.info(f'Results were not True for all test cases' # noqa: F541, E501
79
+ f' {curr_res=}\n')
80
+ except Exception as e:
81
+ if debug:
82
+ logger.info(f'Compilation failed, test framework exception' # noqa: F541, E501
83
+ f' = {repr(e)}{e}\n')
84
+ # break
85
+ curr_metadata = {}
86
+ finally:
87
+ assert isinstance(curr_res, list)
88
+ assert isinstance(curr_metadata, dict)
89
+ res.append(curr_res)
90
+ metadata.append(curr_metadata)
91
+ if debug:
92
+ for i, r in enumerate(problem_generations):
93
+ logger.info(f'Sample\n{r}\nResult\n{res[i]}')
94
+ logger.info('*' * 30 + '\n\n')
95
+ return res, metadata
96
+
97
+
98
+ def evaluate_generations(
99
+ samples_list: list,
100
+ generations_list: list[list[str]],
101
+ debug: bool = False,
102
+ num_process_evaluate: int = 16,
103
+ timeout=6,
104
+ ):
105
+ """We take the list of code generations and try to compile them and the run
106
+ their corresponding unit tests which are retrieved from the APPS dataset.
107
+
108
+ Args:
109
+ generations: list of code generations (same order as samples in APPS
110
+ dataset)
111
+ level: difficulty level used in the generation, can be "all",
112
+ "introductory", "interview" or "competition"
113
+
114
+ Returns:
115
+ results: dictionary of results, key is the problem index, value is
116
+ a list of results for each generation
117
+ [-2] = compile error, [-1] = runtime error [False] = failed test
118
+ case [True] = passed test case
119
+ """
120
+
121
+ # generations are code generations in the same order of the dataset
122
+
123
+ inputs = [[(generations_list[index], samples_list[index], debug, timeout), index]
124
+ for index in range(len(generations_list))]
125
+
126
+ with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
127
+ futures = {
128
+ executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
129
+ for (problem_generations, sample, debug, timeout), index in inputs
130
+ }
131
+
132
+ results = {}
133
+ metadata = {}
134
+ for future in as_completed(futures):
135
+ index = futures[future]
136
+ results[index], metadata[index] = future.result()
137
+
138
+ assert len(results) == len(inputs), f'results = {len(results)} inputs = {len(inputs)} {results=}'
139
+ # results = {i: r for r, (_, i) in zip(results, inputs)}
140
+
141
+ return results, metadata
142
+
143
+
144
+ def codegen_metrics(
145
+ samples_list,
146
+ generations_list,
147
+ k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
148
+ num_process_evaluate=16,
149
+ timeout=6,
150
+ debug=False,
151
+ ):
152
+
153
+ samples_linear = []
154
+ generations_linear = []
155
+ remap_index = []
156
+ results = defaultdict(list)
157
+ metadatas = defaultdict(list)
158
+ for idx, (sample, generation_list) in enumerate(zip(samples_list, generations_list)):
159
+ assert isinstance(generation_list, list), generations_list[0]
160
+ for generation in generation_list:
161
+ assert isinstance(generation, str), generations_list[0]
162
+ samples_linear.append(sample)
163
+ generations_linear.append([generation])
164
+ remap_index.append(idx)
165
+
166
+ results_linear, metadatas_linear = evaluate_generations(
167
+ samples_linear,
168
+ generations_linear,
169
+ debug=debug,
170
+ num_process_evaluate=num_process_evaluate,
171
+ timeout=timeout,
172
+ )
173
+
174
+ for idx, sub_results in sorted(results_linear.items(), key=lambda x: x[0]):
175
+ results[remap_index[idx]].append(sub_results[0])
176
+
177
+ for idx, sub_metadatas in sorted(metadatas_linear.items(), key=lambda x: x[0]):
178
+ metadatas[remap_index[idx]].append(sub_metadatas[0])
179
+
180
+ metrics = compute_metrics_from_results(results, k_list=k_list)
181
+
182
+ final_metadata = []
183
+ for key in sorted(list(metadatas.keys())):
184
+ final_metadata.append(metadatas[key])
185
+ for i in range(len(final_metadata)):
186
+ if type(final_metadata[i]) is not list:
187
+ final_metadata[i] = [json.dumps(final_metadata[i])]
188
+ else:
189
+ final_metadata[i] = [json.dumps(x) for x in final_metadata[i]]
190
+
191
+ assert len(final_metadata[i]) == len(generations_list[0]), f'{len(final_metadata[i])=}'
192
+
193
+ return [metrics, results, final_metadata]
@@ -0,0 +1,267 @@
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the
2
+ # current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is adapted from OpenAI's release
17
+ # https://github.com/openai/human-eval/blob/master/human_eval/execution.py
18
+
19
+ import contextlib
20
+ import faulthandler
21
+ import io
22
+ import multiprocessing
23
+ import os
24
+ import platform
25
+ import signal
26
+ import tempfile
27
+
28
+ BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
29
+ from copy import deepcopy
30
+ from string import ascii_lowercase
31
+ from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
32
+ from collections import defaultdict, deque, Counter
33
+ from bisect import bisect, bisect_left, bisect_right, insort
34
+ from heapq import heappush, heappop, heapify, merge
35
+ from functools import reduce, cache, lru_cache
36
+ from random import randrange, shuffle
37
+ from operator import itemgetter, sub
38
+ from re import search as re_search # Assuming 're' refers to a regex search
39
+ from os.path import commonprefix
40
+ from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
41
+ import copy
42
+ import string
43
+ import math
44
+ import collections
45
+ import bisect
46
+ import heapq
47
+ import functools
48
+ import random
49
+ import itertools
50
+ import operator
51
+ import re
52
+ import numpy as np
53
+ import pandas as pd
54
+ from math import log, prod # 'log' and 'prod' are functions in the math module
55
+ from collections import deque, defaultdict, Counter, OrderedDict
56
+ from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
57
+ from functools import lru_cache, reduce, partial
58
+ # from sortedcontainers import SortedList, SortedDict, SortedSet
59
+ # import sortedcontainers
60
+ from operator import iand
61
+ import sys
62
+ """ # noqa: E501
63
+
64
+
65
+ def codeexecute_check_correctness(check_program, timeout=3):
66
+ """Evaluates the functional correctness of a completion by running the test
67
+ suite provided in the problem.
68
+
69
+ :param completion_id: an optional completion ID so we can match
70
+ the results later even if execution finishes asynchronously.
71
+ """
72
+ manager = multiprocessing.Manager()
73
+ result = manager.list()
74
+
75
+ p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
76
+ p.start()
77
+ p.join(timeout=timeout + 1)
78
+ if p.is_alive():
79
+ p.kill()
80
+
81
+ if not result:
82
+ result.append('timed out')
83
+
84
+ return result[0] == 'passed'
85
+
86
+
87
+ def unsafe_execute(check_program, result, timeout):
88
+
89
+ with create_tempdir():
90
+
91
+ # These system calls are needed when cleaning up tempdir.
92
+ import os
93
+ import shutil
94
+
95
+ rmtree = shutil.rmtree
96
+ rmdir = os.rmdir
97
+ chdir = os.chdir
98
+
99
+ # Disable functionalities that can make destructive changes
100
+ # to the test.
101
+ reliability_guard()
102
+
103
+ # Run program.
104
+ try:
105
+ exec_globals = {}
106
+ with swallow_io():
107
+ with time_limit(timeout):
108
+ exec(check_program, exec_globals)
109
+ result.append('passed')
110
+ except TimeoutException:
111
+ result.append('timed out')
112
+ except BaseException as e:
113
+ result.append(f'failed: {e}')
114
+
115
+ # Needed for cleaning up.
116
+ shutil.rmtree = rmtree
117
+ os.rmdir = rmdir
118
+ os.chdir = chdir
119
+
120
+
121
+ @contextlib.contextmanager
122
+ def time_limit(seconds):
123
+
124
+ def signal_handler(signum, frame):
125
+ raise TimeoutException('Timed out!')
126
+
127
+ signal.setitimer(signal.ITIMER_REAL, seconds)
128
+ signal.signal(signal.SIGALRM, signal_handler)
129
+ try:
130
+ yield
131
+ finally:
132
+ signal.setitimer(signal.ITIMER_REAL, 0)
133
+
134
+
135
+ @contextlib.contextmanager
136
+ def swallow_io():
137
+ stream = WriteOnlyStringIO()
138
+ with contextlib.redirect_stdout(stream):
139
+ with contextlib.redirect_stderr(stream):
140
+ with redirect_stdin(stream):
141
+ yield
142
+
143
+
144
+ @contextlib.contextmanager
145
+ def create_tempdir():
146
+ with tempfile.TemporaryDirectory() as dirname:
147
+ with chdir(dirname):
148
+ yield dirname
149
+
150
+
151
+ class TimeoutException(Exception):
152
+ pass
153
+
154
+
155
+ class WriteOnlyStringIO(io.StringIO):
156
+ """StringIO that throws an exception when it's read from."""
157
+
158
+ def read(self, *args, **kwargs):
159
+ raise OSError
160
+
161
+ def readline(self, *args, **kwargs):
162
+ raise OSError
163
+
164
+ def readlines(self, *args, **kwargs):
165
+ raise OSError
166
+
167
+ def readable(self, *args, **kwargs):
168
+ """Returns True if the IO object can be read."""
169
+ return False
170
+
171
+
172
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
173
+ _stream = 'stdin'
174
+
175
+
176
+ @contextlib.contextmanager
177
+ def chdir(root):
178
+ if root == '.':
179
+ yield
180
+ return
181
+ cwd = os.getcwd()
182
+ os.chdir(root)
183
+ try:
184
+ yield
185
+ except BaseException as exc:
186
+ raise exc
187
+ finally:
188
+ os.chdir(cwd)
189
+
190
+
191
+ def reliability_guard(maximum_memory_bytes=None):
192
+ """This disables various destructive functions and prevents the generated
193
+ code from interfering with the test (e.g. fork bomb, killing other
194
+ processes, removing filesystem files, etc.)
195
+
196
+ WARNING This function is NOT a security sandbox. Untrusted code, including,
197
+ model- generated code, should not be blindly executed outside of one. See
198
+ the Codex paper for more information about OpenAI's code sandbox, and
199
+ proceed with caution.
200
+ """
201
+
202
+ if maximum_memory_bytes is not None:
203
+ import resource
204
+
205
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
206
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
207
+ if not platform.uname().system == 'Darwin':
208
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
209
+
210
+ faulthandler.disable()
211
+
212
+ import builtins
213
+
214
+ builtins.exit = None
215
+ builtins.quit = None
216
+
217
+ import os
218
+
219
+ os.environ['OMP_NUM_THREADS'] = '1'
220
+
221
+ os.kill = None
222
+ os.system = None
223
+ os.putenv = None
224
+ os.remove = None
225
+ os.removedirs = None
226
+ os.rmdir = None
227
+ os.fchdir = None
228
+ os.setuid = None
229
+ os.fork = None
230
+ os.forkpty = None
231
+ os.killpg = None
232
+ os.rename = None
233
+ os.renames = None
234
+ os.truncate = None
235
+ os.replace = None
236
+ os.unlink = None
237
+ os.fchmod = None
238
+ os.fchown = None
239
+ os.chmod = None
240
+ os.chown = None
241
+ os.chroot = None
242
+ os.fchdir = None
243
+ os.lchflags = None
244
+ os.lchmod = None
245
+ os.lchown = None
246
+ os.getcwd = None
247
+ os.chdir = None
248
+
249
+ import shutil
250
+
251
+ shutil.rmtree = None
252
+ shutil.move = None
253
+ shutil.chown = None
254
+
255
+ import subprocess
256
+
257
+ subprocess.Popen = None # type: ignore
258
+
259
+ __builtins__['help'] = None
260
+
261
+ import sys
262
+
263
+ sys.modules['ipdb'] = None
264
+ sys.modules['joblib'] = None
265
+ sys.modules['resource'] = None
266
+ sys.modules['psutil'] = None
267
+ sys.modules['tkinter'] = None
@@ -0,0 +1,70 @@
1
+ # Copyright LiveCodeBench @ 2024,
2
+
3
+ import re
4
+
5
+
6
+ def extract_code_generation(model_output: str, model_type: str = 'chat'):
7
+ # modified from
8
+ outputlines = model_output.split('\n')
9
+ # TODO: handle codellama
10
+
11
+ if model_type == 'base':
12
+ return model_output.strip()
13
+ elif model_type == 'chat':
14
+ indexlines = [i for i, line in enumerate(outputlines) if '```' in line]
15
+ else:
16
+ raise ValueError(f'Invalid mode type: {model_type}')
17
+
18
+ if len(indexlines) < 2:
19
+ return ''
20
+ return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])
21
+
22
+
23
+ def extract_code_execution(model_output: str, cot: bool = False):
24
+ pattern = r'\[PYTHON\](.*?)\[\/PYTHON\]'
25
+ matches = re.findall(pattern, model_output, re.DOTALL)
26
+ if matches:
27
+ # fetch the last one
28
+ model_output = matches[-1]
29
+
30
+ if '[PYTHON]' in model_output:
31
+ model_output
32
+ if cot:
33
+ if '[ANSWER]' in model_output:
34
+ model_output = model_output.split('[ANSWER]')[1].strip()
35
+ if '==' in model_output:
36
+ model_output = model_output.split('==')[1].strip()
37
+ if '[/ANSWER]' in model_output:
38
+ model_output = model_output.split('[/ANSWER]')[0].strip()
39
+ else:
40
+ model_output = model_output.split('\n')[0].strip()
41
+ return model_output.strip()
42
+
43
+
44
+ def extract_test_output_code(model_output: str):
45
+ outputlines = model_output.split('\n')
46
+ # find the last line startwith assert...
47
+ indexlines = [i for i, line in enumerate(outputlines) if line.startswith('assert')]
48
+ if indexlines:
49
+ return outputlines[indexlines[-1]]
50
+
51
+ # TODO: handle codellama format
52
+ # if lmstyle and lmstyle == LMStyle.CodeLLaMaInstruct:
53
+ # indexlines = \
54
+ # [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
55
+ # else:
56
+
57
+ # first try to extract ```python if not then try ```
58
+ indexlines = [i for i, line in enumerate(outputlines) if '```python' in line or '```Python' in line]
59
+ if indexlines:
60
+ start_index = indexlines[0]
61
+ else:
62
+ start_index = None
63
+ indexlines = [i for i, line in enumerate(outputlines) if '```' in line]
64
+ if start_index is not None:
65
+ indexlines = [i for i in indexlines if i > start_index]
66
+ indexlines = [start_index] + indexlines
67
+
68
+ if len(indexlines) < 2:
69
+ return ''
70
+ return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])
@@ -0,0 +1,90 @@
1
+ from tqdm import tqdm
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+
9
+ @Benchmark.register(
10
+ name='live_code_bench',
11
+ pretty_name='Live Code Bench',
12
+ dataset_id='AI-ModelScope/code_generation_lite',
13
+ subset_list=['release_latest'],
14
+ metric_list=['Pass@1'],
15
+ few_shot_num=0,
16
+ train_split=None,
17
+ eval_split='test',
18
+ extra_params={
19
+ 'start_date': None,
20
+ 'end_date': None,
21
+ 'num_process_evaluate': 1,
22
+ 'timeout': 6
23
+ },
24
+ system_prompt=
25
+ 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
26
+ prompt_template=
27
+ '### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n', # noqa: E501
28
+ )
29
+ class LiveCodeBenchAdapter(DataAdapter):
30
+
31
+ def __init__(self, **kwargs):
32
+ super().__init__(**kwargs)
33
+
34
+ extra_params = kwargs.get('extra_params', {})
35
+
36
+ self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
37
+ self.timeout = extra_params.get('timeout', 6)
38
+ self.start_date = extra_params.get('start_date')
39
+ self.end_date = extra_params.get('end_date')
40
+
41
+ def load(self, **kwargs) -> dict:
42
+ from .load_utils import filter_date, transform
43
+
44
+ # Note: need trust_remote_code=True to load the python script
45
+ dataset_dict = super().load(trust_remote_code=True, **kwargs)
46
+ new_dataset_dict = {}
47
+ for subset_key, dataset in dataset_dict.items():
48
+ datasets = dataset[self.eval_split]
49
+ filtered_datasets = filter_date(datasets, start_date=self.start_date, end_date=self.end_date)
50
+
51
+ transformed_datasets = [transform(item) for item in tqdm(filtered_datasets, desc='Transforming data')]
52
+ new_dataset_dict[subset_key] = {self.eval_split: transformed_datasets}
53
+ return new_dataset_dict
54
+
55
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
56
+ """
57
+ Generate the prompt for the model input.
58
+ """
59
+ format_prompt = input_d['format_prompt']
60
+ question_content = input_d['question_content']
61
+ full_prompt = self.prompt_template.format(question_content=question_content, format_prompt=format_prompt)
62
+
63
+ return self.gen_prompt_data(full_prompt)
64
+
65
+ def get_gold_answer(self, input_d: dict) -> str:
66
+ # Extract the gold answer from the input dict.
67
+ return input_d
68
+
69
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
70
+ """
71
+ Parse the model output to get the answer. Could be the best choice index.
72
+ """
73
+ return result
74
+
75
+ def match(self, gold: dict, pred: str) -> float:
76
+ from .evaluate_utils import codegen_metrics
77
+ from .extract_utils import extract_code_generation
78
+
79
+ ext_pred = extract_code_generation(pred)
80
+
81
+ references = [{'input_output': gold['evaluation_sample']}]
82
+ predictions = [[ext_pred]]
83
+ metrics, eval_results, final_metadata = codegen_metrics(
84
+ references,
85
+ predictions,
86
+ k_list=[1],
87
+ num_process_evaluate=self.num_process_evaluate,
88
+ timeout=self.timeout,
89
+ )
90
+ return metrics['pass@1'] / 100 # convert to point scale
@@ -0,0 +1,71 @@
1
+ import base64
2
+ import json
3
+ import pickle
4
+ import zlib
5
+ from datetime import datetime
6
+
7
+ from evalscope.benchmarks.live_code_bench.prompts import CodeGenerationPromptConstants
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ def transform(item):
14
+ # Define the dataitem mapping logic
15
+
16
+ # starter_code
17
+ if item['starter_code']:
18
+ format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n' # noqa: E501
19
+ format_prompt += f"```python\n{item['starter_code']}\n```\n\n"
20
+ else:
21
+ format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n' # noqa: E501
22
+ format_prompt += '```python\n# YOUR CODE HERE\n```\n\n'
23
+
24
+ item['format_prompt'] = format_prompt
25
+
26
+ # load test cases
27
+ public_test_cases = item['public_test_cases']
28
+ public_test_cases = json.loads(item['public_test_cases'])
29
+
30
+ private_test_cases = item['private_test_cases']
31
+ try:
32
+ private_test_cases = json.loads(item['private_test_cases'])
33
+ except Exception as e: # noqa: F841
34
+ private_test_cases = json.loads(
35
+ pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8')) # type: ignore
36
+ ))) # type: ignore
37
+
38
+ # load metadata
39
+ metadata = json.loads(item['metadata'])
40
+ evaluation_sample = json.dumps({
41
+ 'inputs': [t['input'] for t in public_test_cases + private_test_cases],
42
+ 'outputs': [t['output'] for t in public_test_cases + private_test_cases],
43
+ 'fn_name': metadata.get('func_name', None),
44
+ })
45
+ item['evaluation_sample'] = evaluation_sample
46
+
47
+ return item
48
+
49
+
50
+ def filter_date(dataset, start_date=None, end_date=None):
51
+ new_dataset = []
52
+
53
+ for item in dataset:
54
+ contest_date = datetime.fromisoformat(item['contest_date'])
55
+ if start_date is not None:
56
+ p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
57
+ if p_start_date > contest_date:
58
+ continue
59
+
60
+ if end_date is not None:
61
+ p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
62
+ if p_end_date < contest_date:
63
+ continue
64
+
65
+ new_dataset.append(item)
66
+
67
+ if start_date or end_date:
68
+ logger.info(
69
+ f'Filtered dataset with start_date: {start_date}, end_date: {end_date}, remaining items: {len(new_dataset)}'
70
+ )
71
+ return new_dataset