evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +8 -9
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +30 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +2 -2
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +30 -9
- evalscope/perf/benchmark.py +57 -103
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +3 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
- tests/cli/test_all.py +36 -27
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +38 -20
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +33 -27
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the
|
|
2
|
-
# current dataset script contributor.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
# This code is adapted from OpenAI's release
|
|
17
|
-
# https://github.com/openai/human-eval/blob/master/human_eval/execution.py
|
|
18
|
-
|
|
19
|
-
import contextlib
|
|
20
|
-
import faulthandler
|
|
21
|
-
import io
|
|
22
|
-
import multiprocessing
|
|
23
|
-
import os
|
|
24
|
-
import platform
|
|
25
|
-
import signal
|
|
26
|
-
import tempfile
|
|
27
|
-
|
|
28
|
-
BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
|
|
29
|
-
from copy import deepcopy
|
|
30
|
-
from string import ascii_lowercase
|
|
31
|
-
from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
|
|
32
|
-
from collections import defaultdict, deque, Counter
|
|
33
|
-
from bisect import bisect, bisect_left, bisect_right, insort
|
|
34
|
-
from heapq import heappush, heappop, heapify, merge
|
|
35
|
-
from functools import reduce, cache, lru_cache
|
|
36
|
-
from random import randrange, shuffle
|
|
37
|
-
from operator import itemgetter, sub
|
|
38
|
-
from re import search as re_search # Assuming 're' refers to a regex search
|
|
39
|
-
from os.path import commonprefix
|
|
40
|
-
from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
|
|
41
|
-
import copy
|
|
42
|
-
import string
|
|
43
|
-
import math
|
|
44
|
-
import collections
|
|
45
|
-
import bisect
|
|
46
|
-
import heapq
|
|
47
|
-
import functools
|
|
48
|
-
import random
|
|
49
|
-
import itertools
|
|
50
|
-
import operator
|
|
51
|
-
import re
|
|
52
|
-
import numpy as np
|
|
53
|
-
import pandas as pd
|
|
54
|
-
from math import log, prod # 'log' and 'prod' are functions in the math module
|
|
55
|
-
from collections import deque, defaultdict, Counter, OrderedDict
|
|
56
|
-
from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
|
|
57
|
-
from functools import lru_cache, reduce, partial
|
|
58
|
-
# from sortedcontainers import SortedList, SortedDict, SortedSet
|
|
59
|
-
# import sortedcontainers
|
|
60
|
-
from operator import iand
|
|
61
|
-
import sys
|
|
62
|
-
""" # noqa: E501
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def codeexecute_check_correctness(check_program, timeout=3):
|
|
66
|
-
"""Evaluates the functional correctness of a completion by running the test
|
|
67
|
-
suite provided in the problem.
|
|
68
|
-
|
|
69
|
-
:param completion_id: an optional completion ID so we can match
|
|
70
|
-
the results later even if execution finishes asynchronously.
|
|
71
|
-
"""
|
|
72
|
-
manager = multiprocessing.Manager()
|
|
73
|
-
result = manager.list()
|
|
74
|
-
|
|
75
|
-
p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
|
|
76
|
-
p.start()
|
|
77
|
-
p.join(timeout=timeout + 1)
|
|
78
|
-
if p.is_alive():
|
|
79
|
-
p.kill()
|
|
80
|
-
|
|
81
|
-
if not result:
|
|
82
|
-
result.append('timed out')
|
|
83
|
-
|
|
84
|
-
return result[0] == 'passed'
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def unsafe_execute(check_program, result, timeout):
|
|
88
|
-
|
|
89
|
-
with create_tempdir():
|
|
90
|
-
|
|
91
|
-
# These system calls are needed when cleaning up tempdir.
|
|
92
|
-
import os
|
|
93
|
-
import shutil
|
|
94
|
-
|
|
95
|
-
rmtree = shutil.rmtree
|
|
96
|
-
rmdir = os.rmdir
|
|
97
|
-
chdir = os.chdir
|
|
98
|
-
|
|
99
|
-
# Disable functionalities that can make destructive changes
|
|
100
|
-
# to the test.
|
|
101
|
-
reliability_guard()
|
|
102
|
-
|
|
103
|
-
# Run program.
|
|
104
|
-
try:
|
|
105
|
-
exec_globals = {}
|
|
106
|
-
with swallow_io():
|
|
107
|
-
with time_limit(timeout):
|
|
108
|
-
exec(check_program, exec_globals)
|
|
109
|
-
result.append('passed')
|
|
110
|
-
except TimeoutException:
|
|
111
|
-
result.append('timed out')
|
|
112
|
-
except BaseException as e:
|
|
113
|
-
result.append(f'failed: {e}')
|
|
114
|
-
|
|
115
|
-
# Needed for cleaning up.
|
|
116
|
-
shutil.rmtree = rmtree
|
|
117
|
-
os.rmdir = rmdir
|
|
118
|
-
os.chdir = chdir
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
@contextlib.contextmanager
|
|
122
|
-
def time_limit(seconds):
|
|
123
|
-
|
|
124
|
-
def signal_handler(signum, frame):
|
|
125
|
-
raise TimeoutException('Timed out!')
|
|
126
|
-
|
|
127
|
-
signal.setitimer(signal.ITIMER_REAL, seconds)
|
|
128
|
-
signal.signal(signal.SIGALRM, signal_handler)
|
|
129
|
-
try:
|
|
130
|
-
yield
|
|
131
|
-
finally:
|
|
132
|
-
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
@contextlib.contextmanager
|
|
136
|
-
def swallow_io():
|
|
137
|
-
stream = WriteOnlyStringIO()
|
|
138
|
-
with contextlib.redirect_stdout(stream):
|
|
139
|
-
with contextlib.redirect_stderr(stream):
|
|
140
|
-
with redirect_stdin(stream):
|
|
141
|
-
yield
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
@contextlib.contextmanager
|
|
145
|
-
def create_tempdir():
|
|
146
|
-
with tempfile.TemporaryDirectory() as dirname:
|
|
147
|
-
with chdir(dirname):
|
|
148
|
-
yield dirname
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
class TimeoutException(Exception):
|
|
152
|
-
pass
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
class WriteOnlyStringIO(io.StringIO):
|
|
156
|
-
"""StringIO that throws an exception when it's read from."""
|
|
157
|
-
|
|
158
|
-
def read(self, *args, **kwargs):
|
|
159
|
-
raise OSError
|
|
160
|
-
|
|
161
|
-
def readline(self, *args, **kwargs):
|
|
162
|
-
raise OSError
|
|
163
|
-
|
|
164
|
-
def readlines(self, *args, **kwargs):
|
|
165
|
-
raise OSError
|
|
166
|
-
|
|
167
|
-
def readable(self, *args, **kwargs):
|
|
168
|
-
"""Returns True if the IO object can be read."""
|
|
169
|
-
return False
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
|
173
|
-
_stream = 'stdin'
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
@contextlib.contextmanager
|
|
177
|
-
def chdir(root):
|
|
178
|
-
if root == '.':
|
|
179
|
-
yield
|
|
180
|
-
return
|
|
181
|
-
cwd = os.getcwd()
|
|
182
|
-
os.chdir(root)
|
|
183
|
-
try:
|
|
184
|
-
yield
|
|
185
|
-
except BaseException as exc:
|
|
186
|
-
raise exc
|
|
187
|
-
finally:
|
|
188
|
-
os.chdir(cwd)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def reliability_guard(maximum_memory_bytes=None):
|
|
192
|
-
"""This disables various destructive functions and prevents the generated
|
|
193
|
-
code from interfering with the test (e.g. fork bomb, killing other
|
|
194
|
-
processes, removing filesystem files, etc.)
|
|
195
|
-
|
|
196
|
-
WARNING This function is NOT a security sandbox. Untrusted code, including,
|
|
197
|
-
model- generated code, should not be blindly executed outside of one. See
|
|
198
|
-
the Codex paper for more information about OpenAI's code sandbox, and
|
|
199
|
-
proceed with caution.
|
|
200
|
-
"""
|
|
201
|
-
|
|
202
|
-
if maximum_memory_bytes is not None:
|
|
203
|
-
import resource
|
|
204
|
-
|
|
205
|
-
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
|
206
|
-
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
|
207
|
-
if not platform.uname().system == 'Darwin':
|
|
208
|
-
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
|
209
|
-
|
|
210
|
-
faulthandler.disable()
|
|
211
|
-
|
|
212
|
-
import builtins
|
|
213
|
-
|
|
214
|
-
builtins.exit = None
|
|
215
|
-
builtins.quit = None
|
|
216
|
-
|
|
217
|
-
import os
|
|
218
|
-
|
|
219
|
-
os.environ['OMP_NUM_THREADS'] = '1'
|
|
220
|
-
|
|
221
|
-
os.kill = None
|
|
222
|
-
os.system = None
|
|
223
|
-
os.putenv = None
|
|
224
|
-
os.remove = None
|
|
225
|
-
os.removedirs = None
|
|
226
|
-
os.rmdir = None
|
|
227
|
-
os.fchdir = None
|
|
228
|
-
os.setuid = None
|
|
229
|
-
os.fork = None
|
|
230
|
-
os.forkpty = None
|
|
231
|
-
os.killpg = None
|
|
232
|
-
os.rename = None
|
|
233
|
-
os.renames = None
|
|
234
|
-
os.truncate = None
|
|
235
|
-
os.replace = None
|
|
236
|
-
os.unlink = None
|
|
237
|
-
os.fchmod = None
|
|
238
|
-
os.fchown = None
|
|
239
|
-
os.chmod = None
|
|
240
|
-
os.chown = None
|
|
241
|
-
os.chroot = None
|
|
242
|
-
os.fchdir = None
|
|
243
|
-
os.lchflags = None
|
|
244
|
-
os.lchmod = None
|
|
245
|
-
os.lchown = None
|
|
246
|
-
os.getcwd = None
|
|
247
|
-
os.chdir = None
|
|
248
|
-
|
|
249
|
-
import shutil
|
|
250
|
-
|
|
251
|
-
shutil.rmtree = None
|
|
252
|
-
shutil.move = None
|
|
253
|
-
shutil.chown = None
|
|
254
|
-
|
|
255
|
-
import subprocess
|
|
256
|
-
|
|
257
|
-
subprocess.Popen = None # type: ignore
|
|
258
|
-
|
|
259
|
-
__builtins__['help'] = None
|
|
260
|
-
|
|
261
|
-
import sys
|
|
262
|
-
|
|
263
|
-
sys.modules['ipdb'] = None
|
|
264
|
-
sys.modules['joblib'] = None
|
|
265
|
-
sys.modules['resource'] = None
|
|
266
|
-
sys.modules['psutil'] = None
|
|
267
|
-
sys.modules['tkinter'] = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|