evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +8 -9
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  12. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  13. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  14. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. evalscope/benchmarks/data_adapter.py +30 -2
  20. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  26. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  27. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  30. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  31. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  32. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  33. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  34. evalscope/collections/evaluator.py +4 -2
  35. evalscope/config.py +2 -2
  36. evalscope/metrics/llm_judge.py +1 -1
  37. evalscope/models/chat_adapter.py +32 -11
  38. evalscope/perf/arguments.py +30 -9
  39. evalscope/perf/benchmark.py +57 -103
  40. evalscope/perf/http_client.py +2 -3
  41. evalscope/perf/plugin/api/custom_api.py +1 -1
  42. evalscope/perf/plugin/api/openai_api.py +4 -2
  43. evalscope/perf/plugin/datasets/custom.py +4 -1
  44. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  45. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  46. evalscope/perf/plugin/datasets/openqa.py +4 -1
  47. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  48. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  49. evalscope/perf/utils/benchmark_util.py +12 -6
  50. evalscope/perf/utils/db_util.py +3 -3
  51. evalscope/perf/utils/log_utils.py +41 -0
  52. evalscope/report/app.py +11 -11
  53. evalscope/run.py +7 -0
  54. evalscope/summarizer.py +2 -1
  55. evalscope/utils/utils.py +36 -25
  56. evalscope/version.py +2 -2
  57. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
  58. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
  59. tests/cli/test_all.py +36 -27
  60. tests/cli/test_collection.py +2 -1
  61. tests/cli/test_run.py +38 -20
  62. tests/perf/test_perf.py +1 -2
  63. tests/rag/test_clip_benchmark.py +0 -1
  64. tests/rag/test_mteb.py +37 -8
  65. tests/rag/test_ragas.py +33 -27
  66. tests/vlm/test_vlmeval.py +37 -1
  67. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  68. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  69. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  70. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  71. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  72. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,267 +0,0 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the
2
- # current dataset script contributor.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- # This code is adapted from OpenAI's release
17
- # https://github.com/openai/human-eval/blob/master/human_eval/execution.py
18
-
19
- import contextlib
20
- import faulthandler
21
- import io
22
- import multiprocessing
23
- import os
24
- import platform
25
- import signal
26
- import tempfile
27
-
28
- BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
29
- from copy import deepcopy
30
- from string import ascii_lowercase
31
- from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
32
- from collections import defaultdict, deque, Counter
33
- from bisect import bisect, bisect_left, bisect_right, insort
34
- from heapq import heappush, heappop, heapify, merge
35
- from functools import reduce, cache, lru_cache
36
- from random import randrange, shuffle
37
- from operator import itemgetter, sub
38
- from re import search as re_search # Assuming 're' refers to a regex search
39
- from os.path import commonprefix
40
- from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
41
- import copy
42
- import string
43
- import math
44
- import collections
45
- import bisect
46
- import heapq
47
- import functools
48
- import random
49
- import itertools
50
- import operator
51
- import re
52
- import numpy as np
53
- import pandas as pd
54
- from math import log, prod # 'log' and 'prod' are functions in the math module
55
- from collections import deque, defaultdict, Counter, OrderedDict
56
- from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
57
- from functools import lru_cache, reduce, partial
58
- # from sortedcontainers import SortedList, SortedDict, SortedSet
59
- # import sortedcontainers
60
- from operator import iand
61
- import sys
62
- """ # noqa: E501
63
-
64
-
65
- def codeexecute_check_correctness(check_program, timeout=3):
66
- """Evaluates the functional correctness of a completion by running the test
67
- suite provided in the problem.
68
-
69
- :param completion_id: an optional completion ID so we can match
70
- the results later even if execution finishes asynchronously.
71
- """
72
- manager = multiprocessing.Manager()
73
- result = manager.list()
74
-
75
- p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
76
- p.start()
77
- p.join(timeout=timeout + 1)
78
- if p.is_alive():
79
- p.kill()
80
-
81
- if not result:
82
- result.append('timed out')
83
-
84
- return result[0] == 'passed'
85
-
86
-
87
- def unsafe_execute(check_program, result, timeout):
88
-
89
- with create_tempdir():
90
-
91
- # These system calls are needed when cleaning up tempdir.
92
- import os
93
- import shutil
94
-
95
- rmtree = shutil.rmtree
96
- rmdir = os.rmdir
97
- chdir = os.chdir
98
-
99
- # Disable functionalities that can make destructive changes
100
- # to the test.
101
- reliability_guard()
102
-
103
- # Run program.
104
- try:
105
- exec_globals = {}
106
- with swallow_io():
107
- with time_limit(timeout):
108
- exec(check_program, exec_globals)
109
- result.append('passed')
110
- except TimeoutException:
111
- result.append('timed out')
112
- except BaseException as e:
113
- result.append(f'failed: {e}')
114
-
115
- # Needed for cleaning up.
116
- shutil.rmtree = rmtree
117
- os.rmdir = rmdir
118
- os.chdir = chdir
119
-
120
-
121
- @contextlib.contextmanager
122
- def time_limit(seconds):
123
-
124
- def signal_handler(signum, frame):
125
- raise TimeoutException('Timed out!')
126
-
127
- signal.setitimer(signal.ITIMER_REAL, seconds)
128
- signal.signal(signal.SIGALRM, signal_handler)
129
- try:
130
- yield
131
- finally:
132
- signal.setitimer(signal.ITIMER_REAL, 0)
133
-
134
-
135
- @contextlib.contextmanager
136
- def swallow_io():
137
- stream = WriteOnlyStringIO()
138
- with contextlib.redirect_stdout(stream):
139
- with contextlib.redirect_stderr(stream):
140
- with redirect_stdin(stream):
141
- yield
142
-
143
-
144
- @contextlib.contextmanager
145
- def create_tempdir():
146
- with tempfile.TemporaryDirectory() as dirname:
147
- with chdir(dirname):
148
- yield dirname
149
-
150
-
151
- class TimeoutException(Exception):
152
- pass
153
-
154
-
155
- class WriteOnlyStringIO(io.StringIO):
156
- """StringIO that throws an exception when it's read from."""
157
-
158
- def read(self, *args, **kwargs):
159
- raise OSError
160
-
161
- def readline(self, *args, **kwargs):
162
- raise OSError
163
-
164
- def readlines(self, *args, **kwargs):
165
- raise OSError
166
-
167
- def readable(self, *args, **kwargs):
168
- """Returns True if the IO object can be read."""
169
- return False
170
-
171
-
172
- class redirect_stdin(contextlib._RedirectStream): # type: ignore
173
- _stream = 'stdin'
174
-
175
-
176
- @contextlib.contextmanager
177
- def chdir(root):
178
- if root == '.':
179
- yield
180
- return
181
- cwd = os.getcwd()
182
- os.chdir(root)
183
- try:
184
- yield
185
- except BaseException as exc:
186
- raise exc
187
- finally:
188
- os.chdir(cwd)
189
-
190
-
191
- def reliability_guard(maximum_memory_bytes=None):
192
- """This disables various destructive functions and prevents the generated
193
- code from interfering with the test (e.g. fork bomb, killing other
194
- processes, removing filesystem files, etc.)
195
-
196
- WARNING This function is NOT a security sandbox. Untrusted code, including,
197
- model- generated code, should not be blindly executed outside of one. See
198
- the Codex paper for more information about OpenAI's code sandbox, and
199
- proceed with caution.
200
- """
201
-
202
- if maximum_memory_bytes is not None:
203
- import resource
204
-
205
- resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
206
- resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
207
- if not platform.uname().system == 'Darwin':
208
- resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
209
-
210
- faulthandler.disable()
211
-
212
- import builtins
213
-
214
- builtins.exit = None
215
- builtins.quit = None
216
-
217
- import os
218
-
219
- os.environ['OMP_NUM_THREADS'] = '1'
220
-
221
- os.kill = None
222
- os.system = None
223
- os.putenv = None
224
- os.remove = None
225
- os.removedirs = None
226
- os.rmdir = None
227
- os.fchdir = None
228
- os.setuid = None
229
- os.fork = None
230
- os.forkpty = None
231
- os.killpg = None
232
- os.rename = None
233
- os.renames = None
234
- os.truncate = None
235
- os.replace = None
236
- os.unlink = None
237
- os.fchmod = None
238
- os.fchown = None
239
- os.chmod = None
240
- os.chown = None
241
- os.chroot = None
242
- os.fchdir = None
243
- os.lchflags = None
244
- os.lchmod = None
245
- os.lchown = None
246
- os.getcwd = None
247
- os.chdir = None
248
-
249
- import shutil
250
-
251
- shutil.rmtree = None
252
- shutil.move = None
253
- shutil.chown = None
254
-
255
- import subprocess
256
-
257
- subprocess.Popen = None # type: ignore
258
-
259
- __builtins__['help'] = None
260
-
261
- import sys
262
-
263
- sys.modules['ipdb'] = None
264
- sys.modules['joblib'] = None
265
- sys.modules['resource'] = None
266
- sys.modules['psutil'] = None
267
- sys.modules['tkinter'] = None