evalscope 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/evaluator/evaluator.py +7 -1
- evalscope/models/adapters/chat_adapter.py +3 -3
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/utils/db_util.py +8 -6
- evalscope/version.py +2 -2
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +5 -5
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +17 -17
- tests/aigc/test_t2i.py +4 -4
- tests/cli/test_run.py +12 -5
- tests/perf/test_perf.py +4 -2
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
|
|
|
34
34
|
subset_list = subset_list or self.subset_list
|
|
35
35
|
|
|
36
36
|
data_file_dict = defaultdict(str)
|
|
37
|
-
|
|
37
|
+
data_item_dict = defaultdict(list)
|
|
38
38
|
|
|
39
39
|
# get data file path and subset name
|
|
40
40
|
if os.path.isdir(dataset_name_or_path):
|
|
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
|
|
|
49
49
|
# load data from local disk
|
|
50
50
|
try:
|
|
51
51
|
for subset_name, file_path in data_file_dict.items():
|
|
52
|
-
|
|
52
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
53
53
|
except Exception as e:
|
|
54
54
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
55
55
|
|
|
56
|
-
data_dict = {subset_name: {'test':
|
|
56
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
57
57
|
|
|
58
58
|
return data_dict
|
|
@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
33
33
|
subset_list = subset_list or self.subset_list
|
|
34
34
|
|
|
35
35
|
data_file_dict = defaultdict(str)
|
|
36
|
-
|
|
36
|
+
data_item_dict = defaultdict(list)
|
|
37
37
|
|
|
38
38
|
# get data file path and subset name
|
|
39
39
|
if os.path.isdir(dataset_name_or_path):
|
|
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
48
48
|
# load data from local disk
|
|
49
49
|
try:
|
|
50
50
|
for subset_name, file_path in data_file_dict.items():
|
|
51
|
-
|
|
51
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
52
52
|
except Exception as e:
|
|
53
53
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
54
54
|
|
|
55
|
-
data_dict = {subset_name: {'test':
|
|
55
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
56
56
|
|
|
57
57
|
return data_dict
|
|
58
58
|
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -317,6 +317,8 @@ class Evaluator(object):
|
|
|
317
317
|
"""
|
|
318
318
|
|
|
319
319
|
review_res_list = []
|
|
320
|
+
max_choices = max(
|
|
321
|
+
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
|
|
320
322
|
for review_d in reviews_list:
|
|
321
323
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
322
324
|
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
@@ -325,10 +327,14 @@ class Evaluator(object):
|
|
|
325
327
|
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
326
328
|
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
327
329
|
continue
|
|
328
|
-
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
330
|
+
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
329
331
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
330
332
|
else:
|
|
331
333
|
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
334
|
+
if len(review_d[AnswerKeys.CHOICES]) < max_choices:
|
|
335
|
+
logger.warning(
|
|
336
|
+
f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
|
|
337
|
+
f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
|
|
332
338
|
|
|
333
339
|
review_res_list.append(review_res)
|
|
334
340
|
|
|
@@ -100,10 +100,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
100
100
|
if i < len(system_prompts) and system_prompts[i]:
|
|
101
101
|
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
102
102
|
# whether thinking is needed
|
|
103
|
-
|
|
104
|
-
if
|
|
103
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
104
|
+
if chat_template_kwargs is not None:
|
|
105
105
|
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
-
messages, tokenize=False, add_generation_prompt=True,
|
|
106
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
107
107
|
else:
|
|
108
108
|
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
109
|
formatted_prompts.append(prompts)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import AsyncGenerator, List
|
|
12
|
+
from typing import AsyncGenerator, Dict, List, Tuple
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
|
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@exception_handler
|
|
183
|
-
async def benchmark(args: Arguments) ->
|
|
183
|
+
async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
184
184
|
if platform.system() != 'Windows':
|
|
185
185
|
loop = asyncio.get_running_loop()
|
|
186
186
|
add_signal_handlers(loop)
|
|
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
|
|
|
205
205
|
data_process_completed_event.set()
|
|
206
206
|
|
|
207
207
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
-
summary_result(args, metrics, result_db_path)
|
|
208
|
+
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
209
|
+
return metrics_result, percentile_result
|
evalscope/perf/main.py
CHANGED
|
@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
|
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
39
|
-
loop.run_until_complete(benchmark(args))
|
|
39
|
+
return loop.run_until_complete(benchmark(args))
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
if __name__ == '__main__':
|
|
43
43
|
args = Arguments.from_args(parse_args())
|
|
44
|
-
run_perf_benchmark(args)
|
|
44
|
+
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
45
|
+
print(metrics_result)
|
|
46
|
+
print(percentile_result)
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -7,7 +7,7 @@ import sqlite3
|
|
|
7
7
|
import sys
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from tabulate import tabulate
|
|
10
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
11
11
|
|
|
12
12
|
from evalscope.perf.arguments import Arguments
|
|
13
13
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -200,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
200
200
|
return results
|
|
201
201
|
|
|
202
202
|
|
|
203
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
203
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
204
204
|
result_path = os.path.dirname(result_db_path)
|
|
205
205
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
206
206
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
write_json_file(
|
|
207
|
+
metrics_result = metrics.create_message()
|
|
208
|
+
metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
209
|
+
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
210
210
|
|
|
211
211
|
# Print summary in a table
|
|
212
|
-
table = tabulate(list(
|
|
212
|
+
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
213
213
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
214
214
|
|
|
215
215
|
# Get percentile results
|
|
@@ -223,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
223
223
|
if args.dataset.startswith('speed_benchmark'):
|
|
224
224
|
speed_benchmark_result(result_db_path)
|
|
225
225
|
|
|
226
|
+
return metrics_result, percentile_result
|
|
227
|
+
|
|
226
228
|
|
|
227
229
|
def speed_benchmark_result(result_db_path: str):
|
|
228
230
|
query_sql = """
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.15.
|
|
3
|
+
Version: 0.15.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
|
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
29
|
-
Requires-Dist: omegaconf
|
|
30
29
|
Requires-Dist: openai
|
|
31
30
|
Requires-Dist: pandas
|
|
32
31
|
Requires-Dist: pillow
|
|
33
32
|
Requires-Dist: pyarrow
|
|
34
|
-
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: pyyaml>=5.1
|
|
35
34
|
Requires-Dist: requests
|
|
36
35
|
Requires-Dist: rouge-chinese
|
|
37
36
|
Requires-Dist: rouge-score>=0.1.0
|
|
@@ -48,6 +47,7 @@ Requires-Dist: word2number
|
|
|
48
47
|
Provides-Extra: aigc
|
|
49
48
|
Requires-Dist: diffusers; extra == "aigc"
|
|
50
49
|
Requires-Dist: iopath; extra == "aigc"
|
|
50
|
+
Requires-Dist: omegaconf; extra == "aigc"
|
|
51
51
|
Requires-Dist: open-clip-torch; extra == "aigc"
|
|
52
52
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
53
53
|
Provides-Extra: all
|
|
@@ -61,12 +61,11 @@ Requires-Dist: latex2sympy2; extra == "all"
|
|
|
61
61
|
Requires-Dist: matplotlib; extra == "all"
|
|
62
62
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
63
63
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
64
|
-
Requires-Dist: omegaconf; extra == "all"
|
|
65
64
|
Requires-Dist: openai; extra == "all"
|
|
66
65
|
Requires-Dist: pandas; extra == "all"
|
|
67
66
|
Requires-Dist: pillow; extra == "all"
|
|
68
67
|
Requires-Dist: pyarrow; extra == "all"
|
|
69
|
-
Requires-Dist: pyyaml; extra == "all"
|
|
68
|
+
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
70
69
|
Requires-Dist: requests; extra == "all"
|
|
71
70
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
72
71
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
@@ -99,6 +98,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
|
|
|
99
98
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
100
99
|
Requires-Dist: diffusers; extra == "all"
|
|
101
100
|
Requires-Dist: iopath; extra == "all"
|
|
101
|
+
Requires-Dist: omegaconf; extra == "all"
|
|
102
102
|
Requires-Dist: open-clip-torch; extra == "all"
|
|
103
103
|
Requires-Dist: opencv-python; extra == "all"
|
|
104
104
|
Provides-Extra: app
|
|
@@ -5,7 +5,7 @@ evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
|
|
|
5
5
|
evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=eFCP5Hfk4dip59uCASefVxaNqxWNtwDQPrqaoRJxO9c,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -63,7 +63,7 @@ evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
|
|
|
63
63
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
64
64
|
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
|
|
65
65
|
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
|
|
66
|
-
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=
|
|
66
|
+
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
|
|
67
67
|
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
|
|
68
68
|
evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
|
|
69
69
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -123,7 +123,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3
|
|
|
123
123
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
124
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
|
|
125
125
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=I2BanmO4WLrKviyLiIeqmS5mdyjqGg1X7hauv4HBjgk,4653
|
|
127
127
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
128
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
129
129
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -199,7 +199,7 @@ evalscope/collections/evaluator.py,sha256=Ll-qLet04aEp1WxoCKAuvZVWEZuy1lS_D-vZIN
|
|
|
199
199
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
200
200
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
201
201
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
202
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
202
|
+
evalscope/evaluator/evaluator.py,sha256=oOVYRMMQfT3fqu-l33wmJtKlyeWxwoIUADMCoBNARTM,20271
|
|
203
203
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
204
204
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
205
205
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
@@ -318,7 +318,7 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
|
|
|
318
318
|
evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
|
|
319
319
|
evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
|
|
320
320
|
evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
|
|
321
|
-
evalscope/models/adapters/chat_adapter.py,sha256=
|
|
321
|
+
evalscope/models/adapters/chat_adapter.py,sha256=hzFrpvIrakKO5hsnbdXiDTO0cGajAdhcAN9ENoI6XvY,7312
|
|
322
322
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
323
323
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
324
324
|
evalscope/models/adapters/server_adapter.py,sha256=5kH1yDAjETogR7aOdnCEueYE1bREI40OdXdBiJpMdIM,6734
|
|
@@ -328,9 +328,9 @@ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAd
|
|
|
328
328
|
evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
|
|
329
329
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
330
|
evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
|
|
331
|
-
evalscope/perf/benchmark.py,sha256=
|
|
331
|
+
evalscope/perf/benchmark.py,sha256=C0tLaZzxqMonZK4iLtfjiQIxX3tO3-uFrOjgV-oVsU0,8024
|
|
332
332
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
333
|
-
evalscope/perf/main.py,sha256=
|
|
333
|
+
evalscope/perf/main.py,sha256=C7iNEdb4SEMGmHsF4DHAak4O1zRxrWW1tMRmyhEkVwQ,1376
|
|
334
334
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
335
335
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
336
336
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -350,7 +350,7 @@ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1j
|
|
|
350
350
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
351
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
352
352
|
evalscope/perf/utils/benchmark_util.py,sha256=CftjnxYA7d1aeAL_iuyXcJPwCL5A8zWGZSkNtjrMyW8,6309
|
|
353
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
353
|
+
evalscope/perf/utils/db_util.py,sha256=VsYgz6IsSNPAWGCopOOIxAUhUat3GRbZMlrfdZ6i4kM,9575
|
|
354
354
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
355
355
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
356
356
|
evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
|
|
@@ -429,13 +429,13 @@ evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,1110
|
|
|
429
429
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
430
430
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
431
431
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
432
|
-
tests/aigc/test_t2i.py,sha256=
|
|
432
|
+
tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
|
|
433
433
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
434
434
|
tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
|
|
435
435
|
tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
|
|
436
|
-
tests/cli/test_run.py,sha256=
|
|
436
|
+
tests/cli/test_run.py,sha256=1DHLFlgGvHJizbLVc1ShcGFAHirEPgW8r88H7g8Sbx4,17245
|
|
437
437
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
438
|
-
tests/perf/test_perf.py,sha256=
|
|
438
|
+
tests/perf/test_perf.py,sha256=diwwEmoWR-6xSVeGF65J6TWHRNj54rkwyvnhHh7PiE0,3919
|
|
439
439
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
440
440
|
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
441
441
|
tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
|
|
@@ -446,9 +446,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
446
446
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
447
447
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
448
448
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
449
|
-
evalscope-0.15.
|
|
450
|
-
evalscope-0.15.
|
|
451
|
-
evalscope-0.15.
|
|
452
|
-
evalscope-0.15.
|
|
453
|
-
evalscope-0.15.
|
|
454
|
-
evalscope-0.15.
|
|
449
|
+
evalscope-0.15.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
450
|
+
evalscope-0.15.1.dist-info/METADATA,sha256=JvRF5sI_9ak9Y-FwWdU1Y8BE96iKPLO_hIGC7Z9SWpg,34080
|
|
451
|
+
evalscope-0.15.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
452
|
+
evalscope-0.15.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
453
|
+
evalscope-0.15.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
454
|
+
evalscope-0.15.1.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
|
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
61
|
'tifa160',
|
|
62
|
-
'genai_bench',
|
|
63
|
-
'evalmuse',
|
|
64
|
-
'hpdv2',
|
|
62
|
+
# 'genai_bench',
|
|
63
|
+
# 'evalmuse',
|
|
64
|
+
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
67
67
|
'tifa160': {
|
|
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
|
|
|
81
81
|
'num_inference_steps': 50,
|
|
82
82
|
'guidance_scale': 7.5
|
|
83
83
|
},
|
|
84
|
-
use_cache='outputs/20250427_134122',
|
|
84
|
+
# use_cache='outputs/20250427_134122',
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -207,13 +207,13 @@ class TestRun(unittest.TestCase):
|
|
|
207
207
|
from evalscope.config import TaskConfig
|
|
208
208
|
|
|
209
209
|
task_cfg = TaskConfig(
|
|
210
|
-
model='Qwen/
|
|
210
|
+
model='Qwen/Qwen3-1.7B',
|
|
211
211
|
datasets=[
|
|
212
212
|
# 'iquiz',
|
|
213
213
|
# 'math_500',
|
|
214
|
-
|
|
214
|
+
'aime24',
|
|
215
215
|
# 'competition_math',
|
|
216
|
-
'mmlu',
|
|
216
|
+
# 'mmlu',
|
|
217
217
|
],
|
|
218
218
|
dataset_args={
|
|
219
219
|
'competition_math': {
|
|
@@ -224,8 +224,15 @@ class TestRun(unittest.TestCase):
|
|
|
224
224
|
'few_shot_num': 0
|
|
225
225
|
},
|
|
226
226
|
},
|
|
227
|
-
limit=
|
|
228
|
-
eval_batch_size=
|
|
227
|
+
limit=5,
|
|
228
|
+
eval_batch_size=5,
|
|
229
|
+
generation_config={
|
|
230
|
+
'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
|
|
231
|
+
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
|
|
232
|
+
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
233
|
+
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
234
|
+
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
235
|
+
}
|
|
229
236
|
)
|
|
230
237
|
|
|
231
238
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
|
|
|
103
103
|
from evalscope.perf.arguments import Arguments
|
|
104
104
|
task_cfg = Arguments(
|
|
105
105
|
parallel=20,
|
|
106
|
-
model='
|
|
106
|
+
model='Qwen3-1.7B',
|
|
107
107
|
url='http://127.0.0.1:8801/v1/completions',
|
|
108
108
|
api='openai',
|
|
109
109
|
dataset='random',
|
|
@@ -117,7 +117,9 @@ class TestPerf(unittest.TestCase):
|
|
|
117
117
|
seed=None,
|
|
118
118
|
extra_args={'ignore_eos': True}
|
|
119
119
|
)
|
|
120
|
-
run_perf_benchmark(task_cfg)
|
|
120
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
121
|
+
print(metrics_result)
|
|
122
|
+
print(percentile_result)
|
|
121
123
|
|
|
122
124
|
|
|
123
125
|
if __name__ == '__main__':
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|