evalscope 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
34
34
  subset_list = subset_list or self.subset_list
35
35
 
36
36
  data_file_dict = defaultdict(str)
37
- data_list = []
37
+ data_item_dict = defaultdict(list)
38
38
 
39
39
  # get data file path and subset name
40
40
  if os.path.isdir(dataset_name_or_path):
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
49
49
  # load data from local disk
50
50
  try:
51
51
  for subset_name, file_path in data_file_dict.items():
52
- data_list.extend(jsonl_to_list(file_path))
52
+ data_item_dict[subset_name] = jsonl_to_list(file_path)
53
53
  except Exception as e:
54
54
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
55
55
 
56
- data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
56
+ data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
57
57
 
58
58
  return data_dict
@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
33
33
  subset_list = subset_list or self.subset_list
34
34
 
35
35
  data_file_dict = defaultdict(str)
36
- data_list = []
36
+ data_item_dict = defaultdict(list)
37
37
 
38
38
  # get data file path and subset name
39
39
  if os.path.isdir(dataset_name_or_path):
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
48
48
  # load data from local disk
49
49
  try:
50
50
  for subset_name, file_path in data_file_dict.items():
51
- data_list.extend(jsonl_to_list(file_path))
51
+ data_item_dict[subset_name] = jsonl_to_list(file_path)
52
52
  except Exception as e:
53
53
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
54
54
 
55
- data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
55
+ data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
56
56
 
57
57
  return data_dict
58
58
 
@@ -317,6 +317,8 @@ class Evaluator(object):
317
317
  """
318
318
 
319
319
  review_res_list = []
320
+ max_choices = max(
321
+ len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
320
322
  for review_d in reviews_list:
321
323
  if not review_d[ReviewKeys.REVIEWED]:
322
324
  logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
@@ -325,10 +327,14 @@ class Evaluator(object):
325
327
  if len(review_d[AnswerKeys.CHOICES]) == 0:
326
328
  logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
327
329
  continue
328
- elif len(review_d[AnswerKeys.CHOICES]) == 1:
330
+ elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
329
331
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
330
332
  else:
331
333
  review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
334
+ if len(review_d[AnswerKeys.CHOICES]) < max_choices:
335
+ logger.warning(
336
+ f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
337
+ f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
332
338
 
333
339
  review_res_list.append(review_res)
334
340
 
@@ -100,10 +100,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
100
100
  if i < len(system_prompts) and system_prompts[i]:
101
101
  messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
102
102
  # whether thinking is needed
103
- enable_thinking = infer_cfg.get('enable_thinking', None)
104
- if enable_thinking is not None:
103
+ chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
104
+ if chat_template_kwargs is not None:
105
105
  prompts = self.tokenizer.apply_chat_template(
106
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
106
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
107
107
  else:
108
108
  prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
109
109
  formatted_prompts.append(prompts)
@@ -9,7 +9,7 @@ import threading
9
9
  import time
10
10
  from http import HTTPStatus
11
11
  from tqdm import tqdm
12
- from typing import AsyncGenerator, List
12
+ from typing import AsyncGenerator, Dict, List, Tuple
13
13
 
14
14
  from evalscope.perf.arguments import Arguments
15
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
180
180
 
181
181
 
182
182
  @exception_handler
183
- async def benchmark(args: Arguments) -> None:
183
+ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
184
184
  if platform.system() != 'Windows':
185
185
  loop = asyncio.get_running_loop()
186
186
  add_signal_handlers(loop)
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
205
205
  data_process_completed_event.set()
206
206
 
207
207
  metrics, result_db_path = await statistic_benchmark_metric_task
208
- summary_result(args, metrics, result_db_path)
208
+ metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
209
+ return metrics_result, percentile_result
evalscope/perf/main.py CHANGED
@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
38
 
39
- loop.run_until_complete(benchmark(args))
39
+ return loop.run_until_complete(benchmark(args))
40
40
 
41
41
 
42
42
  if __name__ == '__main__':
43
43
  args = Arguments.from_args(parse_args())
44
- run_perf_benchmark(args)
44
+ metrics_result, percentile_result = run_perf_benchmark(args)
45
+ print(metrics_result)
46
+ print(percentile_result)
@@ -7,7 +7,7 @@ import sqlite3
7
7
  import sys
8
8
  from datetime import datetime
9
9
  from tabulate import tabulate
10
- from typing import Dict, List
10
+ from typing import Dict, List, Tuple
11
11
 
12
12
  from evalscope.perf.arguments import Arguments
13
13
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -200,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
200
200
  return results
201
201
 
202
202
 
203
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
203
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
204
204
  result_path = os.path.dirname(result_db_path)
205
205
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
206
206
 
207
- data = metrics.create_message()
208
- data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
209
- write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
207
+ metrics_result = metrics.create_message()
208
+ metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
209
+ write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
210
210
 
211
211
  # Print summary in a table
212
- table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
212
+ table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
213
213
  logger.info('\nBenchmarking summary:\n' + table)
214
214
 
215
215
  # Get percentile results
@@ -223,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
223
223
  if args.dataset.startswith('speed_benchmark'):
224
224
  speed_benchmark_result(result_db_path)
225
225
 
226
+ return metrics_result, percentile_result
227
+
226
228
 
227
229
  def speed_benchmark_result(result_db_path: str):
228
230
  query_sql = """
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.15.0'
4
- __release_datetime__ = '2025-04-29 00:00:00'
3
+ __version__ = '0.15.1'
4
+ __release_datetime__ = '2025-04-30 12:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.15.0
3
+ Version: 0.15.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
29
- Requires-Dist: omegaconf
30
29
  Requires-Dist: openai
31
30
  Requires-Dist: pandas
32
31
  Requires-Dist: pillow
33
32
  Requires-Dist: pyarrow
34
- Requires-Dist: pyyaml
33
+ Requires-Dist: pyyaml>=5.1
35
34
  Requires-Dist: requests
36
35
  Requires-Dist: rouge-chinese
37
36
  Requires-Dist: rouge-score>=0.1.0
@@ -48,6 +47,7 @@ Requires-Dist: word2number
48
47
  Provides-Extra: aigc
49
48
  Requires-Dist: diffusers; extra == "aigc"
50
49
  Requires-Dist: iopath; extra == "aigc"
50
+ Requires-Dist: omegaconf; extra == "aigc"
51
51
  Requires-Dist: open-clip-torch; extra == "aigc"
52
52
  Requires-Dist: opencv-python; extra == "aigc"
53
53
  Provides-Extra: all
@@ -61,12 +61,11 @@ Requires-Dist: latex2sympy2; extra == "all"
61
61
  Requires-Dist: matplotlib; extra == "all"
62
62
  Requires-Dist: modelscope[framework]; extra == "all"
63
63
  Requires-Dist: nltk>=3.9; extra == "all"
64
- Requires-Dist: omegaconf; extra == "all"
65
64
  Requires-Dist: openai; extra == "all"
66
65
  Requires-Dist: pandas; extra == "all"
67
66
  Requires-Dist: pillow; extra == "all"
68
67
  Requires-Dist: pyarrow; extra == "all"
69
- Requires-Dist: pyyaml; extra == "all"
68
+ Requires-Dist: pyyaml>=5.1; extra == "all"
70
69
  Requires-Dist: requests; extra == "all"
71
70
  Requires-Dist: rouge-chinese; extra == "all"
72
71
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -99,6 +98,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
99
98
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
100
99
  Requires-Dist: diffusers; extra == "all"
101
100
  Requires-Dist: iopath; extra == "all"
101
+ Requires-Dist: omegaconf; extra == "all"
102
102
  Requires-Dist: open-clip-torch; extra == "all"
103
103
  Requires-Dist: opencv-python; extra == "all"
104
104
  Provides-Extra: app
@@ -5,7 +5,7 @@ evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
5
5
  evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
- evalscope/version.py,sha256=X2BkdAHDhsMo9BTAegfd5uYheDVI8rh_UG5YqMwwXUE,119
8
+ evalscope/version.py,sha256=eFCP5Hfk4dip59uCASefVxaNqxWNtwDQPrqaoRJxO9c,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -63,7 +63,7 @@ evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
63
63
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
64
64
  evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
65
65
  evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
66
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=But2hcQU3X3v58poF8Qg2agrxTAP6gnjZYJs8Tr0g_4,2047
66
+ evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
67
67
  evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
68
68
  evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
69
69
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +123,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3
123
123
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
124
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
125
125
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
126
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=I2BanmO4WLrKviyLiIeqmS5mdyjqGg1X7hauv4HBjgk,4653
127
127
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
128
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
129
129
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -199,7 +199,7 @@ evalscope/collections/evaluator.py,sha256=Ll-qLet04aEp1WxoCKAuvZVWEZuy1lS_D-vZIN
199
199
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
200
200
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
201
201
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
202
- evalscope/evaluator/evaluator.py,sha256=M1JrsoZZ5OvcZfzgLrNSMtbbz5gvvCd0GwJArJQV0lk,19797
202
+ evalscope/evaluator/evaluator.py,sha256=oOVYRMMQfT3fqu-l33wmJtKlyeWxwoIUADMCoBNARTM,20271
203
203
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
204
204
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
205
205
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
@@ -318,7 +318,7 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
318
318
  evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
319
319
  evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
320
320
  evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
321
- evalscope/models/adapters/chat_adapter.py,sha256=HD1jAKlAv5KRjzB0s21E4rTEIhryZhZHMpSctF9xrN8,7306
321
+ evalscope/models/adapters/chat_adapter.py,sha256=hzFrpvIrakKO5hsnbdXiDTO0cGajAdhcAN9ENoI6XvY,7312
322
322
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
323
323
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
324
324
  evalscope/models/adapters/server_adapter.py,sha256=5kH1yDAjETogR7aOdnCEueYE1bREI40OdXdBiJpMdIM,6734
@@ -328,9 +328,9 @@ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAd
328
328
  evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
329
329
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
330
  evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
331
- evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
331
+ evalscope/perf/benchmark.py,sha256=C0tLaZzxqMonZK4iLtfjiQIxX3tO3-uFrOjgV-oVsU0,8024
332
332
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
333
- evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
333
+ evalscope/perf/main.py,sha256=C7iNEdb4SEMGmHsF4DHAak4O1zRxrWW1tMRmyhEkVwQ,1376
334
334
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
335
335
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
336
336
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -350,7 +350,7 @@ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1j
350
350
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
351
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
352
352
  evalscope/perf/utils/benchmark_util.py,sha256=CftjnxYA7d1aeAL_iuyXcJPwCL5A8zWGZSkNtjrMyW8,6309
353
- evalscope/perf/utils/db_util.py,sha256=efz6qQtMIYAIpG0sAEjLwuzTHBUiuzAV1n7_DCGrN5o,9461
353
+ evalscope/perf/utils/db_util.py,sha256=VsYgz6IsSNPAWGCopOOIxAUhUat3GRbZMlrfdZ6i4kM,9575
354
354
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
355
355
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
356
356
  evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
@@ -429,13 +429,13 @@ evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,1110
429
429
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
430
430
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
431
431
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
432
- tests/aigc/test_t2i.py,sha256=_M3WxY5ruBM4RD7rYHhgizcIhH-ny5XD9M16Ayl3UPk,2619
432
+ tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
433
433
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
434
434
  tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
435
435
  tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
436
- tests/cli/test_run.py,sha256=4B-6sOyotK3omirZWWyg7-CcnUSeZjiaU3aXHr0hH_Y,16804
436
+ tests/cli/test_run.py,sha256=1DHLFlgGvHJizbLVc1ShcGFAHirEPgW8r88H7g8Sbx4,17245
437
437
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
438
- tests/perf/test_perf.py,sha256=8K5tGlWwOpYWnJ0GaCpqSw9zPOiM8fEKJaDil2mpTSQ,3831
438
+ tests/perf/test_perf.py,sha256=diwwEmoWR-6xSVeGF65J6TWHRNj54rkwyvnhHh7PiE0,3919
439
439
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
440
440
  tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
441
441
  tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
@@ -446,9 +446,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
446
446
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
447
447
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
448
448
  tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
449
- evalscope-0.15.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
450
- evalscope-0.15.0.dist-info/METADATA,sha256=MLn0s_L7s0oeQPWL1XuhihDAFJnzLdVTvdrep-9Bgag,34053
451
- evalscope-0.15.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
452
- evalscope-0.15.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
453
- evalscope-0.15.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
454
- evalscope-0.15.0.dist-info/RECORD,,
449
+ evalscope-0.15.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
450
+ evalscope-0.15.1.dist-info/METADATA,sha256=JvRF5sI_9ak9Y-FwWdU1Y8BE96iKPLO_hIGC7Z9SWpg,34080
451
+ evalscope-0.15.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
452
+ evalscope-0.15.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
453
+ evalscope-0.15.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
454
+ evalscope-0.15.1.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
59
59
  },
60
60
  datasets=[
61
61
  'tifa160',
62
- 'genai_bench',
63
- 'evalmuse',
64
- 'hpdv2',
62
+ # 'genai_bench',
63
+ # 'evalmuse',
64
+ # 'hpdv2',
65
65
  ],
66
66
  dataset_args={
67
67
  'tifa160': {
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
81
81
  'num_inference_steps': 50,
82
82
  'guidance_scale': 7.5
83
83
  },
84
- use_cache='outputs/20250427_134122',
84
+ # use_cache='outputs/20250427_134122',
85
85
  )
86
86
 
87
87
  run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -207,13 +207,13 @@ class TestRun(unittest.TestCase):
207
207
  from evalscope.config import TaskConfig
208
208
 
209
209
  task_cfg = TaskConfig(
210
- model='Qwen/Qwen2.5-0.5B-Instruct',
210
+ model='Qwen/Qwen3-1.7B',
211
211
  datasets=[
212
212
  # 'iquiz',
213
213
  # 'math_500',
214
- # 'aime24',
214
+ 'aime24',
215
215
  # 'competition_math',
216
- 'mmlu',
216
+ # 'mmlu',
217
217
  ],
218
218
  dataset_args={
219
219
  'competition_math': {
@@ -224,8 +224,15 @@ class TestRun(unittest.TestCase):
224
224
  'few_shot_num': 0
225
225
  },
226
226
  },
227
- limit=10,
228
- eval_batch_size=10,
227
+ limit=5,
228
+ eval_batch_size=5,
229
+ generation_config={
230
+ 'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
231
+ 'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
232
+ 'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
233
+ 'top_k': 20, # top-k采样 (qwen 报告推荐值)
234
+ 'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
235
+ }
229
236
  )
230
237
 
231
238
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
103
103
  from evalscope.perf.arguments import Arguments
104
104
  task_cfg = Arguments(
105
105
  parallel=20,
106
- model='Qwen2.5-0.5B-Instruct',
106
+ model='Qwen3-1.7B',
107
107
  url='http://127.0.0.1:8801/v1/completions',
108
108
  api='openai',
109
109
  dataset='random',
@@ -117,7 +117,9 @@ class TestPerf(unittest.TestCase):
117
117
  seed=None,
118
118
  extra_args={'ignore_eos': True}
119
119
  )
120
- run_perf_benchmark(task_cfg)
120
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
121
+ print(metrics_result)
122
+ print(percentile_result)
121
123
 
122
124
 
123
125
  if __name__ == '__main__':