PyPI - evalscope - Versions diffs - 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

evalscope 0.15.0py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (17) hide show

evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
evalscope/evaluator/evaluator.py +7 -1
evalscope/models/adapters/chat_adapter.py +3 -3
evalscope/perf/benchmark.py +4 -3
evalscope/perf/main.py +4 -2
evalscope/perf/utils/db_util.py +8 -6
evalscope/version.py +2 -2
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +5 -5
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +17 -17
tests/aigc/test_t2i.py +4 -4
tests/cli/test_run.py +12 -5
tests/perf/test_perf.py +4 -2
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py CHANGED Viewed

@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
         subset_list = subset_list or self.subset_list
         data_file_dict = defaultdict(str)
-        data_list = []
+        data_item_dict = defaultdict(list)
         # get data file path and subset name
         if os.path.isdir(dataset_name_or_path):
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
         # load data from local disk
         try:
             for subset_name, file_path in data_file_dict.items():
-                data_list.extend(jsonl_to_list(file_path))
+                data_item_dict[subset_name] = jsonl_to_list(file_path)
         except Exception as e:
             raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
+        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
         return data_dict

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
         subset_list = subset_list or self.subset_list
         data_file_dict = defaultdict(str)
-        data_list = []
+        data_item_dict = defaultdict(list)
         # get data file path and subset name
         if os.path.isdir(dataset_name_or_path):
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
         # load data from local disk
         try:
             for subset_name, file_path in data_file_dict.items():
-                data_list.extend(jsonl_to_list(file_path))
+                data_item_dict[subset_name] = jsonl_to_list(file_path)
         except Exception as e:
             raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
+        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
         return data_dict

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -317,6 +317,8 @@ class Evaluator(object):
         """
         review_res_list = []
+        max_choices = max(
+            len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
         for review_d in reviews_list:
             if not review_d[ReviewKeys.REVIEWED]:
                 logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
@@ -325,10 +327,14 @@ class Evaluator(object):
             if len(review_d[AnswerKeys.CHOICES]) == 0:
                 logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
                 continue
-            elif len(review_d[AnswerKeys.CHOICES]) == 1:
+            elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
                 review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
             else:
                 review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
+                if len(review_d[AnswerKeys.CHOICES]) < max_choices:
+                    logger.warning(
+                        f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
+                        f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
             review_res_list.append(review_res)

evalscope/models/adapters/chat_adapter.py CHANGED Viewed

@@ -100,10 +100,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
                 if i < len(system_prompts) and system_prompts[i]:
                     messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
                 # whether thinking is needed
-                enable_thinking = infer_cfg.get('enable_thinking', None)
-                if enable_thinking is not None:
+                chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
+                if chat_template_kwargs is not None:
                     prompts = self.tokenizer.apply_chat_template(
-                        messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
+                        messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
                 else:
                     prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                 formatted_prompts.append(prompts)

evalscope/perf/benchmark.py CHANGED Viewed

@@ -9,7 +9,7 @@ import threading
 import time
 from http import HTTPStatus
 from tqdm import tqdm
-from typing import AsyncGenerator, List
+from typing import AsyncGenerator, Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
 @exception_handler
-async def benchmark(args: Arguments) -> None:
+async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
     if platform.system() != 'Windows':
         loop = asyncio.get_running_loop()
         add_signal_handlers(loop)
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
     data_process_completed_event.set()
     metrics, result_db_path = await statistic_benchmark_metric_task
-    summary_result(args, metrics, result_db_path)
+    metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
+    return metrics_result, percentile_result

evalscope/perf/main.py CHANGED Viewed

@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
     if platform.system() != 'Windows':
         add_signal_handlers(loop)
-    loop.run_until_complete(benchmark(args))
+    return loop.run_until_complete(benchmark(args))
 if __name__ == '__main__':
     args = Arguments.from_args(parse_args())
-    run_perf_benchmark(args)
+    metrics_result, percentile_result = run_perf_benchmark(args)
+    print(metrics_result)
+    print(percentile_result)

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -7,7 +7,7 @@ import sqlite3
 import sys
 from datetime import datetime
 from tabulate import tabulate
-from typing import Dict, List
+from typing import Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -200,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     return results
-def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
+def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
     result_path = os.path.dirname(result_db_path)
     write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
-    data = metrics.create_message()
-    data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
-    write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
+    metrics_result = metrics.create_message()
+    metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
+    write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
     # Print summary in a table
-    table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
+    table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
     logger.info('\nBenchmarking summary:\n' + table)
     # Get percentile results
@@ -223,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
     if args.dataset.startswith('speed_benchmark'):
         speed_benchmark_result(result_db_path)
+    return metrics_result, percentile_result
 def speed_benchmark_result(result_db_path: str):
     query_sql = """

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.15.0'
-__release_datetime__ = '2025-04-29 00:00:00'
+__version__ = '0.15.1'
+__release_datetime__ = '2025-04-30 12:00:00'

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.15.0
+Version: 0.15.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
 Requires-Dist: matplotlib
 Requires-Dist: modelscope[framework]
 Requires-Dist: nltk>=3.9
-Requires-Dist: omegaconf
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: pillow
 Requires-Dist: pyarrow
-Requires-Dist: pyyaml
+Requires-Dist: pyyaml>=5.1
 Requires-Dist: requests
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
@@ -48,6 +47,7 @@ Requires-Dist: word2number
 Provides-Extra: aigc
 Requires-Dist: diffusers; extra == "aigc"
 Requires-Dist: iopath; extra == "aigc"
+Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open-clip-torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
 Provides-Extra: all
@@ -61,12 +61,11 @@ Requires-Dist: latex2sympy2; extra == "all"
 Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: modelscope[framework]; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
-Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: pillow; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
-Requires-Dist: pyyaml; extra == "all"
+Requires-Dist: pyyaml>=5.1; extra == "all"
 Requires-Dist: requests; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -99,6 +98,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Requires-Dist: diffusers; extra == "all"
 Requires-Dist: iopath; extra == "all"
+Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open-clip-torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"
 Provides-Extra: app

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
 evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
 evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
 evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
-evalscope/version.py,sha256=X2BkdAHDhsMo9BTAegfd5uYheDVI8rh_UG5YqMwwXUE,119
+evalscope/version.py,sha256=eFCP5Hfk4dip59uCASefVxaNqxWNtwDQPrqaoRJxO9c,119
 evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
 evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -63,7 +63,7 @@ evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
 evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
 evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
 evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
-evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=But2hcQU3X3v58poF8Qg2agrxTAP6gnjZYJs8Tr0g_4,2047
+evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
 evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
 evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
 evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +123,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3
 evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
 evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
+evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=I2BanmO4WLrKviyLiIeqmS5mdyjqGg1X7hauv4HBjgk,4653
 evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
 evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -199,7 +199,7 @@ evalscope/collections/evaluator.py,sha256=Ll-qLet04aEp1WxoCKAuvZVWEZuy1lS_D-vZIN
 evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
 evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
 evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
-evalscope/evaluator/evaluator.py,sha256=M1JrsoZZ5OvcZfzgLrNSMtbbz5gvvCd0GwJArJQV0lk,19797
+evalscope/evaluator/evaluator.py,sha256=oOVYRMMQfT3fqu-l33wmJtKlyeWxwoIUADMCoBNARTM,20271
 evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
 evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
@@ -318,7 +318,7 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
 evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
 evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
 evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
-evalscope/models/adapters/chat_adapter.py,sha256=HD1jAKlAv5KRjzB0s21E4rTEIhryZhZHMpSctF9xrN8,7306
+evalscope/models/adapters/chat_adapter.py,sha256=hzFrpvIrakKO5hsnbdXiDTO0cGajAdhcAN9ENoI6XvY,7312
 evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
 evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
 evalscope/models/adapters/server_adapter.py,sha256=5kH1yDAjETogR7aOdnCEueYE1bREI40OdXdBiJpMdIM,6734
@@ -328,9 +328,9 @@ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAd
 evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
 evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
-evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
+evalscope/perf/benchmark.py,sha256=C0tLaZzxqMonZK4iLtfjiQIxX3tO3-uFrOjgV-oVsU0,8024
 evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
-evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
+evalscope/perf/main.py,sha256=C7iNEdb4SEMGmHsF4DHAak4O1zRxrWW1tMRmyhEkVwQ,1376
 evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
 evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
 evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -350,7 +350,7 @@ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1j
 evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
 evalscope/perf/utils/benchmark_util.py,sha256=CftjnxYA7d1aeAL_iuyXcJPwCL5A8zWGZSkNtjrMyW8,6309
-evalscope/perf/utils/db_util.py,sha256=efz6qQtMIYAIpG0sAEjLwuzTHBUiuzAV1n7_DCGrN5o,9461
+evalscope/perf/utils/db_util.py,sha256=VsYgz6IsSNPAWGCopOOIxAUhUat3GRbZMlrfdZ6i4kM,9575
 evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
 evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
 evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
@@ -429,13 +429,13 @@ evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,1110
 tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
 tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/aigc/test_t2i.py,sha256=_M3WxY5ruBM4RD7rYHhgizcIhH-ny5XD9M16Ayl3UPk,2619
+tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
 tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
 tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
-tests/cli/test_run.py,sha256=4B-6sOyotK3omirZWWyg7-CcnUSeZjiaU3aXHr0hH_Y,16804
+tests/cli/test_run.py,sha256=1DHLFlgGvHJizbLVc1ShcGFAHirEPgW8r88H7g8Sbx4,17245
 tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
-tests/perf/test_perf.py,sha256=8K5tGlWwOpYWnJ0GaCpqSw9zPOiM8fEKJaDil2mpTSQ,3831
+tests/perf/test_perf.py,sha256=diwwEmoWR-6xSVeGF65J6TWHRNj54rkwyvnhHh7PiE0,3919
 tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
 tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
@@ -446,9 +446,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
 tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
 tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
-evalscope-0.15.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
-evalscope-0.15.0.dist-info/METADATA,sha256=MLn0s_L7s0oeQPWL1XuhihDAFJnzLdVTvdrep-9Bgag,34053
-evalscope-0.15.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-evalscope-0.15.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
-evalscope-0.15.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
-evalscope-0.15.0.dist-info/RECORD,,
+evalscope-0.15.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
+evalscope-0.15.1.dist-info/METADATA,sha256=JvRF5sI_9ak9Y-FwWdU1Y8BE96iKPLO_hIGC7Z9SWpg,34080
+evalscope-0.15.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+evalscope-0.15.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
+evalscope-0.15.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
+evalscope-0.15.1.dist-info/RECORD,,

tests/aigc/test_t2i.py CHANGED Viewed

@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
             },
             datasets=[
                 'tifa160',
-                'genai_bench',
-                'evalmuse',
-                'hpdv2',
+                # 'genai_bench',
+                # 'evalmuse',
+                # 'hpdv2',
             ],
             dataset_args={
                 'tifa160': {
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
                 'num_inference_steps': 50,
                 'guidance_scale': 7.5
             },
-            use_cache='outputs/20250427_134122',
+            # use_cache='outputs/20250427_134122',
         )
         run_task(task_cfg=task_cfg)

tests/cli/test_run.py CHANGED Viewed

@@ -207,13 +207,13 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model='Qwen/Qwen3-1.7B',
             datasets=[
                 # 'iquiz',
                 # 'math_500',
-                # 'aime24',
+                'aime24',
                 # 'competition_math',
-                'mmlu',
+                # 'mmlu',
             ],
             dataset_args={
                 'competition_math': {
@@ -224,8 +224,15 @@ class TestRun(unittest.TestCase):
                     'few_shot_num': 0
                 },
             },
-            limit=10,
-            eval_batch_size=10,
+            limit=5,
+            eval_batch_size=5,
+            generation_config={
+                'max_new_tokens': 1000,  # 最大生成token数，建议设置为较大值避免输出截断
+                'temperature': 0.7,  # 采样温度 (qwen 报告推荐值)
+                'top_p': 0.8,  # top-p采样 (qwen 报告推荐值)
+                'top_k': 20,  # top-k采样 (qwen 报告推荐值)
+                'chat_template_kwargs': {'enable_thinking': False}  # 关闭思考模式
+            }
         )
         run_task(task_cfg=task_cfg)

tests/perf/test_perf.py CHANGED Viewed

@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
         from evalscope.perf.arguments import Arguments
         task_cfg = Arguments(
             parallel=20,
-            model='Qwen2.5-0.5B-Instruct',
+            model='Qwen3-1.7B',
             url='http://127.0.0.1:8801/v1/completions',
             api='openai',
             dataset='random',
@@ -117,7 +117,9 @@ class TestPerf(unittest.TestCase):
             seed=None,
             extra_args={'ignore_eos': True}
         )
-        run_perf_benchmark(task_cfg)
+        metrics_result, percentile_result = run_perf_benchmark(task_cfg)
+        print(metrics_result)
+        print(percentile_result)
 if __name__ == '__main__':

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.15.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.15.0py3-none-any.whl → 0.15.1py3-none-any.whl