PyPI - evalscope - Versions diffs - 0.15.0__tar.gz → 0.15.1__tar.gz - Mend

evalscope 0.15.0tar.gz → 0.15.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (471) hide show

{evalscope-0.15.0/evalscope.egg-info → evalscope-0.15.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.15.0
+Version: 0.15.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
 Requires-Dist: matplotlib
 Requires-Dist: modelscope[framework]
 Requires-Dist: nltk>=3.9
-Requires-Dist: omegaconf
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: pillow
 Requires-Dist: pyarrow
-Requires-Dist: pyyaml
+Requires-Dist: pyyaml>=5.1
 Requires-Dist: requests
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
@@ -70,6 +69,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
 Provides-Extra: aigc
 Requires-Dist: diffusers; extra == "aigc"
 Requires-Dist: iopath; extra == "aigc"
+Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open_clip_torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
 Provides-Extra: all
@@ -83,12 +83,11 @@ Requires-Dist: latex2sympy2; extra == "all"
 Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: modelscope[framework]; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
-Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: pillow; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
-Requires-Dist: pyyaml; extra == "all"
+Requires-Dist: pyyaml>=5.1; extra == "all"
 Requires-Dist: requests; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -121,6 +120,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Requires-Dist: diffusers; extra == "all"
 Requires-Dist: iopath; extra == "all"
+Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open_clip_torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py RENAMED Viewed

@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
         subset_list = subset_list or self.subset_list
         data_file_dict = defaultdict(str)
-        data_list = []
+        data_item_dict = defaultdict(list)
         # get data file path and subset name
         if os.path.isdir(dataset_name_or_path):
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
         # load data from local disk
         try:
             for subset_name, file_path in data_file_dict.items():
-                data_list.extend(jsonl_to_list(file_path))
+                data_item_dict[subset_name] = jsonl_to_list(file_path)
         except Exception as e:
             raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
+        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
         return data_dict

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py RENAMED Viewed

@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
         subset_list = subset_list or self.subset_list
         data_file_dict = defaultdict(str)
-        data_list = []
+        data_item_dict = defaultdict(list)
         # get data file path and subset name
         if os.path.isdir(dataset_name_or_path):
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
         # load data from local disk
         try:
             for subset_name, file_path in data_file_dict.items():
-                data_list.extend(jsonl_to_list(file_path))
+                data_item_dict[subset_name] = jsonl_to_list(file_path)
         except Exception as e:
             raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
+        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
         return data_dict

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/evaluator.py RENAMED Viewed

@@ -317,6 +317,8 @@ class Evaluator(object):
         """
         review_res_list = []
+        max_choices = max(
+            len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
         for review_d in reviews_list:
             if not review_d[ReviewKeys.REVIEWED]:
                 logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
@@ -325,10 +327,14 @@ class Evaluator(object):
             if len(review_d[AnswerKeys.CHOICES]) == 0:
                 logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
                 continue
-            elif len(review_d[AnswerKeys.CHOICES]) == 1:
+            elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
                 review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
             else:
                 review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
+                if len(review_d[AnswerKeys.CHOICES]) < max_choices:
+                    logger.warning(
+                        f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
+                        f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
             review_res_list.append(review_res)

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/chat_adapter.py RENAMED Viewed

@@ -100,10 +100,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
                 if i < len(system_prompts) and system_prompts[i]:
                     messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
                 # whether thinking is needed
-                enable_thinking = infer_cfg.get('enable_thinking', None)
-                if enable_thinking is not None:
+                chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
+                if chat_template_kwargs is not None:
                     prompts = self.tokenizer.apply_chat_template(
-                        messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
+                        messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
                 else:
                     prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                 formatted_prompts.append(prompts)

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/benchmark.py RENAMED Viewed

@@ -9,7 +9,7 @@ import threading
 import time
 from http import HTTPStatus
 from tqdm import tqdm
-from typing import AsyncGenerator, List
+from typing import AsyncGenerator, Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
 @exception_handler
-async def benchmark(args: Arguments) -> None:
+async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
     if platform.system() != 'Windows':
         loop = asyncio.get_running_loop()
         add_signal_handlers(loop)
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
     data_process_completed_event.set()
     metrics, result_db_path = await statistic_benchmark_metric_task
-    summary_result(args, metrics, result_db_path)
+    metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
+    return metrics_result, percentile_result

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/main.py RENAMED Viewed

@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
     if platform.system() != 'Windows':
         add_signal_handlers(loop)
-    loop.run_until_complete(benchmark(args))
+    return loop.run_until_complete(benchmark(args))
 if __name__ == '__main__':
     args = Arguments.from_args(parse_args())
-    run_perf_benchmark(args)
+    metrics_result, percentile_result = run_perf_benchmark(args)
+    print(metrics_result)
+    print(percentile_result)

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/db_util.py RENAMED Viewed

@@ -7,7 +7,7 @@ import sqlite3
 import sys
 from datetime import datetime
 from tabulate import tabulate
-from typing import Dict, List
+from typing import Dict, List, Tuple
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -200,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
     return results
-def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
+def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
     result_path = os.path.dirname(result_db_path)
     write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
-    data = metrics.create_message()
-    data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
-    write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
+    metrics_result = metrics.create_message()
+    metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
+    write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
     # Print summary in a table
-    table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
+    table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
     logger.info('\nBenchmarking summary:\n' + table)
     # Get percentile results
@@ -223,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
     if args.dataset.startswith('speed_benchmark'):
         speed_benchmark_result(result_db_path)
+    return metrics_result, percentile_result
 def speed_benchmark_result(result_db_path: str):
     query_sql = """

evalscope-0.15.1/evalscope/version.py ADDED Viewed

@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+__version__ = '0.15.1'
+__release_datetime__ = '2025-04-30 12:00:00'

{evalscope-0.15.0 → evalscope-0.15.1/evalscope.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.15.0
+Version: 0.15.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
 Requires-Dist: matplotlib
 Requires-Dist: modelscope[framework]
 Requires-Dist: nltk>=3.9
-Requires-Dist: omegaconf
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: pillow
 Requires-Dist: pyarrow
-Requires-Dist: pyyaml
+Requires-Dist: pyyaml>=5.1
 Requires-Dist: requests
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
@@ -70,6 +69,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
 Provides-Extra: aigc
 Requires-Dist: diffusers; extra == "aigc"
 Requires-Dist: iopath; extra == "aigc"
+Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open_clip_torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
 Provides-Extra: all
@@ -83,12 +83,11 @@ Requires-Dist: latex2sympy2; extra == "all"
 Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: modelscope[framework]; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
-Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: pillow; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
-Requires-Dist: pyyaml; extra == "all"
+Requires-Dist: pyyaml>=5.1; extra == "all"
 Requires-Dist: requests; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -121,6 +120,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
 Requires-Dist: diffusers; extra == "all"
 Requires-Dist: iopath; extra == "all"
+Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open_clip_torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"

{evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/requires.txt RENAMED Viewed

@@ -8,12 +8,11 @@ latex2sympy2
 matplotlib
 modelscope[framework]
 nltk>=3.9
-omegaconf
 openai
 pandas
 pillow
 pyarrow
-pyyaml
+pyyaml>=5.1
 requests
 rouge-chinese
 rouge-score>=0.1.0
@@ -31,6 +30,7 @@ word2number
 [aigc]
 diffusers
 iopath
+omegaconf
 open_clip_torch
 opencv-python
@@ -45,12 +45,11 @@ latex2sympy2
 matplotlib
 modelscope[framework]
 nltk>=3.9
-omegaconf
 openai
 pandas
 pillow
 pyarrow
-pyyaml
+pyyaml>=5.1
 requests
 rouge-chinese
 rouge-score>=0.1.0
@@ -83,6 +82,7 @@ gradio==5.4.0
 plotly<6.0.0,>=5.23.0
 diffusers
 iopath
+omegaconf
 open_clip_torch
 opencv-python

{evalscope-0.15.0 → evalscope-0.15.1}/requirements/aigc.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 diffusers
 iopath
+omegaconf
 open_clip_torch
 opencv-python

{evalscope-0.15.0 → evalscope-0.15.1}/requirements/framework.txt RENAMED Viewed

@@ -8,12 +8,11 @@ latex2sympy2
 matplotlib
 modelscope[framework]
 nltk>=3.9
-omegaconf
 openai
 pandas
 pillow
 pyarrow
-pyyaml
+pyyaml>=5.1
 requests
 rouge-chinese
 rouge-score>=0.1.0

{evalscope-0.15.0 → evalscope-0.15.1}/tests/aigc/test_t2i.py RENAMED Viewed

@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
             },
             datasets=[
                 'tifa160',
-                'genai_bench',
-                'evalmuse',
-                'hpdv2',
+                # 'genai_bench',
+                # 'evalmuse',
+                # 'hpdv2',
             ],
             dataset_args={
                 'tifa160': {
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
                 'num_inference_steps': 50,
                 'guidance_scale': 7.5
             },
-            use_cache='outputs/20250427_134122',
+            # use_cache='outputs/20250427_134122',
         )
         run_task(task_cfg=task_cfg)

{evalscope-0.15.0 → evalscope-0.15.1}/tests/cli/test_run.py RENAMED Viewed

@@ -207,13 +207,13 @@ class TestRun(unittest.TestCase):
         from evalscope.config import TaskConfig
         task_cfg = TaskConfig(
-            model='Qwen/Qwen2.5-0.5B-Instruct',
+            model='Qwen/Qwen3-1.7B',
             datasets=[
                 # 'iquiz',
                 # 'math_500',
-                # 'aime24',
+                'aime24',
                 # 'competition_math',
-                'mmlu',
+                # 'mmlu',
             ],
             dataset_args={
                 'competition_math': {
@@ -224,8 +224,15 @@ class TestRun(unittest.TestCase):
                     'few_shot_num': 0
                 },
             },
-            limit=10,
-            eval_batch_size=10,
+            limit=5,
+            eval_batch_size=5,
+            generation_config={
+                'max_new_tokens': 1000,  # 最大生成token数，建议设置为较大值避免输出截断
+                'temperature': 0.7,  # 采样温度 (qwen 报告推荐值)
+                'top_p': 0.8,  # top-p采样 (qwen 报告推荐值)
+                'top_k': 20,  # top-k采样 (qwen 报告推荐值)
+                'chat_template_kwargs': {'enable_thinking': False}  # 关闭思考模式
+            }
         )
         run_task(task_cfg=task_cfg)

{evalscope-0.15.0 → evalscope-0.15.1}/tests/perf/test_perf.py RENAMED Viewed

@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
         from evalscope.perf.arguments import Arguments
         task_cfg = Arguments(
             parallel=20,
-            model='Qwen2.5-0.5B-Instruct',
+            model='Qwen3-1.7B',
             url='http://127.0.0.1:8801/v1/completions',
             api='openai',
             dataset='random',
@@ -117,7 +117,9 @@ class TestPerf(unittest.TestCase):
             seed=None,
             extra_args={'ignore_eos': True}
         )
-        run_perf_benchmark(task_cfg)
+        metrics_result, percentile_result = run_perf_benchmark(task_cfg)
+        print(metrics_result)
+        print(percentile_result)
 if __name__ == '__main__':