PyPI - evalscope - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
evalscope/api/benchmark/benchmark.py +27 -2
evalscope/api/benchmark/meta.py +3 -0
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +5 -0
evalscope/api/messages/chat_message.py +6 -1
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +0 -3
evalscope/api/model/model.py +1 -1
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +6 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
evalscope/benchmarks/bfcl/generation.py +7 -7
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
evalscope/config.py +24 -1
evalscope/constants.py +3 -0
evalscope/evaluator/evaluator.py +25 -7
evalscope/metrics/metric.py +78 -2
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/model_apis.py +10 -8
evalscope/models/utils/openai.py +1 -2
evalscope/perf/arguments.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/report/__init__.py +9 -1
evalscope/report/combinator.py +45 -20
evalscope/report/report.py +8 -4
evalscope/run.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +63 -13
evalscope/utils/io_utils.py +19 -11
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +1 -1
evalscope/utils/multi_choices.py +16 -1
evalscope/version.py +2 -2
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -385
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -80
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -178
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import json
+import math
 import os
+from collections import defaultdict
 from typing import Any, Dict, List, Tuple, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
 from evalscope.perf.plugin.registry import register_api
+from evalscope.utils.io_utils import base64_to_PIL
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
             return input_tokens, output_tokens
         # no usage information in the response, parse the response to get the tokens
-        delta_contents = {}
+        delta_contents = defaultdict(list)
         for response in responses:
             if 'object' in response:
                 self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
         input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
         return input_tokens, output_tokens
-    def __process_response_object(self, js, delta_contents):
-        if js['object'] == 'chat.completion':
-            for choice in js['choices']:
+    def __process_response_object(self, response, delta_contents):
+        if not response.get('choices'):
+            return
+        if response['object'] == 'chat.completion':
+            for choice in response['choices']:
                 delta_contents[choice['index']] = [choice['message']['content']]
-        elif js['object'] == 'text_completion':
-            for choice in js['choices']:
-                delta_contents[choice['index']] = [choice['text']]
-        elif js['object'] == 'chat.completion.chunk':
-            for choice in js.get('choices', []):
+        elif response['object'] == 'text_completion':
+            for choice in response['choices']:
+                if 'text' in choice and 'index' in choice:
+                    delta_contents[choice['index']].append(choice['text'])
+        elif response['object'] == 'chat.completion.chunk':
+            for choice in response['choices']:
                 if 'delta' in choice and 'index' in choice:
                     delta = choice['delta']
                     idx = choice['index']
                     if 'content' in delta:
-                        delta_content = delta['content']
-                        delta_contents.setdefault(idx, []).append(delta_content)
+                        delta_contents[idx].append(delta['content'])
-    def __process_no_object(self, js, delta_contents):
+    def __process_no_object(self, response, delta_contents):
         #  assume the response is a single choice
-        for choice in js['choices']:
+        if not response.get('choices'):
+            return
+        for choice in response['choices']:
             if 'delta' in choice:
                 delta = choice['delta']
                 idx = choice['index']
                 if 'content' in delta:
-                    delta_content = delta['content']
-                    delta_contents.setdefault(idx, []).append(delta_content)
+                    delta_contents[idx].append(delta['content'])
             else:
                 delta_contents[choice['index']] = [choice['message']['content']]
-    def __calculate_tokens_from_content(self, request, delta_contents):
+    def __calculate_tokens_from_content(self, request, content):
         input_tokens = output_tokens = 0
         if self.tokenizer is not None:
-            for idx, choice_contents in delta_contents.items():
+            # Calculate input tokens
+            input_tokens += self._count_input_tokens(request)
+            for idx, choice_contents in content.items():
                 full_response_content = ''.join(choice_contents)
-                input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
-                output_tokens += len(self.tokenizer.encode(full_response_content))
+                # Calculate output tokens
+                output_tokens += self._count_output_tokens(full_response_content)
         else:
             raise ValueError(
                 'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
                 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
             )
         return input_tokens, output_tokens
+    def _count_input_tokens(self, request: Dict) -> int:
+        """Count the number of input tokens in the request.
+        This method handles different types of requests and calculates tokens for:
+        - Text content in messages or prompts
+        - Images in multimodal messages (converted to patch tokens)
+        Args:
+            request (Dict): The request dictionary containing either 'messages' for chat
+                          completion or 'prompt' for text completion.
+        Returns:
+            int: The total number of input tokens including text and image tokens.
+        """
+        input_tokens = 0
+        if 'messages' in request:
+            input_content = self.tokenizer.apply_chat_template(
+                request['messages'], tokenize=True, add_generation_prompt=True
+            )
+            input_tokens += len(input_content)
+            # handle image tokens if any
+            for message in request['messages']:
+                content = message.get('content', '')
+                if isinstance(content, str):
+                    continue
+                for cont in content:
+                    if cont['type'] == 'image_url':
+                        try:
+                            # assuming image_url is base64 string
+                            image_base64 = cont['image_url']['url']
+                            image = base64_to_PIL(image_base64)
+                            # Use math.ceil for more accurate token count when image dimensions
+                            # aren't perfectly divisible by patch size
+                            n_patches = (
+                                math.ceil(image.height / self.param.image_patch_size)
+                                * math.ceil(image.width / self.param.image_patch_size)
+                            )
+                            input_tokens += n_patches
+                        except Exception as e:
+                            logger.warning(f'Failed to process image for token counting: {e}')
+                            # Continue processing other content without failing
+        elif 'prompt' in request:
+            input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
+        return input_tokens
+    def _count_output_tokens(self, response: str) -> int:
+        """Count the number of output tokens in the response. Only string response is supported.
+        Args:
+            response (str): The API response text.
+        Returns:
+            int: The number of output tokens.
+        """
+        return len(self.tokenizer.encode(response, add_special_tokens=False))

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['jpg']
             text = item['txt']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/kontext_bench.py CHANGED Viewed

@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
         for item in dataset:
             pil_image = item['image']
             text = item['instruction']
-            base64_image = PIL_to_base64(pil_image)
+            base64_image = PIL_to_base64(pil_image, add_header=True)
-            message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
+            message = self.create_message(text=text, image_urls=base64_image)
             yield [message]

evalscope/perf/plugin/datasets/random_vl_dataset.py CHANGED Viewed

@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
             # Generate random images based on image_num
             images_b64 = []
             for _ in range(self.image_num):
-                images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
+                images_b64.append(self._generate_random_image_b64())
             message = self.create_message(text=prompt, image_urls=images_b64)
             yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
                 draw.line(coords, fill=shape_color, width=random.randint(1, 5))
         # Convert to base64
-        return PIL_to_base64(image, format='PNG')
+        return PIL_to_base64(image, format='PNG', add_header=True)

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -44,8 +44,7 @@ class BenchmarkData:
             api_plugin.parse_responses(self.response_messages, request=self.request)
     def update_gpu_usage(self):
-        if check_import('torch'):
+        if check_import('torch', raise_warning=False):
             import torch
             total_memory = 0
             for i in range(torch.cuda.device_count()):

evalscope/report/__init__.py CHANGED Viewed

@@ -4,7 +4,13 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .combinator import gen_table, get_data_frame, get_report_list
+    from .combinator import (
+        gen_table,
+        get_data_frame,
+        get_report_list,
+        unweighted_average_from_subsets,
+        weighted_average_from_subsets,
+    )
     from .generator import ReportGenerator
     from .report import Category, Report, ReportKey, Subset
@@ -14,6 +20,8 @@ else:
             'gen_table',
             'get_data_frame',
             'get_report_list',
+            'weighted_average_from_subsets',
+            'unweighted_average_from_subsets',
         ],
         'generator': [
             'ReportGenerator',

evalscope/report/combinator.py CHANGED Viewed

@@ -4,9 +4,9 @@ import glob
 import os
 import pandas as pd
 from tabulate import tabulate
-from typing import List, Tuple
+from typing import Dict, List, Tuple, Union
-from evalscope.report.report import Report
+from evalscope.report.report import Report, Subset
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -88,26 +88,51 @@ def gen_table(
     return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
-class ReportsRecorder:
-    COMMON_DATASET_PATH = []
-    CUSTOM_DATASET_PATH = []
+def weighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate weighted average for given subsets.
-    def __init__(self, oss_url: str = '', endpoint: str = ''):
-        pass
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with weighted average score
+    """
+    total_score = 0
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            total_score += subset.score * subset.num
+            total_count += subset.num
+    weighted_avg = total_score / total_count if total_count > 0 else 0
+    return Subset(name=new_name, score=weighted_avg, num=total_count)
-if __name__ == '__main__':
-    report_dir_1 = './outputs/20250117_151926'
-    # report_dir_2 = './outputs/20250107_204445/reports'
-    report_table = gen_table(reports_path_list=[report_dir_1])
-    print(report_table)
+def unweighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate unweighted average for given subsets.
-    # ALL VALUES ONLY FOR EXAMPLE
-    # +--------------------------+-------------------+-------------+
-    # | Model                    | CompetitionMath   | GSM8K       |
-    # +==========================+===================+=============+
-    # | ZhipuAI_chatglm2-6b-base | 25.0 (acc)        | 30.50 (acc) |
-    # +--------------------------+-------------------+-------------+
-    # | ZhipuAI_chatglm2-6b      | 30.5 (acc)        | 40.50 (acc) |
-    # +--------------------------+-------------------+-------------+
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with unweighted average score
+    """
+    scores = []
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            scores.append(subset.score)
+            total_count += subset.num
+    unweighted_avg = sum(scores) / len(scores) if scores else 0
+    return Subset(name=new_name, score=unweighted_avg, num=total_count)

evalscope/report/report.py CHANGED Viewed

@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果，输出分
 """
-def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
+def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
     """
     Normalize score.
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
         score = round(score, keep_num)
     elif isinstance(score, dict):
         score = {k: round(v, keep_num) for k, v in score.items()}
+    elif isinstance(score, int):
+        score = float(score)
     else:
         logger.warning(f'Unknown score type: {type(score)}')
     return score
@@ -103,6 +104,7 @@ class ReportKey:
     subset_name = 'Subset'
     num = 'Num'
     score = 'Score'
+    overall_score = 'OVERALL'
 @dataclass
@@ -181,12 +183,14 @@ class Report:
                     table[ReportKey.num].append(subset.num)
                     table[ReportKey.score].append(subset.score)
             # add overall metric when there are multiple subsets
-            if metric_count > 1 and add_overall_metric:
+            if metric_count > 1 and add_overall_metric and (
+                ReportKey.overall_score not in table[ReportKey.subset_name]
+            ):
                 table[ReportKey.model_name].append(self.model_name)
                 table[ReportKey.dataset_name].append(self.dataset_name)
                 table[ReportKey.metric_name].append(metric.name)
                 table[ReportKey.category_name].append(('-', ))
-                table[ReportKey.subset_name].append('OVERALL')
+                table[ReportKey.subset_name].append(ReportKey.overall_score)
                 table[ReportKey.num].append(metric.num)
                 table[ReportKey.score].append(metric.score)
             # NOTE: only flatten metrics if needed, use the first metric by default

evalscope/run.py CHANGED Viewed

@@ -159,7 +159,7 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
         gc.collect()
         from evalscope.utils.import_utils import check_import
-        if check_import('torch'):
+        if check_import('torch', raise_warning=False):
             import torch
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()

evalscope/utils/function_utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import threading
+import time
+from contextlib import contextmanager
 from functools import wraps
@@ -27,3 +29,42 @@ def thread_safe(func):
             return func(*args, **kwargs)
     return wrapper
+def retry_func(retries=3, sleep_interval=0):
+    """A decorator that retries a function call up to `retries` times if an exception occurs."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            for attempt in range(retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    last_exception = e
+                    if sleep_interval > 0:
+                        time.sleep(sleep_interval)
+            raise last_exception
+        return wrapper
+    return decorator
+@contextmanager
+def retry_context(retries=3, sleep_interval=0):
+    """A context manager that retries the code block up to `retries` times if an exception occurs."""
+    last_exception = None
+    for attempt in range(retries):
+        try:
+            yield
+            return  # If no exception, exit successfully
+        except Exception as e:
+            last_exception = e
+            if sleep_interval > 0:
+                time.sleep(sleep_interval)
+            if attempt == retries - 1:  # Last attempt
+                break
+    raise last_exception

evalscope/utils/import_utils.py CHANGED Viewed

@@ -7,32 +7,82 @@ from itertools import chain
 from types import ModuleType
 from typing import Any, Optional, Union
+from evalscope.constants import IS_BUILD_DOC
 from .logger import get_logger
 logger = get_logger()  # pylint: disable=invalid-name
-def check_import(module_name: str, package: Optional[str] = None, raise_error: bool = False) -> bool:
-    """Check if a module can be imported.
+def check_import(
+    module_name: Union[str, list[str]],
+    package: Optional[Union[str, list[str]]] = None,
+    raise_warning: bool = True,
+    raise_error: bool = False,
+    feature_name: Optional[str] = 'this feature',
+) -> bool:
+    """Check if a module or list of modules can be imported.
     Args:
-        module_name (str): The name of the module to check.
-        package (str, optional): The package to install if the module is not found. Defaults to None.
-        raise_error (bool, optional): Whether to raise an error if the module is not found. Defaults to False.
+        module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
+        package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
+            Defaults to None.
+        raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
+        raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
+        feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
+            Defaults to 'this feature'.
+    Returns:
+        bool: True if all modules can be imported, False otherwise.
     """
-    try:
-        importlib.import_module(module_name)
-        return True
-    except ImportError:
-        error_msg = f'`{module_name}` not found.'
-        if package:
-            error_msg += f' Please run `pip install {package}` to use this feature.'
+    # Convert single strings to lists for uniform processing
+    if isinstance(module_name, str):
+        module_names = [module_name]
+    else:
+        module_names = module_name
+    if package is None:
+        packages = [None] * len(module_names)
+    elif isinstance(package, str):
+        packages = [package] * len(module_names)
+    else:
+        packages = package
+        # Ensure packages list has same length as module_names
+        if len(packages) < len(module_names):
+            packages.extend([None] * (len(module_names) - len(packages)))
+    missing_modules = []
+    missing_packages = []
+    for i, mod_name in enumerate(module_names):
+        try:
+            importlib.import_module(mod_name)
+        except ImportError:
+            missing_modules.append(mod_name)
+            if i < len(packages) and packages[i]:
+                missing_packages.append(packages[i])
+    if missing_modules:
+        if len(missing_modules) == 1:
+            error_msg = f'`{missing_modules[0]}` not found.'
+        else:
+            error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
+        if missing_packages:
+            if len(missing_packages) == 1:
+                error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
+            else:
+                unique_packages = list(dict.fromkeys(missing_packages))  # Remove duplicates while preserving order
+                error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
+        if raise_warning:
             logger.warning(error_msg)
-        if raise_error:
+        if not IS_BUILD_DOC and raise_error:
             raise ImportError(error_msg)
         return False
+    return True
 class _LazyModule(ModuleType):
     """

evalscope/utils/io_utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ import re
 import string
 import unicodedata
 import yaml
+from datetime import datetime
 from io import BytesIO
 from PIL import Image
@@ -123,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
     if not isinstance(data_list, list):
         data_list = [data_list]
+    # Convert non-serializable types to serializable ones
+    data_list = convert_normal_types(data_list)
     if dump_mode == DumpMode.OVERWRITE:
         dump_mode = 'w'
     elif dump_mode == DumpMode.APPEND:
@@ -304,20 +308,22 @@ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = F
     return img_str
-def bytes_to_base64(bytes_data: bytes, format: str = 'png', add_header: bool = False) -> str:
-    """Convert image bytes to a base64 encoded string.
+def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
+    """Convert bytes to a base64 encoded string.
     Args:
         bytes_data (bytes): The bytes to convert.
+        format (str): The format of the image. Default is 'png'.
         add_header (bool): Whether to add the base64 header. Default is False.
+        content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
     Returns:
         str: Base64 encoded string of the bytes.
     """
-    img_str = base64.b64encode(bytes_data).decode('utf-8')
+    base64_str = base64.b64encode(bytes_data).decode('utf-8')
     if add_header:
-        img_str = f'data:image/{format};base64,{img_str}'
-    return img_str
+        base64_str = f'data:{content_type}/{format};base64,{base64_str}'
+    return base64_str
 def base64_to_PIL(base64_str):
@@ -392,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
     return s
-def convert_numpy_types(obj):
-    """Recursively convert numpy types to native Python types for JSON serialization."""
+def convert_normal_types(obj):
+    """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
     import numpy as np
-    if isinstance(obj, np.bool_):
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    elif isinstance(obj, np.bool_):
         return bool(obj)
     elif isinstance(obj, np.integer):
         return int(obj)
@@ -405,10 +413,10 @@ def convert_numpy_types(obj):
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
     elif isinstance(obj, dict):
-        return {key: convert_numpy_types(value) for key, value in obj.items()}
+        return {key: convert_normal_types(value) for key, value in obj.items()}
     elif isinstance(obj, list):
-        return [convert_numpy_types(item) for item in obj]
+        return [convert_normal_types(item) for item in obj]
     elif isinstance(obj, tuple):
-        return tuple(convert_numpy_types(item) for item in obj)
+        return tuple(convert_normal_types(item) for item in obj)
     else:
         return obj

evalscope/utils/json_schema.py CHANGED Viewed

@@ -4,7 +4,7 @@ from copy import deepcopy
 from dataclasses import is_dataclass
 from datetime import date, datetime, time
 from enum import EnumMeta
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator, model_validator
 from typing import (
     Any,
     Dict,
@@ -59,6 +59,28 @@ class JSONSchema(BaseModel):
     required: Optional[List[str]] = Field(default=None)
     """Required fields for object parameters."""
+    @model_validator(mode='before')
+    def convert_type_before_validation(cls, values):
+        values = deepcopy(values)
+        def recursive_convert_type(obj):
+            if isinstance(obj, dict):
+                # Convert 'type' field if it's a string
+                if 'type' in obj and isinstance(obj['type'], str):
+                    try:
+                        obj['type'] = python_type_to_json_type(obj['type'])
+                    except ValueError:
+                        # If conversion fails, leave it as is
+                        pass
+                # Recursively process nested structures
+                for k, v in obj.items():
+                    obj[k] = recursive_convert_type(v)
+            elif isinstance(obj, list):
+                return [recursive_convert_type(item) for item in obj]
+            return obj
+        return recursive_convert_type(values)
 def json_schema(t: Type[Any]) -> JSONSchema:
     """Provide a JSON Schema for the specified type.
@@ -152,6 +174,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
 def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
+    if python_type is not None and python_type in get_args(JSONType):
+        return python_type
     if python_type == 'str':
         return 'string'
     elif python_type == 'int':
@@ -205,4 +229,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
             return obj
     return cast(Dict[str, Any], _resolve_refs(schema))
-    return cast(Dict[str, Any], _resolve_refs(schema))

evalscope/utils/logger.py CHANGED Viewed

@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
 logging.getLogger('httpx').setLevel(logging.WARNING)
 logging.getLogger('modelscope').setLevel(logging.ERROR)
+info_set = set()
+warning_set = set()
+def info_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in info_set:
+        return
+    info_set.add(hash_id)
+    self.info(msg)
+def warning_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in warning_set:
+        return
+    warning_set.add(hash_id)
+    self.warning(msg)
 def get_logger(
     log_file: Optional[str] = None,

evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl