PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show

evalscope/api/benchmark/__init__.py +1 -1
evalscope/api/benchmark/adapters/__init__.py +2 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +62 -2
evalscope/api/benchmark/meta.py +9 -0
evalscope/api/dataset/dataset.py +6 -6
evalscope/api/dataset/loader.py +2 -1
evalscope/api/evaluator/cache.py +24 -1
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +1 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/single_model.py +3 -3
evalscope/app/utils/data_utils.py +7 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
evalscope/benchmarks/bfcl/generation.py +9 -9
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/generation.py +1 -1
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +96 -14
evalscope/constants.py +11 -0
evalscope/evaluator/evaluator.py +30 -10
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/metric.py +27 -2
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +3 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +8 -6
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/local_server.py +3 -0
evalscope/report/__init__.py +0 -1
evalscope/report/combinator.py +0 -25
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +9 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +56 -7
evalscope/utils/json_schema.py +23 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +23 -6
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
tests/benchmark/test_eval.py +80 -37
tests/benchmark/test_image_edit.py +65 -0
tests/benchmark/test_sandbox.py +81 -0
tests/benchmark/test_vlm.py +137 -0
tests/cli/test_all.py +83 -43
tests/cli/test_collection.py +8 -5
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +44 -14
tests/rag/test_clip_benchmark.py +0 -3
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/aigc/__init__.py +0 -1
/evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/test_t2i.py +0 -0

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import time
-import torch
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -44,10 +44,12 @@ class BenchmarkData:
             api_plugin.parse_responses(self.response_messages, request=self.request)
     def update_gpu_usage(self):
-        total_memory = 0
-        for i in range(torch.cuda.device_count()):
-            total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
-        self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
+        if check_import('torch', raise_warning=False):
+            import torch
+            total_memory = 0
+            for i in range(torch.cuda.device_count()):
+                total_memory += (torch.cuda.max_memory_allocated(i) / 2**30)  # GB
+            self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
 class Metrics:

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -9,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
 from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -101,6 +102,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
 def start_app(args: Arguments):
     logger.info('Starting local server, please wait...')
     if args.api == 'local':
+        check_import('torch', 'torch', raise_error=True)
         app = create_app(args.model, args.attn_implementation)
         uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)

evalscope/report/__init__.py CHANGED Viewed

@@ -14,7 +14,6 @@ else:
             'gen_table',
             'get_data_frame',
             'get_report_list',
-            'gen_report_table',
         ],
         'generator': [
             'ReportGenerator',

evalscope/report/combinator.py CHANGED Viewed

@@ -86,28 +86,3 @@ def gen_table(
         add_overall_metric=add_overall_metric
     )
     return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
-class ReportsRecorder:
-    COMMON_DATASET_PATH = []
-    CUSTOM_DATASET_PATH = []
-    def __init__(self, oss_url: str = '', endpoint: str = ''):
-        pass
-if __name__ == '__main__':
-    report_dir_1 = './outputs/20250117_151926'
-    # report_dir_2 = './outputs/20250107_204445/reports'
-    report_table = gen_table(reports_path_list=[report_dir_1])
-    print(report_table)
-    # ALL VALUES ONLY FOR EXAMPLE
-    # +--------------------------+-------------------+-------------+
-    # | Model                    | CompetitionMath   | GSM8K       |
-    # +==========================+===================+=============+
-    # | ZhipuAI_chatglm2-6b-base | 25.0 (acc)        | 30.50 (acc) |
-    # +--------------------------+-------------------+-------------+
-    # | ZhipuAI_chatglm2-6b      | 30.5 (acc)        | 40.50 (acc) |
-    # +--------------------------+-------------------+-------------+

evalscope/report/generator.py CHANGED Viewed

@@ -8,105 +8,26 @@ from evalscope.report.report import *
 if TYPE_CHECKING:
     from evalscope.api.benchmark import DataAdapter
     from evalscope.api.metric import AggScore
-    from evalscope.benchmarks import DataAdapter as OldDataAdapter
 class ReportGenerator:
     @staticmethod
-    def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
-        """
-        Generate a report for a specific dataset based on provided subset scores.
-        Args:
-            subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
-                    {
-                        'subset_name': [
-                            {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
-                            {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
-                        ],
-                        ...
-                    }
-            report_name (str): The name of the report to generate.
-            data_adapter (DataAdapter): An adapter object for data handling.
-        Returns:
-            Report: A structured report object containing metrics, categories, and subsets.
-            >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
-        """  # noqa: E501
-        dataset_name = data_adapter.name
-        category_map = data_adapter.category_map
-        report_name = f'{model_name}@{dataset_name}'
-        def flatten_subset() -> DataFrame:
-            """
-            Flatten subset score map to a DataFrame.
-            Example:
-                        name  score  num   categories      metric_name
-            0       ARC-Easy    0.5    2    [default]  AverageAccuracy
-            1  ARC-Challenge    0.5    2    [default]  AverageAccuracy
-            """
-            subsets = []
-            for subset_name, scores in subset_score_map.items():
-                for score_item in scores:
-                    categories = category_map.get(subset_name, ['default'])
-                    if isinstance(categories, str):
-                        categories = [categories]
-                    subsets.append(
-                        dict(
-                            name=subset_name,
-                            score=score_item['score'],
-                            num=score_item['num'],
-                            metric_name=score_item['metric_name'],
-                            categories=tuple(categories)
-                        )
-                    )
-            df = pd.DataFrame(subsets)
-            return df
-        df = flatten_subset()
+    def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
         metrics_list = []
-        for metric_name, group_metric in df.groupby('metric_name', sort=False):
+        for metric_name, group_metric in df.groupby('metric', sort=False):
             categories = []
             for category_name, group_category in group_metric.groupby('categories'):
                 subsets = []
-                for _, row in group_category.iterrows():
-                    subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
+                for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
+                                                                                         'subset_name']):
+                    avg_score = group_subset['score'].mean()
+                    num = group_subset['score'].count()
+                    subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
                 categories.append(Category(name=category_name, subsets=subsets))
             metrics_list.append(Metric(name=metric_name, categories=categories))
-        report = Report(
-            name=report_name,
-            metrics=metrics_list,
-            dataset_name=dataset_name,
-            model_name=model_name,
-            dataset_description=data_adapter.description,
-            dataset_pretty_name=data_adapter.pretty_name
-        )
-        return report
-    @staticmethod
-    def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
-        categories = []
-        for category_name, group_category in df.groupby('categories'):
-            subsets = []
-            for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
-                avg_score = group_subset['score'].mean()
-                num = group_subset['score'].count()
-                subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
-            categories.append(Category(name=category_name, subsets=subsets))
         return Report(
-            name=DataCollection.NAME,
-            metrics=[Metric(name='Average', categories=categories)],
-            dataset_name=all_dataset_name,
-            model_name=model_name
+            name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
         )
     @staticmethod

evalscope/report/report.py CHANGED Viewed

@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果，输出分
 """
-def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
+def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
     """
     Normalize score.
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
         score = round(score, keep_num)
     elif isinstance(score, dict):
         score = {k: round(v, keep_num) for k, v in score.items()}
+    elif isinstance(score, int):
+        score = float(score)
     else:
         logger.warning(f'Unknown score type: {type(score)}')
     return score
@@ -103,6 +104,7 @@ class ReportKey:
     subset_name = 'Subset'
     num = 'Num'
     score = 'Score'
+    overall_score = 'OVERALL'
 @dataclass
@@ -181,12 +183,14 @@ class Report:
                     table[ReportKey.num].append(subset.num)
                     table[ReportKey.score].append(subset.score)
             # add overall metric when there are multiple subsets
-            if metric_count > 1 and add_overall_metric:
+            if metric_count > 1 and add_overall_metric and (
+                ReportKey.overall_score not in table[ReportKey.subset_name]
+            ):
                 table[ReportKey.model_name].append(self.model_name)
                 table[ReportKey.dataset_name].append(self.dataset_name)
                 table[ReportKey.metric_name].append(metric.name)
                 table[ReportKey.category_name].append(('-', ))
-                table[ReportKey.subset_name].append('OVERALL')
+                table[ReportKey.subset_name].append(ReportKey.overall_score)
                 table[ReportKey.num].append(metric.num)
                 table[ReportKey.score].append(metric.score)
             # NOTE: only flatten metrics if needed, use the first metric by default

evalscope/run.py CHANGED Viewed

@@ -131,8 +131,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
         )
         evaluators.append(evaluator)
-        # Update task_config.dataset_args with benchmark metadata
-        task_config.dataset_args[dataset_name] = benchmark.to_dict()
+        # Update task_config.dataset_args with benchmark metadata, except for DataCollection
+        if dataset_name != DataCollection.NAME:
+            task_config.dataset_args[dataset_name] = benchmark.to_dict()
     # dump task_cfg to outputs.configs_dir after creating evaluators
     task_config.dump_yaml(outputs.configs_dir)
@@ -149,17 +150,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
         logger.info(f'Overall report table: \n{report_table} \n')
     except Exception:
         logger.error('Failed to generate report table.')
     # Clean up
     if model is not None:
         import gc
-        import torch
         del model
         del evaluators
-        torch.cuda.empty_cache()
         gc.collect()
+        from evalscope.utils.import_utils import check_import
+        if check_import('torch', raise_warning=False):
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
     return eval_results

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,9 +1,5 @@
-import os
 from dataclasses import dataclass
-from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
-# 设置GPU环境变量
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 @dataclass
 class SwiftInferArgs:

evalscope/utils/chat_service.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import time
-import torch
 from contextlib import contextmanager
 from functools import partial
 from pydantic import BaseModel, Field
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
 class ChatService:
     def __init__(self, model_path, attn_implementation):
+        import torch
         from modelscope import AutoModelForCausalLM, AutoTokenizer
         from transformers import TextIteratorStreamer

evalscope/utils/function_utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import threading
+import time
+from contextlib import contextmanager
 from functools import wraps
@@ -27,3 +29,42 @@ def thread_safe(func):
             return func(*args, **kwargs)
     return wrapper
+def retry_func(retries=3, sleep_interval=0):
+    """A decorator that retries a function call up to `retries` times if an exception occurs."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            for attempt in range(retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    last_exception = e
+                    if sleep_interval > 0:
+                        time.sleep(sleep_interval)
+            raise last_exception
+        return wrapper
+    return decorator
+@contextmanager
+def retry_context(retries=3, sleep_interval=0):
+    """A context manager that retries the code block up to `retries` times if an exception occurs."""
+    last_exception = None
+    for attempt in range(retries):
+        try:
+            yield
+            return  # If no exception, exit successfully
+        except Exception as e:
+            last_exception = e
+            if sleep_interval > 0:
+                time.sleep(sleep_interval)
+            if attempt == retries - 1:  # Last attempt
+                break
+    raise last_exception

evalscope/utils/import_utils.py CHANGED Viewed

@@ -5,13 +5,85 @@ import importlib
 import os
 from itertools import chain
 from types import ModuleType
-from typing import Any
+from typing import Any, Optional, Union
+from evalscope.constants import IS_BUILD_DOC
 from .logger import get_logger
 logger = get_logger()  # pylint: disable=invalid-name
+def check_import(
+    module_name: Union[str, list[str]],
+    package: Optional[Union[str, list[str]]] = None,
+    raise_warning: bool = True,
+    raise_error: bool = False,
+    feature_name: Optional[str] = 'this feature',
+) -> bool:
+    """Check if a module or list of modules can be imported.
+    Args:
+        module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
+        package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
+            Defaults to None.
+        raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
+        raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
+        feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
+            Defaults to 'this feature'.
+    Returns:
+        bool: True if all modules can be imported, False otherwise.
+    """
+    # Convert single strings to lists for uniform processing
+    if isinstance(module_name, str):
+        module_names = [module_name]
+    else:
+        module_names = module_name
+    if package is None:
+        packages = [None] * len(module_names)
+    elif isinstance(package, str):
+        packages = [package] * len(module_names)
+    else:
+        packages = package
+        # Ensure packages list has same length as module_names
+        if len(packages) < len(module_names):
+            packages.extend([None] * (len(module_names) - len(packages)))
+    missing_modules = []
+    missing_packages = []
+    for i, mod_name in enumerate(module_names):
+        try:
+            importlib.import_module(mod_name)
+        except ImportError:
+            missing_modules.append(mod_name)
+            if i < len(packages) and packages[i]:
+                missing_packages.append(packages[i])
+    if missing_modules:
+        if len(missing_modules) == 1:
+            error_msg = f'`{missing_modules[0]}` not found.'
+        else:
+            error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
+        if missing_packages:
+            if len(missing_packages) == 1:
+                error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
+            else:
+                unique_packages = list(dict.fromkeys(missing_packages))  # Remove duplicates while preserving order
+                error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
+        if raise_warning:
+            logger.warning(error_msg)
+        if not IS_BUILD_DOC and raise_error:
+            raise ImportError(error_msg)
+        return False
+    return True
 class _LazyModule(ModuleType):
     """
     Module class that surfaces all objects but only performs associated imports when the objects are requested.

evalscope/utils/io_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import base64
 import csv
 import hashlib
+import io
 import json
 import jsonlines as jsonl
 import os
@@ -8,6 +9,7 @@ import re
 import string
 import unicodedata
 import yaml
+from datetime import datetime
 from io import BytesIO
 from PIL import Image
@@ -122,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
     if not isinstance(data_list, list):
         data_list = [data_list]
+    # Convert non-serializable types to serializable ones
+    data_list = convert_normal_types(data_list)
     if dump_mode == DumpMode.OVERWRITE:
         dump_mode = 'w'
     elif dump_mode == DumpMode.APPEND:
@@ -283,22 +288,64 @@ def get_valid_list(input_list, candidate_list):
            [i for i in input_list if i not in candidate_list]
-def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
+def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
     """
     Convert a PIL Image to a base64 encoded string.
     Args:
         image (Image.Image): The PIL Image to convert.
         format (str): The format to save the image in. Default is 'JPEG'.
+        add_header (bool): Whether to add the base64 header. Default is False.
     Returns:
         str: Base64 encoded string of the image.
     """
     buffered = BytesIO()
     image.save(buffered, format=format)
     img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    if add_header:
+        img_str = f'data:image/{format.lower()};base64,{img_str}'
     return img_str
+def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
+    """Convert bytes to a base64 encoded string.
+    Args:
+        bytes_data (bytes): The bytes to convert.
+        format (str): The format of the image. Default is 'png'.
+        add_header (bool): Whether to add the base64 header. Default is False.
+        content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
+    Returns:
+        str: Base64 encoded string of the bytes.
+    """
+    base64_str = base64.b64encode(bytes_data).decode('utf-8')
+    if add_header:
+        base64_str = f'data:{content_type}/{format};base64,{base64_str}'
+    return base64_str
+def base64_to_PIL(base64_str):
+    """Convert a base64 encoded string to a PIL Image.
+    Args:
+        base64_str (str): The base64 encoded string.
+    Returns:
+        Image.Image: The decoded PIL Image.
+    """
+    # remove header
+    if ',' in base64_str:
+        base64_str = base64_str.split(',', 1)[1]
+    # decode
+    img_data = base64.b64decode(base64_str)
+    img_file = io.BytesIO(img_data)
+    img = Image.open(img_file)
+    return img
 def safe_filename(s: str, max_length: int = 255) -> str:
     """
     Convert a string into a safe filename by removing or replacing unsafe characters.
@@ -351,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
     return s
-def convert_numpy_types(obj):
-    """Recursively convert numpy types to native Python types for JSON serialization."""
+def convert_normal_types(obj):
+    """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
     import numpy as np
-    if isinstance(obj, np.bool_):
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    elif isinstance(obj, np.bool_):
         return bool(obj)
     elif isinstance(obj, np.integer):
         return int(obj)
@@ -364,10 +413,10 @@ def convert_numpy_types(obj):
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
     elif isinstance(obj, dict):
-        return {key: convert_numpy_types(value) for key, value in obj.items()}
+        return {key: convert_normal_types(value) for key, value in obj.items()}
     elif isinstance(obj, list):
-        return [convert_numpy_types(item) for item in obj]
+        return [convert_normal_types(item) for item in obj]
     elif isinstance(obj, tuple):
-        return tuple(convert_numpy_types(item) for item in obj)
+        return tuple(convert_normal_types(item) for item in obj)
     else:
         return obj

evalscope/utils/json_schema.py CHANGED Viewed

@@ -4,7 +4,7 @@ from copy import deepcopy
 from dataclasses import is_dataclass
 from datetime import date, datetime, time
 from enum import EnumMeta
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator, model_validator
 from typing import (
     Any,
     Dict,
@@ -59,6 +59,26 @@ class JSONSchema(BaseModel):
     required: Optional[List[str]] = Field(default=None)
     """Required fields for object parameters."""
+    @field_validator('type')
+    def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
+        return python_type_to_json_type(v)
+    @model_validator(mode='before')
+    def convert_type_before_validation(cls, values):
+        values = deepcopy(values)
+        def recursive_convert_type(obj):
+            if isinstance(obj, dict):
+                if 'type' in obj:
+                    obj['type'] = python_type_to_json_type(obj['type'])
+                for k, v in obj.items():
+                    obj[k] = recursive_convert_type(v)
+            elif isinstance(obj, list):
+                return [recursive_convert_type(item) for item in obj]
+            return obj
+        return recursive_convert_type(values)
 def json_schema(t: Type[Any]) -> JSONSchema:
     """Provide a JSON Schema for the specified type.
@@ -152,6 +172,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
 def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
+    if python_type is not None and python_type in get_args(JSONType):
+        return python_type
     if python_type == 'str':
         return 'string'
     elif python_type == 'int':
@@ -205,4 +227,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
             return obj
     return cast(Dict[str, Any], _resolve_refs(schema))
-    return cast(Dict[str, Any], _resolve_refs(schema))

evalscope/utils/logger.py CHANGED Viewed

@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
 logging.getLogger('httpx').setLevel(logging.WARNING)
 logging.getLogger('modelscope').setLevel(logging.ERROR)
+info_set = set()
+warning_set = set()
+def info_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in info_set:
+        return
+    info_set.add(hash_id)
+    self.info(msg)
+def warning_once(self, msg, *args, **kwargs):
+    hash_id = kwargs.get('hash_id') or msg
+    if hash_id in warning_set:
+        return
+    warning_set.add(hash_id)
+    self.warning(msg)
 def get_logger(
     log_file: Optional[str] = None,

evalscope/utils/model_utils.py CHANGED Viewed

@@ -3,6 +3,8 @@ import random
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from evalscope.utils.import_utils import check_import
 if TYPE_CHECKING:
     from transformers import GenerationConfig
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
     """
     random.seed(seed)
     np.random.seed(seed)
-    try:
+    if check_import('torch', raise_warning=False):
         import torch
         torch.manual_seed(seed)
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
             torch.cuda.manual_seed_all(seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
-    except ImportError:
-        pass

evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl