PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import subprocess
 import uvicorn
 from contextlib import asynccontextmanager
-from dataclasses import dataclass
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
@@ -15,49 +14,6 @@ from evalscope.utils.logger import get_logger
 logger = get_logger()
-@dataclass
-class ServerSentEvent(object):
-    def __init__(self, data='', event=None, id=None, retry=None):
-        self.data = data
-        self.event = event
-        self.id = id
-        self.retry = retry
-    @classmethod
-    def decode(cls, line):
-        """Decode line to ServerSentEvent
-        Args:
-            line (str): The line.
-        Return:
-            ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
-        """
-        if not line:
-            return None
-        sse_msg = cls()
-        # format data:xxx
-        field_type, _, field_value = line.partition(':')
-        if field_value.startswith(' '):  # compatible with openai api
-            field_value = field_value[1:]
-        if field_type == 'event':
-            sse_msg.event = field_value
-        elif field_type == 'data':
-            field_value = field_value.rstrip()
-            sse_msg.data = field_value
-        elif field_type == 'id':
-            sse_msg.id = field_value
-        elif field_type == 'retry':
-            sse_msg.retry = field_value
-        else:
-            pass
-        return sse_msg
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     yield

evalscope/perf/utils/log_utils.py CHANGED Viewed

@@ -15,29 +15,42 @@ def init_wandb(args: Arguments) -> None:
         raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
     os.environ['WANDB_SILENT'] = 'true'
     os.environ['WANDB_DIR'] = args.outputs_dir
-    wandb.login(key=args.wandb_api_key)
     current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
     name = args.name if args.name else f'{args.model_id}_{current_time}'
-    wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('wandb_api_key', None)
+    if args.wandb_api_key is not None:
+        wandb.login(key=args.wandb_api_key)
+    wandb.init(project='perf_benchmark', name=name, config=logging_config)
 def init_swanlab(args: Arguments) -> None:
+    """
+    Initialize SwanLab for logging.
+    """
     import datetime
     try:
         import swanlab
     except ImportError:
         raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
     os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
-    if not args.swanlab_api_key == 'local':
-        swanlab.login(api_key=args.swanlab_api_key)
     current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
     name = args.name if args.name else f'{args.model_id}_{current_time}'
     swanlab.config.update({'framework': '📏evalscope'})
+    # Remove sensitive information from logging config
+    logging_config = args.to_dict()
+    logging_config.pop('api_key', None)
+    logging_config.pop('swanlab_api_key', None)
     init_kwargs = {
         'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
         'name': name,
-        'config': args.to_dict(),
+        'config': logging_config,
         'mode': 'local' if args.swanlab_api_key == 'local' else None
     }
@@ -45,4 +58,6 @@ def init_swanlab(args: Arguments) -> None:
     if workspace:
         init_kwargs['workspace'] = workspace
+    if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
+        swanlab.login(api_key=args.swanlab_api_key)
     swanlab.init(**init_kwargs)

evalscope/report/__init__.py CHANGED Viewed

@@ -4,9 +4,15 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .combinator import gen_table, get_data_frame, get_report_list
+    from .combinator import (
+        gen_table,
+        get_data_frame,
+        get_report_list,
+        unweighted_average_from_subsets,
+        weighted_average_from_subsets,
+    )
     from .generator import ReportGenerator
-    from .report import Category, Report, ReportKey, Subset
+    from .report import Category, Metric, Report, ReportKey, Subset
 else:
     _import_structure = {
@@ -14,6 +20,8 @@ else:
             'gen_table',
             'get_data_frame',
             'get_report_list',
+            'weighted_average_from_subsets',
+            'unweighted_average_from_subsets',
         ],
         'generator': [
             'ReportGenerator',
@@ -23,6 +31,7 @@ else:
             'Report',
             'ReportKey',
             'Subset',
+            'Metric',
         ],
     }

evalscope/report/combinator.py CHANGED Viewed

@@ -4,9 +4,9 @@ import glob
 import os
 import pandas as pd
 from tabulate import tabulate
-from typing import List, Tuple
+from typing import Dict, List, Tuple, Union
-from evalscope.report.report import Report
+from evalscope.report.report import Report, Subset
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -86,3 +86,53 @@ def gen_table(
         add_overall_metric=add_overall_metric
     )
     return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
+def weighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate weighted average for given subsets.
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with weighted average score
+    """
+    total_score = 0
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            total_score += subset.score * subset.num
+            total_count += subset.num
+    weighted_avg = total_score / total_count if total_count > 0 else 0
+    return Subset(name=new_name, score=weighted_avg, num=total_count)
+def unweighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate unweighted average for given subsets.
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with unweighted average score
+    """
+    scores = []
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            scores.append(subset.score)
+            total_count += subset.num
+    unweighted_avg = sum(scores) / len(scores) if scores else 0
+    return Subset(name=new_name, score=unweighted_avg, num=total_count)

evalscope/run.py CHANGED Viewed

@@ -38,6 +38,7 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     if task_cfg.eval_backend != EvalBackend.NATIVE:
         result = run_non_native_backend(task_cfg, outputs)
     else:
+        logger.info('Running with native backend')
         result = evaluate_model(task_cfg, outputs)
         logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
@@ -94,12 +95,15 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
 def get_backend_manager_class(eval_backend: EvalBackend):
     """Get the backend manager class based on the evaluation backend."""
     if eval_backend == EvalBackend.OPEN_COMPASS:
+        logger.info('Using OpenCompassBackendManager')
         from evalscope.backend.opencompass import OpenCompassBackendManager
         return OpenCompassBackendManager
     elif eval_backend == EvalBackend.VLM_EVAL_KIT:
+        logger.info('Using VLMEvalKitBackendManager')
         from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
         return VLMEvalKitBackendManager
     elif eval_backend == EvalBackend.RAG_EVAL:
+        logger.info('Using RAGEvalBackendManager')
         from evalscope.backend.rag_eval import RAGEvalBackendManager
         return RAGEvalBackendManager
     elif eval_backend == EvalBackend.THIRD_PARTY:

evalscope/utils/function_utils.py CHANGED Viewed

@@ -1,7 +1,50 @@
+import asyncio
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor, wait
 from contextlib import contextmanager
 from functools import wraps
+from tqdm import tqdm
+from typing import Any, Awaitable, Callable, List, Optional, Sequence, TypeVar, Union
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+T = TypeVar('T')
+R = TypeVar('R')
+# Global lock to safely create per-instance locks in decorators
+_THREAD_SAFE_GLOBAL_LOCK = threading.RLock()
+def thread_safe(func: Callable[..., T]) -> Callable[..., T]:
+    """Thread-safe decorator.
+    - If decorating a bound method, uses a per-instance, per-method lock.
+    - If decorating a function, uses a function-scoped lock.
+    """
+    func_lock = threading.RLock()
+    lock_attr_name = f'__lock_{func.__name__}'
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Prefer per-instance lock if the first arg looks like 'self'
+        if args and hasattr(args[0], '__dict__'):
+            self_obj = args[0]
+            lock = getattr(self_obj, lock_attr_name, None)
+            if lock is None:
+                with _THREAD_SAFE_GLOBAL_LOCK:
+                    lock = getattr(self_obj, lock_attr_name, None)
+                    if lock is None:
+                        lock = threading.RLock()
+                        setattr(self_obj, lock_attr_name, lock)
+        else:
+            lock = func_lock
+        with lock:
+            return func(*args, **kwargs)
+    return wrapper
 def run_once(func):
@@ -19,18 +62,6 @@ def run_once(func):
     return wrapper
-def thread_safe(func):
-    """Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
-    lock = threading.RLock()
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        with lock:
-            return func(*args, **kwargs)
-    return wrapper
 def retry_func(retries=3, sleep_interval=0):
     """A decorator that retries a function call up to `retries` times if an exception occurs."""
@@ -68,3 +99,155 @@ def retry_context(retries=3, sleep_interval=0):
             if attempt == retries - 1:  # Last attempt
                 break
     raise last_exception
+class AsyncioLoopRunner:
+    """Singleton background asyncio loop runner for sync→async bridging."""
+    _instance: Optional['AsyncioLoopRunner'] = None
+    _inst_lock = threading.Lock()
+    def __init__(self) -> None:
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self._thread: Optional[threading.Thread] = None
+        self._start_loop()
+    def _start_loop(self) -> None:
+        loop = asyncio.new_event_loop()
+        self._loop = loop
+        def run_loop() -> None:
+            asyncio.set_event_loop(loop)
+            loop.run_forever()
+        self._thread = threading.Thread(target=run_loop, daemon=True, name='AsyncioLoopRunner')
+        self._thread.start()
+    @classmethod
+    def instance(cls) -> 'AsyncioLoopRunner':
+        if cls._instance is not None:
+            return cls._instance
+        with cls._inst_lock:
+            if cls._instance is None:
+                cls._instance = AsyncioLoopRunner()
+        return cls._instance
+    @classmethod
+    def run(cls, coro: Awaitable[T], timeout: Optional[float] = None) -> T:
+        """Submit a coroutine to the background loop and wait for result."""
+        inst = cls.instance()
+        fut = asyncio.run_coroutine_threadsafe(coro, inst._loop)
+        return fut.result(timeout=timeout)
+    @property
+    def loop(self) -> Optional[asyncio.AbstractEventLoop]:
+        """Access the underlying event loop (read-only use)."""
+        return self._loop
+    def stop(self, join_timeout: float = 5.0) -> None:
+        """Optional shutdown of the background loop (generally not needed)."""
+        if not self._loop:
+            return
+        self._loop.call_soon_threadsafe(self._loop.stop)
+        if self._thread:
+            self._thread.join(timeout=join_timeout)
+def run_in_threads_with_progress(
+    items: Sequence[T],
+    worker: Callable[[T], R],
+    *,
+    desc: str,
+    max_workers: int,
+    heartbeat_sec: int,
+    on_result: Optional[Callable[[T, R], None]] = None,
+    on_error: Optional[Callable[[T, Exception], None]] = None,
+) -> List[R]:
+    """
+    Execute a collection of tasks concurrently with a ThreadPoolExecutor while
+    displaying a tqdm progress bar and emitting periodic heartbeat logs.
+    Key behaviors:
+    - Concurrency: Uses up to `min(len(items), max_workers)` threads.
+    - Progress: A tqdm bar advances when each task finishes (success or failure).
+    - Heartbeat: If no tasks finish within `heartbeat_sec`, a status line is logged.
+    - Ordering: Results are appended in completion order (not the original order).
+    - Error handling:
+        * If `on_error` is provided, it is called for each failed item; execution continues
+          unless `on_error` itself raises.
+        * If `on_error` is None, the first exception is raised immediately and stops processing.
+    - Callbacks:
+        * `on_result(item, result)` is called after a successful result is obtained.
+        * Both callbacks run in the main thread (not worker threads).
+    Args:
+        items: A sequence of items (inputs) to process. Converted to a list internally.
+        worker: A callable executed in threads to process a single item and return a result.
+        desc: A short text shown as the tqdm progress bar description.
+        max_workers: Upper bound on the number of concurrent threads.
+        heartbeat_sec: Interval (in seconds) to wait before emitting a heartbeat log if
+            no tasks complete in that window.
+        on_result: Optional callback invoked as on_result(item, result) after success.
+        on_error: Optional callback invoked as on_error(item, exception) on failure. If omitted,
+            the exception is propagated and the function terminates early.
+    Returns:
+        A list of results collected as tasks complete (completion order).
+        If some tasks fail and `on_error` is provided (and does not re-raise), those failures
+        are skipped and not included in the returned results.
+    Raises:
+        Exception: Propagates the first task exception if `on_error` is not provided, or if
+        `on_error` re-raises.
+    Notes:
+        - The function is blocking until all tasks complete or an exception is propagated.
+        - Use `on_error` to implement "best-effort" processing where failures are logged
+          and the rest continue.
+    """
+    # Defensive copy to avoid consuming a generator multiple times and to compute pool size.
+    pending_items: List[T] = list(items)
+    if not pending_items:
+        return []
+    results: List[R] = []
+    # Bound the pool by actual workload size for efficiency.
+    with ThreadPoolExecutor(max_workers=min(len(pending_items), max_workers)) as executor:
+        # Submit all tasks up-front and map futures back to their originating item.
+        future_to_item = {executor.submit(worker, item): item for item in pending_items}
+        # Progress bar reflects total number of submitted tasks; updated per finished future.
+        with tqdm(total=len(pending_items), desc=desc, mininterval=1, dynamic_ncols=True) as pbar:
+            # Track unfinished futures and poll with a timeout to enable heartbeat logs.
+            pending = set(future_to_item.keys())
+            while pending:
+                # Wait with timeout to detect stalls and emit heartbeats proactively.
+                done, not_done = wait(pending, timeout=heartbeat_sec)
+                if not done:
+                    # Heartbeat when nothing has completed within the window.
+                    logger.info(f'{desc} still processing... pending={len(not_done)}')
+                    continue
+                # Consume completed futures.
+                for future in done:
+                    item = future_to_item[future]
+                    try:
+                        res = future.result()
+                        results.append(res)
+                        # Invoke success callback in caller thread (not in worker).
+                        if on_result is not None:
+                            on_result(item, res)
+                    except Exception as exc:
+                        # Delegate failure handling to on_error if provided; otherwise bubble up.
+                        if on_error is not None:
+                            on_error(item, exc)
+                        else:
+                            raise
+                    finally:
+                        # Always advance progress for completed futures (success or failure).
+                        pbar.update(1)
+                # Continue polling remaining futures.
+                pending = not_done
+    return results

evalscope/utils/io_utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ import yaml
 from datetime import datetime
 from io import BytesIO
 from PIL import Image
+from typing import Tuple
 from evalscope.constants import DumpMode
 from evalscope.utils.logger import get_logger
@@ -173,6 +174,24 @@ def csv_to_list(csv_file) -> list:
     return res_list
+def tsv_to_list(tsv_file) -> list:
+    """
+    Read tsv file to list.
+    Args:
+        tsv_file: tsv file path.
+    Returns:
+        list: list of lines. Each line is a dict.
+    """
+    res_list = []
+    with open(tsv_file, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f, delimiter='\t')
+        for row in reader:
+            res_list.append(row)
+    return res_list
 def csv_to_jsonl(csv_file, jsonl_file):
     """
     Convert csv file to jsonl file.
@@ -420,3 +439,58 @@ def convert_normal_types(obj):
         return tuple(convert_normal_types(item) for item in obj)
     else:
         return obj
+def compress_image_to_limit(image_bytes: bytes, max_bytes: int = 10_000_000) -> Tuple[bytes, str]:
+    """
+    Ensure image bytes are under max_bytes by re-encoding to JPEG with quality reduction
+    and optional downscaling. Returns (processed_bytes, format_str).
+    If the original bytes are already below the limit, returns them as PNG.
+    """
+    if len(image_bytes) <= max_bytes:
+        return image_bytes, 'png'
+    try:
+        img = Image.open(BytesIO(image_bytes))
+    except Exception as exc:
+        logger.warning(f'Failed to open image bytes with PIL, sending original image; may exceed API limit: {exc}')
+        return image_bytes, 'png'
+    # Convert to RGB for JPEG if needed
+    if img.mode not in ('RGB', 'L'):
+        img = img.convert('RGB')
+    def encode_jpeg(source: Image.Image, quality: int) -> bytes:
+        buf = BytesIO()
+        source.save(buf, format='JPEG', quality=quality, optimize=True, progressive=True)
+        return buf.getvalue()
+    # Start with moderate quality and reduce
+    quality: int = 85
+    out: bytes = encode_jpeg(img, quality)
+    quality_floor: int = 40
+    while len(out) > max_bytes and quality > quality_floor:
+        quality -= 10
+        out = encode_jpeg(img, quality)
+    # If still too large, progressively downscale
+    min_side_floor: int = 256
+    scale: float = 0.9
+    while len(out) > max_bytes and min(img.size) > min_side_floor:
+        new_w = max(min_side_floor, int(img.width * scale))
+        new_h = max(min_side_floor, int(img.height * scale))
+        if (new_w, new_h) == img.size:
+            break
+        img = img.resize((new_w, new_h), Image.LANCZOS)
+        out = encode_jpeg(img, quality)
+    if len(out) > max_bytes:
+        logger.warning(f'Image remains above limit after compression: size={len(out)} bytes (limit={max_bytes}).')
+    else:
+        logger.info(
+            f'Compressed image from {len(image_bytes)} to {len(out)} bytes; '
+            f'quality={quality}, size={img.width}x{img.height}.'
+        )
+    return out, 'jpeg'

evalscope/utils/json_schema.py CHANGED Viewed

@@ -59,18 +59,20 @@ class JSONSchema(BaseModel):
     required: Optional[List[str]] = Field(default=None)
     """Required fields for object parameters."""
-    @field_validator('type')
-    def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
-        return python_type_to_json_type(v)
     @model_validator(mode='before')
     def convert_type_before_validation(cls, values):
         values = deepcopy(values)
         def recursive_convert_type(obj):
             if isinstance(obj, dict):
-                if 'type' in obj:
-                    obj['type'] = python_type_to_json_type(obj['type'])
+                # Convert 'type' field if it's a string
+                if 'type' in obj and isinstance(obj['type'], str):
+                    try:
+                        obj['type'] = python_type_to_json_type(obj['type'])
+                    except ValueError:
+                        # If conversion fails, leave it as is
+                        pass
+                # Recursively process nested structures
                 for k, v in obj.items():
                     obj[k] = recursive_convert_type(v)
             elif isinstance(obj, list):

evalscope/utils/logger.py CHANGED Viewed

@@ -53,16 +53,16 @@ def get_logger(
     name: Optional[str] = None,
     log_level: int = DEFAULT_LEVEL,
     file_mode: str = 'w',
-    force=False
+    force: bool = False,
 ):
     """Get logging logger
     Args:
-        log_file: Log filename, if specified, file handler will be added to
-            logger
-        log_level: Logging level.
-        file_mode: Specifies the mode to open the file, if filename is
-            specified (if filemode is unspecified, it defaults to 'w').
+        log_file: Log filename. If specified, a file handler will be added to the logger.
+        name: Logical component name. Used to derive the logger name.
+        log_level: Logging level to set.
+        file_mode: Mode to open the file when log_file is provided (default 'w').
+        force: If True, reconfigure the existing logger (levels, formatters, handlers).
     """
     if name:
@@ -77,7 +77,7 @@ def get_logger(
             logger.setLevel(log_level)
             for handler in logger.handlers:
                 handler.setLevel(log_level)
-                # 区分不同类型的 handler，使用相应的格式化器
+                # Select formatter by handler type
                 if isinstance(handler, logging.FileHandler):
                     handler.setFormatter(
                         plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
@@ -86,6 +86,7 @@ def get_logger(
                     handler.setFormatter(
                         color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
                     )
+            # Ensure file handler points to current log_file (replace if needed)
             add_file_handler_if_needed(logger, log_file, file_mode, log_level)
         return logger
@@ -137,23 +138,54 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
         get_logger(log_level=logging.DEBUG, force=True)
-def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
-    for handler in logger.handlers:
-        if isinstance(handler, logging.FileHandler):
-            return
+def add_file_handler_if_needed(
+    logger: logging.Logger,
+    log_file: Optional[str],
+    file_mode: str,
+    log_level: int,
+) -> None:
+    """Ensure logger has a FileHandler targeting log_file.
+    - If no FileHandler exists, add one.
+    - If a FileHandler exists but points to a different file, replace it.
+    """
+    if log_file is None:
+        return
+    # Only worker-0 writes files
     if iutil.find_spec('torch') is not None:
         from modelscope.utils.torch_utils import is_master
         is_worker0 = is_master()
     else:
         is_worker0 = True
-    if is_worker0 and log_file is not None:
-        file_handler = logging.FileHandler(log_file, file_mode)
-        file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
-        file_handler.setLevel(log_level)
-        logger.addHandler(file_handler)
+    if not is_worker0:
+        return
+    target_path = os.path.abspath(log_file)
+    existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
+    # If there is a FileHandler already pointing to the target file, nothing to do.
+    for fh in existing_file_handlers:
+        try:
+            if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
+                return
+        except Exception:
+            # If any issue retrieving baseFilename, fall through to replacement
+            pass
+    # Replace all existing FileHandlers with the new one
+    for fh in existing_file_handlers:
+        try:
+            logger.removeHandler(fh)
+            fh.flush()
+            fh.close()
+        except Exception:
+            pass
+    file_handler = logging.FileHandler(target_path, file_mode)
+    file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
+    file_handler.setLevel(log_level)
+    logger.addHandler(file_handler)
 def warn_once(logger: Logger, message: str) -> None:

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl