PyPI - evalscope - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
evalscope/api/benchmark/benchmark.py +27 -2
evalscope/api/benchmark/meta.py +3 -0
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +5 -0
evalscope/api/messages/chat_message.py +6 -1
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +204 -0
evalscope/api/model/generate_config.py +0 -3
evalscope/api/model/model.py +1 -1
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +6 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +46 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
evalscope/benchmarks/bfcl/generation.py +7 -7
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drop/drop_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
evalscope/config.py +24 -1
evalscope/constants.py +3 -0
evalscope/evaluator/evaluator.py +25 -7
evalscope/metrics/metric.py +78 -2
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/model_apis.py +10 -8
evalscope/models/utils/openai.py +1 -2
evalscope/perf/arguments.py +2 -0
evalscope/perf/plugin/api/base.py +2 -2
evalscope/perf/plugin/api/default_api.py +7 -7
evalscope/perf/plugin/api/openai_api.py +83 -19
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/report/__init__.py +9 -1
evalscope/report/combinator.py +45 -20
evalscope/report/report.py +8 -4
evalscope/run.py +1 -1
evalscope/utils/function_utils.py +41 -0
evalscope/utils/import_utils.py +63 -13
evalscope/utils/io_utils.py +19 -11
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +19 -0
evalscope/utils/model_utils.py +1 -1
evalscope/utils/multi_choices.py +16 -1
evalscope/version.py +2 -2
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -385
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -80
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -178
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
{evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0

evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py ADDED Viewed

@@ -0,0 +1,64 @@
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+SUBSET_LIST = ['default']
+OPEN_PROMPT = (
+    'Read the picture and solve the following problem step by step.'
+    'The last line of your response should be of the form'
+    ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
+    '{question}\n\n'
+    'Remember to put your answer on its own line at the end in the form'
+    ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
+    ' and you do not need to use a \\boxed command.'
+)
+@register_benchmark(
+    BenchmarkMeta(
+        name='real_world_qa',
+        pretty_name='RealWorldQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.',  # noqa: E501
+        dataset_id='lmms-lab/RealWorldQA',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class RealWorldQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['answer'],
+            metadata={'image_path': record['image_path']}
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return ''

evalscope/benchmarks/tau_bench/tau_bench_adapter.py CHANGED Viewed

@@ -47,7 +47,12 @@ class TauBenchAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
+        check_import(
+            'tau_bench',
+            package='git+https://github.com/sierra-research/tau-bench',
+            raise_error=True,
+            feature_name=self.pretty_name
+        )
         # setup user model args
         self.user_model = self.extra_params.get('user_model', 'qwen-plus')

evalscope/config.py CHANGED Viewed

@@ -18,6 +18,7 @@ from evalscope.constants import (
 )
 from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
 from evalscope.utils.deprecation_utils import deprecated_warning
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
 from evalscope.utils.logger import get_logger
@@ -124,6 +125,19 @@ class TaskConfig(BaseArgument):
     analysis_report: bool = False
     """Whether to generate detailed analysis reports after evaluation."""
+    # Sandbox configuration arguments
+    use_sandbox: bool = False
+    """Whether to execute code in a sandboxed environment."""
+    sandbox_type: Optional[str] = 'docker'
+    """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
+    sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
+    sandbox_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for sandboxed code execution environments."""
     def __post_init__(self):
         self.__init_model_and_id()
@@ -132,6 +146,7 @@ class TaskConfig(BaseArgument):
         # Set default generation_config and model_args
         self.__init_default_generation_config()
         self.__init_default_model_args()
+        self.__init_default_sandbox_config()
     def __init_model_and_id(self):
         # Set model to DummyCustomModel if not provided
@@ -223,6 +238,14 @@ class TaskConfig(BaseArgument):
                     'precision': 'torch.float16',
                 }
+    def __init_default_sandbox_config(self):
+        if not self.use_sandbox:
+            return
+        check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
+        if not self.sandbox_type:
+            self.sandbox_type = 'docker'
     def update(self, other: Union['TaskConfig', dict]):
         if isinstance(other, TaskConfig):
             other = other.to_dict()
@@ -238,7 +261,7 @@ class TaskConfig(BaseArgument):
             logger.warning(f'Failed to dump overall task config: {e}')
     def to_dict(self):
-        result = copy.deepcopy(self.__dict__)
+        result = copy.copy(self.__dict__)
         del result['api_key']  # Do not expose api_key in the config
         if isinstance(self.model, (Model, ModelAPI)):

evalscope/constants.py CHANGED Viewed

@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR  # compatible with old versio
 DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
     os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
 )  # ~/.cache/evalscope
+IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1'  # To avoid some heavy dependencies when building doc
 class HubType:
@@ -130,6 +131,8 @@ class Tags:
     TEXT_TO_IMAGE = 'TextToImage'
     IMAGE_EDITING = 'ImageEditing'
     MULTI_MODAL = 'MultiModal'
+    MULTI_LINGUAL = 'MultiLingual'
+    MULTI_TURN = 'MultiTurn'
 class FileConstants:

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -8,8 +8,9 @@ and report generation.
 """
 import os
+import traceback
 from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
 from tqdm import tqdm
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
 from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
 from evalscope.api.metric import AggScore, SampleScore
 from evalscope.report import Report, gen_table
+from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
     from evalscope.api.benchmark import DataAdapter
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
     from evalscope.config import TaskConfig
     from evalscope.utils.io_utils import OutputsStructure
-from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -104,6 +104,9 @@ class DefaultEvaluator(Evaluator):
         # Generate the report based on aggregated scores
         report = self.get_report(agg_score_dict)
+        # Finalize the evaluation process
+        self.finalize()
         return report
     def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -186,7 +189,10 @@ class DefaultEvaluator(Evaluator):
                         logger.debug(f'Model result: \n{model_result.pretty_print()}')
                     except Exception as exc:
-                        logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
+                        tb_str = traceback.format_exc()
+                        logger.error(
+                            f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
+                        )
                         if self.task_config.ignore_errors:
                             logger.warning('Error ignored, continuing with next sample.')
                         else:
@@ -253,7 +259,13 @@ class DefaultEvaluator(Evaluator):
                 for future in as_completed(future_to_task_state):
                     task_state = future_to_task_state[future]
                     try:
-                        sample_score = future.result()
+                        try:
+                            sample_score = future.result()
+                        except TimeoutError:
+                            logger.warning(
+                                f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
+                            )
+                            sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
                         sample_score_list.append(sample_score)
                         # Save the review result to cache for future use
@@ -266,7 +278,10 @@ class DefaultEvaluator(Evaluator):
                         logger.debug(f'Review result: \n{review_result.pretty_print()}')
                     except Exception as exc:
-                        logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
+                        tb_str = traceback.format_exc()
+                        logger.error(
+                            f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
+                        )
                         if self.task_config.ignore_errors:
                             logger.warning('Error ignored, continuing with next sample.')
                         else:
@@ -319,7 +334,7 @@ class DefaultEvaluator(Evaluator):
         # Generate and display a summary table of results
         try:
-            report_table = gen_table(report_list=[report], add_overall_metric=True)
+            report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
             logger.info(f'\n{self.benchmark_name} report table:'
                         f'\n{report_table} \n')
         except Exception:
@@ -337,3 +352,6 @@ class DefaultEvaluator(Evaluator):
         report.to_json(report_file)
         logger.info(f'Dump report to: {report_file} \n')
         return report
+    def finalize(self, *args, **kwargs):
+        self.benchmark.finalize(*args, **kwargs)

evalscope/metrics/metric.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from collections import defaultdict
 from typing import List
@@ -6,11 +7,19 @@ from evalscope.api.registry import register_aggregation, register_metric
 from .metrics import mean
+def normalize_text(text: str) -> str:
+    """Normalize text by lowering case and stripping whitespace."""
+    return text.strip().lower()
 @register_metric(name='exact_match')
 class ExactMatch(Metric):
     def apply(self, predictions, references):
-        return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
+        return [
+            float(normalize_text(prediction) == normalize_text(reference))
+            for prediction, reference in zip(predictions, references)
+        ]
 @register_metric(name='acc')
@@ -92,6 +101,56 @@ class MultiChoiceAcc(Metric):
         return res
+@register_metric(name='anls')
+class ANLS(Metric):
+    def __init__(self, thresh_hold=0.5):
+        self.thresh_hold = thresh_hold
+    def apply(self, predictions, references):
+        """
+        Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
+        This implementation is adapted from
+        https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
+        Args:
+            references (List[str]): List of correct answers. Each answer can be a string of json.
+            predictions (List[str]): List of predicted answers.
+        """
+        from .metrics import levenshtein_distance
+        res = []
+        # Unwrap predictions if it's a nested list
+        for prediction, reference in zip(predictions, references):
+            # Parse the reference which is a json string
+            try:
+                answer = json.loads(reference)
+            except json.JSONDecodeError:
+                answer = reference
+            if isinstance(answer, str):
+                answer = [answer]
+            assert isinstance(answer, list), 'The reference answer should be a list of answers.'
+            # Calculate ANLS for each reference answer
+            values = []
+            for ans in answer:
+                # preprocess both the answers - gt and prediction
+                gt_answer = ' '.join(ans.strip().lower().split())
+                det_answer = ' '.join(prediction.strip().lower().split())
+                dist = levenshtein_distance(gt_answer, det_answer)
+                length = max(len(ans.upper()), len(prediction.upper()))
+                values.append(0.0 if length == 0 else float(dist) / float(length))
+            question_result = 0.0
+            if values:
+                question_result = 1 - min(values)
+                if question_result < self.thresh_hold:
+                    question_result = 0.0
+            res.append(question_result)
+        return res
 # ##################
 # T2I Metrics ######
 ####################
@@ -202,6 +261,9 @@ class Mean(Aggregator):
     name = 'mean'
+    def agg_func(self, values: List[float]) -> float:
+        return mean(values)
     def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
         """Aggregate scores by computing the mean for each metric.
@@ -230,7 +292,7 @@ class Mean(Aggregator):
             if values:  # Only process non-empty value lists
                 aggregated_scores.append(
                     AggScore(
-                        score=mean(values),
+                        score=self.agg_func(values),
                         metric_name=metric_name,
                         aggregation_name=self.name,
                         num=len(values),
@@ -241,6 +303,20 @@ class Mean(Aggregator):
         return aggregated_scores
+@register_aggregation(name='clipped_mean')
+class ClippedMean(Mean):
+    name = 'clipped_mean'
+    def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+    def agg_func(self, values: List[float]) -> float:
+        clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
+        return clipped_values
 @register_aggregation(name='pass_at_k')
 class PassAtK(Aggregator):

evalscope/metrics/metrics.py CHANGED Viewed

@@ -467,3 +467,19 @@ def calculate_pass_at_k(
         num_samples_it = iter(num_samples)
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py CHANGED Viewed

@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Any, Dict, Optional, Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py CHANGED Viewed

@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py CHANGED Viewed

@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Optional, Tuple

evalscope/models/model_apis.py CHANGED Viewed

@@ -28,7 +28,7 @@ def server() -> type[ModelAPI]:
 @register_model_api(name='llm_ckpt')
 def llm_ckpt() -> type[ModelAPI]:
-    check_import('torch', package='torch', raise_error=True)
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
@@ -38,7 +38,7 @@ def llm_ckpt() -> type[ModelAPI]:
 @register_model_api(name='checkpoint')
 @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
 def checkpoint() -> type[ModelAPI]:
-    check_import('torch', package='torch', raise_error=True)
+    check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
     from .modelscope import ModelScopeAPI
@@ -47,9 +47,10 @@ def checkpoint() -> type[ModelAPI]:
 @register_model_api(name='text2image')
 def text2image() -> type[ModelAPI]:
-    check_import('torch', package='evalscope[aigc]', raise_error=True)
-    check_import('torchvision', package='evalscope[aigc]', raise_error=True)
-    check_import('diffusers', package='evalscope[aigc]', raise_error=True)
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='text2image')
     from .text2image_model import Text2ImageAPI
@@ -58,9 +59,10 @@ def text2image() -> type[ModelAPI]:
 @register_model_api(name='image_editing')
 def image_editing() -> type[ModelAPI]:
-    check_import('torch', package='evalscope[aigc]', raise_error=True)
-    check_import('torchvision', package='evalscope[aigc]', raise_error=True)
-    check_import('diffusers', package='evalscope[aigc]', raise_error=True)
+    check_import(['torch', 'torchvision', 'diffusers'],
+                 package='evalscope[aigc]',
+                 raise_error=True,
+                 feature_name='image_editing')
     from .image_edit_model import ImageEditAPI

evalscope/models/utils/openai.py CHANGED Viewed

@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
         )
     elif content.type == 'audio':
         audio_data_uri = file_as_data_uri(content.audio)
-        audio_data = audio_data_uri.split('base64,')[1]
         return ChatCompletionContentPartInputAudioParam(
-            type='input_audio', input_audio=dict(data=audio_data, format=content.format)
+            type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
         )
     else:

evalscope/perf/arguments.py CHANGED Viewed

@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
     image_height: int = 224  # Height of the image for random VL dataset
     image_format: str = 'RGB'  # Image format for random VL dataset
     image_num: int = 1  # Number of images for random VL dataset
+    image_patch_size: int = 28  # Patch size for image tokenizer, only for local image token calculation
     # Dataset settings
     dataset: str = 'openqa'  # Dataset type (default: 'line_by_line')
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
     parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
     parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
+    parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation')  # noqa: E501
     # Output settings
     parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')

evalscope/perf/plugin/api/base.py CHANGED Viewed

@@ -43,7 +43,7 @@ class ApiPluginBase:
     @abstractmethod
     async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Process the HTTP request and handle the response.
         Args:
@@ -53,7 +53,7 @@ class ApiPluginBase:
             body: The request body
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         raise NotImplementedError

evalscope/perf/plugin/api/default_api.py CHANGED Viewed

@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
         super().__init__(param)
     async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
-                              body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
+                              body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Process the HTTP request and handle the response.
         Args:
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
             body: The request body
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         try:
             headers = {'Content-Type': 'application/json', **headers}
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
             logger.error(f'Error in process_request: {e}')
             yield (True, None, str(e))
-    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Handle streaming response from server-sent events.
         Args:
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
             logger.error(f'Error in _handle_stream: {e}')
             yield True, response.status, str(e)
-    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
+    async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
         """Handle the HTTP response based on content type and status.
         Args:
             response: The aiohttp response object
         Yields:
-            Tuple[bool, int, str]: (is_error, status_code, response_data)
+            Tuple[bool, int, Any]: (is_error, status_code, response_data)
         """
         response_status = response.status
         response_content_type = response.content_type
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
             # Handle successful response with 'application/json' content type
             elif content_type_json in response_content_type:
                 content = await response.json()
-                yield (False, response_status, json.dumps(content, ensure_ascii=False))
+                yield (False, response_status, content)
             # Handle other successful responses
             else:
                 content = await response.read()
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
         else:
             # error is always in JSON format
             error = await response.json()
-            yield (True, response_status, json.dumps(error, ensure_ascii=False))
+            yield (True, response_status, error)

evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl