PyPI - evalscope - Versions diffs - 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +29 -9
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +2 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +20 -15
evalscope/metrics/__init__.py +9 -1
evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
evalscope/metrics/llm_judge.py +106 -20
evalscope/metrics/metrics.py +20 -8
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +4 -2
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/custom/dummy_model.py +3 -3
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +15 -16
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +3 -3
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +55 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +68 -4
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +34 -70
tests/perf/test_perf.py +31 -4
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/models/model.py +0 -189
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/config.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import copy
-import json
 import os
 from argparse import Namespace
 from dataclasses import dataclass, field
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
 from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
                                  JudgeStrategy, ModelTask, OutputType)
 from evalscope.models import CustomModel, DummyCustomModel
-from evalscope.utils import gen_hash
-from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
+from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
+from evalscope.utils.io_utils import dict_to_yaml, gen_hash
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import parse_int_or_float
 logger = get_logger()
-cur_path = os.path.dirname(os.path.abspath(__file__))
 @dataclass
-class TaskConfig:
+class TaskConfig(BaseArgument):
     # Model-related arguments
     model: Union[str, 'CustomModel', None] = None
     model_id: Optional[str] = None
@@ -132,15 +128,6 @@ class TaskConfig:
                     'precision': 'torch.float16',
                 }
-    def to_dict(self):
-        result = self.__dict__.copy()
-        if isinstance(self.model, CustomModel):
-            result['model'] = self.model.__class__.__name__
-        return result
-    def __str__(self):
-        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
     def update(self, other: Union['TaskConfig', dict]):
         if isinstance(other, TaskConfig):
             other = other.to_dict()
@@ -155,91 +142,11 @@ class TaskConfig:
         except Exception as e:
             logger.warning(f'Failed to dump overall task config: {e}')
-    @staticmethod
-    def list():
-        return list(registry_tasks.keys())
-    @staticmethod
-    def from_yaml(yaml_file: str):
-        return TaskConfig.from_dict(yaml_to_dict(yaml_file))
-    @staticmethod
-    def from_dict(d: dict):
-        return TaskConfig(**d)
-    @staticmethod
-    def from_json(json_file: str):
-        return TaskConfig.from_dict(json_to_dict(json_file))
-    @staticmethod
-    def from_args(args: Namespace):
-        # Convert Namespace to a dictionary and filter out None values
-        args_dict = {k: v for k, v in vars(args).items() if v is not None}
-        if 'func' in args_dict:
-            del args_dict['func']  # Note: compat CLI arguments
-        return TaskConfig.from_dict(args_dict)
-    @staticmethod
-    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
-        res_list = []
-        for task_name in tasks:
-            task = registry_tasks.get(task_name, None)
-            if task is None:
-                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
-                continue
-            task.model = custom_model
-            task.model_args = custom_model.config
-            task.model_id = type(custom_model).__name__
-            res_list.append(task)
-        return res_list
-    @staticmethod
-    def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
-        """
-        Register a new task (dataset) for evaluation.
-        Args:
-            name: str, the dataset name.
-            data_pattern: str, the data pattern for the task.
-                    e.g. `mmlu`, `ceval`, `gsm8k`, ...
-                    refer to task_config.list() for all available datasets.
-            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
-                then your specific custom dataset directory will be /path/to/data/{name}
-            subset_list: list, the subset list for the dataset.
-                e.g. ['middle_school_politics', 'operating_system']
-                refer to the mmlu for example.  https://github.com/hendrycks/test/blob/master/categories.py
-        """
-        available_datasets = list(registry_tasks.keys())
-        if data_pattern not in available_datasets:
-            logger.error(
-                f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
-            return
-        # Reuse the existing task config and update the datasets
-        pattern_config = registry_tasks[data_pattern]
-        custom_config = copy.deepcopy(pattern_config)
-        custom_config.datasets = [data_pattern]
-        custom_config.dataset_args = {data_pattern: {}}
-        custom_config.eval_type = EvalType.CHECKPOINT
-        if dataset_dir is not None:
-            custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
-        if subset_list is not None:
-            custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
-        registry_tasks.update({name: custom_config})
-        logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
-tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
-registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
+    def to_dict(self):
+        result = self.__dict__.copy()
+        if isinstance(self.model, CustomModel):
+            result['model'] = self.model.__class__.__name__
+        return result
 def parse_task_config(task_cfg) -> TaskConfig:
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
     else:
         raise ValueError('Args: Please provide a valid task config.')
     return task_cfg
-class TempModel(CustomModel):
-    def __init__(self, config: dict):
-        super().__init__(config=config)
-    def predict(self, prompts: str, **kwargs):
-        return [item + ': response' for item in prompts]
-if __name__ == '__main__':
-    model = TempModel(config={'model_id': 'test-swift-dummy-model'})
-    task_config = TaskConfig()
-    # Register a new task
-    TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
-    swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
-    for item in swift_eval_task:
-        print(item)
-        print()

evalscope/constants.py CHANGED Viewed

@@ -41,27 +41,6 @@ class MetricsConstant:
     ]
-class MetricMembers:
-    # Math accuracy metric
-    MATH_ACCURACY = 'math_accuracy'
-    # Code pass@k metric
-    CODE_PASS_K = 'code_pass_k'
-    # Code rouge metric
-    ROUGE = 'rouge'
-    # ELO rating system for pairwise comparison
-    ELO = 'elo'
-    # Pairwise comparison win/lose and tie(optional)
-    PAIRWISE = 'pairwise'
-    # Rating score for single model
-    SCORE = 'score'
 class ArenaWinner:
     MODEL_A = 'model_a'
@@ -172,6 +151,11 @@ class JudgeStrategy:
     LLM_RECALL = 'llm_recall'
+class JudgeScoreType:
+    NUMERIC = 'numeric'  # numeric score
+    PATTERN = 'pattern'  # pattern matching score
 class ModelTask:
     TEXT_GENERATION = 'text_generation'
     IMAGE_GENERATION = 'image_generation'

evalscope/evaluator/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.evaluator.evaluator import Evaluator
+from .evaluator import Evaluator

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -7,16 +7,18 @@ from collections import OrderedDict, defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from copy import deepcopy
 from tqdm import tqdm
-from typing import Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
 from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
-from evalscope.models import BaseModelAdapter
 from evalscope.report import Report, gen_table
-from evalscope.utils import dict_torch_dtype_to_str, gen_hash
-from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
+from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
 from evalscope.utils.logger import get_logger
+from evalscope.utils.model_utils import dict_torch_dtype_to_str
+if TYPE_CHECKING:
+    from evalscope.models import BaseModelAdapter
 logger = get_logger()
@@ -38,7 +40,7 @@ class Evaluator(object):
     def __init__(self,
                  data_adapter: DataAdapter,
-                 model_adapter: BaseModelAdapter,
+                 model_adapter: 'BaseModelAdapter',
                  outputs: OutputsStructure = None,
                  task_cfg: TaskConfig = None,
                  **kwargs):
@@ -237,9 +239,10 @@ class Evaluator(object):
             if use_llm:
                 # Use LLM as judge
                 assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
+                pred_content = self.data_adapter.llm_parse_pred_result(
+                    result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
                 review_result = self.data_adapter.llm_match(
-                    gold_content, answer_content, self.judge, raw_input=raw_input_d)
-                pred = answer_content
+                    gold_content, pred_content, self.judge, raw_input=raw_input_d)
             else:
                 # Use rule-based judging
                 pred_content = self.data_adapter.parse_pred_result(
@@ -250,15 +253,14 @@ class Evaluator(object):
                 if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
                         and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
                     assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}'  # noqa: E501
+                    pred_content = self.data_adapter.llm_parse_pred_result(
+                        result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
                     review_result = self.data_adapter.llm_match(
-                        gold_content, answer_content, self.judge, raw_input=raw_input_d)
-                    pred = answer_content
-                else:
-                    pred = pred_content
+                        gold_content, pred_content, self.judge, raw_input=raw_input_d)
             choice[ReviewKeys.REVIEW] = {
                 ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
-                ReviewKeys.PRED: pred,
+                ReviewKeys.PRED: pred_content,
                 ReviewKeys.RESULT: review_result
             }
             rev_choices.append(choice)
@@ -394,9 +396,6 @@ class Evaluator(object):
         report_map: Report = self.data_adapter.gen_report(
             subset_score_map=reviews_score_all, model_name=self.model_name)
-        # Post process report
-        self.data_adapter.post_process_report(report_map, report_path=report_path)
         # Make table
         try:
             report_table = gen_table(report_list=[report_map], add_overall_metric=True)
@@ -418,6 +417,12 @@ class Evaluator(object):
         report_map.to_json(report_file)
         logger.info(f'Dump report to: {report_file} \n')
+        # Post process report
+        try:
+            self.data_adapter.post_process_report(report_map, report_path=report_path)
+        except Exception as e:
+            logger.error(f'Failed to post process report: {e}')
         return report_map
     def eval(self, **kwargs) -> dict:

evalscope/metrics/__init__.py CHANGED Viewed

@@ -4,7 +4,8 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .llm_judge import LLMJudge
+    from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
+    from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
     from .math_parser import extract_answer, math_equal, strip_answer_string
     from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
                           weighted_mean)
@@ -33,12 +34,19 @@ else:
         ],
         'llm_judge': [
             'LLMJudge',
+            'DEFAULT_PROMPT_TEMPLATE',
+            'DEFAULT_NUMERIC_SCORE_TEMPLATE',
         ],
         'math_parser': [
             'extract_answer',
             'math_equal',
             'strip_answer_string',
         ],
+        'completion_parsers': [
+            'ResponseParser',
+            'lmsys_parser',
+            'ranking_parser',
+        ],
     }
     import sys

evalscope/{utils/utils.py → metrics/completion_parsers.py} RENAMED Viewed

@@ -1,77 +1,85 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) OpenCompass.
+# flake8: noqa
-import functools
-import hashlib
-import importlib
-import importlib.util
-import numpy as np
-import os
-import random
+import ast
 import re
-import torch
-from inspect import signature
-from typing import Any, Dict, List, Tuple, Union
+# from . import utils as ann_utils
+from evalscope.constants import ArenaWinner
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-TEST_LEVEL_LIST = [0, 1]
+one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
+one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
-# Example: export TEST_LEVEL_LIST=0,1
-TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
+# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
+# does not work with batched completions
+def lmsys_parser(completion, output_format):
+    if output_format == '[[rating]]':
+        match = re.search(one_score_pattern, completion)
+        if not match:
+            match = re.search(one_score_pattern_backup, completion)
-def test_level_list():
-    global TEST_LEVEL_LIST
-    if TEST_LEVEL_LIST_STR in os.environ:
-        TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
-    return TEST_LEVEL_LIST
-def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
-    module_name, spliter, cls_name = eval_class_ref.partition(':')
+        if match:
+            rating = ast.literal_eval(match.groups()[0])
+        else:
+            logger.error(f'Content: {completion}\n'
+                         'You must manually fix the score.')
+            rating = -1
+        return rating
+    if output_format == '[[rating_a,rating_b]]':
+        try:
+            score_pair = completion.split('\n')[0]
+            score_pair = score_pair.replace(',', ' ')
+            sp = score_pair.split(' ')
+            if len(sp) == 2:
+                score_1 = float(sp[0])
+                score_2 = float(sp[1])
+                if score_1 > score_2:
+                    winner = ArenaWinner.MODEL_A
+                elif score_1 < score_2:
+                    winner = ArenaWinner.MODEL_B
+                else:
+                    if score_1 == score_1 == -1:
+                        winner = ArenaWinner.UNKNOWN
+                    winner = ArenaWinner.TIE
+                return winner, [score_1, score_2]
+            else:
+                raise Exception('Invalid score pair.')
+        except Exception as e:
+            logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
+            return ArenaWinner.UNKNOWN, [-1, -1]
+    elif output_format == '[[A]]':
+        if '[[A]]' in completion:
+            winner = ArenaWinner.MODEL_A
+        elif '[[B]]' in completion:
+            winner = ArenaWinner.MODEL_B
+        elif '[[C]]' in completion:
+            winner = ArenaWinner.TIE
+        else:
+            logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
+            winner = ArenaWinner.UNKNOWN
+        return winner
+def ranking_parser(completion, **kwargs):
     try:
-        obj_cls = importlib.import_module(module_name)
-    except ImportError as e:
-        logger.error(f'{e}')
-        raise e
-    if spliter:
-        for attr in cls_name.split('.'):
-            obj_cls = getattr(obj_cls, attr)
+        if isinstance(completion, str):
+            ordered_completions = ast.literal_eval(completion)
+        else:
+            ordered_completions = completion
-    return functools.partial(obj_cls, *args, **kwargs)
+        rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
+        assert rank in [1, 2]
-def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
-    """Random choice with a (potentially string) seed."""
-    return random.Random(seed).choices(choices, k=1, **kwargs)[0]
-def gen_hash(name: str, bits: int = 32):
-    return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
-def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
-    """
-        Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
-        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
-        string, which can then be stored in the json format.
-        Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
-        """
-    if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
-        d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
-    for value in d.values():
-        if isinstance(value, dict):
-            dict_torch_dtype_to_str(value)
-    return d
+        return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
+    except Exception as e:
+        logger.error(f'{e}\nContent: {completion}\n'
+                     'You must manually fix the score pair.')
+        return ArenaWinner.UNKNOWN
 class ResponseParser:
@@ -194,7 +202,6 @@ class ResponseParser:
             return last_capital
         return 'No valid option found'
     @staticmethod
     def parse_bracketed_answer(text: str, options: list[str]) -> str:
         options = ResponseParser.process_options(options)
@@ -212,121 +219,9 @@ class ResponseParser:
         options_pattern = '|'.join(escaped_options)
         return options_pattern
-def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
-    """
-    Normalize score.
-    Args:
-        score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
-        keep_num: number of digits to keep.
-    Returns:
-        Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
-    """
-    if isinstance(score, float):
-        score = round(score, keep_num)
-    elif isinstance(score, dict):
-        score = {k: round(v, keep_num) for k, v in score.items()}
-    else:
-        logger.warning(f'Unknown score type: {type(score)}')
-    return score
-def is_module_installed(module_name):
-    try:
-        importlib.import_module(module_name)
-        return True
-    except ImportError:
-        return False
-def get_module_path(module_name):
-    spec = importlib.util.find_spec(module_name)
-    if spec and spec.origin:
-        return os.path.abspath(spec.origin)
-    else:
-        raise ValueError(f'Cannot find module: {module_name}')
-def get_valid_list(input_list, candidate_list):
-    """
-    Get the valid and invalid list from input_list based on candidate_list.
-    Args:
-        input_list: The input list.
-        candidate_list: The candidate list.
-    Returns:
-        valid_list: The valid list.
-        invalid_list: The invalid list.
-    """
-    return [i for i in input_list if i in candidate_list], \
-           [i for i in input_list if i not in candidate_list]
-def get_latest_folder_path(work_dir):
-    from datetime import datetime
-    # Get all subdirectories in the work_dir
-    folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
-    # Get the timestamp（YYYYMMDD_HHMMSS）
-    timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
-    # Filter out the folders
-    timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
-    if not timestamped_folders:
-        print(f'>> No timestamped folders found in {work_dir}!')
-        return None
-    # timestamp parser
-    def parse_timestamp(folder_name):
-        return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
-    # Find the latest folder
-    latest_folder = max(timestamped_folders, key=parse_timestamp)
-    return os.path.join(work_dir, latest_folder)
-def csv_to_list(file_path: str) -> List[dict]:
-    import csv
-    with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
-        csv_reader = csv.DictReader(csv_file)
-        result = [row for row in csv_reader]
-    return result
-def seed_everything(seed: int):
-    """Set all random seeds to a fixed value for reproducibility.
-    Args:
-        seed (int): The seed value.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-def get_supported_params(func):
-    """Get the supported parameters of a function."""
-    sig = signature(func)
-    return list(sig.parameters.keys())
-def parse_int_or_float(num):
-    number = float(num)
-    if number.is_integer():
-        return int(number)
-    return number
 if __name__ == '__main__':
+    result = '**Answer: A **Answer: C**'
     options = ['A', 'B', 'C', 'D']
-    answers = ['Context .... ANSWER: A', 'answer: A']
-    for answer in answers:
-        print(ResponseParser.parse_first_option(answer, options))
+    parsed_result = ResponseParser.parse_first_option(result, options)
+    print(f'Parsed result: {parsed_result}')  # Should print 'C'

evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl