PyPI - evalscope - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.1.dist-info/RECORD +0 -286
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .version import __release_datetime__, __version__
+from .version import __release_datetime__, __version__

evalscope/arguments.py ADDED Viewed

@@ -0,0 +1,73 @@
+import argparse
+import json
+class ParseStrArgsAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        assert isinstance(values, str), 'args should be a string.'
+        arg_dict = {}
+        for arg in values.strip().split(','):
+            key, value = map(str.strip, arg.split('=', 1))  # Use maxsplit=1 to handle multiple '='
+            try:
+                # Safely evaluate the value using eval
+                arg_dict[key] = eval(value)
+            except Exception:
+                # If eval fails, check if it's a boolean value
+                value_lower = value.lower()
+                if value_lower == 'true':
+                    arg_dict[key] = True
+                elif value_lower == 'false':
+                    arg_dict[key] = False
+                else:
+                    # If not a boolean, keep the original string
+                    arg_dict[key] = value
+        setattr(namespace, self.dest, arg_dict)
+def add_argument(parser: argparse.ArgumentParser):
+    # yapf: disable
+    # Model-related arguments
+    parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
+    parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
+    # Template-related arguments
+    parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
+    parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.')  # noqa: E501
+    # Dataset-related arguments
+    parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks')  # noqa: E501
+    parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.')  # noqa: E501
+    parser.add_argument('--dataset-dir', help='The datasets dir.')
+    parser.add_argument('--dataset-hub', help='The datasets hub.')
+    # Generation configuration arguments
+    parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
+    # Evaluation-related arguments
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
+    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
+    parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
+    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
+    parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
+    # Cache and working directory arguments
+    parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.')  # noqa: E501
+    parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
+    parser.add_argument('--work-dir', type=str, help='The root cache dir.')
+    # Debug and runtime mode arguments
+    parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
+    parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
+    parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
+    # yapf: enable
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
+    add_argument(parser)
+    args = parser.parse_args()
+    return args

evalscope/backend/base.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Union
+from evalscope.config import TaskConfig
 from evalscope.utils import yaml_to_dict
 class BackendManager:
-    def __init__(self, config: Union[str, dict], **kwargs):
+    def __init__(self, config: Union[str, dict, TaskConfig], **kwargs):
         """
         BackendManager is the base class for the evaluation backend manager.
         It provides the basic configuration parsing, command generation, task submission, and result fetching.
@@ -15,6 +17,8 @@ class BackendManager:
         """
         if isinstance(config, str):
             self.config_d = yaml_to_dict(config)
+        elif isinstance(config, TaskConfig):
+            self.config_d = config.eval_config
         else:
             self.config_d = config

evalscope/backend/opencompass/api_meta_template.py CHANGED Viewed

@@ -1,6 +1,6 @@
+# isort: skip_file
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict, Any, List
+from typing import Any, Dict, List
 """
 The API meta template for OpenCompass.
@@ -26,18 +26,16 @@ class MetaTemplateType:
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
-def register_template(name: str,
-                      template: Dict[str, Any],
-                      exists_ok: bool = False):
+def register_template(name: str, template: Dict[str, Any], exists_ok: bool = False):
     if not exists_ok and name in TEMPLATE_MAPPING:
-        raise ValueError(f"The `{name}` has already been registered in the TEMPLATE_MAPPING.")
+        raise ValueError(f'The `{name}` has already been registered in the TEMPLATE_MAPPING.')
     TEMPLATE_MAPPING[name] = template
 def get_template(name: str) -> Dict[str, Any]:
     if name not in TEMPLATE_MAPPING:
-        raise ValueError(f"The `{name}` has not been registered in the TEMPLATE_MAPPING.")
+        raise ValueError(f'The `{name}` has not been registered in the TEMPLATE_MAPPING.')
     return TEMPLATE_MAPPING[name]
@@ -46,16 +44,12 @@ def get_template(name: str) -> Dict[str, Any]:
 register_template(
     name=MetaTemplateType.default_api_meta_template_oc,
     template=dict(
-        round=[
-            dict(role='HUMAN', api_role='HUMAN'),
-            dict(role='BOT', api_role='BOT', generate=True)
-        ],
+        round=[dict(role='HUMAN', api_role='HUMAN'),
+               dict(role='BOT', api_role='BOT', generate=True)],
         reserved_roles=[
             dict(role='SYSTEM', api_role='SYSTEM'),
         ],
-    )
-)
+    ))
 if __name__ == '__main__':
     res = MetaTemplateType.get_template_name_list()

evalscope/backend/opencompass/backend_manager.py CHANGED Viewed

@@ -1,13 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from enum import Enum
-from typing import Optional, Union
 import subprocess
-from dataclasses import asdict
 import tempfile
+from dataclasses import asdict
+from enum import Enum
+from typing import Optional, Union
-from evalscope.utils import is_module_installed, get_module_path, get_valid_list
 from evalscope.backend.base import BackendManager
 from evalscope.backend.opencompass.api_meta_template import get_template
+from evalscope.utils import get_module_path, get_valid_list, is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -107,8 +107,8 @@ class OpenCompassBackendManager(BackendManager):
     @staticmethod
     def list_datasets(return_details: bool = False):
-        from opencompass.utils.run import get_config_from_arg
         from dataclasses import dataclass
+        from opencompass.utils.run import get_config_from_arg
         @dataclass
         class TempArgs:
@@ -160,18 +160,18 @@ class OpenCompassBackendManager(BackendManager):
             None
         """
         if run_mode == RunMode.FUNCTION:
-            from opencompass.cli.main import run_task
             from opencompass.cli.arguments import ApiModelConfig
+            from opencompass.cli.main import run_task
             assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'
             tmp_model_d: dict = self.args.models[0]
             assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
-                f"Got invalid model config: {tmp_model_d}. \nTo get valid format: " \
+                f'Got invalid model config: {tmp_model_d}. \nTo get valid format: ' \
                 "{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"
             # Get valid datasets
-            dataset_names = self.args.datasets       # e.g. ['mmlu', 'ceval']
+            dataset_names = self.args.datasets  # e.g. ['mmlu', 'ceval']
             dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)
             if not dataset_names:
@@ -185,7 +185,9 @@ class OpenCompassBackendManager(BackendManager):
                 assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
                                                      f'To get the valid datasets, please refer to {dataset_names_all}'
-            valid_datasets = [_dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names]
+            valid_datasets = [
+                _dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names
+            ]
             for _dataset in valid_datasets:
                 _dataset.pop('dataset_name')
                 _dataset['reader_cfg']['test_range'] = self.args.limit
@@ -232,16 +234,23 @@ class OpenCompassBackendManager(BackendManager):
 if __name__ == '__main__':
     # OpenCompassBackendManager.list_datasets()
-    # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC', 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa', 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c', 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval', 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
+    # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC',
+    # 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa',
+    # 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c',
+    # 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval',
+    # 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
     # 'meta_template': 'default-api-meta-template-oc',
     # models: llama3-8b-instruct, qwen-7b-chat
     oc_backend_manager = OpenCompassBackendManager(
-        config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
-                'models': [{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}],
-                'limit': 5
-                }
-    )
+        config={
+            'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
+            'models': [{
+                'path': 'llama3-8b-instruct',
+                'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
+            }],
+            'limit': 5
+        })
     all_datasets = OpenCompassBackendManager.list_datasets()
     print(f'all_datasets: {all_datasets}')
     oc_backend_manager.run()

evalscope/backend/opencompass/tasks/eval_api.py CHANGED Viewed

@@ -4,7 +4,6 @@ from opencompass.partitioners import NaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
     # from opencompass.configs.summarizers.medium import summarizer
     # from opencompass.configs.summarizers.PMMEval import summarizer
@@ -17,7 +16,6 @@ for _dataset in datasets:
         from opencompass.datasets.humaneval import humaneval_gpt_postprocess
         _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
 # 2. Get models, only for placeholder, you should fill in the real model information from command line
 # See more templates in `opencompass.cli.arguments.ApiModelConfig`
 models = []
@@ -25,8 +23,5 @@ models = []
 # 3. Get infer config
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
-    runner=dict(
-        type=LocalRunner,
-        max_num_workers=4,
-        task=dict(type=OpenICLInferTask)),
+    runner=dict(type=LocalRunner, max_num_workers=4, task=dict(type=OpenICLInferTask)),
 )

evalscope/backend/opencompass/tasks/eval_datasets.py CHANGED Viewed

@@ -2,18 +2,18 @@
 from mmengine.config import read_base
 with read_base():
-    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
     from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import agieval_datasets
-    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets
+    from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
     from opencompass.configs.datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
     from opencompass.configs.datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
     from opencompass.configs.datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
-    from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
-    from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
     from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
     from opencompass.configs.datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
     from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
     from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
@@ -21,37 +21,37 @@ with read_base():
     from opencompass.configs.datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
     from opencompass.configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
     from opencompass.configs.datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
-    from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
     from opencompass.configs.datasets.lambada.lambada_gen_217e11 import lambada_datasets
+    from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
+    from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
+    from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
+    from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
     from opencompass.configs.datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
+    from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
     from opencompass.configs.datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
     from opencompass.configs.datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
     from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
     from opencompass.configs.datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
     from opencompass.configs.datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
     from opencompass.configs.datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
-    from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
     from opencompass.configs.datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
     from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
     from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
-    from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
-    from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
-    from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
-    from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
-    from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
-    from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
-    from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
-    from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
-    from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
-    from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
     from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
-    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
-    from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
+    from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
+    from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
     # Note: to be supported
     # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
@@ -59,7 +59,6 @@ with read_base():
     # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
     # from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
 datasets = []
 _locals = {k: v for k, v in locals().items() if k.endswith('_datasets')}
@@ -68,7 +67,6 @@ for k, v in _locals.items():
         _dataset['dataset_name'] = k.replace('_datasets', '')
         datasets.append(_dataset)
 if __name__ == '__main__':
     for _dataset in datasets:
         print(_dataset)

evalscope/backend/rag_eval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
-from evalscope.backend.rag_eval.utils.llm import LLM, LocalLLM, ChatOpenAI
-from evalscope.backend.rag_eval.utils.clip import VisionModel
 from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
+from evalscope.backend.rag_eval.utils.clip import VisionModel
+from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
+from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM

evalscope/backend/rag_eval/backend_manager.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import os
 from typing import Optional, Union
-from evalscope.utils import is_module_installed, get_valid_list
 from evalscope.backend.base import BackendManager
+from evalscope.utils import get_valid_list, is_module_installed
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 class RAGEvalBackendManager(BackendManager):
     def __init__(self, config: Union[str, dict], **kwargs):
         """BackendManager for VLM Evaluation Kit
@@ -20,17 +21,16 @@ class RAGEvalBackendManager(BackendManager):
     @staticmethod
     def _check_env(module_name: str):
         if is_module_installed(module_name):
-            logger.info(f"Check `{module_name}` Installed")
+            logger.info(f'Check `{module_name}` Installed')
         else:
-            logger.error(f"Please install `{module_name}` first")
+            logger.error(f'Please install `{module_name}` first')
     @staticmethod
     def run_mteb(model_args, eval_args):
-        from evalscope.backend.rag_eval.cmteb import ModelArguments, EvalArguments
-        from evalscope.backend.rag_eval.cmteb import one_stage_eval, two_stage_eval
+        from evalscope.backend.rag_eval.cmteb import EvalArguments, ModelArguments, one_stage_eval, two_stage_eval
         if len(model_args) > 2:
-            raise ValueError("Not support multiple models yet")
+            raise ValueError('Not support multiple models yet')
         # Convert arguments to dictionary
         model_args_list = [ModelArguments(**args).to_dict() for args in model_args]
@@ -43,12 +43,8 @@ class RAGEvalBackendManager(BackendManager):
     @staticmethod
     def run_ragas(testset_args, eval_args):
-        from evalscope.backend.rag_eval.ragas import rag_eval
+        from evalscope.backend.rag_eval.ragas import EvaluationArguments, TestsetGenerationArguments, rag_eval
         from evalscope.backend.rag_eval.ragas.tasks import generate_testset
-        from evalscope.backend.rag_eval.ragas import (
-            TestsetGenerationArguments,
-            EvaluationArguments,
-        )
         if testset_args is not None:
             generate_testset(TestsetGenerationArguments(**testset_args))
@@ -62,19 +58,19 @@ class RAGEvalBackendManager(BackendManager):
         evaluate(Arguments(**args))
     def run(self, *args, **kwargs):
-        tool = self.config_d.pop("tool")
-        if tool.lower() == "mteb":
-            self._check_env("mteb")
-            model_args = self.config_d["model"]
-            eval_args = self.config_d["eval"]
+        tool = self.config_d.pop('tool')
+        if tool.lower() == 'mteb':
+            self._check_env('mteb')
+            model_args = self.config_d['model']
+            eval_args = self.config_d['eval']
             self.run_mteb(model_args, eval_args)
-        elif tool.lower() == "ragas":
-            self._check_env("ragas")
-            testset_args = self.config_d.get("testset_generation", None)
-            eval_args = self.config_d.get("eval", None)
+        elif tool.lower() == 'ragas':
+            self._check_env('ragas')
+            testset_args = self.config_d.get('testset_generation', None)
+            eval_args = self.config_d.get('eval', None)
             self.run_ragas(testset_args, eval_args)
-        elif tool.lower() == "clip_benchmark":
-            self._check_env("webdataset")
-            self.run_clip_benchmark(self.config_d["eval"])
+        elif tool.lower() == 'clip_benchmark':
+            self._check_env('webdataset')
+            self.run_clip_benchmark(self.config_d['eval'])
         else:
-            raise ValueError(f"Unknown tool: {tool}")
+            raise ValueError(f'Unknown tool: {tool}')

evalscope/backend/rag_eval/clip_benchmark/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
+from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
 from evalscope.backend.rag_eval.clip_benchmark.task_template import evaluate
-from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments

evalscope/backend/rag_eval/clip_benchmark/arguments.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import List, Dict
+from typing import Dict, List
 @dataclass
@@ -13,7 +13,7 @@ class Arguments:
         model_name: str
         revision: str = "master"
         hub: str = "modelscope"
     For API VLM model support, you can use the following fields, (image caption only):
         model_name="gpt-4o-mini"
         api_base: str = "",
@@ -23,12 +23,12 @@ class Arguments:
     models: List[Dict] = field(default_factory=dict)  # List of paths to the pre-trained models or model identifiers
     dataset_name: List[str] = field(default_factory=list)  # List of dataset names to be used
     data_dir: str = None  # Root directory where the datasets are stored
-    split: str = "test"  # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
+    split: str = 'test'  # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
     task: str = None
     batch_size: int = 128  # Batch size for data loading
     num_workers: int = 1  # Number of workers for data loading
     verbose: bool = True  # Flag to enable verbose logging
-    output_dir: str = "outputs"  # Directory where the outputs (e.g., predictions, logs) will be saved
-    cache_dir: str = "cache"  # Directory where the dataset cache will be stored
+    output_dir: str = 'outputs'  # Directory where the outputs (e.g., predictions, logs) will be saved
+    cache_dir: str = 'cache'  # Directory where the dataset cache will be stored
     skip_existing: bool = False  # Flag to skip processing if outputs already exist
-    limit: int = None # Limit the number of samples to be processed
+    limit: int = None  # Limit the number of samples to be processed

evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl