PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/arc/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
 from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/arc/ai2_arc.py CHANGED Viewed

@@ -1,18 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) Allen Institute, and its affiliates.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 """AI2 ARC (Abstraction and Reasoning Corpus) for General Artificial Intelligence Benchmark."""
 """AUTO GENERATED, DO NOT EDIT"""
+import datasets
 import json
 import os
-import datasets
 # flake8: noqa
 _CITATION = """\
 @article{allenai:arc,
       author    = {Peter Clark  and Isaac Cowhey and Oren Etzioni and Tushar Khot and
@@ -85,17 +82,20 @@ class Ai2Arc(datasets.GeneratorBasedBuilder):
             # This is the description that will appear on the datasets page.
             description=_DESCRIPTION,
             # datasets.features.FeatureConnectors
-            features=datasets.Features(
-                {
-                    'id': datasets.Value('string'),
-                    'question': datasets.Value('string'),
-                    'choices': datasets.features.Sequence(
-                        {'text': datasets.Value('string'), 'label': datasets.Value('string')}
-                    ),
-                    'answerKey': datasets.Value('string')
-                    # These are the features of your dataset like images, labels ...
-                }
-            ),
+            features=datasets.Features({
+                'id':
+                datasets.Value('string'),
+                'question':
+                datasets.Value('string'),
+                'choices':
+                datasets.features.Sequence({
+                    'text': datasets.Value('string'),
+                    'label': datasets.Value('string')
+                }),
+                'answerKey':
+                datasets.Value('string')
+                # These are the features of your dataset like images, labels ...
+            }),
             # If there's a common (input, target) tuple from the features,
             # specify them here. They'll be used if as_supervised=True in
             # builder.as_dataset.
@@ -144,5 +144,8 @@ class Ai2Arc(datasets.GeneratorBasedBuilder):
                     'id': id_,
                     'answerKey': answerkey,
                     'question': question,
-                    'choices': {'text': text_choices, 'label': label_choices},
+                    'choices': {
+                        'text': text_choices,
+                        'label': label_choices
+                    },
                 }

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import json
+import os
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score, ResponseParser
+from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -44,13 +45,14 @@ class ARCAdapter(DataAdapter):
         if few_shot_num != 0:
             logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         prompt_template=prompt_template,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            prompt_template=prompt_template,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         """
@@ -78,8 +80,10 @@ class ARCAdapter(DataAdapter):
                             rows.append({
                                 'id': item['id'],
                                 'question': item['question']['stem'],
-                                'choices': {'text': [d['text'] for d in raw_choices],
-                                            'label': [d['label'] for d in raw_choices]},
+                                'choices': {
+                                    'text': [d['text'] for d in raw_choices],
+                                    'label': [d['label'] for d in raw_choices]
+                                },
                                 'answerKey': item['answerKey'],
                             })
@@ -143,9 +147,11 @@ class ARCAdapter(DataAdapter):
         if eval_type == 'checkpoint':
             return result
         elif eval_type == 'service':
-            return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(
+                text=result, options=self.choices)  # TODO: to be checked !
         elif eval_type == 'custom':
-            return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(
+                text=result, options=self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -200,17 +206,19 @@ class ARCAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
-        res_map = dict(name=report_name or 'arc',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'arc',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map

evalscope/benchmarks/bbh/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
 from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/bbh/bbh_adapter.py CHANGED Viewed

@@ -1,22 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
 import os
-import re
 import random
-import json
+import re
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.constants import AnswerKeys
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score, ResponseParser
+from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
 DATASET_ID = 'modelscope/bbh'
 # BBH multiple choice subset list
 MULTIPLE_CHOICE = 'multiple_choice'
 MULTIPLE_CHOICE_LIST = [
@@ -87,12 +87,13 @@ class BBHAdapter(DataAdapter):
                          f'Use 3-shot by default.')
             few_shot_num = 3
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -151,10 +152,10 @@ class BBHAdapter(DataAdapter):
         if self.few_shot_num < 0:
             raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
-        logger.info(f'\n** Use default settings: \n'
-                    f'>few_shot_num: {self.few_shot_num}, '
-                    f'>few_shot_split: {self.train_split}, '
-                    f'>target_eval_split: {self.eval_split}')
+        logger.info(f'Use default settings: '
+                    f'> few_shot_num: {self.few_shot_num}, '
+                    f'> few_shot_split: {self.train_split}, '
+                    f'> target_eval_split: {self.eval_split}')
         for sub_name, sub_data_dict in data_dict.items():
             few_shot_data = []
@@ -260,17 +261,19 @@ class BBHAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
-        res_map = dict(name=report_name or 'bbh',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'bbh',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map

evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt CHANGED Viewed

@@ -20,4 +20,4 @@ A: Let's think step by step.
 Remember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is "not", "and", "or", respectively.
 We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
 Let's evaluate A: A = not ( False ) = not False = True.
-Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.
+Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.

evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt CHANGED Viewed

@@ -22,4 +22,4 @@ Options:
 - Yes
 - No
 A: Let's think step by step.
-Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.
+Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.

evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt CHANGED Viewed

@@ -30,4 +30,4 @@ Options:
 (E) 01/03/1982
 (F) 12/03/1960
 A: Let's think step by step.
-If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).
+If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).

evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt CHANGED Viewed

@@ -34,4 +34,4 @@ A: Let's think step by step.
 Here we need to determine who the pronoun "his" might be referring to. There are two possible referents for "his", namely Bailey's and the director's. The verb phrase "plan to meet" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form "X will plan to meet Y at (X or Y)'s office."
 Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
 Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
-Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).
+Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt CHANGED Viewed

@@ -69,4 +69,4 @@ We should process each input one by one and keep track of the stack configuratio
 29: } ; stack: < [ {
 Now, we have reached the end. The final stack is "< [ {".
 We will need to pop out "{", "[", "<" one by one in that order.
-So, we need "}", "]", ">". So the answer is } ] >.
+So, we need "}", "]", ">". So the answer is } ] >.

evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt CHANGED Viewed

@@ -41,4 +41,4 @@ Let’s see whether the Hypothesis can be deduced from the arguments (1) and (2)
 By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress). We need to consider both cases separately:
 The case X = rare-consumer(Nioxin) does not appear in (2).
 The case X = loyal-buyer(Caress) does not appear in (2), either.
-So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.
+So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.

evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt CHANGED Viewed

@@ -75,4 +75,4 @@ Length of side A: |A| = sqrt((41.00-37.00)^2 + (43.00-34.00)^2) = sqrt((4)^2 + (
 Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 + (1)^2) = sqrt(16 + 1) = sqrt(17).
 Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
 Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
-Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).
+Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).

evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt CHANGED Viewed

@@ -25,4 +25,4 @@ Options:
 A: Let's think step by step.
 When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
 Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
-Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).
+Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).

evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt CHANGED Viewed

@@ -34,4 +34,4 @@ A: Let's think step by step.
 (2) The red book is the second from the left: "(left) ? white red gray ? (right)".
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
-The white book is the leftmost. So the answer is (C).
+The white book is the leftmost. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt CHANGED Viewed

@@ -34,4 +34,4 @@ A: Let's think step by step.
 (2) The red book is the second from the left: "(left) ? white red gray ? (right)".
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
-The white book is the leftmost. So the answer is (C).
+The white book is the leftmost. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt CHANGED Viewed

@@ -34,4 +34,4 @@ A: Let's think step by step.
 (2) The red book is the second from the left: "(left) ? white red gray ? (right)".
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
-The white book is the leftmost. So the answer is (C).
+The white book is the leftmost. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt CHANGED Viewed

@@ -39,4 +39,4 @@ A: Let's think step by step.
 - Total Recall (action, adventure, science-fiction; 2012)
 - Inside Out (animation, family, comedy; 2015)
 - Forrest Gump (comedy, drama, romance; 1994)
-These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).
+These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).

evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt CHANGED Viewed

@@ -22,4 +22,4 @@ Let’s recall that the order of operations in mathematics is as follows: (1) Pa
 This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9 - 8 * -7 + -9).
 Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
 Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
-Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.
+Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.

evalscope/benchmarks/bbh/cot_prompts/navigate.txt CHANGED Viewed

@@ -40,4 +40,4 @@ We start at the origin (0, 0), facing the positive y-axis.
 (2) Take 2 steps right: (0, 2), facing the positive y-axis.
 (3) Take 9 steps left: (0, -7), facing the positive y-axis.
 (4) Take 7 steps right: (0, 7), facing the positive y-axis.
-Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.
+Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.

evalscope/benchmarks/bbh/cot_prompts/object_counting.txt CHANGED Viewed

@@ -34,4 +34,4 @@ We first identify the vegetables on the list and include their quantity in paren
 - carrots (2)
 - garlic (1)
 - yam (1)
-Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.
+Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.

evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt CHANGED Viewed

@@ -38,4 +38,4 @@ A: Let's think step by step.
 This question focuses on the name. We know the following: The names of the penguin in the table are Louis, Bernard, Vincent, and Gwen.
 When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
 The name of the second penguin sorted by alphabetical order is Gwen.
-The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).
+The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).

evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt CHANGED Viewed

@@ -60,4 +60,4 @@ Options:
 A: Let's think step by step.
 According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
 The teal plate is the first item, namely (1). There is no item to the left of the teal item.
-The number of non-orange items to the left of the teal item is zero. So the answer is (A).
+The number of non-orange items to the left of the teal item is zero. So the answer is (A).

evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt CHANGED Viewed

@@ -41,4 +41,4 @@ The original name is "counting crows". This is the name of an American rock band
 (B) "counting cows": Here the word "crows" is changed to "cows", and this is indeed a playful and meaningful edit that ruins the original name of the band.
 (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
 (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
-Above the above, the only humorous edit is (B). So the answer is (B).
+Above the above, the only humorous edit is (B). So the answer is (B).

evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt CHANGED Viewed

@@ -37,4 +37,4 @@ Options:
 (E) Dropped Content
 (F) Facts
 A: Let's think step by step.
-We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).
+We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/snarks.txt CHANGED Viewed

@@ -27,4 +27,4 @@ Options:
 A: Let's think step by step.
 If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
 If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
-Above the above, the sarcastic option is (A). So the answer is (A).
+Above the above, the sarcastic option is (A). So the answer is (A).

evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt CHANGED Viewed

@@ -7,4 +7,4 @@ Q: Is the following sentence plausible? "Santi Cazorla scored a touchdown."
 A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part of American football and rugby. So the answer is no.
 Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
-A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.
+A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.

evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt CHANGED Viewed

@@ -74,4 +74,4 @@ Wake-up time: 5am.
 2pm-3pm: playing tennis at the tennis court.
 3pm-4pm: free.
 The beach closure time: 4pm.
-The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).
+The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).

evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt CHANGED Viewed

@@ -37,4 +37,4 @@ A: Let's think step by step.
 (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
-At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
+At the end of the dance, Alice is dancing with Patrick. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt CHANGED Viewed

@@ -37,4 +37,4 @@ A: Let's think step by step.
 (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
-At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
+At the end of the dance, Alice is dancing with Patrick. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt CHANGED Viewed

@@ -37,4 +37,4 @@ A: Let's think step by step.
 (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
-At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
+At the end of the dance, Alice is dancing with Patrick. So the answer is (C).

evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt CHANGED Viewed

@@ -25,4 +25,4 @@ A: Let's think step by step.
 (3) Raymond says Michaela tells the truth. Since we know from (2) that Michaela lies, if Raymond says Michaela tells the truth, then Raymond lies.
 (4) Osvaldo says Raymond tells the truth. Since we know from (3) that Raymond lies, if Osvaldo says Raymond tells the truth, then Osvaldo lies.
 (5) Jamey says Osvaldo tells the truth. Since we know from (4) that Osvaldo lies, if Jamey says Osvaldo tells the truth, then Jamey lies.
-Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.
+Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.

evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt CHANGED Viewed

@@ -14,4 +14,4 @@ The second letter: "ponderosa": "o" (15). "phone": "h" (8). We now have: (8) "ph
 Q: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census
 A: Let's think step by step.
 The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
-The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.
+The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.

evalscope/benchmarks/benchmark.py CHANGED Viewed

@@ -1,9 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
+from modelscope.msdatasets import MsDataset
 from typing import Optional
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
+from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
 class Benchmark(object):
@@ -20,7 +21,7 @@ class Benchmark(object):
              split: str = None,
              token: str = None,
              hub: str = 'ModelScope',
-             work_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
+             work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
              **kwargs):
         """
         Load a dataset from ModelScope or HuggingFace.
@@ -38,21 +39,20 @@ class Benchmark(object):
         Returns:
             A dict.
         """
-        work_dir = os.path.join(work_dir, 'benchmarks', dataset_name.replace('/', '_'))
-        if hub == 'ModelScope':
-            from modelscope.msdatasets import MsDataset
-            dataset = MsDataset.load(dataset_name=dataset_name, subset_name=subset, split=split, token=token,
-                                     cache_dir=work_dir, **kwargs)
-            dataset.dataset_name = dataset_name.split('/')[-1]
-            dataset.subset_name = subset
-            # dataset.split = split
-            return dataset
-        elif hub == 'HuggingFace':
-            # TODO: implement this by xingjun.wxj@alibaba-inc.com
-            raise NotImplementedError('HuggingFace hub is not supported yet.')
-        else:
-            raise ValueError(f'hub must be `ModelScope` or `HuggingFace`, but got {hub}')
+        dataset = MsDataset.load(
+            dataset_name=dataset_name,
+            subset_name=subset,
+            split=split,
+            token=token,
+            cache_dir=work_dir,
+            hub=hub,
+            **kwargs)
+        dataset.dataset_name = dataset_name.split('/')[-1]
+        dataset.subset_name = subset
+        # dataset.split = split
+        return dataset
 if __name__ == '__main__':

evalscope/benchmarks/ceval/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter, DATASET_ID, SUBSET_LIST, SUBJECT_MAPPING
+from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
+from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
 from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl