PyPI - evalscope - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.1.dist-info/RECORD +0 -286
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
+import random
 from abc import ABC, abstractmethod
 from typing import Any, Optional
-import random
 from evalscope.benchmarks import Benchmark
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, AnswerKeys
+from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -29,7 +29,8 @@ class DataAdapter(ABC):
             train_split: str, usually for few-shot examples. e.g. 'train'
             eval_split: str, the target eval split name. e.g. 'test'
             prompt_template: str, the prompt template for the benchmark,
-                e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:`
+                e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
+                    the form of A or B or C or D, do not output explanation:`
         """
         self.subset_list = subset_list
         self.metric_list = metric_list
@@ -42,8 +43,8 @@ class DataAdapter(ABC):
     def load(self,
              dataset_name_or_path: str,
              subset_list: list = None,
-             work_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
-             datasets_hub: str = 'ModelScope',
+             work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
+             datasets_hub: str = HubType.MODELSCOPE,
              **kwargs) -> dict:
         """
         Load the dataset. Remote and local datasets are supported.
@@ -54,12 +55,11 @@ class DataAdapter(ABC):
         """
         dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
-        if datasets_hub == 'Local':
-            # Try to load dataset from local disk
-            if not os.path.exists(dataset_name_or_path):
-                raise FileNotFoundError(f'Dataset path not found: {dataset_name_or_path}')
-            logger.info(f'Loading dataset from local disk: >dataset_name: {dataset_name_or_path}  >work_dir: {work_dir}')
+        # Try to load dataset from local disk
+        if os.path.exists(dataset_name_or_path):
+            logger.info(
+                f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path}  > work_dir: {work_dir}')
             data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
             if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
                 raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
@@ -76,12 +76,13 @@ class DataAdapter(ABC):
                 data_dict[sub_name] = {}
                 # e.g. train: few-shot, test: target dataset to evaluate
                 for split in split_list:
-                    dataset = Benchmark.load(dataset_name=dataset_name_or_path,
-                                             subset=sub_name,
-                                             split=split,
-                                             hub=datasets_hub,
-                                             work_dir=work_dir,
-                                             **kwargs)
+                    dataset = Benchmark.load(
+                        dataset_name=dataset_name_or_path,
+                        subset=sub_name,
+                        split=split,
+                        hub=datasets_hub,
+                        work_dir=work_dir,
+                        **kwargs)
                     data_dict[sub_name].update({split: dataset})
@@ -112,19 +113,18 @@ class DataAdapter(ABC):
         if self.few_shot_num and self.few_shot_num < 0:
             raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
-        logger.info(f'\n** Use default settings: \n'
-                    f'>few_shot_num: {self.few_shot_num}, '
-                    f'>few_shot_split: {self.train_split}, '
-                    f'>target_eval_split: {self.eval_split}')
+        logger.info(f'Use default settings: '
+                    f'> few_shot_num: {self.few_shot_num}, '
+                    f'> few_shot_split: {self.train_split}, '
+                    f'> target_eval_split: {self.eval_split}')
         for sub_name, sub_data_dict in data_dict.items():
             few_shot_data = []
             if self.few_shot_num and self.few_shot_num > 0:
                 few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
-                few_shot_data = self.get_fewshot_examples(
-                    [item for item in sub_data_dict[self.train_split]],
-                    self.few_shot_num,
-                    few_shot_random=few_shot_random)
+                few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
+                                                          self.few_shot_num,
+                                                          few_shot_random=few_shot_random)
             res_dict[sub_name] = []
             for sample_d in sub_data_dict[self.eval_split]:

evalscope/benchmarks/general_qa/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST, GeneralQAAdapter
+from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
 from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -1,15 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import glob
+import json
 import os.path
+from collections import defaultdict
+from typing import Any, Optional
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
 from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
 from evalscope.utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
-from typing import Any, Optional
-from collections import defaultdict
-import json
 logger = get_logger()
@@ -31,17 +31,11 @@ class GeneralQAAdapter(DataAdapter):
         if metric_list is None:
             metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
-    def load(self,
-             dataset_name_or_path: str,
-             subset_list: list = None,
-             **kwargs) -> dict:
+        super().__init__(
+            subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
+    def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
         data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
         data_list = []
@@ -50,12 +44,12 @@ class GeneralQAAdapter(DataAdapter):
             for file_path in data_file_list:
                 data_list.extend(jsonl_to_list(file_path))
         except Exception as e:
-            raise ValueError(f"Failed to load data from {dataset_name_or_path}, got error: {e}")
+            raise ValueError(f'Failed to load data from {dataset_name_or_path}, got error: {e}')
         data_dict = {'default': {'test': data_list}}
         return data_dict
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
         """
         Args:
@@ -68,16 +62,17 @@ class GeneralQAAdapter(DataAdapter):
         """
         # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
-        history = input_d.get('history', [])    # history: [['q1', 'a1'], ['q2', 'a2'], ...]
+        history = input_d.get('history', [])  # history: [['q1', 'a1'], ['q2', 'a2'], ...]
         if len(history) > 0:
-            logger.warning(f"The history is not included in the prompt for GeneralQA. To be supported in the future.")
+            logger.warning('The history is not included in the prompt for GeneralQA. \
+                           To be supported in the future.')
         prompt = input_d.get('question', '') or input_d.get('query', '')
         # if len(history) > 0:
         #     prompt = '\n'.join(history) + '\n' + prompt
         return {'data': [prompt]}
     def get_gold_answer(self, input_d: dict) -> str:
         """
         Args:
@@ -88,7 +83,7 @@ class GeneralQAAdapter(DataAdapter):
         """
         return input_d.get('answer', '') or input_d.get('response', '')
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         """
         Args:
@@ -99,7 +94,7 @@ class GeneralQAAdapter(DataAdapter):
         """
         return result
     def match(self, gold: str, pred: str) -> float:
         """
         Args:
@@ -110,7 +105,6 @@ class GeneralQAAdapter(DataAdapter):
             bleu_score: float
         """
-        item = [(gold, pred)]
         res = dict()
         rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
         bleu_dict = bleu_ngram_one_sample(pred, gold)
@@ -118,7 +112,7 @@ class GeneralQAAdapter(DataAdapter):
         res.update(bleu_dict)
         # return bleu(item)
         return res
     def compute_metric(self, review_res_list: list) -> float:
         """
         compute weighted mean of the bleu score of all samples
@@ -132,13 +126,13 @@ class GeneralQAAdapter(DataAdapter):
         """
         items = defaultdict(list)
         for scores in review_res_list:
-            for k,v in scores.items():
+            for k, v in scores.items():
                 items[k].append((v, 1.0))
         # items = [(score, 1.0) for score in review_res_list]
-        res = {k: weighted_mean(v) for k,v in items.items()}
+        res = {k: weighted_mean(v) for k, v in items.items()}
         # return weighted_mean(items)
         return res
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         """
         Args:
@@ -167,20 +161,22 @@ class GeneralQAAdapter(DataAdapter):
         """
         total_num: int = sum([num for _, num in subset_score_map.values()])
         # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        cate_avg_list = [{'name': subset_name, 'score': score_dict} for subset_name, (score_dict, _) in subset_score_map.items()]
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': score_dict
+        } for subset_name, (score_dict, _) in subset_score_map.items()]
         total_avg_list = defaultdict(float)
         for score_dict, num in subset_score_map.values():
             for metric, score in score_dict.items():
                 total_avg_list[metric] += score * num / total_num
-        category_d = dict(name="DEFAULT",
-                          score=total_avg_list,
-                          subset=cate_avg_list)
-        res_map = dict(name=report_name or "general_qa",
-                       metric=self.metric_list[0]['name'],
-                       score=total_avg_list,
-                       category=[category_d],
-                       total_num=total_num)
-        return res_map
+        category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'general_qa',
+            metric=self.metric_list[0]['name'],
+            score=total_avg_list,
+            category=[category_d],
+            total_num=total_num)
+        return res_map

evalscope/benchmarks/gsm8k/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
 from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/gsm8k/gsm8k.py CHANGED Viewed

@@ -13,15 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # flake8: noqa
 """Grade School Math 8k dataset."""
+import datasets
 import json
 import textwrap
-import datasets
 _CITATION = """\
 @misc{cobbe2021training,
       title={Training Verifiers to Solve Math Word Problems},
@@ -76,8 +73,7 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
                 using basic arithmetic operations (+ - / *) to reach the final
                 answer. A bright middle school student should be able to solve
                 every problem.
-                """,
-            ),
+                """, ),
             urls={
                 'train': TRAIN_URL,
                 'test': TEST_URL,
@@ -86,12 +82,10 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
     ]
     def _info(self):
-        features = datasets.Features(
-            {
-                'question': datasets.Value('string'),
-                'answer': datasets.Value('string'),
-            }
-        )
+        features = datasets.Features({
+            'question': datasets.Value('string'),
+            'answer': datasets.Value('string'),
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,

evalscope/benchmarks/gsm8k/gsm8k_adapter.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI, Inc. and its affiliates.
+import math
 import os
 import re
-import math
 from evalscope.benchmarks import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score, jsonl_to_list
+from evalscope.utils import jsonl_to_list, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
@@ -54,13 +56,14 @@ class GSM8KAdapter(DataAdapter):
                          f'Use 4-shot by default.')
             few_shot_num = 4
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         prompt_template=prompt_template,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            prompt_template=prompt_template,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -182,17 +185,19 @@ class GSM8KAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(name=report_name or 'gsm8k',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        res_map = dict(
+            name=report_name or 'gsm8k',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map
@@ -209,8 +214,7 @@ class GSM8KAdapter(DataAdapter):
                 "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
                 "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
                 'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n'
-                f"Question: {input_d['question']}\nLet's think step by step\nAnswer:"
-            )
+                f"Question: {input_d['question']}\nLet's think step by step\nAnswer:")
             # context = input_d['question']
             # fewshot_prompts = ['Question: ' + item_d['question'] + '\nAnswer: ' + item_d['answer'] for item_d in few_shot_list]
             # fewshot_prompts = fewshot_prompts + ['Question: ' + context + '\nAnswer:']
@@ -222,9 +226,7 @@ class GSM8KAdapter(DataAdapter):
     @staticmethod
     def extract_answer(s: str) -> str:
-        _PAT_LAST_DIGIT = re.compile(
-            r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)'
-        )
+        _PAT_LAST_DIGIT = re.compile(r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)')
         match = list(_PAT_LAST_DIGIT.finditer(s))
         if match:
             last_digit = match[-1].group().replace(',', '').replace('+', '').strip().strip('.')
@@ -233,4 +235,4 @@ class GSM8KAdapter(DataAdapter):
             last_digit = None
             print(f'No digits found in {s!r}', flush=True)
-        return last_digit
+        return last_digit

evalscope/benchmarks/hellaswag/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
 from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/hellaswag/hellaswag.py CHANGED Viewed

@@ -1,20 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 """HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI.
     A paper was published at ACL2019.
 """
 """DO NOT EDIT."""
-import json
 import datasets
+import json
 # flake8: noqa
 # HomePage: https://rowanzellers.com/hellaswag/
 # GitHub: https://github.com/rowanz/hellaswag
 _CITATION = """\
 @inproceedings{zellers2019hellaswag,
     title={HellaSwag: Can a Machine Really Finish Your Sentence?},
@@ -47,21 +45,19 @@ class Hellaswag(datasets.GeneratorBasedBuilder):
             # This is the description that will appear on the datasets page.
             description=_DESCRIPTION,
             # datasets.features.FeatureConnectors
-            features=datasets.Features(
-                {
-                    # These are the features of your dataset like images, labels ...
-                    'ind': datasets.Value('int32'),
-                    'activity_label': datasets.Value('string'),
-                    'ctx_a': datasets.Value('string'),
-                    'ctx_b': datasets.Value('string'),
-                    'ctx': datasets.Value('string'),
-                    'endings': datasets.features.Sequence(datasets.Value('string')),
-                    'source_id': datasets.Value('string'),
-                    'split': datasets.Value('string'),
-                    'split_type': datasets.Value('string'),
-                    'label': datasets.Value('string'),
-                }
-            ),
+            features=datasets.Features({
+                # These are the features of your dataset like images, labels ...
+                'ind': datasets.Value('int32'),
+                'activity_label': datasets.Value('string'),
+                'ctx_a': datasets.Value('string'),
+                'ctx_b': datasets.Value('string'),
+                'ctx': datasets.Value('string'),
+                'endings': datasets.features.Sequence(datasets.Value('string')),
+                'source_id': datasets.Value('string'),
+                'split': datasets.Value('string'),
+                'split_type': datasets.Value('string'),
+                'label': datasets.Value('string'),
+            }),
             # If there's a common (input, target) tuple from the features,
             # specify them here. They'll be used if as_supervised=True in
             # builder.as_dataset.

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -1,18 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
 import os
 import re
-import numpy as np
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score, jsonl_to_list
+from evalscope.utils import jsonl_to_list, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
 DATASET_ID = 'modelscope/hellaswag'
 SUBSET_LIST = ['default']
@@ -44,12 +43,13 @@ class HellaSwagAdapter(DataAdapter):
             logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
             few_shot_num = 0
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -91,7 +91,9 @@ class HellaSwagAdapter(DataAdapter):
         endings: list = [self._preprocess(ending) for ending in input_d['endings']]
-        few_shot_prompts = [self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list]
+        few_shot_prompts = [
+            self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
+        ]
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
@@ -124,9 +126,9 @@ class HellaSwagAdapter(DataAdapter):
             return str(best_choice_idx)
         elif eval_type == 'service':
-            return result           # TODO: to be supported !
+            return result  # TODO: to be supported !
         elif eval_type == 'custom':
-            return result           # TODO: to be supported !
+            return result  # TODO: to be supported !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -177,17 +179,19 @@ class HellaSwagAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
-        res_map = dict(name=report_name or 'hellaswag',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'hellaswag',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map

evalscope/benchmarks/humaneval/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
 from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/humaneval/humaneval.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
 import datasets
+import json
 # flake8: noqa
 # NOTE: AUTOGENERATED, DO NOT CHANGE.
@@ -41,15 +42,13 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
     ]
     def _info(self):
-        features = datasets.Features(
-            {
-                'task_id': datasets.Value('string'),
-                'prompt': datasets.Value('string'),
-                'canonical_solution': datasets.Value('string'),
-                'test': datasets.Value('string'),
-                'entry_point': datasets.Value('string'),
-            }
-        )
+        features = datasets.Features({
+            'task_id': datasets.Value('string'),
+            'prompt': datasets.Value('string'),
+            'canonical_solution': datasets.Value('string'),
+            'test': datasets.Value('string'),
+            'entry_point': datasets.Value('string'),
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -63,14 +62,12 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
         data_dir = dl_manager.download_and_extract(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    'filepath': data_dir,
-                },
-            )
-        ]
+        return [datasets.SplitGenerator(
+            name=datasets.Split.TEST,
+            gen_kwargs={
+                'filepath': data_dir,
+            },
+        )]
     def _generate_examples(self, filepath):
         """Yields examples."""

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -2,7 +2,6 @@
 # flake8: noqa
 DATASET_ID = 'modelscope/humaneval'
 SUBSET_LIST = ['openai_humaneval']

evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl