evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os.path
|
|
3
|
+
import random
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
from typing import Any, Optional
|
|
5
|
-
import random
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import Benchmark
|
|
8
|
-
from evalscope.constants import
|
|
8
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
11
11
|
logger = get_logger()
|
|
@@ -29,7 +29,8 @@ class DataAdapter(ABC):
|
|
|
29
29
|
train_split: str, usually for few-shot examples. e.g. 'train'
|
|
30
30
|
eval_split: str, the target eval split name. e.g. 'test'
|
|
31
31
|
prompt_template: str, the prompt template for the benchmark,
|
|
32
|
-
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
32
|
+
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
33
|
+
the form of A or B or C or D, do not output explanation:`
|
|
33
34
|
"""
|
|
34
35
|
self.subset_list = subset_list
|
|
35
36
|
self.metric_list = metric_list
|
|
@@ -42,8 +43,8 @@ class DataAdapter(ABC):
|
|
|
42
43
|
def load(self,
|
|
43
44
|
dataset_name_or_path: str,
|
|
44
45
|
subset_list: list = None,
|
|
45
|
-
work_dir: Optional[str] =
|
|
46
|
-
datasets_hub: str =
|
|
46
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
47
|
+
datasets_hub: str = HubType.MODELSCOPE,
|
|
47
48
|
**kwargs) -> dict:
|
|
48
49
|
"""
|
|
49
50
|
Load the dataset. Remote and local datasets are supported.
|
|
@@ -54,12 +55,11 @@ class DataAdapter(ABC):
|
|
|
54
55
|
|
|
55
56
|
"""
|
|
56
57
|
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
57
|
-
if datasets_hub == 'Local':
|
|
58
|
-
# Try to load dataset from local disk
|
|
59
|
-
if not os.path.exists(dataset_name_or_path):
|
|
60
|
-
raise FileNotFoundError(f'Dataset path not found: {dataset_name_or_path}')
|
|
61
58
|
|
|
62
|
-
|
|
59
|
+
# Try to load dataset from local disk
|
|
60
|
+
if os.path.exists(dataset_name_or_path):
|
|
61
|
+
logger.info(
|
|
62
|
+
f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path} > work_dir: {work_dir}')
|
|
63
63
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
64
64
|
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
65
65
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
@@ -76,12 +76,13 @@ class DataAdapter(ABC):
|
|
|
76
76
|
data_dict[sub_name] = {}
|
|
77
77
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
78
78
|
for split in split_list:
|
|
79
|
-
dataset = Benchmark.load(
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
79
|
+
dataset = Benchmark.load(
|
|
80
|
+
dataset_name=dataset_name_or_path,
|
|
81
|
+
subset=sub_name,
|
|
82
|
+
split=split,
|
|
83
|
+
hub=datasets_hub,
|
|
84
|
+
work_dir=work_dir,
|
|
85
|
+
**kwargs)
|
|
85
86
|
|
|
86
87
|
data_dict[sub_name].update({split: dataset})
|
|
87
88
|
|
|
@@ -112,19 +113,18 @@ class DataAdapter(ABC):
|
|
|
112
113
|
if self.few_shot_num and self.few_shot_num < 0:
|
|
113
114
|
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
114
115
|
|
|
115
|
-
logger.info(f'
|
|
116
|
-
f'>few_shot_num: {self.few_shot_num}, '
|
|
117
|
-
f'>few_shot_split: {self.train_split}, '
|
|
118
|
-
f'>target_eval_split: {self.eval_split}')
|
|
116
|
+
logger.info(f'Use default settings: '
|
|
117
|
+
f'> few_shot_num: {self.few_shot_num}, '
|
|
118
|
+
f'> few_shot_split: {self.train_split}, '
|
|
119
|
+
f'> target_eval_split: {self.eval_split}')
|
|
119
120
|
|
|
120
121
|
for sub_name, sub_data_dict in data_dict.items():
|
|
121
122
|
few_shot_data = []
|
|
122
123
|
if self.few_shot_num and self.few_shot_num > 0:
|
|
123
124
|
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
|
|
124
|
-
few_shot_data = self.get_fewshot_examples(
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
few_shot_random=few_shot_random)
|
|
125
|
+
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
|
|
126
|
+
self.few_shot_num,
|
|
127
|
+
few_shot_random=few_shot_random)
|
|
128
128
|
|
|
129
129
|
res_dict[sub_name] = []
|
|
130
130
|
for sample_d in sub_data_dict[self.eval_split]:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
3
|
+
from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
|
|
4
5
|
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import glob
|
|
3
|
+
import json
|
|
3
4
|
import os.path
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import Any, Optional
|
|
4
7
|
|
|
5
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
6
9
|
from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
|
|
7
10
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
8
11
|
from evalscope.utils import jsonl_to_list
|
|
9
12
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from typing import Any, Optional
|
|
11
|
-
from collections import defaultdict
|
|
12
|
-
import json
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
@@ -31,17 +31,11 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
31
31
|
|
|
32
32
|
if metric_list is None:
|
|
33
33
|
metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
|
|
34
|
-
|
|
35
|
-
super().__init__(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
**kwargs)
|
|
40
|
-
|
|
41
|
-
def load(self,
|
|
42
|
-
dataset_name_or_path: str,
|
|
43
|
-
subset_list: list = None,
|
|
44
|
-
**kwargs) -> dict:
|
|
34
|
+
|
|
35
|
+
super().__init__(
|
|
36
|
+
subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
|
|
37
|
+
|
|
38
|
+
def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
|
|
45
39
|
|
|
46
40
|
data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
|
|
47
41
|
data_list = []
|
|
@@ -50,12 +44,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
50
44
|
for file_path in data_file_list:
|
|
51
45
|
data_list.extend(jsonl_to_list(file_path))
|
|
52
46
|
except Exception as e:
|
|
53
|
-
raise ValueError(f
|
|
47
|
+
raise ValueError(f'Failed to load data from {dataset_name_or_path}, got error: {e}')
|
|
54
48
|
|
|
55
49
|
data_dict = {'default': {'test': data_list}}
|
|
56
50
|
|
|
57
51
|
return data_dict
|
|
58
|
-
|
|
52
|
+
|
|
59
53
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
60
54
|
"""
|
|
61
55
|
Args:
|
|
@@ -68,16 +62,17 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
68
62
|
|
|
69
63
|
"""
|
|
70
64
|
# prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
|
|
71
|
-
history = input_d.get('history', [])
|
|
65
|
+
history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
|
|
72
66
|
if len(history) > 0:
|
|
73
|
-
logger.warning(
|
|
67
|
+
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
68
|
+
To be supported in the future.')
|
|
74
69
|
|
|
75
70
|
prompt = input_d.get('question', '') or input_d.get('query', '')
|
|
76
71
|
|
|
77
72
|
# if len(history) > 0:
|
|
78
73
|
# prompt = '\n'.join(history) + '\n' + prompt
|
|
79
74
|
return {'data': [prompt]}
|
|
80
|
-
|
|
75
|
+
|
|
81
76
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
77
|
"""
|
|
83
78
|
Args:
|
|
@@ -88,7 +83,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
88
83
|
|
|
89
84
|
"""
|
|
90
85
|
return input_d.get('answer', '') or input_d.get('response', '')
|
|
91
|
-
|
|
86
|
+
|
|
92
87
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
93
88
|
"""
|
|
94
89
|
Args:
|
|
@@ -99,7 +94,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
99
94
|
|
|
100
95
|
"""
|
|
101
96
|
return result
|
|
102
|
-
|
|
97
|
+
|
|
103
98
|
def match(self, gold: str, pred: str) -> float:
|
|
104
99
|
"""
|
|
105
100
|
Args:
|
|
@@ -110,7 +105,6 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
110
105
|
bleu_score: float
|
|
111
106
|
|
|
112
107
|
"""
|
|
113
|
-
item = [(gold, pred)]
|
|
114
108
|
res = dict()
|
|
115
109
|
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
116
110
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
@@ -118,7 +112,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
118
112
|
res.update(bleu_dict)
|
|
119
113
|
# return bleu(item)
|
|
120
114
|
return res
|
|
121
|
-
|
|
115
|
+
|
|
122
116
|
def compute_metric(self, review_res_list: list) -> float:
|
|
123
117
|
"""
|
|
124
118
|
compute weighted mean of the bleu score of all samples
|
|
@@ -132,13 +126,13 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
132
126
|
"""
|
|
133
127
|
items = defaultdict(list)
|
|
134
128
|
for scores in review_res_list:
|
|
135
|
-
for k,v in scores.items():
|
|
129
|
+
for k, v in scores.items():
|
|
136
130
|
items[k].append((v, 1.0))
|
|
137
131
|
# items = [(score, 1.0) for score in review_res_list]
|
|
138
|
-
res = {k: weighted_mean(v) for k,v in items.items()}
|
|
132
|
+
res = {k: weighted_mean(v) for k, v in items.items()}
|
|
139
133
|
# return weighted_mean(items)
|
|
140
134
|
return res
|
|
141
|
-
|
|
135
|
+
|
|
142
136
|
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
143
137
|
"""
|
|
144
138
|
Args:
|
|
@@ -167,20 +161,22 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
167
161
|
"""
|
|
168
162
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
169
163
|
# weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
170
|
-
cate_avg_list = [{
|
|
164
|
+
cate_avg_list = [{
|
|
165
|
+
'name': subset_name,
|
|
166
|
+
'score': score_dict
|
|
167
|
+
} for subset_name, (score_dict, _) in subset_score_map.items()]
|
|
171
168
|
total_avg_list = defaultdict(float)
|
|
172
169
|
for score_dict, num in subset_score_map.values():
|
|
173
170
|
for metric, score in score_dict.items():
|
|
174
171
|
total_avg_list[metric] += score * num / total_num
|
|
175
172
|
|
|
176
|
-
category_d = dict(name=
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
return res_map
|
|
173
|
+
category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
|
|
174
|
+
|
|
175
|
+
res_map = dict(
|
|
176
|
+
name=report_name or 'general_qa',
|
|
177
|
+
metric=self.metric_list[0]['name'],
|
|
178
|
+
score=total_avg_list,
|
|
179
|
+
category=[category_d],
|
|
180
|
+
total_num=total_num)
|
|
181
|
+
|
|
182
|
+
return res_map
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
|
|
3
3
|
from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
|
|
4
4
|
from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
5
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -13,15 +13,12 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
# flake8: noqa
|
|
16
|
-
|
|
17
16
|
"""Grade School Math 8k dataset."""
|
|
18
17
|
|
|
18
|
+
import datasets
|
|
19
19
|
import json
|
|
20
20
|
import textwrap
|
|
21
21
|
|
|
22
|
-
import datasets
|
|
23
|
-
|
|
24
|
-
|
|
25
22
|
_CITATION = """\
|
|
26
23
|
@misc{cobbe2021training,
|
|
27
24
|
title={Training Verifiers to Solve Math Word Problems},
|
|
@@ -76,8 +73,7 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
|
|
|
76
73
|
using basic arithmetic operations (+ - / *) to reach the final
|
|
77
74
|
answer. A bright middle school student should be able to solve
|
|
78
75
|
every problem.
|
|
79
|
-
""",
|
|
80
|
-
),
|
|
76
|
+
""", ),
|
|
81
77
|
urls={
|
|
82
78
|
'train': TRAIN_URL,
|
|
83
79
|
'test': TEST_URL,
|
|
@@ -86,12 +82,10 @@ class Gsm8k(datasets.GeneratorBasedBuilder):
|
|
|
86
82
|
]
|
|
87
83
|
|
|
88
84
|
def _info(self):
|
|
89
|
-
features = datasets.Features(
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
}
|
|
94
|
-
)
|
|
85
|
+
features = datasets.Features({
|
|
86
|
+
'question': datasets.Value('string'),
|
|
87
|
+
'answer': datasets.Value('string'),
|
|
88
|
+
})
|
|
95
89
|
return datasets.DatasetInfo(
|
|
96
90
|
description=_DESCRIPTION,
|
|
97
91
|
features=features,
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI, Inc. and its affiliates.
|
|
3
|
+
import math
|
|
3
4
|
import os
|
|
4
5
|
import re
|
|
5
|
-
|
|
6
|
+
|
|
6
7
|
from evalscope.benchmarks import DataAdapter
|
|
7
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
9
|
+
from evalscope.utils import jsonl_to_list, normalize_score
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
10
12
|
# flake8: noqa
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
@@ -54,13 +56,14 @@ class GSM8KAdapter(DataAdapter):
|
|
|
54
56
|
f'Use 4-shot by default.')
|
|
55
57
|
few_shot_num = 4
|
|
56
58
|
|
|
57
|
-
super().__init__(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
59
|
+
super().__init__(
|
|
60
|
+
subset_list=subset_list,
|
|
61
|
+
metric_list=metric_list,
|
|
62
|
+
few_shot_num=few_shot_num,
|
|
63
|
+
train_split=train_split,
|
|
64
|
+
eval_split=eval_split,
|
|
65
|
+
prompt_template=prompt_template,
|
|
66
|
+
**kwargs)
|
|
64
67
|
|
|
65
68
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
66
69
|
data_dict = {}
|
|
@@ -182,17 +185,19 @@ class GSM8KAdapter(DataAdapter):
|
|
|
182
185
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
183
186
|
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
184
187
|
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
185
|
-
cate_avg_list = [{
|
|
188
|
+
cate_avg_list = [{
|
|
189
|
+
'name': subset_name,
|
|
190
|
+
'score': normalize_score(score=score)
|
|
191
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
186
192
|
|
|
187
|
-
category_d = dict(name='DEFAULT',
|
|
188
|
-
score=weighted_avg_acc,
|
|
189
|
-
subset=cate_avg_list)
|
|
193
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
190
194
|
|
|
191
|
-
res_map = dict(
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
195
|
+
res_map = dict(
|
|
196
|
+
name=report_name or 'gsm8k',
|
|
197
|
+
metric=self.metric_list[0]['name'],
|
|
198
|
+
score=weighted_avg_acc,
|
|
199
|
+
category=[category_d],
|
|
200
|
+
total_num=total_num)
|
|
196
201
|
|
|
197
202
|
return res_map
|
|
198
203
|
|
|
@@ -209,8 +214,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
209
214
|
"When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
|
|
210
215
|
"Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
|
|
211
216
|
'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n'
|
|
212
|
-
f"Question: {input_d['question']}\nLet's think step by step\nAnswer:"
|
|
213
|
-
)
|
|
217
|
+
f"Question: {input_d['question']}\nLet's think step by step\nAnswer:")
|
|
214
218
|
# context = input_d['question']
|
|
215
219
|
# fewshot_prompts = ['Question: ' + item_d['question'] + '\nAnswer: ' + item_d['answer'] for item_d in few_shot_list]
|
|
216
220
|
# fewshot_prompts = fewshot_prompts + ['Question: ' + context + '\nAnswer:']
|
|
@@ -222,9 +226,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
222
226
|
|
|
223
227
|
@staticmethod
|
|
224
228
|
def extract_answer(s: str) -> str:
|
|
225
|
-
_PAT_LAST_DIGIT = re.compile(
|
|
226
|
-
r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)'
|
|
227
|
-
)
|
|
229
|
+
_PAT_LAST_DIGIT = re.compile(r'([+-])?(?=([0-9]|\.[0-9]))(0|([1-9](\d{0,2}(,\d{3})*)|\d*))?(\.\d*)?(?=\D|$)')
|
|
228
230
|
match = list(_PAT_LAST_DIGIT.finditer(s))
|
|
229
231
|
if match:
|
|
230
232
|
last_digit = match[-1].group().replace(',', '').replace('+', '').strip().strip('.')
|
|
@@ -233,4 +235,4 @@ class GSM8KAdapter(DataAdapter):
|
|
|
233
235
|
last_digit = None
|
|
234
236
|
print(f'No digits found in {s!r}', flush=True)
|
|
235
237
|
|
|
236
|
-
return last_digit
|
|
238
|
+
return last_digit
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import
|
|
3
|
+
from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
|
|
4
5
|
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,20 +1,18 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
|
|
4
3
|
"""HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI.
|
|
5
4
|
A paper was published at ACL2019.
|
|
6
5
|
"""
|
|
7
|
-
|
|
8
6
|
"""DO NOT EDIT."""
|
|
9
7
|
|
|
10
|
-
import json
|
|
11
8
|
import datasets
|
|
9
|
+
import json
|
|
10
|
+
|
|
12
11
|
# flake8: noqa
|
|
13
12
|
|
|
14
13
|
# HomePage: https://rowanzellers.com/hellaswag/
|
|
15
14
|
# GitHub: https://github.com/rowanz/hellaswag
|
|
16
15
|
|
|
17
|
-
|
|
18
16
|
_CITATION = """\
|
|
19
17
|
@inproceedings{zellers2019hellaswag,
|
|
20
18
|
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
|
@@ -47,21 +45,19 @@ class Hellaswag(datasets.GeneratorBasedBuilder):
|
|
|
47
45
|
# This is the description that will appear on the datasets page.
|
|
48
46
|
description=_DESCRIPTION,
|
|
49
47
|
# datasets.features.FeatureConnectors
|
|
50
|
-
features=datasets.Features(
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
}
|
|
64
|
-
),
|
|
48
|
+
features=datasets.Features({
|
|
49
|
+
# These are the features of your dataset like images, labels ...
|
|
50
|
+
'ind': datasets.Value('int32'),
|
|
51
|
+
'activity_label': datasets.Value('string'),
|
|
52
|
+
'ctx_a': datasets.Value('string'),
|
|
53
|
+
'ctx_b': datasets.Value('string'),
|
|
54
|
+
'ctx': datasets.Value('string'),
|
|
55
|
+
'endings': datasets.features.Sequence(datasets.Value('string')),
|
|
56
|
+
'source_id': datasets.Value('string'),
|
|
57
|
+
'split': datasets.Value('string'),
|
|
58
|
+
'split_type': datasets.Value('string'),
|
|
59
|
+
'label': datasets.Value('string'),
|
|
60
|
+
}),
|
|
65
61
|
# If there's a common (input, target) tuple from the features,
|
|
66
62
|
# specify them here. They'll be used if as_supervised=True in
|
|
67
63
|
# builder.as_dataset.
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import numpy as np
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
4
|
-
import numpy as np
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
7
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import jsonl_to_list, normalize_score
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
11
11
|
# flake8: noqa
|
|
12
12
|
|
|
13
13
|
logger = get_logger()
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
DATASET_ID = 'modelscope/hellaswag'
|
|
17
16
|
SUBSET_LIST = ['default']
|
|
18
17
|
|
|
@@ -44,12 +43,13 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
44
43
|
logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
|
|
45
44
|
few_shot_num = 0
|
|
46
45
|
|
|
47
|
-
super().__init__(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
46
|
+
super().__init__(
|
|
47
|
+
subset_list=subset_list,
|
|
48
|
+
metric_list=metric_list,
|
|
49
|
+
few_shot_num=few_shot_num,
|
|
50
|
+
train_split=train_split,
|
|
51
|
+
eval_split=eval_split,
|
|
52
|
+
**kwargs)
|
|
53
53
|
|
|
54
54
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
55
55
|
data_dict = {}
|
|
@@ -91,7 +91,9 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
91
91
|
|
|
92
92
|
endings: list = [self._preprocess(ending) for ending in input_d['endings']]
|
|
93
93
|
|
|
94
|
-
few_shot_prompts = [
|
|
94
|
+
few_shot_prompts = [
|
|
95
|
+
self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
|
|
96
|
+
]
|
|
95
97
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
96
98
|
context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
|
|
97
99
|
|
|
@@ -124,9 +126,9 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
124
126
|
|
|
125
127
|
return str(best_choice_idx)
|
|
126
128
|
elif eval_type == 'service':
|
|
127
|
-
return result
|
|
129
|
+
return result # TODO: to be supported !
|
|
128
130
|
elif eval_type == 'custom':
|
|
129
|
-
return result
|
|
131
|
+
return result # TODO: to be supported !
|
|
130
132
|
else:
|
|
131
133
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
132
134
|
|
|
@@ -177,17 +179,19 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
177
179
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
178
180
|
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
179
181
|
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
180
|
-
cate_avg_list = [{
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
182
|
+
cate_avg_list = [{
|
|
183
|
+
'name': subset_name,
|
|
184
|
+
'score': normalize_score(score=score)
|
|
185
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
186
|
+
|
|
187
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
188
|
+
|
|
189
|
+
res_map = dict(
|
|
190
|
+
name=report_name or 'hellaswag',
|
|
191
|
+
metric=self.metric_list[0]['name'],
|
|
192
|
+
score=weighted_avg_acc,
|
|
193
|
+
category=[category_d],
|
|
194
|
+
total_num=total_num)
|
|
191
195
|
|
|
192
196
|
return res_map
|
|
193
197
|
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
|
|
3
3
|
from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
|
|
4
4
|
from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
5
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import json
|
|
3
2
|
import datasets
|
|
3
|
+
import json
|
|
4
|
+
|
|
4
5
|
# flake8: noqa
|
|
5
6
|
|
|
6
7
|
# NOTE: AUTOGENERATED, DO NOT CHANGE.
|
|
@@ -41,15 +42,13 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
|
|
|
41
42
|
]
|
|
42
43
|
|
|
43
44
|
def _info(self):
|
|
44
|
-
features = datasets.Features(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
52
|
-
)
|
|
45
|
+
features = datasets.Features({
|
|
46
|
+
'task_id': datasets.Value('string'),
|
|
47
|
+
'prompt': datasets.Value('string'),
|
|
48
|
+
'canonical_solution': datasets.Value('string'),
|
|
49
|
+
'test': datasets.Value('string'),
|
|
50
|
+
'entry_point': datasets.Value('string'),
|
|
51
|
+
})
|
|
53
52
|
|
|
54
53
|
return datasets.DatasetInfo(
|
|
55
54
|
description=_DESCRIPTION,
|
|
@@ -63,14 +62,12 @@ class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
|
|
|
63
62
|
def _split_generators(self, dl_manager):
|
|
64
63
|
"""Returns SplitGenerators."""
|
|
65
64
|
data_dir = dl_manager.download_and_extract(_URL)
|
|
66
|
-
return [
|
|
67
|
-
datasets.
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
)
|
|
73
|
-
]
|
|
65
|
+
return [datasets.SplitGenerator(
|
|
66
|
+
name=datasets.Split.TEST,
|
|
67
|
+
gen_kwargs={
|
|
68
|
+
'filepath': data_dir,
|
|
69
|
+
},
|
|
70
|
+
)]
|
|
74
71
|
|
|
75
72
|
def _generate_examples(self, filepath):
|
|
76
73
|
"""Yields examples."""
|