evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.arc.arc_adapter import
|
|
3
|
+
from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
|
|
4
5
|
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,18 +1,15 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) Allen Institute, and its affiliates.
|
|
3
3
|
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
4
|
-
|
|
5
4
|
"""AI2 ARC (Abstraction and Reasoning Corpus) for General Artificial Intelligence Benchmark."""
|
|
6
|
-
|
|
7
5
|
"""AUTO GENERATED, DO NOT EDIT"""
|
|
8
6
|
|
|
7
|
+
import datasets
|
|
9
8
|
import json
|
|
10
9
|
import os
|
|
11
|
-
import datasets
|
|
12
10
|
|
|
13
11
|
# flake8: noqa
|
|
14
12
|
|
|
15
|
-
|
|
16
13
|
_CITATION = """\
|
|
17
14
|
@article{allenai:arc,
|
|
18
15
|
author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and
|
|
@@ -85,17 +82,20 @@ class Ai2Arc(datasets.GeneratorBasedBuilder):
|
|
|
85
82
|
# This is the description that will appear on the datasets page.
|
|
86
83
|
description=_DESCRIPTION,
|
|
87
84
|
# datasets.features.FeatureConnectors
|
|
88
|
-
features=datasets.Features(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
'
|
|
96
|
-
|
|
97
|
-
}
|
|
98
|
-
|
|
85
|
+
features=datasets.Features({
|
|
86
|
+
'id':
|
|
87
|
+
datasets.Value('string'),
|
|
88
|
+
'question':
|
|
89
|
+
datasets.Value('string'),
|
|
90
|
+
'choices':
|
|
91
|
+
datasets.features.Sequence({
|
|
92
|
+
'text': datasets.Value('string'),
|
|
93
|
+
'label': datasets.Value('string')
|
|
94
|
+
}),
|
|
95
|
+
'answerKey':
|
|
96
|
+
datasets.Value('string')
|
|
97
|
+
# These are the features of your dataset like images, labels ...
|
|
98
|
+
}),
|
|
99
99
|
# If there's a common (input, target) tuple from the features,
|
|
100
100
|
# specify them here. They'll be used if as_supervised=True in
|
|
101
101
|
# builder.as_dataset.
|
|
@@ -144,5 +144,8 @@ class Ai2Arc(datasets.GeneratorBasedBuilder):
|
|
|
144
144
|
'id': id_,
|
|
145
145
|
'answerKey': answerkey,
|
|
146
146
|
'question': question,
|
|
147
|
-
'choices': {
|
|
147
|
+
'choices': {
|
|
148
|
+
'text': text_choices,
|
|
149
|
+
'label': label_choices
|
|
150
|
+
},
|
|
148
151
|
}
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
5
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
6
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import ResponseParser, normalize_score
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
10
|
|
|
10
11
|
# flake8: noqa
|
|
@@ -44,13 +45,14 @@ class ARCAdapter(DataAdapter):
|
|
|
44
45
|
if few_shot_num != 0:
|
|
45
46
|
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
|
|
46
47
|
|
|
47
|
-
super().__init__(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
48
|
+
super().__init__(
|
|
49
|
+
subset_list=subset_list,
|
|
50
|
+
metric_list=metric_list,
|
|
51
|
+
few_shot_num=few_shot_num,
|
|
52
|
+
train_split=train_split,
|
|
53
|
+
eval_split=eval_split,
|
|
54
|
+
prompt_template=prompt_template,
|
|
55
|
+
**kwargs)
|
|
54
56
|
|
|
55
57
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
56
58
|
"""
|
|
@@ -78,8 +80,10 @@ class ARCAdapter(DataAdapter):
|
|
|
78
80
|
rows.append({
|
|
79
81
|
'id': item['id'],
|
|
80
82
|
'question': item['question']['stem'],
|
|
81
|
-
'choices': {
|
|
82
|
-
|
|
83
|
+
'choices': {
|
|
84
|
+
'text': [d['text'] for d in raw_choices],
|
|
85
|
+
'label': [d['label'] for d in raw_choices]
|
|
86
|
+
},
|
|
83
87
|
'answerKey': item['answerKey'],
|
|
84
88
|
})
|
|
85
89
|
|
|
@@ -143,9 +147,11 @@ class ARCAdapter(DataAdapter):
|
|
|
143
147
|
if eval_type == 'checkpoint':
|
|
144
148
|
return result
|
|
145
149
|
elif eval_type == 'service':
|
|
146
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
150
|
+
return ResponseParser.parse_first_option_with_choices(
|
|
151
|
+
text=result, options=self.choices) # TODO: to be checked !
|
|
147
152
|
elif eval_type == 'custom':
|
|
148
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
153
|
+
return ResponseParser.parse_first_option_with_choices(
|
|
154
|
+
text=result, options=self.choices) # TODO: to be checked !
|
|
149
155
|
else:
|
|
150
156
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
151
157
|
|
|
@@ -200,17 +206,19 @@ class ARCAdapter(DataAdapter):
|
|
|
200
206
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
201
207
|
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
202
208
|
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
203
|
-
cate_avg_list = [{
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
209
|
+
cate_avg_list = [{
|
|
210
|
+
'name': subset_name,
|
|
211
|
+
'score': normalize_score(score=score)
|
|
212
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
213
|
+
|
|
214
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
215
|
+
|
|
216
|
+
res_map = dict(
|
|
217
|
+
name=report_name or 'arc',
|
|
218
|
+
metric=self.metric_list[0]['name'],
|
|
219
|
+
score=weighted_avg_acc,
|
|
220
|
+
category=[category_d],
|
|
221
|
+
total_num=total_num)
|
|
214
222
|
|
|
215
223
|
return res_map
|
|
216
224
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
|
|
5
4
|
from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
5
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
4
|
-
import re
|
|
5
5
|
import random
|
|
6
|
-
import
|
|
6
|
+
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
10
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
11
|
-
from evalscope.utils import
|
|
11
|
+
from evalscope.utils import ResponseParser, normalize_score
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
13
14
|
# flake8: noqa
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
16
17
|
|
|
17
18
|
DATASET_ID = 'modelscope/bbh'
|
|
18
19
|
|
|
19
|
-
|
|
20
20
|
# BBH multiple choice subset list
|
|
21
21
|
MULTIPLE_CHOICE = 'multiple_choice'
|
|
22
22
|
MULTIPLE_CHOICE_LIST = [
|
|
@@ -87,12 +87,13 @@ class BBHAdapter(DataAdapter):
|
|
|
87
87
|
f'Use 3-shot by default.')
|
|
88
88
|
few_shot_num = 3
|
|
89
89
|
|
|
90
|
-
super().__init__(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
90
|
+
super().__init__(
|
|
91
|
+
subset_list=subset_list,
|
|
92
|
+
metric_list=metric_list,
|
|
93
|
+
few_shot_num=few_shot_num,
|
|
94
|
+
train_split=train_split,
|
|
95
|
+
eval_split=eval_split,
|
|
96
|
+
**kwargs)
|
|
96
97
|
|
|
97
98
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
98
99
|
data_dict = {}
|
|
@@ -151,10 +152,10 @@ class BBHAdapter(DataAdapter):
|
|
|
151
152
|
if self.few_shot_num < 0:
|
|
152
153
|
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
153
154
|
|
|
154
|
-
logger.info(f'
|
|
155
|
-
f'>few_shot_num: {self.few_shot_num}, '
|
|
156
|
-
f'>few_shot_split: {self.train_split}, '
|
|
157
|
-
f'>target_eval_split: {self.eval_split}')
|
|
155
|
+
logger.info(f'Use default settings: '
|
|
156
|
+
f'> few_shot_num: {self.few_shot_num}, '
|
|
157
|
+
f'> few_shot_split: {self.train_split}, '
|
|
158
|
+
f'> target_eval_split: {self.eval_split}')
|
|
158
159
|
|
|
159
160
|
for sub_name, sub_data_dict in data_dict.items():
|
|
160
161
|
few_shot_data = []
|
|
@@ -260,17 +261,19 @@ class BBHAdapter(DataAdapter):
|
|
|
260
261
|
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
261
262
|
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
262
263
|
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
263
|
-
cate_avg_list = [{
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
264
|
+
cate_avg_list = [{
|
|
265
|
+
'name': subset_name,
|
|
266
|
+
'score': normalize_score(score=score)
|
|
267
|
+
} for subset_name, (score, _) in subset_score_map.items()]
|
|
268
|
+
|
|
269
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
270
|
+
|
|
271
|
+
res_map = dict(
|
|
272
|
+
name=report_name or 'bbh',
|
|
273
|
+
metric=self.metric_list[0]['name'],
|
|
274
|
+
score=weighted_avg_acc,
|
|
275
|
+
category=[category_d],
|
|
276
|
+
total_num=total_num)
|
|
274
277
|
|
|
275
278
|
return res_map
|
|
276
279
|
|
|
@@ -20,4 +20,4 @@ A: Let's think step by step.
|
|
|
20
20
|
Remember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is "not", "and", "or", respectively.
|
|
21
21
|
We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
|
|
22
22
|
Let's evaluate A: A = not ( False ) = not False = True.
|
|
23
|
-
Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.
|
|
23
|
+
Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.
|
|
@@ -22,4 +22,4 @@ Options:
|
|
|
22
22
|
- Yes
|
|
23
23
|
- No
|
|
24
24
|
A: Let's think step by step.
|
|
25
|
-
Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.
|
|
25
|
+
Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.
|
|
@@ -30,4 +30,4 @@ Options:
|
|
|
30
30
|
(E) 01/03/1982
|
|
31
31
|
(F) 12/03/1960
|
|
32
32
|
A: Let's think step by step.
|
|
33
|
-
If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).
|
|
33
|
+
If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).
|
|
@@ -34,4 +34,4 @@ A: Let's think step by step.
|
|
|
34
34
|
Here we need to determine who the pronoun "his" might be referring to. There are two possible referents for "his", namely Bailey's and the director's. The verb phrase "plan to meet" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form "X will plan to meet Y at (X or Y)'s office."
|
|
35
35
|
Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
|
|
36
36
|
Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
|
|
37
|
-
Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).
|
|
37
|
+
Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).
|
|
@@ -69,4 +69,4 @@ We should process each input one by one and keep track of the stack configuratio
|
|
|
69
69
|
29: } ; stack: < [ {
|
|
70
70
|
Now, we have reached the end. The final stack is "< [ {".
|
|
71
71
|
We will need to pop out "{", "[", "<" one by one in that order.
|
|
72
|
-
So, we need "}", "]", ">". So the answer is } ] >.
|
|
72
|
+
So, we need "}", "]", ">". So the answer is } ] >.
|
|
@@ -41,4 +41,4 @@ Let’s see whether the Hypothesis can be deduced from the arguments (1) and (2)
|
|
|
41
41
|
By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress). We need to consider both cases separately:
|
|
42
42
|
The case X = rare-consumer(Nioxin) does not appear in (2).
|
|
43
43
|
The case X = loyal-buyer(Caress) does not appear in (2), either.
|
|
44
|
-
So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.
|
|
44
|
+
So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.
|
|
@@ -75,4 +75,4 @@ Length of side A: |A| = sqrt((41.00-37.00)^2 + (43.00-34.00)^2) = sqrt((4)^2 + (
|
|
|
75
75
|
Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 + (1)^2) = sqrt(16 + 1) = sqrt(17).
|
|
76
76
|
Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
|
|
77
77
|
Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
|
|
78
|
-
Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).
|
|
78
|
+
Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).
|
|
@@ -25,4 +25,4 @@ Options:
|
|
|
25
25
|
A: Let's think step by step.
|
|
26
26
|
When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
|
|
27
27
|
Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
|
|
28
|
-
Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).
|
|
28
|
+
Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).
|
|
@@ -34,4 +34,4 @@ A: Let's think step by step.
|
|
|
34
34
|
(2) The red book is the second from the left: "(left) ? white red gray ? (right)".
|
|
35
35
|
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
|
36
36
|
According to this ordering, the leftmost book is the white book.
|
|
37
|
-
The white book is the leftmost. So the answer is (C).
|
|
37
|
+
The white book is the leftmost. So the answer is (C).
|
|
@@ -34,4 +34,4 @@ A: Let's think step by step.
|
|
|
34
34
|
(2) The red book is the second from the left: "(left) ? white red gray ? (right)".
|
|
35
35
|
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
|
36
36
|
According to this ordering, the leftmost book is the white book.
|
|
37
|
-
The white book is the leftmost. So the answer is (C).
|
|
37
|
+
The white book is the leftmost. So the answer is (C).
|
|
@@ -34,4 +34,4 @@ A: Let's think step by step.
|
|
|
34
34
|
(2) The red book is the second from the left: "(left) ? white red gray ? (right)".
|
|
35
35
|
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
|
36
36
|
According to this ordering, the leftmost book is the white book.
|
|
37
|
-
The white book is the leftmost. So the answer is (C).
|
|
37
|
+
The white book is the leftmost. So the answer is (C).
|
|
@@ -39,4 +39,4 @@ A: Let's think step by step.
|
|
|
39
39
|
- Total Recall (action, adventure, science-fiction; 2012)
|
|
40
40
|
- Inside Out (animation, family, comedy; 2015)
|
|
41
41
|
- Forrest Gump (comedy, drama, romance; 1994)
|
|
42
|
-
These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).
|
|
42
|
+
These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).
|
|
@@ -22,4 +22,4 @@ Let’s recall that the order of operations in mathematics is as follows: (1) Pa
|
|
|
22
22
|
This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9 - 8 * -7 + -9).
|
|
23
23
|
Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
|
|
24
24
|
Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
|
|
25
|
-
Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.
|
|
25
|
+
Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.
|
|
@@ -40,4 +40,4 @@ We start at the origin (0, 0), facing the positive y-axis.
|
|
|
40
40
|
(2) Take 2 steps right: (0, 2), facing the positive y-axis.
|
|
41
41
|
(3) Take 9 steps left: (0, -7), facing the positive y-axis.
|
|
42
42
|
(4) Take 7 steps right: (0, 7), facing the positive y-axis.
|
|
43
|
-
Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.
|
|
43
|
+
Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.
|
|
@@ -34,4 +34,4 @@ We first identify the vegetables on the list and include their quantity in paren
|
|
|
34
34
|
- carrots (2)
|
|
35
35
|
- garlic (1)
|
|
36
36
|
- yam (1)
|
|
37
|
-
Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.
|
|
37
|
+
Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.
|
|
@@ -38,4 +38,4 @@ A: Let's think step by step.
|
|
|
38
38
|
This question focuses on the name. We know the following: The names of the penguin in the table are Louis, Bernard, Vincent, and Gwen.
|
|
39
39
|
When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
|
|
40
40
|
The name of the second penguin sorted by alphabetical order is Gwen.
|
|
41
|
-
The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).
|
|
41
|
+
The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).
|
|
@@ -60,4 +60,4 @@ Options:
|
|
|
60
60
|
A: Let's think step by step.
|
|
61
61
|
According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
|
|
62
62
|
The teal plate is the first item, namely (1). There is no item to the left of the teal item.
|
|
63
|
-
The number of non-orange items to the left of the teal item is zero. So the answer is (A).
|
|
63
|
+
The number of non-orange items to the left of the teal item is zero. So the answer is (A).
|
|
@@ -41,4 +41,4 @@ The original name is "counting crows". This is the name of an American rock band
|
|
|
41
41
|
(B) "counting cows": Here the word "crows" is changed to "cows", and this is indeed a playful and meaningful edit that ruins the original name of the band.
|
|
42
42
|
(C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
|
|
43
43
|
(D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
|
|
44
|
-
Above the above, the only humorous edit is (B). So the answer is (B).
|
|
44
|
+
Above the above, the only humorous edit is (B). So the answer is (B).
|
|
@@ -37,4 +37,4 @@ Options:
|
|
|
37
37
|
(E) Dropped Content
|
|
38
38
|
(F) Facts
|
|
39
39
|
A: Let's think step by step.
|
|
40
|
-
We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).
|
|
40
|
+
We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).
|
|
@@ -27,4 +27,4 @@ Options:
|
|
|
27
27
|
A: Let's think step by step.
|
|
28
28
|
If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
|
|
29
29
|
If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
|
|
30
|
-
Above the above, the sarcastic option is (A). So the answer is (A).
|
|
30
|
+
Above the above, the sarcastic option is (A). So the answer is (A).
|
|
@@ -7,4 +7,4 @@ Q: Is the following sentence plausible? "Santi Cazorla scored a touchdown."
|
|
|
7
7
|
A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part of American football and rugby. So the answer is no.
|
|
8
8
|
|
|
9
9
|
Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
|
|
10
|
-
A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.
|
|
10
|
+
A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.
|
|
@@ -74,4 +74,4 @@ Wake-up time: 5am.
|
|
|
74
74
|
2pm-3pm: playing tennis at the tennis court.
|
|
75
75
|
3pm-4pm: free.
|
|
76
76
|
The beach closure time: 4pm.
|
|
77
|
-
The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).
|
|
77
|
+
The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).
|
|
@@ -37,4 +37,4 @@ A: Let's think step by step.
|
|
|
37
37
|
(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
|
|
38
38
|
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
|
39
39
|
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
|
40
|
-
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
40
|
+
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
@@ -37,4 +37,4 @@ A: Let's think step by step.
|
|
|
37
37
|
(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
|
|
38
38
|
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
|
39
39
|
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
|
40
|
-
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
40
|
+
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
@@ -37,4 +37,4 @@ A: Let's think step by step.
|
|
|
37
37
|
(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.
|
|
38
38
|
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
|
39
39
|
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
|
40
|
-
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
40
|
+
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).
|
|
@@ -25,4 +25,4 @@ A: Let's think step by step.
|
|
|
25
25
|
(3) Raymond says Michaela tells the truth. Since we know from (2) that Michaela lies, if Raymond says Michaela tells the truth, then Raymond lies.
|
|
26
26
|
(4) Osvaldo says Raymond tells the truth. Since we know from (3) that Raymond lies, if Osvaldo says Raymond tells the truth, then Osvaldo lies.
|
|
27
27
|
(5) Jamey says Osvaldo tells the truth. Since we know from (4) that Osvaldo lies, if Jamey says Osvaldo tells the truth, then Jamey lies.
|
|
28
|
-
Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.
|
|
28
|
+
Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.
|
|
@@ -14,4 +14,4 @@ The second letter: "ponderosa": "o" (15). "phone": "h" (8). We now have: (8) "ph
|
|
|
14
14
|
Q: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census
|
|
15
15
|
A: Let's think step by step.
|
|
16
16
|
The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
|
|
17
|
-
The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.
|
|
17
|
+
The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
3
|
import os.path
|
|
4
|
+
from modelscope.msdatasets import MsDataset
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
|
-
from evalscope.constants import
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class Benchmark(object):
|
|
@@ -20,7 +21,7 @@ class Benchmark(object):
|
|
|
20
21
|
split: str = None,
|
|
21
22
|
token: str = None,
|
|
22
23
|
hub: str = 'ModelScope',
|
|
23
|
-
work_dir: Optional[str] =
|
|
24
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
24
25
|
**kwargs):
|
|
25
26
|
"""
|
|
26
27
|
Load a dataset from ModelScope or HuggingFace.
|
|
@@ -38,21 +39,20 @@ class Benchmark(object):
|
|
|
38
39
|
Returns:
|
|
39
40
|
A dict.
|
|
40
41
|
"""
|
|
41
|
-
work_dir = os.path.join(work_dir, 'benchmarks', dataset_name.replace('/', '_'))
|
|
42
|
-
if hub == 'ModelScope':
|
|
43
|
-
from modelscope.msdatasets import MsDataset
|
|
44
|
-
dataset = MsDataset.load(dataset_name=dataset_name, subset_name=subset, split=split, token=token,
|
|
45
|
-
cache_dir=work_dir, **kwargs)
|
|
46
42
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
43
|
+
dataset = MsDataset.load(
|
|
44
|
+
dataset_name=dataset_name,
|
|
45
|
+
subset_name=subset,
|
|
46
|
+
split=split,
|
|
47
|
+
token=token,
|
|
48
|
+
cache_dir=work_dir,
|
|
49
|
+
hub=hub,
|
|
50
|
+
**kwargs)
|
|
51
|
+
|
|
52
|
+
dataset.dataset_name = dataset_name.split('/')[-1]
|
|
53
|
+
dataset.subset_name = subset
|
|
54
|
+
# dataset.split = split
|
|
55
|
+
return dataset
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
if __name__ == '__main__':
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.ceval.ceval_adapter import
|
|
3
|
+
from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
|
|
4
5
|
from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|