evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
3
|
+
from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
|
|
4
5
|
from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# isort: skip_file
|
|
1
2
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
3
|
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
4
|
#
|
|
@@ -14,14 +15,11 @@
|
|
|
14
15
|
# limitations under the License.
|
|
15
16
|
# flake8: noqa
|
|
16
17
|
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
18
|
import datasets
|
|
19
|
+
import os
|
|
20
20
|
import pandas as pd
|
|
21
|
-
|
|
22
21
|
"""The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
|
|
23
22
|
|
|
24
|
-
|
|
25
23
|
_CITATION = """\
|
|
26
24
|
@article{hendryckstest2021,
|
|
27
25
|
title={Measuring Massive Multitask Language Understanding},
|
|
@@ -105,29 +103,23 @@ task_list = [
|
|
|
105
103
|
|
|
106
104
|
|
|
107
105
|
class MMLUConfig(datasets.BuilderConfig):
|
|
106
|
+
|
|
108
107
|
def __init__(self, **kwargs):
|
|
109
108
|
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
110
109
|
|
|
111
110
|
|
|
112
111
|
class MMLU(datasets.GeneratorBasedBuilder):
|
|
113
|
-
BUILDER_CONFIGS = [
|
|
114
|
-
MMLUConfig(
|
|
115
|
-
name=task_name,
|
|
116
|
-
)
|
|
117
|
-
for task_name in task_list
|
|
118
|
-
]
|
|
112
|
+
BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list]
|
|
119
113
|
|
|
120
114
|
def _info(self):
|
|
121
|
-
features = datasets.Features(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
}
|
|
130
|
-
)
|
|
115
|
+
features = datasets.Features({
|
|
116
|
+
'input': datasets.Value('string'),
|
|
117
|
+
'A': datasets.Value('string'),
|
|
118
|
+
'B': datasets.Value('string'),
|
|
119
|
+
'C': datasets.Value('string'),
|
|
120
|
+
'D': datasets.Value('string'),
|
|
121
|
+
'target': datasets.Value('string'),
|
|
122
|
+
})
|
|
131
123
|
return datasets.DatasetInfo(
|
|
132
124
|
description=_DESCRIPTION,
|
|
133
125
|
features=features,
|
|
@@ -143,25 +135,19 @@ class MMLU(datasets.GeneratorBasedBuilder):
|
|
|
143
135
|
datasets.SplitGenerator(
|
|
144
136
|
name=datasets.Split.TEST,
|
|
145
137
|
gen_kwargs={
|
|
146
|
-
'filepath': os.path.join(
|
|
147
|
-
data_dir, 'data', 'test', f'{task_name}_test.csv'
|
|
148
|
-
),
|
|
138
|
+
'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'),
|
|
149
139
|
},
|
|
150
140
|
),
|
|
151
141
|
datasets.SplitGenerator(
|
|
152
142
|
name=datasets.Split.VALIDATION,
|
|
153
143
|
gen_kwargs={
|
|
154
|
-
'filepath': os.path.join(
|
|
155
|
-
data_dir, 'data', 'val', f'{task_name}_val.csv'
|
|
156
|
-
),
|
|
144
|
+
'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'),
|
|
157
145
|
},
|
|
158
146
|
),
|
|
159
147
|
datasets.SplitGenerator(
|
|
160
148
|
name=datasets.Split.TRAIN,
|
|
161
149
|
gen_kwargs={
|
|
162
|
-
'filepath': os.path.join(
|
|
163
|
-
data_dir, 'data', 'dev', f'{task_name}_dev.csv'
|
|
164
|
-
),
|
|
150
|
+
'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'),
|
|
165
151
|
},
|
|
166
152
|
),
|
|
167
153
|
]
|
|
@@ -4,8 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
6
6
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
-
from evalscope.utils import
|
|
7
|
+
from evalscope.utils import ResponseParser, normalize_score
|
|
8
8
|
from evalscope.utils.logger import get_logger
|
|
9
|
+
|
|
9
10
|
# flake8: noqa
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -72,65 +73,65 @@ SUBSET_LIST = [
|
|
|
72
73
|
'college_biology',
|
|
73
74
|
]
|
|
74
75
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
76
|
+
SUBJECT_MAPPING = {
|
|
77
|
+
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
78
|
+
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
79
|
+
'astronomy': ['Astronomy', 'physics', 'STEM'],
|
|
80
|
+
'business_ethics': ['Business Ethics', 'business', 'Other'],
|
|
81
|
+
'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
|
|
82
|
+
'college_biology': ['College Biology', 'biology', 'STEM'],
|
|
83
|
+
'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
|
|
84
|
+
'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
|
|
85
|
+
'college_mathematics': ['College Mathematics', 'math', 'STEM'],
|
|
86
|
+
'college_medicine': ['College Medicine', 'health', 'Other'],
|
|
87
|
+
'college_physics': ['College Physics', 'physics', 'STEM'],
|
|
88
|
+
'computer_security': ['Computer Security', 'computer science', 'STEM'],
|
|
89
|
+
'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
|
|
90
|
+
'econometrics': ['Econometrics', 'economics', 'Social Science'],
|
|
91
|
+
'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
|
|
92
|
+
'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
|
|
93
|
+
'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
|
|
94
|
+
'global_facts': ['Global Facts', 'other', 'Other'],
|
|
95
|
+
'high_school_biology': ['High School Biology', 'biology', 'STEM'],
|
|
96
|
+
'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
|
|
97
|
+
'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
|
|
98
|
+
'high_school_european_history': ['High School European History', 'history', 'Humanities'],
|
|
99
|
+
'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
|
|
100
|
+
'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
|
|
101
|
+
'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
|
|
102
|
+
'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
|
|
103
|
+
'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
|
|
104
|
+
'high_school_physics': ['High School Physics', 'physics', 'STEM'],
|
|
105
|
+
'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
|
|
106
|
+
'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
|
|
107
|
+
'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
|
|
108
|
+
'high_school_world_history': ['High School World History', 'history', 'Humanities'],
|
|
109
|
+
'human_aging': ['Human Aging', 'health', 'Other'],
|
|
110
|
+
'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
|
|
111
|
+
'international_law': ['International Law', 'law', 'Humanities'],
|
|
112
|
+
'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
|
|
113
|
+
'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
|
|
114
|
+
'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
|
|
115
|
+
'management': ['Management', 'business', 'Other'],
|
|
116
|
+
'marketing': ['Marketing', 'business', 'Other'],
|
|
117
|
+
'medical_genetics': ['Medical Genetics', 'health', 'Other'],
|
|
118
|
+
'miscellaneous': ['Miscellaneous', 'other', 'Other'],
|
|
119
|
+
'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
|
|
120
|
+
'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
|
|
121
|
+
'nutrition': ['Nutrition', 'health', 'Other'],
|
|
122
|
+
'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
|
|
123
|
+
'prehistory': ['Prehistory', 'history', 'Humanities'],
|
|
124
|
+
'professional_accounting': ['Professional Accounting', 'other', 'Other'],
|
|
125
|
+
'professional_law': ['Professional Law', 'law', 'Humanities'],
|
|
126
|
+
'professional_medicine': ['Professional Medicine', 'health', 'Other'],
|
|
127
|
+
'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
|
|
128
|
+
'public_relations': ['Public Relations', 'politics', 'Social Science'],
|
|
129
|
+
'security_studies': ['Security Studies', 'politics', 'Social Science'],
|
|
130
|
+
'sociology': ['Sociology', 'culture', 'Social Science'],
|
|
131
|
+
'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
|
|
132
|
+
'virology': ['Virology', 'health', 'Other'],
|
|
133
|
+
'world_religions': ['World Religions', 'philosophy', 'Humanities'],
|
|
134
|
+
}
|
|
134
135
|
|
|
135
136
|
|
|
136
137
|
class MMLUAdapter(DataAdapter):
|
|
@@ -160,12 +161,13 @@ class MMLUAdapter(DataAdapter):
|
|
|
160
161
|
logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
|
|
161
162
|
few_shot_num = 5
|
|
162
163
|
|
|
163
|
-
super().__init__(
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
164
|
+
super().__init__(
|
|
165
|
+
subset_list=subset_list,
|
|
166
|
+
metric_list=metric_list,
|
|
167
|
+
few_shot_num=few_shot_num,
|
|
168
|
+
train_split=train_split,
|
|
169
|
+
eval_split=eval_split,
|
|
170
|
+
**kwargs)
|
|
169
171
|
|
|
170
172
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
171
173
|
data_dict = {}
|
|
@@ -227,8 +229,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
227
229
|
|
|
228
230
|
"""
|
|
229
231
|
prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
|
|
230
|
-
self._format_subject(subset_name)
|
|
231
|
-
)
|
|
232
|
+
self._format_subject(subset_name))
|
|
232
233
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
233
234
|
|
|
234
235
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
@@ -335,19 +336,26 @@ class MMLUAdapter(DataAdapter):
|
|
|
335
336
|
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
336
337
|
sum([num for _, _, num in domain_res_list])
|
|
337
338
|
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
338
|
-
category_list.append({
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
339
|
+
category_list.append({
|
|
340
|
+
'name':
|
|
341
|
+
domain_name,
|
|
342
|
+
'score':
|
|
343
|
+
domain_weighted_avg_acc,
|
|
344
|
+
'subset': [{
|
|
345
|
+
'name': subset_name,
|
|
346
|
+
'score': normalize_score(score=subset_score)
|
|
347
|
+
} for subset_name, subset_score, _ in domain_res_list]
|
|
348
|
+
})
|
|
342
349
|
|
|
343
350
|
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
344
351
|
|
|
345
352
|
# Get final dict of report
|
|
346
|
-
res_map = dict(
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
353
|
+
res_map = dict(
|
|
354
|
+
name=report_name or 'mmlu',
|
|
355
|
+
metric=self.metric_list[0]['name'],
|
|
356
|
+
score=weighted_avg_acc,
|
|
357
|
+
category=category_list,
|
|
358
|
+
total_num=total_num)
|
|
351
359
|
|
|
352
360
|
return res_map
|
|
353
361
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
3
|
+
from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.race.race_adapter import RACEAdapter
|
|
4
5
|
from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -11,12 +11,10 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import os
|
|
15
|
-
|
|
16
14
|
import datasets
|
|
15
|
+
import os
|
|
17
16
|
import pandas as pd
|
|
18
17
|
|
|
19
|
-
|
|
20
18
|
_CITATION = """\
|
|
21
19
|
@inproceedings{lai-etal-2017-race,
|
|
22
20
|
title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
|
|
@@ -40,39 +38,33 @@ _DESCRIPTION = """\
|
|
|
40
38
|
RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
|
|
41
39
|
"""
|
|
42
40
|
|
|
43
|
-
_HOMEPAGE =
|
|
41
|
+
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
|
|
44
42
|
|
|
45
|
-
_URL =
|
|
43
|
+
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
|
|
46
44
|
|
|
47
45
|
task_list = [
|
|
48
|
-
|
|
49
|
-
|
|
46
|
+
'high',
|
|
47
|
+
'middle',
|
|
50
48
|
]
|
|
51
49
|
|
|
52
50
|
|
|
53
51
|
class RACEConfig(datasets.BuilderConfig):
|
|
52
|
+
|
|
54
53
|
def __init__(self, **kwargs):
|
|
55
|
-
super().__init__(version=datasets.Version(
|
|
54
|
+
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
56
55
|
|
|
57
56
|
|
|
58
57
|
class RACE(datasets.GeneratorBasedBuilder):
|
|
59
|
-
BUILDER_CONFIGS = [
|
|
60
|
-
RACEConfig(
|
|
61
|
-
name=task_name,
|
|
62
|
-
)
|
|
63
|
-
for task_name in task_list
|
|
64
|
-
]
|
|
58
|
+
BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
|
|
65
59
|
|
|
66
60
|
def _info(self):
|
|
67
|
-
features = datasets.Features(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
}
|
|
75
|
-
)
|
|
61
|
+
features = datasets.Features({
|
|
62
|
+
'example_id': datasets.Value('string'),
|
|
63
|
+
'article': datasets.Value('string'),
|
|
64
|
+
'answer': datasets.Value('string'),
|
|
65
|
+
'question': datasets.Value('string'),
|
|
66
|
+
'options': [datasets.Value('string')],
|
|
67
|
+
})
|
|
76
68
|
return datasets.DatasetInfo(
|
|
77
69
|
description=_DESCRIPTION,
|
|
78
70
|
features=features,
|
|
@@ -87,32 +79,26 @@ class RACE(datasets.GeneratorBasedBuilder):
|
|
|
87
79
|
datasets.SplitGenerator(
|
|
88
80
|
name=datasets.Split.TEST,
|
|
89
81
|
gen_kwargs={
|
|
90
|
-
|
|
91
|
-
data_dir, f"race/test/{task_name}-00000-of-00001.parquet"
|
|
92
|
-
),
|
|
82
|
+
'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
|
|
93
83
|
},
|
|
94
84
|
),
|
|
95
85
|
datasets.SplitGenerator(
|
|
96
86
|
name=datasets.Split.VALIDATION,
|
|
97
87
|
gen_kwargs={
|
|
98
|
-
|
|
99
|
-
data_dir, f"race/val/{task_name}-00000-of-00001.parquet"
|
|
100
|
-
),
|
|
88
|
+
'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
|
|
101
89
|
},
|
|
102
90
|
),
|
|
103
91
|
datasets.SplitGenerator(
|
|
104
92
|
name=datasets.Split.TRAIN,
|
|
105
93
|
gen_kwargs={
|
|
106
|
-
|
|
107
|
-
data_dir, f"race/train/{task_name}-00000-of-00001.parquet"
|
|
108
|
-
),
|
|
94
|
+
'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
|
|
109
95
|
},
|
|
110
96
|
),
|
|
111
97
|
]
|
|
112
98
|
|
|
113
99
|
def _generate_examples(self, filepath):
|
|
114
100
|
df = pd.read_parquet(filepath)
|
|
115
|
-
df.columns = [
|
|
101
|
+
df.columns = ['example_id', 'article', 'answer', 'question', 'options']
|
|
116
102
|
|
|
117
|
-
for i, instance in enumerate(df.to_dict(orient=
|
|
118
|
-
yield i, instance
|
|
103
|
+
for i, instance in enumerate(df.to_dict(orient='records')):
|
|
104
|
+
yield i, instance
|
|
@@ -1,26 +1,22 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
5
6
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
6
7
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils import jsonl_to_list, normalize_score
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
9
11
|
# flake8: noqa
|
|
10
12
|
|
|
11
13
|
logger = get_logger()
|
|
12
14
|
|
|
13
15
|
DATASET_ID = 'modelscope/race'
|
|
14
16
|
|
|
15
|
-
SUBSET_LIST = [
|
|
16
|
-
"high",
|
|
17
|
-
"middle"
|
|
18
|
-
]
|
|
19
|
-
|
|
17
|
+
SUBSET_LIST = ['high', 'middle']
|
|
20
18
|
|
|
21
|
-
SUBJECT_MAPPING = {
|
|
22
|
-
"middle": "Middle"
|
|
23
|
-
}
|
|
19
|
+
SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
|
|
24
20
|
|
|
25
21
|
|
|
26
22
|
class RACEAdapter(DataAdapter):
|
|
@@ -49,12 +45,13 @@ class RACEAdapter(DataAdapter):
|
|
|
49
45
|
logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
|
|
50
46
|
few_shot_num = 3
|
|
51
47
|
|
|
52
|
-
super().__init__(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
48
|
+
super().__init__(
|
|
49
|
+
subset_list=subset_list,
|
|
50
|
+
metric_list=metric_list,
|
|
51
|
+
few_shot_num=few_shot_num,
|
|
52
|
+
train_split=train_split,
|
|
53
|
+
eval_split=eval_split,
|
|
54
|
+
**kwargs)
|
|
58
55
|
|
|
59
56
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
60
57
|
data_dict = {}
|
|
@@ -92,8 +89,7 @@ class RACEAdapter(DataAdapter):
|
|
|
92
89
|
|
|
93
90
|
"""
|
|
94
91
|
prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
|
|
95
|
-
self._format_subject(subset_name)
|
|
96
|
-
)
|
|
92
|
+
self._format_subject(subset_name))
|
|
97
93
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
98
94
|
|
|
99
95
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
@@ -122,9 +118,9 @@ class RACEAdapter(DataAdapter):
|
|
|
122
118
|
"""
|
|
123
119
|
if eval_type == 'checkpoint':
|
|
124
120
|
return result
|
|
125
|
-
elif eval_type == 'service':
|
|
121
|
+
elif eval_type == 'service': # TODO: to be implemented
|
|
126
122
|
return result
|
|
127
|
-
elif eval_type == 'custom':
|
|
123
|
+
elif eval_type == 'custom': # TODO: to be implemented
|
|
128
124
|
return result
|
|
129
125
|
else:
|
|
130
126
|
raise ValueError(f'Unknown eval_type: {eval_type}')
|
|
@@ -191,17 +187,24 @@ class RACEAdapter(DataAdapter):
|
|
|
191
187
|
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
192
188
|
sum([num for _, _, num in domain_res_list])
|
|
193
189
|
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
194
|
-
category_list.append({
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
190
|
+
category_list.append({
|
|
191
|
+
'name':
|
|
192
|
+
domain_name,
|
|
193
|
+
'score':
|
|
194
|
+
normalize_score(score=domain_weighted_avg_acc),
|
|
195
|
+
'subset': [{
|
|
196
|
+
'name': subset_name,
|
|
197
|
+
'score': subset_score
|
|
198
|
+
} for subset_name, subset_score, _ in domain_res_list]
|
|
199
|
+
})
|
|
198
200
|
|
|
199
201
|
# Get final dict of report
|
|
200
|
-
res_map = dict(
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
202
|
+
res_map = dict(
|
|
203
|
+
name=report_name or 'race',
|
|
204
|
+
metric=self.metric_list[0]['name'],
|
|
205
|
+
score=weighted_avg_acc,
|
|
206
|
+
category=category_list,
|
|
207
|
+
total_num=total_num)
|
|
205
208
|
|
|
206
209
|
return res_map
|
|
207
210
|
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
{'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
|
|
3
3
|
{'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats _ .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
|
|
4
4
|
{'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
|
|
5
|
-
{'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
|
|
5
|
+
{'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import
|
|
3
|
+
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
|
|
4
5
|
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -2,4 +2,4 @@
|
|
|
2
2
|
{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}], "ideal": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd.", "sunset boulevard", "sunset bulevard", "west sunset boulevard", "sunset blvd"]}
|
|
3
3
|
{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the next British Prime Minister after Arthur Balfour?"}], "ideal": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "henry campbell bannerman", "sir henry campbell bannerman", "campbell bannerman"]}
|
|
4
4
|
{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who had a 70s No 1 hit with Kiss You All Over?"}], "ideal": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment", "exiles", "voluntary exile", "forced exile", "banish", "self exile", "exile politics and government", "exile in greek tragedy", "sent into exile", "banishment", "transported for life", "exile", "internal exile", "exile and banishment"]}
|
|
5
|
-
{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
|
|
5
|
+
{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
|