crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_specs/arabic_run_specs.py +6 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/commonsense_scenario.py +7 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/gsm_scenario.py +9 -3
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -7
- helm/benchmark/scenarios/math_scenario.py +11 -4
- helm/benchmark/scenarios/med_qa_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mmlu_scenario.py +8 -2
- helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
- helm/benchmark/static/schema_long_context.yaml +12 -31
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +5 -1
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/together_client.py +4 -0
- helm/clients/vertexai_client.py +4 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +225 -0
- helm/config/model_metadata.yaml +232 -7
- helm/config/tokenizer_configs.yaml +74 -4
- helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -134,8 +134,14 @@ class MMLUScenario(Scenario):
|
|
|
134
134
|
short_display_name="MMLU",
|
|
135
135
|
description="The Massive Multitask Language Understanding (MMLU) benchmark for "
|
|
136
136
|
"knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
|
|
137
|
-
"2021)](https://
|
|
138
|
-
taxonomy=TaxonomyInfo(
|
|
137
|
+
"2021)](https://arxiv.org/pdf/2009.03300.pdf).",
|
|
138
|
+
taxonomy=TaxonomyInfo(
|
|
139
|
+
task="multiple-choice question answering",
|
|
140
|
+
what="math, science, history, etc.",
|
|
141
|
+
when="before 2021",
|
|
142
|
+
who="various online sources",
|
|
143
|
+
language="English",
|
|
144
|
+
),
|
|
139
145
|
main_metric="exact_match",
|
|
140
146
|
main_split="test",
|
|
141
147
|
)
|
|
@@ -162,14 +162,13 @@ class NarrativeQAScenario(Scenario):
|
|
|
162
162
|
return ScenarioMetadata(
|
|
163
163
|
name="narrative_qa",
|
|
164
164
|
display_name="NarrativeQA",
|
|
165
|
-
short_display_name=None,
|
|
166
165
|
description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
|
|
167
166
|
"et al., 2017)](https://aclanthology.org/Q18-1023/).",
|
|
168
167
|
taxonomy=TaxonomyInfo(
|
|
169
|
-
task="question answering",
|
|
168
|
+
task="short-answer question answering",
|
|
170
169
|
what="passages are books and movie scripts, questions are unknown",
|
|
171
|
-
when="
|
|
172
|
-
who="
|
|
170
|
+
when="2018",
|
|
171
|
+
who="annotators from summaries",
|
|
173
172
|
language="English",
|
|
174
173
|
),
|
|
175
174
|
main_metric="f1_score",
|
|
@@ -6,6 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
import datasets
|
|
7
7
|
import tiktoken
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Output,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
Instance,
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
from helm.common.general import ensure_directory_exists
|
|
19
21
|
|
|
@@ -77,3 +79,16 @@ class OpenAIMRCRScenario(Scenario):
|
|
|
77
79
|
instances.append(instance)
|
|
78
80
|
|
|
79
81
|
return instances
|
|
82
|
+
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="openai_mrcr",
|
|
86
|
+
display_name="OpenAI MRCR",
|
|
87
|
+
description="OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset "
|
|
88
|
+
"for benchmarking an LLM's ability to distinguish between multiple needles "
|
|
89
|
+
"hidden in context. This eval is inspired by the MRCR eval first introduced by "
|
|
90
|
+
"[Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).",
|
|
91
|
+
taxonomy=TaxonomyInfo(task="MRCR", what="Synthetic data", when="2025", who="None", language="English"),
|
|
92
|
+
main_metric="openai_mrcr_accuracy",
|
|
93
|
+
main_split="test",
|
|
94
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -78,6 +80,25 @@ class RULERHotpotQAScenario(_RULERQAScenario):
|
|
|
78
80
|
def __init__(self, max_num_words: int):
|
|
79
81
|
super().__init__("hotpotqa", max_num_words)
|
|
80
82
|
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="ruler_hotpotqa",
|
|
86
|
+
display_name="RULER HotPotQA",
|
|
87
|
+
description="RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., "
|
|
88
|
+
"2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., "
|
|
89
|
+
"2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question "
|
|
90
|
+
"answering as a long-context scenario.",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="question answering with retrieval-augmented generation",
|
|
93
|
+
what="Wikipedia articles",
|
|
94
|
+
when="Before 2018",
|
|
95
|
+
who="Wikipedia authors",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
main_metric="ruler_string_match_part",
|
|
99
|
+
main_split="valid",
|
|
100
|
+
)
|
|
101
|
+
|
|
81
102
|
|
|
82
103
|
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
104
|
name = "ruler_squad"
|
|
@@ -86,3 +107,22 @@ class RULERSQuADScenario(_RULERQAScenario):
|
|
|
86
107
|
|
|
87
108
|
def __init__(self, max_num_words: int):
|
|
88
109
|
super().__init__("squad", max_num_words)
|
|
110
|
+
|
|
111
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
112
|
+
return ScenarioMetadata(
|
|
113
|
+
name="ruler_squad",
|
|
114
|
+
display_name="RULER SQuAD",
|
|
115
|
+
description="RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., "
|
|
116
|
+
"2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., "
|
|
117
|
+
"2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question "
|
|
118
|
+
"answering as a long-context scenario.",
|
|
119
|
+
taxonomy=TaxonomyInfo(
|
|
120
|
+
task="question answering",
|
|
121
|
+
what="Wikipedia articles",
|
|
122
|
+
when="Before 2018",
|
|
123
|
+
who="Wikipedia authors and crowdworkers",
|
|
124
|
+
language="English",
|
|
125
|
+
),
|
|
126
|
+
main_metric="ruler_string_match_part",
|
|
127
|
+
main_split="valid",
|
|
128
|
+
)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
-
from helm.benchmark.
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class SimpleSafetyTestsScenario(Scenario):
|
|
@@ -31,3 +32,13 @@ class SimpleSafetyTestsScenario(Scenario):
|
|
|
31
32
|
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
32
33
|
instances.append(instance)
|
|
33
34
|
return instances
|
|
35
|
+
|
|
36
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
37
|
+
return ScenarioMetadata(
|
|
38
|
+
name="simple_safety_tests",
|
|
39
|
+
display_name="SimpleSafetyTests",
|
|
40
|
+
description="SimpleSafetyTests",
|
|
41
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
42
|
+
main_metric="safety_score",
|
|
43
|
+
main_split="test",
|
|
44
|
+
)
|
|
@@ -4,6 +4,7 @@ from typing import Dict, List
|
|
|
4
4
|
|
|
5
5
|
from filelock import FileLock
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
9
|
from helm.common.hierarchical_logger import hlog
|
|
9
10
|
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
@@ -17,6 +18,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
17
18
|
VALID_SPLIT,
|
|
18
19
|
Input,
|
|
19
20
|
Output,
|
|
21
|
+
ScenarioMetadata,
|
|
20
22
|
)
|
|
21
23
|
|
|
22
24
|
|
|
@@ -89,3 +91,19 @@ INSERT_YOUR_SQL_QUERY_HERE
|
|
|
89
91
|
)
|
|
90
92
|
instances.append(instance)
|
|
91
93
|
return instances
|
|
94
|
+
|
|
95
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
96
|
+
return ScenarioMetadata(
|
|
97
|
+
name="spider",
|
|
98
|
+
display_name="Spider 1.0 (Test)",
|
|
99
|
+
description="Spider 1.0 (Test)",
|
|
100
|
+
taxonomy=TaxonomyInfo(
|
|
101
|
+
task="text-to-SQL",
|
|
102
|
+
what="databases from various domains",
|
|
103
|
+
when="?",
|
|
104
|
+
who="expert data scientists",
|
|
105
|
+
language="English",
|
|
106
|
+
),
|
|
107
|
+
main_metric="execution_accuracy",
|
|
108
|
+
main_split="valid",
|
|
109
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
|
|
|
142
144
|
instances.extend(self.process_jsonl(jsonl_path, splits[split]))
|
|
143
145
|
|
|
144
146
|
return instances
|
|
147
|
+
|
|
148
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
149
|
+
if self.exam == "onet":
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="thai_exam_onet",
|
|
152
|
+
display_name="ONET",
|
|
153
|
+
description="The Ordinary National Educational Test (ONET) is an examination for students "
|
|
154
|
+
"in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
|
|
155
|
+
"each question has 5 choices. These subjects are Thai, English, Mathematics, "
|
|
156
|
+
"Social Studies, and Science. Amounting to a total of 170 questions and "
|
|
157
|
+
"options.\n",
|
|
158
|
+
taxonomy=TaxonomyInfo(
|
|
159
|
+
task="question answering",
|
|
160
|
+
what="high school / medical school academic knowledge",
|
|
161
|
+
when="?",
|
|
162
|
+
who="n/a",
|
|
163
|
+
language="Thai and English",
|
|
164
|
+
),
|
|
165
|
+
main_metric="exact_match",
|
|
166
|
+
main_split="test",
|
|
167
|
+
)
|
|
168
|
+
elif self.exam == "ic":
|
|
169
|
+
return ScenarioMetadata(
|
|
170
|
+
name="thai_exam_ic",
|
|
171
|
+
display_name="IC",
|
|
172
|
+
description="The Investment Consultant (IC) examination, a licensing test for investment "
|
|
173
|
+
"professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
|
|
174
|
+
"features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
|
|
175
|
+
"resulting in a total of 95 questions and options.\n",
|
|
176
|
+
taxonomy=TaxonomyInfo(
|
|
177
|
+
task="question answering",
|
|
178
|
+
what="licensing for investment professionals",
|
|
179
|
+
when="?",
|
|
180
|
+
who="n/a",
|
|
181
|
+
language="Thai",
|
|
182
|
+
),
|
|
183
|
+
main_metric="exact_match",
|
|
184
|
+
main_split="test",
|
|
185
|
+
)
|
|
186
|
+
elif self.exam == "tgat":
|
|
187
|
+
return ScenarioMetadata(
|
|
188
|
+
name="thai_exam_tgat",
|
|
189
|
+
display_name="TGAT",
|
|
190
|
+
description="The Thai General Aptitude Test (TGAT), a national high school examination in "
|
|
191
|
+
"Thailand. Focuses on critical and logical thinking skills. We collected a "
|
|
192
|
+
"total of 90 questions and answers. The TGAT consists of four choices per "
|
|
193
|
+
"question.\n",
|
|
194
|
+
taxonomy=TaxonomyInfo(
|
|
195
|
+
task="question answering",
|
|
196
|
+
what="high school level questions on reasoning",
|
|
197
|
+
when="?",
|
|
198
|
+
who="n/a",
|
|
199
|
+
language="English",
|
|
200
|
+
),
|
|
201
|
+
main_metric="exact_match",
|
|
202
|
+
main_split="test",
|
|
203
|
+
)
|
|
204
|
+
elif self.exam == "tpat1":
|
|
205
|
+
return ScenarioMetadata(
|
|
206
|
+
name="thai_exam_tpat1",
|
|
207
|
+
display_name="TPAT-1",
|
|
208
|
+
description="TBD",
|
|
209
|
+
taxonomy=TaxonomyInfo(
|
|
210
|
+
task="question answering",
|
|
211
|
+
what="high school / medical school academic knowledge",
|
|
212
|
+
when="?",
|
|
213
|
+
who="n/a",
|
|
214
|
+
language="Thai",
|
|
215
|
+
),
|
|
216
|
+
main_metric="exact_match",
|
|
217
|
+
main_split="test",
|
|
218
|
+
)
|
|
219
|
+
elif self.exam == "a_level":
|
|
220
|
+
return ScenarioMetadata(
|
|
221
|
+
name="thai_exam_a_level",
|
|
222
|
+
display_name="A-Level",
|
|
223
|
+
description="An academic knowledge assessment examination (Applied Knowledge Level) that "
|
|
224
|
+
"covers general foundational subjects taught in schools. The content assessed "
|
|
225
|
+
"in this examination aligns with the curriculum guidelines and emphasizes the "
|
|
226
|
+
"practical application of knowledge in daily life. We collected a total of 175 "
|
|
227
|
+
"questions and answers.\n",
|
|
228
|
+
taxonomy=TaxonomyInfo(
|
|
229
|
+
task="question answering",
|
|
230
|
+
what="high school academic knowledge",
|
|
231
|
+
when="?",
|
|
232
|
+
who="n/a",
|
|
233
|
+
language="Thai and English",
|
|
234
|
+
),
|
|
235
|
+
main_metric="exact_match",
|
|
236
|
+
main_split="test",
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"Unknown exam: {self.exam}")
|
|
@@ -113,8 +113,15 @@ class WMT14Scenario(Scenario):
|
|
|
113
113
|
return ScenarioMetadata(
|
|
114
114
|
name="wmt_14",
|
|
115
115
|
display_name="WMT 2014",
|
|
116
|
-
description="WMT 2014 is a collection of machine translation datasets
|
|
117
|
-
|
|
116
|
+
description="WMT 2014 is a collection of machine translation datasets "
|
|
117
|
+
"[(website)](https://www.statmt.org/wmt14/index.html).",
|
|
118
|
+
taxonomy=TaxonomyInfo(
|
|
119
|
+
task="machine translation",
|
|
120
|
+
what="multilingual sentences",
|
|
121
|
+
when="before 2014",
|
|
122
|
+
who="Europarl, news, Common Crawl, etc.",
|
|
123
|
+
language="English, French, Czech, etc.",
|
|
124
|
+
),
|
|
118
125
|
main_metric="bleu_4",
|
|
119
126
|
main_split="test",
|
|
120
127
|
)
|
|
@@ -191,16 +191,15 @@ run_groups:
|
|
|
191
191
|
description: Scenarios for evaluating long context capabilities
|
|
192
192
|
category: All scenarios
|
|
193
193
|
subgroups:
|
|
194
|
-
- ruler_hotpotqa
|
|
195
194
|
- ruler_squad
|
|
196
|
-
-
|
|
197
|
-
# - infinite_bench_en_qa
|
|
195
|
+
- ruler_hotpotqa
|
|
198
196
|
- infinite_bench_en_mc
|
|
197
|
+
- infinite_bench_en_sum
|
|
199
198
|
- openai_mrcr
|
|
200
199
|
|
|
201
|
-
- name:
|
|
202
|
-
display_name: RULER
|
|
203
|
-
description: RULER
|
|
200
|
+
- name: ruler_squad
|
|
201
|
+
display_name: RULER SQuAD
|
|
202
|
+
description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
|
|
204
203
|
metric_groups:
|
|
205
204
|
- accuracy
|
|
206
205
|
- general_information
|
|
@@ -209,16 +208,15 @@ run_groups:
|
|
|
209
208
|
main_name: ruler_string_match_part
|
|
210
209
|
main_split: valid
|
|
211
210
|
taxonomy:
|
|
212
|
-
task: question answering
|
|
211
|
+
task: question answering
|
|
213
212
|
what: Wikipedia articles
|
|
214
|
-
who: Wikipedia authors
|
|
213
|
+
who: Wikipedia authors and crowdworkers
|
|
215
214
|
when: Before 2018
|
|
216
215
|
language: English
|
|
217
216
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
|
|
217
|
+
- name: ruler_hotpotqa
|
|
218
|
+
display_name: RULER HotPotQA
|
|
219
|
+
description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
|
|
222
220
|
metric_groups:
|
|
223
221
|
- accuracy
|
|
224
222
|
- general_information
|
|
@@ -227,29 +225,12 @@ run_groups:
|
|
|
227
225
|
main_name: ruler_string_match_part
|
|
228
226
|
main_split: valid
|
|
229
227
|
taxonomy:
|
|
230
|
-
task: question answering
|
|
228
|
+
task: question answering with retrieval-augmented generation
|
|
231
229
|
what: Wikipedia articles
|
|
232
|
-
who: Wikipedia authors
|
|
230
|
+
who: Wikipedia authors
|
|
233
231
|
when: Before 2018
|
|
234
232
|
language: English
|
|
235
233
|
|
|
236
|
-
# - name: infinite_bench_en_qa
|
|
237
|
-
# display_name: ∞Bench En.QA
|
|
238
|
-
# description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
239
|
-
# metric_groups:
|
|
240
|
-
# - accuracy
|
|
241
|
-
# - general_information
|
|
242
|
-
# - annotation_metrics
|
|
243
|
-
# environment:
|
|
244
|
-
# main_name: f1_score
|
|
245
|
-
# main_split: test
|
|
246
|
-
# taxonomy:
|
|
247
|
-
# task: question answering
|
|
248
|
-
# what: Novels
|
|
249
|
-
# who: Novel authors
|
|
250
|
-
# when: Before 2024
|
|
251
|
-
# language: English
|
|
252
|
-
|
|
253
234
|
- name: infinite_bench_en_mc
|
|
254
235
|
display_name: ∞Bench En.MC
|
|
255
236
|
description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
Binary file
|