crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show
  1. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
  2. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/metrics/bbq_metrics.py +12 -0
  5. helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
  6. helm/benchmark/metrics/safety_metrics.py +13 -1
  7. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  8. helm/benchmark/presentation/run_display.py +13 -3
  9. helm/benchmark/presentation/run_entry.py +2 -2
  10. helm/benchmark/run.py +1 -1
  11. helm/benchmark/run_specs/arabic_run_specs.py +6 -0
  12. helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
  13. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  14. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  15. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  16. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  17. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  18. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  19. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  20. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  21. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  22. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  23. helm/benchmark/scenarios/commonsense_scenario.py +7 -1
  24. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  25. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  26. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  27. helm/benchmark/scenarios/gsm_scenario.py +9 -3
  28. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  29. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  30. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  31. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  32. helm/benchmark/scenarios/legalbench_scenario.py +6 -7
  33. helm/benchmark/scenarios/math_scenario.py +11 -4
  34. helm/benchmark/scenarios/med_qa_scenario.py +7 -1
  35. helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
  36. helm/benchmark/scenarios/mmlu_scenario.py +8 -2
  37. helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
  38. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  39. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  40. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  41. helm/benchmark/scenarios/spider_scenario.py +18 -0
  42. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  43. helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
  44. helm/benchmark/static/schema_long_context.yaml +12 -31
  45. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  46. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  47. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  48. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  49. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  50. helm/benchmark/static_build/index.html +5 -6
  51. helm/clients/ai21_client.py +2 -0
  52. helm/clients/aleph_alpha_client.py +2 -0
  53. helm/clients/anthropic_client.py +7 -1
  54. helm/clients/audio_language/diva_llama_client.py +2 -0
  55. helm/clients/audio_language/llama_omni_client.py +2 -1
  56. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  57. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  58. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  59. helm/clients/bedrock_client.py +2 -0
  60. helm/clients/cohere_client.py +3 -0
  61. helm/clients/google_client.py +2 -0
  62. helm/clients/http_model_client.py +2 -0
  63. helm/clients/huggingface_client.py +2 -1
  64. helm/clients/ibm_client.py +3 -1
  65. helm/clients/image_generation/adobe_vision_client.py +2 -0
  66. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  67. helm/clients/image_generation/cogview2_client.py +2 -1
  68. helm/clients/image_generation/dalle2_client.py +2 -0
  69. helm/clients/image_generation/dalle_mini_client.py +2 -1
  70. helm/clients/image_generation/deep_floyd_client.py +2 -0
  71. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  72. helm/clients/image_generation/lexica_client.py +2 -0
  73. helm/clients/image_generation/mindalle_client.py +2 -1
  74. helm/clients/image_generation/together_image_generation_client.py +2 -0
  75. helm/clients/megatron_client.py +2 -0
  76. helm/clients/mistral_client.py +2 -0
  77. helm/clients/moderation_api_client.py +2 -0
  78. helm/clients/openai_client.py +5 -1
  79. helm/clients/palmyra_client.py +2 -1
  80. helm/clients/reka_client.py +2 -1
  81. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  82. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  83. helm/clients/together_client.py +4 -0
  84. helm/clients/vertexai_client.py +4 -0
  85. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  86. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  87. helm/clients/vision_language/idefics_client.py +2 -1
  88. helm/clients/vision_language/open_flamingo_client.py +2 -1
  89. helm/clients/vision_language/paligemma_client.py +2 -1
  90. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  91. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  92. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  93. helm/clients/writer_client.py +2 -0
  94. helm/common/hierarchical_logger.py +20 -0
  95. helm/common/optional_dependencies.py +1 -1
  96. helm/common/test_general.py +4 -0
  97. helm/config/model_deployments.yaml +225 -0
  98. helm/config/model_metadata.yaml +232 -7
  99. helm/config/tokenizer_configs.yaml +74 -4
  100. helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
  101. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  102. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  103. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  104. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  105. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  106. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  107. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  108. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  109. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  110. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  111. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  112. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  113. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  114. /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
  115. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  116. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  117. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  118. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  119. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  120. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  121. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -134,8 +134,14 @@ class MMLUScenario(Scenario):
134
134
  short_display_name="MMLU",
135
135
  description="The Massive Multitask Language Understanding (MMLU) benchmark for "
136
136
  "knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
137
- "2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).",
138
- taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
137
+ "2021)](https://arxiv.org/pdf/2009.03300.pdf).",
138
+ taxonomy=TaxonomyInfo(
139
+ task="multiple-choice question answering",
140
+ what="math, science, history, etc.",
141
+ when="before 2021",
142
+ who="various online sources",
143
+ language="English",
144
+ ),
139
145
  main_metric="exact_match",
140
146
  main_split="test",
141
147
  )
@@ -162,14 +162,13 @@ class NarrativeQAScenario(Scenario):
162
162
  return ScenarioMetadata(
163
163
  name="narrative_qa",
164
164
  display_name="NarrativeQA",
165
- short_display_name=None,
166
165
  description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
167
166
  "et al., 2017)](https://aclanthology.org/Q18-1023/).",
168
167
  taxonomy=TaxonomyInfo(
169
- task="question answering",
168
+ task="short-answer question answering",
170
169
  what="passages are books and movie scripts, questions are unknown",
171
- when="?",
172
- who="?",
170
+ when="2018",
171
+ who="annotators from summaries",
173
172
  language="English",
174
173
  ),
175
174
  main_metric="f1_score",
@@ -6,6 +6,7 @@ from typing import List, Optional
6
6
  import datasets
7
7
  import tiktoken
8
8
 
9
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
9
10
  from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Output,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  Instance,
15
16
  TEST_SPLIT,
16
17
  Input,
18
+ ScenarioMetadata,
17
19
  )
18
20
  from helm.common.general import ensure_directory_exists
19
21
 
@@ -77,3 +79,16 @@ class OpenAIMRCRScenario(Scenario):
77
79
  instances.append(instance)
78
80
 
79
81
  return instances
82
+
83
+ def get_metadata(self) -> ScenarioMetadata:
84
+ return ScenarioMetadata(
85
+ name="openai_mrcr",
86
+ display_name="OpenAI MRCR",
87
+ description="OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset "
88
+ "for benchmarking an LLM's ability to distinguish between multiple needles "
89
+ "hidden in context. This eval is inspired by the MRCR eval first introduced by "
90
+ "[Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).",
91
+ taxonomy=TaxonomyInfo(task="MRCR", what="Synthetic data", when="2025", who="None", language="English"),
92
+ main_metric="openai_mrcr_accuracy",
93
+ main_split="test",
94
+ )
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
5
6
  from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
6
7
  from helm.benchmark.scenarios.scenario import (
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -78,6 +80,25 @@ class RULERHotpotQAScenario(_RULERQAScenario):
78
80
  def __init__(self, max_num_words: int):
79
81
  super().__init__("hotpotqa", max_num_words)
80
82
 
83
+ def get_metadata(self) -> ScenarioMetadata:
84
+ return ScenarioMetadata(
85
+ name="ruler_hotpotqa",
86
+ display_name="RULER HotPotQA",
87
+ description="RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., "
88
+ "2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., "
89
+ "2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question "
90
+ "answering as a long-context scenario.",
91
+ taxonomy=TaxonomyInfo(
92
+ task="question answering with retrieval-augmented generation",
93
+ what="Wikipedia articles",
94
+ when="Before 2018",
95
+ who="Wikipedia authors",
96
+ language="English",
97
+ ),
98
+ main_metric="ruler_string_match_part",
99
+ main_split="valid",
100
+ )
101
+
81
102
 
82
103
  class RULERSQuADScenario(_RULERQAScenario):
83
104
  name = "ruler_squad"
@@ -86,3 +107,22 @@ class RULERSQuADScenario(_RULERQAScenario):
86
107
 
87
108
  def __init__(self, max_num_words: int):
88
109
  super().__init__("squad", max_num_words)
110
+
111
+ def get_metadata(self) -> ScenarioMetadata:
112
+ return ScenarioMetadata(
113
+ name="ruler_squad",
114
+ display_name="RULER SQuAD",
115
+ description="RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., "
116
+ "2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., "
117
+ "2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question "
118
+ "answering as a long-context scenario.",
119
+ taxonomy=TaxonomyInfo(
120
+ task="question answering",
121
+ what="Wikipedia articles",
122
+ when="Before 2018",
123
+ who="Wikipedia authors and crowdworkers",
124
+ language="English",
125
+ ),
126
+ main_metric="ruler_string_match_part",
127
+ main_split="valid",
128
+ )
@@ -1,7 +1,8 @@
1
1
  from typing import List
2
2
  from datasets import load_dataset
3
3
 
4
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
5
6
 
6
7
 
7
8
  class SimpleSafetyTestsScenario(Scenario):
@@ -31,3 +32,13 @@ class SimpleSafetyTestsScenario(Scenario):
31
32
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
32
33
  instances.append(instance)
33
34
  return instances
35
+
36
+ def get_metadata(self) -> ScenarioMetadata:
37
+ return ScenarioMetadata(
38
+ name="simple_safety_tests",
39
+ display_name="SimpleSafetyTests",
40
+ description="SimpleSafetyTests",
41
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
42
+ main_metric="safety_score",
43
+ main_split="test",
44
+ )
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
 
5
5
  from filelock import FileLock
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
9
  from helm.common.hierarchical_logger import hlog
9
10
  from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
@@ -17,6 +18,7 @@ from helm.benchmark.scenarios.scenario import (
17
18
  VALID_SPLIT,
18
19
  Input,
19
20
  Output,
21
+ ScenarioMetadata,
20
22
  )
21
23
 
22
24
 
@@ -89,3 +91,19 @@ INSERT_YOUR_SQL_QUERY_HERE
89
91
  )
90
92
  instances.append(instance)
91
93
  return instances
94
+
95
+ def get_metadata(self) -> ScenarioMetadata:
96
+ return ScenarioMetadata(
97
+ name="spider",
98
+ display_name="Spider 1.0 (Test)",
99
+ description="Spider 1.0 (Test)",
100
+ taxonomy=TaxonomyInfo(
101
+ task="text-to-SQL",
102
+ what="databases from various domains",
103
+ when="?",
104
+ who="expert data scientists",
105
+ language="English",
106
+ ),
107
+ main_metric="execution_accuracy",
108
+ main_split="valid",
109
+ )
@@ -2,6 +2,7 @@ import os
2
2
  from typing import Dict, List
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
142
144
  instances.extend(self.process_jsonl(jsonl_path, splits[split]))
143
145
 
144
146
  return instances
147
+
148
+ def get_metadata(self) -> ScenarioMetadata:
149
+ if self.exam == "onet":
150
+ return ScenarioMetadata(
151
+ name="thai_exam_onet",
152
+ display_name="ONET",
153
+ description="The Ordinary National Educational Test (ONET) is an examination for students "
154
+ "in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
155
+ "each question has 5 choices. These subjects are Thai, English, Mathematics, "
156
+ "Social Studies, and Science. Amounting to a total of 170 questions and "
157
+ "options.\n",
158
+ taxonomy=TaxonomyInfo(
159
+ task="question answering",
160
+ what="high school / medical school academic knowledge",
161
+ when="?",
162
+ who="n/a",
163
+ language="Thai and English",
164
+ ),
165
+ main_metric="exact_match",
166
+ main_split="test",
167
+ )
168
+ elif self.exam == "ic":
169
+ return ScenarioMetadata(
170
+ name="thai_exam_ic",
171
+ display_name="IC",
172
+ description="The Investment Consultant (IC) examination, a licensing test for investment "
173
+ "professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
174
+ "features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
175
+ "resulting in a total of 95 questions and options.\n",
176
+ taxonomy=TaxonomyInfo(
177
+ task="question answering",
178
+ what="licensing for investment professionals",
179
+ when="?",
180
+ who="n/a",
181
+ language="Thai",
182
+ ),
183
+ main_metric="exact_match",
184
+ main_split="test",
185
+ )
186
+ elif self.exam == "tgat":
187
+ return ScenarioMetadata(
188
+ name="thai_exam_tgat",
189
+ display_name="TGAT",
190
+ description="The Thai General Aptitude Test (TGAT), a national high school examination in "
191
+ "Thailand. Focuses on critical and logical thinking skills. We collected a "
192
+ "total of 90 questions and answers. The TGAT consists of four choices per "
193
+ "question.\n",
194
+ taxonomy=TaxonomyInfo(
195
+ task="question answering",
196
+ what="high school level questions on reasoning",
197
+ when="?",
198
+ who="n/a",
199
+ language="English",
200
+ ),
201
+ main_metric="exact_match",
202
+ main_split="test",
203
+ )
204
+ elif self.exam == "tpat1":
205
+ return ScenarioMetadata(
206
+ name="thai_exam_tpat1",
207
+ display_name="TPAT-1",
208
+ description="TBD",
209
+ taxonomy=TaxonomyInfo(
210
+ task="question answering",
211
+ what="high school / medical school academic knowledge",
212
+ when="?",
213
+ who="n/a",
214
+ language="Thai",
215
+ ),
216
+ main_metric="exact_match",
217
+ main_split="test",
218
+ )
219
+ elif self.exam == "a_level":
220
+ return ScenarioMetadata(
221
+ name="thai_exam_a_level",
222
+ display_name="A-Level",
223
+ description="An academic knowledge assessment examination (Applied Knowledge Level) that "
224
+ "covers general foundational subjects taught in schools. The content assessed "
225
+ "in this examination aligns with the curriculum guidelines and emphasizes the "
226
+ "practical application of knowledge in daily life. We collected a total of 175 "
227
+ "questions and answers.\n",
228
+ taxonomy=TaxonomyInfo(
229
+ task="question answering",
230
+ what="high school academic knowledge",
231
+ when="?",
232
+ who="n/a",
233
+ language="Thai and English",
234
+ ),
235
+ main_metric="exact_match",
236
+ main_split="test",
237
+ )
238
+ else:
239
+ raise ValueError(f"Unknown exam: {self.exam}")
@@ -113,8 +113,15 @@ class WMT14Scenario(Scenario):
113
113
  return ScenarioMetadata(
114
114
  name="wmt_14",
115
115
  display_name="WMT 2014",
116
- description="WMT 2014 is a collection of machine translation datasets.",
117
- taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
116
+ description="WMT 2014 is a collection of machine translation datasets "
117
+ "[(website)](https://www.statmt.org/wmt14/index.html).",
118
+ taxonomy=TaxonomyInfo(
119
+ task="machine translation",
120
+ what="multilingual sentences",
121
+ when="before 2014",
122
+ who="Europarl, news, Common Crawl, etc.",
123
+ language="English, French, Czech, etc.",
124
+ ),
118
125
  main_metric="bleu_4",
119
126
  main_split="test",
120
127
  )
@@ -191,16 +191,15 @@ run_groups:
191
191
  description: Scenarios for evaluating long context capabilities
192
192
  category: All scenarios
193
193
  subgroups:
194
- - ruler_hotpotqa
195
194
  - ruler_squad
196
- - infinite_bench_en_sum
197
- # - infinite_bench_en_qa
195
+ - ruler_hotpotqa
198
196
  - infinite_bench_en_mc
197
+ - infinite_bench_en_sum
199
198
  - openai_mrcr
200
199
 
201
- - name: ruler_hotpotqa
202
- display_name: RULER HotPotQA
203
- description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
200
+ - name: ruler_squad
201
+ display_name: RULER SQuAD
202
+ description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
204
203
  metric_groups:
205
204
  - accuracy
206
205
  - general_information
@@ -209,16 +208,15 @@ run_groups:
209
208
  main_name: ruler_string_match_part
210
209
  main_split: valid
211
210
  taxonomy:
212
- task: question answering with retrieval-augmented generation
211
+ task: question answering
213
212
  what: Wikipedia articles
214
- who: Wikipedia authors
213
+ who: Wikipedia authors and crowdworkers
215
214
  when: Before 2018
216
215
  language: English
217
216
 
218
-
219
- - name: ruler_squad
220
- display_name: RULER SQuAD
221
- description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
217
+ - name: ruler_hotpotqa
218
+ display_name: RULER HotPotQA
219
+ description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
222
220
  metric_groups:
223
221
  - accuracy
224
222
  - general_information
@@ -227,29 +225,12 @@ run_groups:
227
225
  main_name: ruler_string_match_part
228
226
  main_split: valid
229
227
  taxonomy:
230
- task: question answering
228
+ task: question answering with retrieval-augmented generation
231
229
  what: Wikipedia articles
232
- who: Wikipedia authors and crowdworkers
230
+ who: Wikipedia authors
233
231
  when: Before 2018
234
232
  language: English
235
233
 
236
- # - name: infinite_bench_en_qa
237
- # display_name: ∞Bench En.QA
238
- # description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
239
- # metric_groups:
240
- # - accuracy
241
- # - general_information
242
- # - annotation_metrics
243
- # environment:
244
- # main_name: f1_score
245
- # main_split: test
246
- # taxonomy:
247
- # task: question answering
248
- # what: Novels
249
- # who: Novel authors
250
- # when: Before 2024
251
- # language: English
252
-
253
234
  - name: infinite_bench_en_mc
254
235
  display_name: ∞Bench En.MC
255
236
  description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))