crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,1942 @@
1
+ import datasets
2
+ import os
3
+ import random
4
+ from typing import List, Dict
5
+
6
+ import pandas as pd
7
+
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Input,
10
+ Instance,
11
+ Output,
12
+ PassageQuestionInput,
13
+ Reference,
14
+ Scenario,
15
+ CORRECT_TAG,
16
+ TEST_SPLIT,
17
+ TRAIN_SPLIT,
18
+ )
19
+ from helm.common.general import ensure_file_downloaded
20
+ from helm.common.hierarchical_logger import hlog
21
+
22
+ # BHASA Scenarios
23
+ # A. Natural Language Understanding
24
+ # B. Natural Language Generation
25
+ # C. Natural Language Reasoning
26
+ # D. Linguistic Diagnostics
27
+
28
+ # A. Natural Language Understanding
29
+ # 1. Question Answering
30
+ # 2. Sentiment Analysis
31
+ # 3. Toxicity Detection/Classification
32
+
33
+
34
+ # 1. Question Answering
35
+ # 1.1 Indonesian: TyDiQA
36
+ class TyDiQAScenario(Scenario):
37
+ """
38
+ TyDiQA is is an open-book question answering scenario for 11 typologically-diverse languages.
39
+ The questions are written by people who want to know the answer, but do not know the answer yet,
40
+ and the data is collected directly in each language without the use of translation.
41
+
42
+ This scenario only uses the Indonesian subset of the data, and uses the Gold Passage (GoldP) task,
43
+ which requires the tested system to extract a span from the given passage to answer a given question.
44
+ There are no unanswerable questions.
45
+
46
+ The models are prompted using the following format:
47
+
48
+ Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan mengekstrak jawaban
49
+ dari paragraf tersebut.
50
+
51
+ Paragraf: <text>
52
+ Pertanyaan: <question>
53
+ Jawaban: <answer>
54
+
55
+ ...
56
+
57
+ Paragraf: <text>
58
+ Pertanyaan: <question>
59
+ Jawaban:
60
+
61
+
62
+ Target completion:
63
+ <answer>
64
+
65
+ @article{clark-etal-2020-tydi,
66
+ title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically
67
+ Diverse Languages",
68
+ author = "Clark, Jonathan H. and
69
+ Choi, Eunsol and
70
+ Collins, Michael and
71
+ Garrette, Dan and
72
+ Kwiatkowski, Tom and
73
+ Nikolaev, Vitaly and
74
+ Palomaki, Jennimaria",
75
+ editor = "Johnson, Mark and
76
+ Roark, Brian and
77
+ Nenkova, Ani",
78
+ journal = "Transactions of the Association for Computational Linguistics",
79
+ volume = "8",
80
+ year = "2020",
81
+ address = "Cambridge, MA",
82
+ publisher = "MIT Press",
83
+ url = "https://aclanthology.org/2020.tacl-1.30",
84
+ doi = "10.1162/tacl_a_00317",
85
+ pages = "454--470",
86
+ }
87
+ """
88
+
89
+ name = "tydiqa"
90
+ description = "Indonesian Open-book Question Answering task"
91
+ tags = ["question_answering"]
92
+
93
+ def __init__(self):
94
+ super().__init__()
95
+ self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
96
+
97
+ def get_instances(self, output_path) -> List[Instance]:
98
+ dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
99
+
100
+ outputs = []
101
+ for split in self.splits.keys():
102
+ df = dataset[split].to_pandas()
103
+
104
+ if split == "train":
105
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
106
+ data = df[df["passage_text"].apply(len) < df["passage_text"].apply(len).quantile(0.2)]
107
+ else:
108
+ data = df
109
+
110
+ for _, row in data.iterrows():
111
+ passage = row["passage_text"].strip()
112
+ question = row["question_text"].strip()
113
+ input = PassageQuestionInput(
114
+ passage=passage,
115
+ question=question,
116
+ passage_prefix="Paragraf: ",
117
+ question_prefix="Pertanyaan: ",
118
+ )
119
+ references = []
120
+ for answer in row["answers"]["text"]:
121
+ output = Output(text=answer.strip())
122
+ references.append(Reference(output, tags=[CORRECT_TAG]))
123
+ instance = Instance(input=input, references=references, split=self.splits[split])
124
+ outputs.append(instance)
125
+ return outputs
126
+
127
+
128
+ # 1.2 Vietnamese & Thai: XQuAD
129
+ class XQuADScenario(Scenario):
130
+ """
131
+ XQuAD is an open-book question answering scenario that is parallel across 10 languages.
132
+ The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the
133
+ development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
134
+
135
+ This scenario only uses the Vietnamese and Thai subsets of the data and there are no
136
+ unanswerable questions.
137
+
138
+ The models are prompted using the following general format:
139
+
140
+ You will be given a paragraph and a question. Answer the question by extracting the answer from the paragraph.
141
+
142
+ Paragraph: <text>
143
+ Question: <question>
144
+ Answer: <answer>
145
+
146
+ ...
147
+
148
+ Paragraph: <text>
149
+ Question: <question>
150
+ Answer:
151
+
152
+ Target completion:
153
+ <answer>
154
+
155
+ @article{Artetxe:etal:2019,
156
+ author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
157
+ title = {On the cross-lingual transferability of monolingual representations},
158
+ journal = {CoRR},
159
+ volume = {abs/1910.11856},
160
+ year = {2019},
161
+ archivePrefix = {arXiv},
162
+ eprint = {1910.11856}
163
+ }
164
+ """
165
+
166
+ name = "xquad"
167
+ description = "Vietnamese and Thai Open-book Question Answering task"
168
+ tags = ["question_answering"]
169
+
170
+ def __init__(self, language: str):
171
+ super().__init__()
172
+ self.language = language
173
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
174
+ self.language_to_prompt_components = {
175
+ "th": {
176
+ "passage_prefix": "ข้อความ: ",
177
+ "question_prefix": "คำถาม: ",
178
+ "random_state": 4520,
179
+ },
180
+ "vi": {
181
+ "passage_prefix": "Đoạn văn: ",
182
+ "question_prefix": "Câu hỏi: ",
183
+ "random_state": 4502,
184
+ },
185
+ }
186
+ if self.language not in self.language_to_prompt_components.keys():
187
+ raise Exception(
188
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
189
+ )
190
+ else:
191
+ self.prompt_components = self.language_to_prompt_components[self.language]
192
+
193
+ def get_instances(self, output_path) -> List[Instance]:
194
+ dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
195
+ df = dataset.to_pandas()
196
+
197
+ # Sample 1000 examples for test
198
+ df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
199
+
200
+ # In-context examples to be drawn from remaining examples (since there is no train data)
201
+ df_train = df[~df.index.isin(df_test.index)]
202
+
203
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
204
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
205
+ dataset = {
206
+ "train": df_train,
207
+ "test": df_test,
208
+ }
209
+
210
+ outputs = []
211
+ for split in self.splits.keys():
212
+ data = dataset[split]
213
+ for _, row in data.iterrows():
214
+ passage = row["context"].strip()
215
+ question = row["question"].strip()
216
+ input = PassageQuestionInput(
217
+ passage=passage,
218
+ question=question,
219
+ passage_prefix=str(self.prompt_components["passage_prefix"]),
220
+ question_prefix=str(self.prompt_components["question_prefix"]),
221
+ )
222
+ references = []
223
+ for answer in row["answers"]["text"]:
224
+ output = Output(text=answer.strip())
225
+ references.append(Reference(output, tags=[CORRECT_TAG]))
226
+ instance = Instance(input=input, references=references, split=self.splits[split])
227
+ outputs.append(instance)
228
+ return outputs
229
+
230
+
231
+ # 1.3 Tamil: IndicQA
232
+ class IndicQAScenario(Scenario):
233
+ """
234
+ IndicQA is an open-book question answering scenario for 11 Indic languages.
235
+ Answers to questions are to be extracted from the text provided. The data is taken from
236
+ Wikipedia articles across various domains and questions and answers were manually created
237
+ by native speakers.
238
+
239
+ This scenario only uses the Tamil subset of the data and unanswerable questions
240
+ are removed from the dataset in order to be consistent with the question answering
241
+ scenarios for Indonesian, Vietnamese and Thai.
242
+
243
+ The models are prompted using the following format:
244
+
245
+ உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்.
246
+
247
+ பத்தி: <text>
248
+ கேள்வி: <question>
249
+ பதில்: <answer>
250
+
251
+ ...
252
+
253
+ பத்தி: <text>
254
+ கேள்வி: <question>
255
+ பதில்:
256
+
257
+ Target completion:
258
+ <answer>
259
+
260
+ @inproceedings{doddapaneni-etal-2023-towards,
261
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
262
+ {I}ndic Languages",
263
+ author = "Doddapaneni, Sumanth and
264
+ Aralikatte, Rahul and
265
+ Ramesh, Gowtham and
266
+ Goyal, Shreya and
267
+ Khapra, Mitesh M. and
268
+ Kunchukuttan, Anoop and
269
+ Kumar, Pratyush",
270
+ editor = "Rogers, Anna and
271
+ Boyd-Graber, Jordan and
272
+ Okazaki, Naoaki",
273
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
274
+ Long Papers)",
275
+ month = jul,
276
+ year = "2023",
277
+ address = "Toronto, Canada",
278
+ publisher = "Association for Computational Linguistics",
279
+ url = "https://aclanthology.org/2023.acl-long.693",
280
+ doi = "10.18653/v1/2023.acl-long.693",
281
+ pages = "12402--12426",
282
+ }
283
+ """
284
+
285
+ name = "indicqa"
286
+ description = "Tamil Open-book Question Answering task"
287
+ tags = ["question_answering"]
288
+
289
+ def __init__(self):
290
+ super().__init__()
291
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
292
+
293
+ def get_instances(self, output_path) -> List[Instance]:
294
+ dataset = datasets.load_dataset(
295
+ "ai4bharat/IndicQA",
296
+ "indicqa.ta",
297
+ split="test",
298
+ revision="78ee8d58e880c72f324e176c989dfefa55427af4",
299
+ trust_remote_code=True,
300
+ )
301
+ df = dataset.to_pandas()
302
+
303
+ # Remove unanswerable questions (answer is an empty string)
304
+ df = df[df["answers"].apply(lambda x: len(x["text"][0].strip()) > 0)]
305
+
306
+ # Sample 1000 examples for test
307
+ df_test = df.sample(n=1000, random_state=7900)
308
+
309
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
310
+ df_train = df[~df.index.isin(df_test.index)]
311
+
312
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
313
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
314
+ dataset = {
315
+ "train": df_train,
316
+ "test": df_test,
317
+ }
318
+
319
+ outputs = []
320
+ for split in self.splits.keys():
321
+ data = dataset[split]
322
+ for _, row in data.iterrows():
323
+ passage = row["context"].strip()
324
+ question = row["question"].strip()
325
+ input = PassageQuestionInput(
326
+ passage=passage,
327
+ question=question,
328
+ passage_prefix="பத்தி: ",
329
+ question_prefix="கேள்வி: ",
330
+ )
331
+ references = []
332
+ for answer in row["answers"]["text"]:
333
+ output = Output(text=answer.strip())
334
+ references.append(Reference(output, tags=[CORRECT_TAG]))
335
+ instance = Instance(input=input, references=references, split=self.splits[split])
336
+ outputs.append(instance)
337
+ return outputs
338
+
339
+
340
+ # 2. Sentiment Analysis
341
+ # 2.1 Indonesian: NusaX Sentiment
342
+ class NusaXScenario(Scenario):
343
+ """
344
+ NusaX is a sentiment analysis scenario for 11 Indonesian languages.
345
+ The data is derived from a subset of SmSA (Purwarianti and Crisdayanti, 2019) and manually translated
346
+ from Indonesian to 10 other local languages, such as Acehnese and Toba Batak.
347
+ It consists of comments and reviews from various online platforms.
348
+
349
+ Only the Indonesian subset of the data is used for this scenario, and the labels are
350
+ positive, negative or neutral.
351
+
352
+ The models are prompted using the following format:
353
+
354
+ Apa sentimen dari kalimat berikut ini?
355
+ Jawablah dengan satu kata saja:
356
+ - Positif
357
+ - Negatif
358
+ - Netral
359
+
360
+ Kalimat: <text>
361
+ Jawaban: <sentiment>
362
+
363
+ ...
364
+
365
+ Kalimat: <text>
366
+ Jawaban:
367
+
368
+ Target completion:
369
+ <sentiment>
370
+
371
+ @inproceedings{winata-etal-2023-nusax,
372
+ title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
373
+ author = "Winata, Genta Indra and
374
+ Aji, Alham Fikri and
375
+ Cahyawijaya, Samuel and
376
+ Mahendra, Rahmad and
377
+ Koto, Fajri and
378
+ Romadhony, Ade and
379
+ Kurniawan, Kemal and
380
+ Moeljadi, David and
381
+ Prasojo, Radityo Eko and
382
+ Fung, Pascale and
383
+ Baldwin, Timothy and
384
+ Lau, Jey Han and
385
+ Sennrich, Rico and
386
+ Ruder, Sebastian",
387
+ editor = "Vlachos, Andreas and
388
+ Augenstein, Isabelle",
389
+ booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for
390
+ Computational Linguistics",
391
+ month = may,
392
+ year = "2023",
393
+ address = "Dubrovnik, Croatia",
394
+ publisher = "Association for Computational Linguistics",
395
+ url = "https://aclanthology.org/2023.eacl-main.57",
396
+ doi = "10.18653/v1/2023.eacl-main.57",
397
+ pages = "815--834",
398
+ }
399
+ """
400
+
401
+ name = "nusax"
402
+ description = "Indonesian NusaX-Senti Sentiment Analysis dataset"
403
+ tags = ["sentiment_analysis"]
404
+
405
+ def __init__(self):
406
+ super().__init__()
407
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
408
+ self.sentiment2label = {
409
+ "positive": "Positif",
410
+ "negative": "Negatif",
411
+ "neutral": "Netral",
412
+ }
413
+
414
+ def download_dataset(self, output_path: str):
415
+ URLS = {
416
+ "test": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/test.csv",
417
+ "train": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/train.csv",
418
+ }
419
+
420
+ dataset: Dict[str, pd.DataFrame] = {}
421
+ for split in self.splits.keys():
422
+ target_path_file = os.path.join(output_path, split)
423
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
424
+ data = pd.read_csv(target_path_file)
425
+ dataset[split] = data
426
+ return dataset
427
+
428
+ def get_instances(self, output_path) -> List[Instance]:
429
+ dataset = self.download_dataset(output_path)
430
+ outputs = []
431
+ for split in self.splits.keys():
432
+ data = dataset[split]
433
+ for _, row in data.iterrows():
434
+ input = Input(row["text"].strip())
435
+ output = Output(text=self.sentiment2label[row["label"]])
436
+ references = [
437
+ Reference(output, tags=[CORRECT_TAG]),
438
+ ]
439
+ instance = Instance(input=input, references=references, split=self.splits[split])
440
+ outputs.append(instance)
441
+ return outputs
442
+
443
+
444
+ # 2.2 Vietnamese: UIT-VSFC
445
+ class UITVSFCScenario(Scenario):
446
+ """
447
+ UIT-VSFC is a Vietnamese sentiment analysis scenario. The data consists of student feedback obtained from
448
+ end-of-semester surveys at a Vietnamese university. Feedback is labeled as one of three sentiment
449
+ polarities: positive, negative or neutral.
450
+
451
+ The models are prompted using the following format:
452
+
453
+ Sắc thái của câu sau đây là gì?
454
+ Trả lời với một từ duy nhất:
455
+ - Tích cực
456
+ - Tiêu cực
457
+ - Trung lập
458
+
459
+ Câu văn: <text>
460
+ Câu trả lời: <sentiment>
461
+
462
+ ...
463
+
464
+ Câu văn: <text>
465
+ Câu trả lời:
466
+
467
+ Target completion:
468
+ <sentiment>
469
+
470
+ @inproceedings{van2018uit,
471
+ title={UIT-VSFC: Vietnamese students’ feedback corpus for sentiment analysis},
472
+ author={Van Nguyen, Kiet and Nguyen, Vu Duc and Nguyen, Phu XV and Truong, Tham TH and Nguyen, Ngan Luu-Thuy},
473
+ booktitle={2018 10th international conference on knowledge and systems engineering (KSE)},
474
+ pages={19--24},
475
+ year={2018},
476
+ organization={IEEE},
477
+ url={https://ieeexplore.ieee.org/document/8573337},
478
+ }
479
+ """
480
+
481
+ name = "uitvsfc"
482
+ description = "Vietnamese Students' Feedback Corpus sentiment analysis task"
483
+ tags = ["sentiment_analysis"]
484
+
485
+ def __init__(self):
486
+ super().__init__()
487
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
488
+ self.id2label = {
489
+ 0: "Tiêu cực",
490
+ 1: "Trung lập",
491
+ 2: "Tích cực",
492
+ }
493
+
494
+ def download_dataset(self, output_path: str):
495
+ URLS = {
496
+ "train": {
497
+ "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
498
+ "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
499
+ },
500
+ "test": {
501
+ "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
502
+ "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
503
+ },
504
+ }
505
+
506
+ dataset: Dict[str, pd.DataFrame] = {}
507
+ for split in list(URLS.keys()):
508
+ file_lines: Dict[str, List[str]] = {}
509
+ for file in list(URLS[split].keys()):
510
+ file_lines[file] = []
511
+ target_path_file = os.path.join(output_path, split, file)
512
+ ensure_file_downloaded(source_url=URLS[split][file], target_path=target_path_file)
513
+ with open(target_path_file, "r") as f:
514
+ lines = f.readlines()
515
+ for line in lines:
516
+ file_lines[file].append(str(line).strip())
517
+ df = pd.DataFrame({"text": file_lines["sentences"], "label": file_lines["sentiments"]})
518
+ if split == "test":
519
+ dataset[split] = df.groupby("label", group_keys=False).apply(
520
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
521
+ )
522
+ else:
523
+ dataset[split] = df
524
+ return dataset
525
+
526
+ def get_instances(self, output_path) -> List[Instance]:
527
+ dataset = self.download_dataset(output_path)
528
+ outputs = []
529
+ for split in self.splits.keys():
530
+ data = dataset[split]
531
+ for _, row in data.iterrows():
532
+ input = Input(row["text"])
533
+ output = Output(text=self.id2label[int(row["label"])])
534
+ references = [
535
+ Reference(output, tags=[CORRECT_TAG]),
536
+ ]
537
+ instance = Instance(input=input, references=references, split=self.splits[split])
538
+ outputs.append(instance)
539
+ return outputs
540
+
541
+
542
+ # 2.3 Thai: Wisesight Sentiment
543
+ class WisesightScenario(Scenario):
544
+ """
545
+ Wisesight Sentiment is a Thai sentiment analysis scenario. The data consists of social media messages
546
+ regarding consumer products and services.
547
+
548
+ The dataset originally included the label "question" for instances that were questions. These instances
549
+ made up only a small subset of the data and were dropped in order to make the task more consistent
550
+ with those of other languages. Labels are therefore only positive, negative or neutral.
551
+
552
+ The models are prompted using the following format:
553
+
554
+ อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?
555
+ กรุณาตอบโดยใช้คำเดียวเท่านั้น:
556
+ - แง่บวก
557
+ - แง่ลบ
558
+ - เฉยๆ
559
+
560
+ ข้อความ: <text>
561
+ คำตอบ: <sentiment>
562
+
563
+ ...
564
+
565
+ ข้อความ: <text>
566
+ คำตอบ:
567
+
568
+ Target completion:
569
+ <sentiment>
570
+
571
+ @software{bact_2019_3457447,
572
+ author = {Suriyawongkul, Arthit and
573
+ Chuangsuwanich, Ekapol and
574
+ Chormai, Pattarawat and
575
+ Polpanumas, Charin},
576
+ title = {PyThaiNLP/wisesight-sentiment: First release},
577
+ month = sep,
578
+ year = 2019,
579
+ publisher = {Zenodo},
580
+ version = {v1.0},
581
+ doi = {10.5281/zenodo.3457447},
582
+ url = {https://doi.org/10.5281/zenodo.3457447}
583
+ }
584
+ """
585
+
586
+ name = "wisesight"
587
+ description = "Wisesight Sentiment Thai sentiment analysis task"
588
+ tags = ["sentiment_analysis"]
589
+
590
+ def __init__(self):
591
+ super().__init__()
592
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
593
+ self.sentiment2label = {
594
+ "pos": "แง่บวก",
595
+ "neg": "แง่ลบ",
596
+ "neu": "เฉยๆ",
597
+ }
598
+
599
+ def download_dataset(self, output_path: str):
600
+ URL = "https://github.com/PyThaiNLP/wisesight-sentiment/raw/master/huggingface/data.zip"
601
+ data_path = os.path.join(output_path, "data")
602
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
603
+
604
+ dataset: Dict[str, pd.DataFrame] = {}
605
+ for split in self.splits.keys():
606
+ target_path_file = os.path.join(data_path, "data", f"{split}.jsonl")
607
+ df = pd.read_json(target_path_file, lines=True)
608
+ df = df[df["category"] != "q"] # Drop instances with the "question" label
609
+ if split == "test":
610
+ dataset[split] = df.groupby("category", group_keys=False).apply(
611
+ lambda x: x.sample(frac=1000 / len(df), random_state=4183)
612
+ )
613
+ else:
614
+ dataset[split] = df
615
+ return dataset
616
+
617
+ def get_instances(self, output_path) -> List[Instance]:
618
+ dataset = self.download_dataset(output_path)
619
+ outputs = []
620
+ for split in self.splits.keys():
621
+ data = dataset[split]
622
+ for _, row in data.iterrows():
623
+ input = Input(row["texts"].strip())
624
+ output = Output(text=self.sentiment2label[row["category"]])
625
+ references = [
626
+ Reference(output, tags=[CORRECT_TAG]),
627
+ ]
628
+ instance = Instance(input=input, references=references, split=self.splits[split])
629
+ outputs.append(instance)
630
+ return outputs
631
+
632
+
633
+ # 2.4 Tamil: IndicSentiment
634
+ class IndicSentimentScenario(Scenario):
635
+ """
636
+ IndicSentiment is a sentiment analysis scenario for 10 Indic languages. The data consists of
637
+ product reviews written in English that were then translated by native speakers of the
638
+ respective languages, resulting in a parallel dataset across the 10 languages.
639
+
640
+ Only the Tamil subset of the dataset is used for this scenario. Labels are positive or negative.
641
+
642
+ The models are prompted using the following format:
643
+
644
+ பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?
645
+ ஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:
646
+ - நேர்மறை
647
+ - எதிர்மறை
648
+
649
+ வாக்கியம்: <text>
650
+ பதில்:
651
+
652
+ ...
653
+
654
+ வாக்கியம்: <text>
655
+ பதில்: <answer>
656
+
657
+ Target completion:
658
+ <sentiment> (<sentiment>:positive or negative)
659
+
660
+ @inproceedings{doddapaneni-etal-2023-towards,
661
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
662
+ {I}ndic Languages",
663
+ author = "Doddapaneni, Sumanth and
664
+ Aralikatte, Rahul and
665
+ Ramesh, Gowtham and
666
+ Goyal, Shreya and
667
+ Khapra, Mitesh M. and
668
+ Kunchukuttan, Anoop and
669
+ Kumar, Pratyush",
670
+ editor = "Rogers, Anna and
671
+ Boyd-Graber, Jordan and
672
+ Okazaki, Naoaki",
673
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
674
+ Long Papers)",
675
+ month = jul,
676
+ year = "2023",
677
+ address = "Toronto, Canada",
678
+ publisher = "Association for Computational Linguistics",
679
+ url = "https://aclanthology.org/2023.acl-long.693",
680
+ doi = "10.18653/v1/2023.acl-long.693",
681
+ pages = "12402--12426",
682
+ }
683
+ """
684
+
685
+ name = "indicsentiment"
686
+ description = "IndicSentiment Tamil sentiment analysis task"
687
+ tags = ["sentiment_analysis"]
688
+
689
+ def __init__(self):
690
+ super().__init__()
691
+ self.splits = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
692
+ self.sentiment2label = {
693
+ "Positive": "நேர்மறை",
694
+ "Negative": "எதிர்மறை",
695
+ }
696
+
697
+ def get_instances(self, output_path) -> List[Instance]:
698
+ dataset = datasets.load_dataset(
699
+ "ai4bharat/IndicSentiment",
700
+ "translation-ta",
701
+ revision="dc8f3f66886531c6897fedffca1e938a68fc5013",
702
+ trust_remote_code=True,
703
+ )
704
+
705
+ outputs = []
706
+ for split in self.splits.keys():
707
+ data = dataset[split].to_pandas()
708
+ # Current version on HuggingFace datasets has 2 instances without labels across all languages.
709
+ # Confirmed with first author that the labels for these instances should be Positive.
710
+ data["LABEL"] = data["LABEL"].fillna("Positive")
711
+ for _, row in data.iterrows():
712
+ input = Input(row["INDIC REVIEW"].strip())
713
+ output = Output(text=self.sentiment2label[row["LABEL"]])
714
+ references = [
715
+ Reference(output, tags=[CORRECT_TAG]),
716
+ ]
717
+ instance = Instance(input=input, references=references, split=self.splits[split])
718
+ outputs.append(instance)
719
+ return outputs
720
+
721
+
722
+ # 3. Toxicity Detection/Classification
723
+ # 3.1 Indonesian: Multi-Label Hate Speech Detection
724
+ class MLHSDScenario(Scenario):
725
+ """
726
+ Multi-Label Hate Speech and Abusive Language Detection (MLHSD) is an Indonesian toxicity
727
+ classification scenario. The data is obtained from Twitter and PII have been anonymized to
728
+ USER and URL.
729
+
730
+ The original dataset was used for a multi-label classification task, but it has been repurposed
731
+ as a multi-class classification task to be more aligned with the task for other languages.
732
+ The mapping is done as follows:
733
+ - Clean: No abusive language or hate speech labels
734
+ - Abusive: Only abusive language label but no hate speech labels
735
+ - Hate: As long as one hate speech label is present
736
+
737
+ The models are prompted using the following format:
738
+
739
+ Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:
740
+ Bersih: Tidak ada ujaran kebencian.
741
+ Kasar: Ada ujaran kebencian dan kata-kata kasar, namun tidak menyerang pihak tertentu.
742
+ Benci: Ada ujaran kebencian atau serangan langsung terhadap pihak tertentu.
743
+ Berdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:
744
+ - Bersih
745
+ - Kasar
746
+ - Benci
747
+
748
+ Kalimat: <text>
749
+ Jawaban: <answer>
750
+
751
+ ...
752
+
753
+ Kalimat: <text>
754
+ Jawaban:
755
+
756
+ Target completion:
757
+ <answer>
758
+
759
+ @inproceedings{ibrohim-budi-2019-multi,
760
+ title = "Multi-label Hate Speech and Abusive Language Detection in {I}ndonesian {T}witter",
761
+ author = "Ibrohim, Muhammad Okky and
762
+ Budi, Indra",
763
+ editor = "Roberts, Sarah T. and
764
+ Tetreault, Joel and
765
+ Prabhakaran, Vinodkumar and
766
+ Waseem, Zeerak",
767
+ booktitle = "Proceedings of the Third Workshop on Abusive Language Online",
768
+ month = aug,
769
+ year = "2019",
770
+ address = "Florence, Italy",
771
+ publisher = "Association for Computational Linguistics",
772
+ url = "https://aclanthology.org/W19-3506",
773
+ doi = "10.18653/v1/W19-3506",
774
+ pages = "46--57",
775
+ }
776
+ """
777
+
778
+ name = "mlhsd"
779
+ description = (
780
+ "Multi-Label Hate Speech and Abusive Language Detection (MLHSD) Indonesian toxicity classification task"
781
+ )
782
+ tags = ["toxicity_detection"]
783
+
784
+ def __init__(self):
785
+ super().__init__()
786
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
787
+
788
+ def download_dataset(self, output_path: str):
789
+ BASE_URL = "https://raw.githubusercontent.com/okkyibrohim/"
790
+ URL = f"{BASE_URL}id-multi-label-hate-speech-and-abusive-language-detection/master/re_dataset.csv"
791
+ target_path_file = os.path.join(output_path, "mlhsd")
792
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
793
+ df = pd.read_csv(target_path_file, encoding="ISO-8859-1")
794
+
795
+ # Map multi-label task to multi-class task
796
+ df["label"] = df.apply(lambda x: self.get_label(x), axis=1)
797
+
798
+ df_test = df.groupby("label", group_keys=False).apply(
799
+ lambda x: x.sample(frac=1000 / len(df), random_state=7123)
800
+ )
801
+
802
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
803
+ df_train = df[~df.index.isin(df_test.index)]
804
+ dataset = {
805
+ "train": df_train,
806
+ "test": df_test,
807
+ }
808
+ return dataset
809
+
810
+ def get_label(self, row) -> str:
811
+ if int(row["HS"]) == 1:
812
+ return "Benci"
813
+ elif int(row["Abusive"]) == 1:
814
+ return "Kasar"
815
+ else:
816
+ return "Bersih"
817
+
818
+ def get_instances(self, output_path) -> List[Instance]:
819
+ dataset = self.download_dataset(output_path)
820
+ outputs = []
821
+ for split in self.splits.keys():
822
+ data = dataset[split]
823
+ for _, row in data.iterrows():
824
+ input = Input(row["Tweet"].strip())
825
+ output = Output(text=row["label"])
826
+ references = [
827
+ Reference(output, tags=[CORRECT_TAG]),
828
+ ]
829
+ instance = Instance(input=input, references=references, split=self.splits[split])
830
+ outputs.append(instance)
831
+ return outputs
832
+
833
+
834
+ # 3.2 Vietnamese: ViHSD
835
+ class ViHSDScenario(Scenario):
836
+ """
837
+ ViHSD is a Vietnamese toxicity classification scenario. The data is obtained from social media.
838
+ The labels are Clean, Offensive and Hate.
839
+
840
+ The models are prompted using the following format:
841
+
842
+ Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:
843
+ Sạch: Không quấy rối.
844
+ Công kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không tấn công bất kì đối tượng cụ thể nào.
845
+ Thù ghét: Trực tiếp quấy rối hay lăng mạ một đối tượng cụ thể.
846
+ Với các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:
847
+ - Sạch
848
+ - Công kích
849
+ - Thù ghét
850
+
851
+
852
+ Câu văn: <text>
853
+ Câu trả lời: <toxicity>
854
+
855
+ ...
856
+
857
+ Câu văn: <text>
858
+ Câu trả lời:
859
+
860
+ Target completion:
861
+ <toxicity>
862
+
863
+ @InProceedings{10.1007/978-3-030-79457-6_35,
864
+ author="Luu, Son T.
865
+ and Nguyen, Kiet Van
866
+ and Nguyen, Ngan Luu-Thuy",
867
+ editor="Fujita, Hamido
868
+ and Selamat, Ali
869
+ and Lin, Jerry Chun-Wei
870
+ and Ali, Moonis",
871
+ title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts",
872
+ booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices",
873
+ year="2021",
874
+ publisher="Springer International Publishing",
875
+ address="Cham",
876
+ pages="415--426",
877
+ isbn="978-3-030-79457-6",
878
+ url="https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35",
879
+ }
880
+ """
881
+
882
+ name = "vihsd"
883
+ description = "ViHSD Vietnamese toxicity classification task"
884
+ tags = ["toxicity_detection"]
885
+
886
+ def __init__(self):
887
+ super().__init__()
888
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
889
+ self.id2label = {
890
+ 0: "Sạch",
891
+ 1: "Công kích",
892
+ 2: "Thù ghét",
893
+ }
894
+
895
+ def download_dataset(self, output_path: str):
896
+ URL = "https://raw.githubusercontent.com/sonlam1102/vihsd/main/data/vihsd.zip"
897
+ data_path = os.path.join(output_path, "data")
898
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
899
+
900
+ dataset: Dict[str, pd.DataFrame] = {}
901
+ for split in self.splits.keys():
902
+ target_path_file = os.path.join(data_path, "vihsd", f"{split}.csv")
903
+ df = pd.read_csv(target_path_file)
904
+ data = df.groupby("label_id", group_keys=False).apply(
905
+ lambda x: x.sample(frac=1000 / len(df), random_state=4878)
906
+ )
907
+ dataset[split] = data
908
+ return dataset
909
+
910
+ def get_instances(self, output_path) -> List[Instance]:
911
+ dataset = self.download_dataset(output_path)
912
+ outputs = []
913
+ for split in self.splits.keys():
914
+ data = dataset[split]
915
+ for _, row in data.iterrows():
916
+ input = Input(str(row["free_text"]).strip())
917
+ output = Output(text=self.id2label[int(row["label_id"])])
918
+ references = [
919
+ Reference(output, tags=[CORRECT_TAG]),
920
+ ]
921
+ instance = Instance(input=input, references=references, split=self.splits[split])
922
+ outputs.append(instance)
923
+ return outputs
924
+
925
+
926
+ # 3.3 Thai: Thai Toxicity Tweets
927
+ class ThaiToxicityTweetsScenario(Scenario):
928
+ """
929
+ Thai Toxicity Tweets is a Thai toxicity detection scenario. The data is obtained from Twitter.
930
+ Instances with no labels or had "TWEET_NOT_FOUND" as the text were dropped from the dataset.
931
+ The labels are either Y (the text is toxic) or N (the text is clean).
932
+
933
+ The models are prompted using the following format:
934
+
935
+ คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ
936
+ ข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล
937
+ และคุณควรคำนึงถึงการประชดประชันด้วย
938
+ เมื่อได้รับข้อความ ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ
939
+
940
+ ข้อความ: <text>
941
+ คำตอบ: <toxicity>
942
+
943
+ ...
944
+
945
+ ข้อความ: <text>
946
+ คำตอบ:
947
+
948
+ Target completion:
949
+ <toxicity>
950
+
951
+ @inproceedings{sirihattasak2018annotation,
952
+ title={Annotation and classification of toxicity for Thai Twitter},
953
+ author={Sirihattasak, Sugan and Komachi, Mamoru and Ishikawa, Hiroshi},
954
+ booktitle={TA-COS 2018: 2nd Workshop on Text Analytics for Cybersecurity and Online Safety},
955
+ pages={1},
956
+ year={2018},
957
+ url={http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf},
958
+ }
959
+ """
960
+
961
+ name = "thaitoxicitytweets"
962
+ description = "Thai Toxicity Tweets toxicity detection task"
963
+ tags = ["toxicity_detection"]
964
+
965
+ def __init__(self):
966
+ super().__init__()
967
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
968
+ self.id2label = {
969
+ 0: "N",
970
+ 1: "Y",
971
+ }
972
+
973
+ def get_instances(self, output_path) -> List[Instance]:
974
+ dataset = datasets.load_dataset(
975
+ "tmu-nlp/thai_toxicity_tweet",
976
+ split="train",
977
+ revision="aa021e41d0ee6dbee2975fbed620ec8c586bdaf6",
978
+ trust_remote_code=True,
979
+ )
980
+ df = dataset.to_pandas()
981
+
982
+ # Drop instances where there are no labels or text is "TWEET_NOT_FOUND"
983
+ df = df[df["tweet_text"].str.len() > 0]
984
+ df = df[df["tweet_text"] != "TWEET_NOT_FOUND"]
985
+
986
+ df_test = df.groupby("is_toxic", group_keys=False).apply(
987
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
988
+ )
989
+
990
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
991
+ df_train = df[~df.index.isin(df_test.index)]
992
+
993
+ dataset = {
994
+ "train": df_train,
995
+ "test": df_test,
996
+ }
997
+
998
+ outputs = []
999
+ for split in self.splits.keys():
1000
+ data = dataset[split]
1001
+ for _, row in data.iterrows():
1002
+ input = Input(row["tweet_text"].strip())
1003
+ output = Output(text=self.id2label[int(row["is_toxic"])])
1004
+ references = [
1005
+ Reference(output, tags=[CORRECT_TAG]),
1006
+ ]
1007
+ instance = Instance(input=input, references=references, split=self.splits[split])
1008
+ outputs.append(instance)
1009
+ return outputs
1010
+
1011
+
1012
+ # B. Natural Language Generation
1013
+ # 1. Machine Translation
1014
+
1015
+
1016
+ # 1. Machine Translation: FLoRes-200
1017
+ class FloresScenario(Scenario):
1018
+ """
1019
+ FLoRes-200 is a machine translation scenario for 200+ languages. The data is obtained from English Wikimedia
1020
+ projects (Wikivoyage, Wikijunior and Wikinews), and professionally translated across 200+ languages to obtain a
1021
+ parallel dataset.
1022
+
1023
+ Only the English, Indonesian, Vietnamese, Thai and Tamil subsets are used in this scenario. Both directions
1024
+ (in and out of English) for each Southeast Asian language are included in the scenario.
1025
+
1026
+ The models are prompted using the following general format:
1027
+
1028
+ Translate the following text into <language> language.
1029
+
1030
+ Text: <text>
1031
+ Translation: <translation>
1032
+
1033
+ ...
1034
+
1035
+ Text: <text>
1036
+ Translation:
1037
+
1038
+ Target completion:
1039
+ <translation>
1040
+
1041
+ @article{nllb2022,
1042
+ author = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield,
1043
+ Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang,
1044
+ Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti,
1045
+ John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran,
1046
+ Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao,
1047
+ Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
1048
+ Safiyyah Saleem, Holger Schwenk, Jeff Wang
1049
+ },
1050
+ title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
1051
+ year = {2022},
1052
+ url = {https://research.facebook.com/publications/no-language-left-behind/},
1053
+ }
1054
+
1055
+ """
1056
+
1057
+ name = "flores"
1058
+ description = "FLoRes-200 machine translation task"
1059
+ tags = ["machine_translation"]
1060
+
1061
+ def __init__(self, pair: str):
1062
+ super().__init__()
1063
+ self.pair = pair
1064
+ self.source = pair.split("_")[0]
1065
+ self.target = pair.split("_")[1]
1066
+
1067
+ self.splits = {"dev": TRAIN_SPLIT, "devtest": TEST_SPLIT}
1068
+
1069
+ self.languages = {
1070
+ "en": "eng_Latn",
1071
+ "id": "ind_Latn",
1072
+ "vi": "vie_Latn",
1073
+ "th": "tha_Thai",
1074
+ "ta": "tam_Taml",
1075
+ }
1076
+
1077
+ if self.source not in self.languages.keys() or self.target not in self.languages.keys():
1078
+ raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
1079
+
1080
+ def get_instances(self, output_path) -> List[Instance]:
1081
+ source_dataset = datasets.load_dataset(
1082
+ "facebook/flores",
1083
+ self.languages[self.source],
1084
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1085
+ trust_remote_code=True,
1086
+ )
1087
+ target_dataset = datasets.load_dataset(
1088
+ "facebook/flores",
1089
+ self.languages[self.target],
1090
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1091
+ trust_remote_code=True,
1092
+ )
1093
+
1094
+ outputs = []
1095
+ for split in self.splits.keys():
1096
+ source_df = source_dataset[split].to_pandas()
1097
+ target_df = target_dataset[split].to_pandas()
1098
+ data = source_df.join(target_df, lsuffix="_source", rsuffix="_target")
1099
+ for _, row in data.iterrows():
1100
+ input = Input(row["sentence_source"].strip())
1101
+ output = Output(row["sentence_target"].strip())
1102
+ references = [
1103
+ Reference(output, tags=[CORRECT_TAG]),
1104
+ ]
1105
+ instance = Instance(input=input, references=references, split=self.splits[split])
1106
+ outputs.append(instance)
1107
+ return outputs
1108
+
1109
+
1110
+ # C. Natural Language Reasoning
1111
+ # 1. Natural Language Inference
1112
+ # 2. Causal Reasoning
1113
+
1114
+
1115
+ # 1. Natural Language Inference
1116
+ # 1.1 Indonesian: IndoNLI
1117
+ class IndoNLIScenario(Scenario):
1118
+ """
1119
+ IndoNLI is an Indonesian Natural Language Inference (NLI) scenario. The data is sourced from Wikipedia, news,
1120
+ and web articles. Native speakers use premise text from these sources and write hypothesis sentences for each
1121
+ NLI label. The labels are entailment, contradiction, or neutral.
1122
+
1123
+ The models are prompted using the following format:
1124
+
1125
+ Anda akan diberikan dua kalimat, X dan Y.
1126
+ Tentukan mana dari pernyataan berikut ini yang paling sesuai untuk kalimat X dan Y.
1127
+ A: Kalau X benar, maka Y juga harus benar.
1128
+ B: X bertentangan dengan Y.
1129
+ C: Ketika X benar, Y mungkin benar atau mungkin tidak benar.
1130
+ Jawablah dengan satu huruf saja, A, B atau C.
1131
+
1132
+ X: <sentence1>
1133
+ Y: <sentence2>
1134
+ Jawaban: <entailment>
1135
+
1136
+ ...
1137
+
1138
+ X: <sentence1>
1139
+ Y: <sentence2>
1140
+ Jawaban:
1141
+
1142
+ Target completion:
1143
+ <entailment>
1144
+
1145
+ @inproceedings{mahendra-etal-2021-indonli,
1146
+ title = "{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian",
1147
+ author = "Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara",
1148
+ booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
1149
+ month = nov,
1150
+ year = "2021",
1151
+ address = "Online and Punta Cana, Dominican Republic",
1152
+ publisher = "Association for Computational Linguistics",
1153
+ url = "https://aclanthology.org/2021.emnlp-main.821",
1154
+ pages = "10511--10527",
1155
+ }
1156
+ """
1157
+
1158
+ name = "indonli"
1159
+ description = "IndoNLI Indonesian Natural Language Inference task"
1160
+ tags = ["natural_language_inference"]
1161
+
1162
+ def __init__(self):
1163
+ super().__init__()
1164
+ self.splits = {
1165
+ "train": TRAIN_SPLIT,
1166
+ "test": TEST_SPLIT,
1167
+ }
1168
+ self.id2label = {"e": "A", "c": "B", "n": "C"}
1169
+
1170
+ def download_dataset(self, output_path: str):
1171
+ URLS = {
1172
+ "train": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/train.jsonl",
1173
+ "test": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/test_lay.jsonl",
1174
+ }
1175
+
1176
+ dataset: Dict[str, pd.DataFrame] = {}
1177
+ for split in self.splits.keys():
1178
+ target_path_file = os.path.join(output_path, split)
1179
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
1180
+ df = pd.read_json(target_path_file, lines=True)
1181
+ if split == "test":
1182
+ dataset[split] = df.groupby("label", group_keys=False).apply(
1183
+ lambda x: x.sample(frac=1000 / len(df), random_state=4685)
1184
+ )
1185
+ else:
1186
+ dataset[split] = df
1187
+ return dataset
1188
+
1189
+ def get_instances(self, output_path) -> List[Instance]:
1190
+ dataset = self.download_dataset(output_path)
1191
+ outputs = []
1192
+ for split in self.splits.keys():
1193
+ data = dataset[split]
1194
+ for _, row in data.iterrows():
1195
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1196
+ input = Input(passage)
1197
+ output = Output(self.id2label[row["label"]])
1198
+ references = [
1199
+ Reference(output, tags=[CORRECT_TAG]),
1200
+ ]
1201
+ instance = Instance(input=input, references=references, split=self.splits[split])
1202
+ outputs.append(instance)
1203
+ return outputs
1204
+
1205
+
1206
+ # 1.2 Vietnamese & Thai: XNLI
1207
+ class XNLIScenario(Scenario):
1208
+ """
1209
+ XNLI is a Natural Language Inference scenario for 15 languages. The data was constructed following the
1210
+ MultiNLI crowdsourcing procedure to obtain English data, which was then professionally translated across
1211
+ 14 other languages. Labels are entailment, neutral, or contradiction.
1212
+
1213
+ The models are prompted using the following general format:
1214
+
1215
+ You will be given two sentences, X and Y.
1216
+ Determine which of the following statements applies to sentences X and Y the best.
1217
+ A: If X is true, Y must be true.
1218
+ B: X contradicts Y.
1219
+ C: When X is true, Y may or may not be true.
1220
+ Answer strictly with a single letter A, B or C.
1221
+
1222
+ X: <sentence1>
1223
+ Y: <sentence2>
1224
+ Answer: <entailment>
1225
+
1226
+ ...
1227
+
1228
+ X: <sentence1>
1229
+ Y: <sentence2>
1230
+ Answer:
1231
+
1232
+ Target completion:
1233
+ <entailment>
1234
+
1235
+ @inproceedings{conneau-etal-2018-xnli,
1236
+ title = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
1237
+ author = "Conneau, Alexis and
1238
+ Rinott, Ruty and
1239
+ Lample, Guillaume and
1240
+ Williams, Adina and
1241
+ Bowman, Samuel and
1242
+ Schwenk, Holger and
1243
+ Stoyanov, Veselin",
1244
+ editor = "Riloff, Ellen and
1245
+ Chiang, David and
1246
+ Hockenmaier, Julia and
1247
+ Tsujii, Jun{'}ichi",
1248
+ booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
1249
+ month = oct # "-" # nov,
1250
+ year = "2018",
1251
+ address = "Brussels, Belgium",
1252
+ publisher = "Association for Computational Linguistics",
1253
+ url = "https://aclanthology.org/D18-1269",
1254
+ doi = "10.18653/v1/D18-1269",
1255
+ pages = "2475--2485",
1256
+ }
1257
+ """
1258
+
1259
+ name = "xnli"
1260
+ description = "XNLI Natural Language Inference task"
1261
+ tags = ["natural_language_inference"]
1262
+
1263
+ def __init__(self, language: str):
1264
+ super().__init__()
1265
+ self.language = language
1266
+ self.splits = {
1267
+ "validation": TRAIN_SPLIT,
1268
+ "test": TEST_SPLIT,
1269
+ }
1270
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1271
+ self.supported_languages = ["th", "vi"]
1272
+ if self.language not in self.supported_languages:
1273
+ raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
1274
+
1275
+ def get_instances(self, output_path) -> List[Instance]:
1276
+ dataset = datasets.load_dataset("xnli", self.language)
1277
+ outputs = []
1278
+ for split in self.splits.keys():
1279
+ df = dataset[split].to_pandas()
1280
+ if split == "validation":
1281
+ data = df
1282
+ else:
1283
+ # This produces 999 instances
1284
+ data = df.groupby("label", group_keys=False).apply(
1285
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1286
+ )
1287
+
1288
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1289
+ remainder = df[~df.index.isin(data.index)]
1290
+ neutral_instance = remainder[remainder["label"] == 1].iloc[0].to_frame().transpose()
1291
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1292
+ for _, row in data.iterrows():
1293
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1294
+ input = Input(passage)
1295
+ output = Output(self.id2label[int(row["label"])])
1296
+ references = [
1297
+ Reference(output, tags=[CORRECT_TAG]),
1298
+ ]
1299
+ instance = Instance(input=input, references=references, split=self.splits[split])
1300
+ outputs.append(instance)
1301
+ return outputs
1302
+
1303
+
1304
+ # 1.3 Tamil: IndicXNLI
1305
+ class IndicXNLIScenario(Scenario):
1306
+ """
1307
+ IndicXNLI is a Natural Language Inference scenario for 11 Indic languages. The data was
1308
+ automatically translated from the English XNLI dataset into 11 Indic languages using
1309
+ IndicTrans (Ramesh et al., 2021).
1310
+
1311
+ Only the Tamil subset of the data is used in this scenario. The labels are
1312
+ entailment, contradiction and neutral.
1313
+
1314
+ The models are prompted using the following format:
1315
+
1316
+ உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்.
1317
+ பின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்.
1318
+ A: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.
1319
+ B: X உம் Y உம் முரண்படுகின்றன.
1320
+ C: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்.
1321
+ A அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.
1322
+
1323
+ X: <premise>
1324
+ Y: <hypothesis>
1325
+ பதில்: <entailment>
1326
+
1327
+ ...
1328
+
1329
+ X: <premise>
1330
+ Y: <hypothesis>
1331
+ பதில்:
1332
+
1333
+ Target completion:
1334
+ <entailment>
1335
+
1336
+ @inproceedings{aggarwal-etal-2022-indicxnli,
1337
+ title = "{I}ndic{XNLI}: Evaluating Multilingual Inference for {I}ndian Languages",
1338
+ author = "Aggarwal, Divyanshu and
1339
+ Gupta, Vivek and
1340
+ Kunchukuttan, Anoop",
1341
+ editor = "Goldberg, Yoav and
1342
+ Kozareva, Zornitsa and
1343
+ Zhang, Yue",
1344
+ booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
1345
+ month = dec,
1346
+ year = "2022",
1347
+ address = "Abu Dhabi, United Arab Emirates",
1348
+ publisher = "Association for Computational Linguistics",
1349
+ url = "https://aclanthology.org/2022.emnlp-main.755",
1350
+ doi = "10.18653/v1/2022.emnlp-main.755",
1351
+ pages = "10994--11006",
1352
+ }
1353
+ """
1354
+
1355
+ name = "indicxnli"
1356
+ description = "IndicXNLI Natural Language Inference task"
1357
+ tags = ["natural_language_inference"]
1358
+
1359
+ def __init__(self):
1360
+ super().__init__()
1361
+ self.splits = {
1362
+ "validation": TRAIN_SPLIT,
1363
+ "test": TEST_SPLIT,
1364
+ }
1365
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1366
+
1367
+ def get_instances(self, output_path) -> List[Instance]:
1368
+ dataset = datasets.load_dataset("Divyanshu/indicxnli", "ta")
1369
+
1370
+ outputs = []
1371
+ for split in self.splits.keys():
1372
+ df = dataset[split].to_pandas()
1373
+ if split == "validation":
1374
+ data = df
1375
+ else:
1376
+ # This produces 999 instances
1377
+ data = df.groupby("label", group_keys=False).apply(
1378
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1379
+ )
1380
+
1381
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1382
+ remainder = df[~df.index.isin(data.index)]
1383
+ neutral_instance = remainder[remainder["label"] == 2].iloc[0].to_frame().transpose()
1384
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1385
+ for _, row in data.iterrows():
1386
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1387
+ input = Input(passage)
1388
+ output = Output(text=self.id2label[row["label"]])
1389
+ references = [
1390
+ Reference(output, tags=[CORRECT_TAG]),
1391
+ ]
1392
+ instance = Instance(input=input, references=references, split=self.splits[split])
1393
+ outputs.append(instance)
1394
+ return outputs
1395
+
1396
+
1397
+ # 2. Causal Reasoning: XCOPA
1398
+ class XCOPAScenario(Scenario):
1399
+ """
1400
+ XCOPA is a commonsense causal reasoning scenario for 11 languages. The data is sourced from the English
1401
+ COPA dataset and professionally translated across 11 languages to create a parallel dataset.
1402
+
1403
+ Only the Indonesian, Vietnamese, Thai and Tamil subsets were used for this scenario. Each instance consists of
1404
+ a premise and two sentences. The system under test needs to determine which of the two sentences is more likely
1405
+ to be the cause/effect of the premise. Whether the cause or the effect is asked for differs from instance to
1406
+ instance. Although there should be an equal number of instances asking for the cause and for the effect, it was
1407
+ found in the BHASA paper (Leong et al., 2023) that this was not the case for Indonesian and Thai. The
1408
+ cause/effect label is fixed in this scenario by harmonizing the labels across the four languages based on the
1409
+ Tamil subset as the reference.
1410
+
1411
+ The models are prompted using the following general format:
1412
+
1413
+ Based on the following situation, which of the following choices is most likely to be its {cause/effect}?
1414
+ Answer only with a single letter A or B.
1415
+
1416
+ Situation: <premise>
1417
+ A: <choice1>
1418
+ B: <choice2>
1419
+ Answer: <answer>
1420
+
1421
+ ...
1422
+
1423
+ Situation: <premise>
1424
+ A: <choice1>
1425
+ B: <choice2>
1426
+ Answer:
1427
+
1428
+ Target completion:
1429
+ <answer>
1430
+
1431
+ @article{ponti2020xcopa,
1432
+ title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
1433
+ author={Edoardo M. Ponti, Goran Glava
1434
+ {s}, Olga Majewska, Qianchu Liu, Ivan Vuli'{c} and Anna Korhonen},
1435
+ journal={arXiv preprint},
1436
+ year={2020},
1437
+ url={https://ducdauge.github.io/files/xcopa.pdf}
1438
+ }
1439
+
1440
+ @inproceedings{roemmele2011choice,
1441
+ title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},
1442
+ author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},
1443
+ booktitle={2011 AAAI Spring Symposium Series},
1444
+ year={2011},
1445
+ url={https://people.ict.usc.edu/~gordon/publications/AAAI-SPRING11A.PDF},
1446
+ }
1447
+ """
1448
+
1449
+ name = "xcopa"
1450
+ description = "XCOPA causal reasoning task"
1451
+ tags = ["causal_reasoning"]
1452
+
1453
+ def __init__(self, language: str):
1454
+ super().__init__()
1455
+ self.language = language
1456
+ self.splits = {
1457
+ "validation": TRAIN_SPLIT,
1458
+ "test": TEST_SPLIT,
1459
+ }
1460
+ self.id2label = {
1461
+ 0: "A",
1462
+ 1: "B",
1463
+ }
1464
+ self.language_to_prompt_components = {
1465
+ "id": {
1466
+ "cause": "sebab",
1467
+ "effect": "akibat",
1468
+ "instruction1": "Berdasarkan situasi di atas, mana dari pilihan-pilihan berikut ini yang lebih "
1469
+ "mungkin menjadi {}?",
1470
+ "instruction2": "Jawablah dengan satu huruf saja, A atau B.",
1471
+ },
1472
+ "ta": {
1473
+ "cause": "காரணமாக",
1474
+ "effect": "விளைவாக",
1475
+ "instruction1": "பின்வரும் வாக்கியங்களில் பெரும்பாலும் எது தரப்பட்ட சூழ்நிலைக்குரிய {} இருக்கும்?",
1476
+ "instruction2": "A அல்லது B என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
1477
+ },
1478
+ "th": {
1479
+ "cause": "สาเหตุ",
1480
+ "effect": "ผล",
1481
+ "instruction1": "เมื่อพิจารณาจากสถานการณ์นี้ ตัวเลือกใดต่อไปนี้น่าจะเป็น{}มากกว่ากัน?",
1482
+ "instruction2": "กรุณาตอบด้วยตัวอักษร A หรือ B ตัวเดียวเท่านั้น",
1483
+ },
1484
+ "vi": {
1485
+ "cause": "nguyên nhân",
1486
+ "effect": "kết quả",
1487
+ "instruction1": "Với tình huống trên, lựa chọn nào dưới đây có khả năng cao là {} của nó hơn?",
1488
+ "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
1489
+ },
1490
+ }
1491
+ if self.language not in self.language_to_prompt_components.keys():
1492
+ raise Exception(
1493
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1494
+ )
1495
+ else:
1496
+ self.prompt_components = self.language_to_prompt_components[self.language]
1497
+
1498
+ def get_instances(self, output_path) -> List[Instance]:
1499
+ language_dataset = datasets.load_dataset("xcopa", self.language)
1500
+ tamil_dataset = datasets.load_dataset("xcopa", "ta")
1501
+
1502
+ outputs = []
1503
+ for split in self.splits.keys():
1504
+ language_df = language_dataset[split].to_pandas()
1505
+ tamil_df = tamil_dataset[split].to_pandas()
1506
+ data = pd.merge(
1507
+ language_df, tamil_df[["question", "idx"]], on="idx"
1508
+ ) # Use the Tamil split's question column
1509
+ for _, row in data.iterrows():
1510
+ instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
1511
+ passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
1512
+ premise=row["premise"].strip(),
1513
+ instruction1=instruction1,
1514
+ choice1=row["choice1"].strip(),
1515
+ choice2=row["choice2"].strip(),
1516
+ instruction2=self.prompt_components["instruction2"],
1517
+ )
1518
+ input = Input(passage)
1519
+ output = Output(self.id2label[int(row["label"])])
1520
+ references = [
1521
+ Reference(output, tags=[CORRECT_TAG]),
1522
+ ]
1523
+ instance = Instance(input=input, references=references, split=self.splits[split])
1524
+ outputs.append(instance)
1525
+ return outputs
1526
+
1527
+
1528
+ # 1. Syntax: LINDSEA Minimal Pairs
1529
+ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1530
+ """
1531
+ The LINDSEA Minimal Pairs dataset is a linguistic diagnostic scenario targeting syntactic phenomena.
1532
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1533
+ of quality control. The high-level categories tested for include morphology, argument structure,
1534
+ filler-gap dependencies, as well as negative polarity items and negation.
1535
+
1536
+ The test is designed as a minimal pair, with a pair of sentences that differ minimally from each other
1537
+ and which exemplify a specific syntactic phenomenon. The system under test needs to determine which
1538
+ sentence of the pair is more acceptable.
1539
+
1540
+ The models are prompted using the following general format:
1541
+
1542
+ Which sentence is more acceptable?
1543
+ Answer only with a single letter A or B.
1544
+ <sentence>
1545
+
1546
+ Target completion:
1547
+ <sentence>
1548
+
1549
+ @misc{leong2023bhasa,
1550
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1551
+ author={Wei Qi Leong
1552
+ and Jian Gang Ngui
1553
+ and Yosephine Susanto
1554
+ and Hamsawardhini Rengarajan
1555
+ and Kengatharaiyer Sarveswaran
1556
+ and William Chandra Tjhi
1557
+ },
1558
+ year={2023},
1559
+ eprint={2309.06085},
1560
+ archivePrefix={arXiv},
1561
+ primaryClass={cs.CL},
1562
+ url={https://arxiv.org/abs/2309.06085},
1563
+ }
1564
+ """
1565
+
1566
+ name = "lindsea_minimal_pairs"
1567
+ description = "LINDSEA minimal pairs task"
1568
+ tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
1569
+
1570
+ def __init__(self, method: str, language: str):
1571
+ super().__init__()
1572
+ self.method = method
1573
+ self.language = language
1574
+ self.language_to_prompt_components = {
1575
+ "id": {
1576
+ "instructions": "Kalimat mana yang lebih mungkin?",
1577
+ "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
1578
+ }
1579
+ }
1580
+ if self.language not in self.language_to_prompt_components.keys():
1581
+ raise Exception(
1582
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1583
+ )
1584
+ else:
1585
+ self.prompt_components = self.language_to_prompt_components[self.language]
1586
+
1587
+ def download_dataset(self, output_path: str):
1588
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1589
+ URLS = {
1590
+ "npis_and_negation": f"{BASE_URL}{self.language}/syntax/NPIs_and_negation.jsonl",
1591
+ "argument_structure": f"{BASE_URL}{self.language}/syntax/argument_structure.jsonl",
1592
+ "filler_gap_dependencies": f"{BASE_URL}{self.language}/syntax/filler-gap_dependencies.jsonl",
1593
+ "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
1594
+ }
1595
+
1596
+ data_files = {}
1597
+ for file in list(URLS.keys()):
1598
+ target_path_file = os.path.join(output_path, file)
1599
+ ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
1600
+ data_files[file] = pd.read_json(target_path_file, lines=True)
1601
+ dataset = pd.concat(data_files)
1602
+
1603
+ return dataset
1604
+
1605
+ def get_instances(self, output_path: str) -> List[Instance]:
1606
+ data = self.download_dataset(output_path)
1607
+
1608
+ outputs = []
1609
+ if self.method == "mcq":
1610
+ category_list = data["category"].value_counts().keys()
1611
+
1612
+ hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
1613
+ for category in category_list:
1614
+ # Fix shuffling within each category
1615
+ random.seed(1)
1616
+ for _, row in data[data["category"] == category].iterrows():
1617
+ options = [(row["correct"], 1), (row["wrong"], 2)]
1618
+ random.shuffle(options)
1619
+ options_reversed = True if options[0][1] == 2 else False
1620
+ instructions = self.prompt_components["instructions"]
1621
+ output_prefix = self.prompt_components["output_prefix"]
1622
+ prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
1623
+ input = Input(text=prompt)
1624
+ # Determine correct option based on whether shuffling reversed the options
1625
+ references = [
1626
+ Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
1627
+ Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
1628
+ ]
1629
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
1630
+ outputs.append(instance)
1631
+
1632
+ else:
1633
+ for _, row in data.iterrows():
1634
+ # No need to shuffle since we are comparing logprobs of the options separately
1635
+ input = Input(text="")
1636
+ references = [
1637
+ Reference(Output(text=row["correct"].strip()), tags=[CORRECT_TAG]),
1638
+ Reference(Output(text=row["wrong"].strip()), tags=[]),
1639
+ ]
1640
+ instance = Instance(
1641
+ input=input,
1642
+ references=references,
1643
+ split=TEST_SPLIT,
1644
+ )
1645
+ outputs.append(instance)
1646
+ return outputs
1647
+
1648
+
1649
+ # 2.1 Pragmatics: LINDSEA Presuppositions
1650
+ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1651
+ """
1652
+ The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
1653
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1654
+ of quality control.
1655
+
1656
+ The presuppositions dataset involves two formats: single and pair sentences.
1657
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1658
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1659
+ from another sentence.
1660
+
1661
+ For the single format, the models are prompted using the following general format:
1662
+
1663
+ Is the following statement true or false?
1664
+ Statement: <sentence>
1665
+ Answer only with True or False.
1666
+
1667
+ For the pair format, the models are prompted using the following general format:
1668
+
1669
+ Situation: <premise>
1670
+ Given this situation, is the following statement true or false?
1671
+ Statement: <hypothesis>
1672
+ Answer only with True or False.
1673
+
1674
+ Target completion:
1675
+ <answer>
1676
+
1677
+ @misc{leong2023bhasa,
1678
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1679
+ author={Wei Qi Leong
1680
+ and Jian Gang Ngui
1681
+ and Yosephine Susanto
1682
+ and Hamsawardhini Rengarajan
1683
+ and Kengatharaiyer Sarveswaran
1684
+ and William Chandra Tjhi
1685
+ },
1686
+ year={2023},
1687
+ eprint={2309.06085},
1688
+ archivePrefix={arXiv},
1689
+ primaryClass={cs.CL}
1690
+ }
1691
+ """
1692
+
1693
+ name = "lindsea_pragmatics_presuppositions"
1694
+ description = "LINDSEA presuppositions task"
1695
+ tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
1696
+
1697
+ def __init__(self, language: str, subset: str):
1698
+ super().__init__()
1699
+ self.language = language
1700
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1701
+ self.language_to_prompt_components = {
1702
+ "id": {
1703
+ "text_noun": "Pernyataan",
1704
+ "premise_noun": "Situasi",
1705
+ "conclusion_noun": "Pernyataan",
1706
+ "single_question": "Apakah pernyataan berikut ini {}?",
1707
+ "single_instruction": "Jawablah dengan {} saja.",
1708
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1709
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1710
+ "True": "Benar",
1711
+ "False": "Salah",
1712
+ },
1713
+ }
1714
+ if self.language not in self.language_to_prompt_components.keys():
1715
+ raise Exception(
1716
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1717
+ )
1718
+ else:
1719
+ self.prompt_components = self.language_to_prompt_components[self.language]
1720
+
1721
+ def download_dataset(self, output_path: str):
1722
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1723
+ datasets = []
1724
+ for subset in self.subsets:
1725
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1726
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1727
+ target_path_file = os.path.join(output_path, file)
1728
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1729
+ data = pd.read_json(target_path_file, lines=True)
1730
+ data["subset"] = subset
1731
+ data = data[data["linguistic_phenomenon"] == "presuppositions"]
1732
+ datasets.append(data)
1733
+ dataset = pd.concat(datasets)
1734
+ return dataset
1735
+
1736
+ def get_instances(self, output_path) -> List[Instance]:
1737
+ data = self.download_dataset(output_path)
1738
+ outputs = []
1739
+ for _, row in data.iterrows():
1740
+ passage = None
1741
+ references = []
1742
+
1743
+ if row["subset"] == "single":
1744
+ question = self.prompt_components["single_question"]
1745
+ text_noun = self.prompt_components["text_noun"]
1746
+ instruction = self.prompt_components["single_instruction"]
1747
+
1748
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1749
+ question=question.format(row["question_translated"]),
1750
+ text_noun=text_noun,
1751
+ text=row["text"],
1752
+ instruction=instruction.format(row["choices_translated"]),
1753
+ )
1754
+ # Split "True or False" into ["True", "or", "False"]
1755
+ choices = row["choices"].split()
1756
+ choices_translated = row["choices_translated"].split()
1757
+ label2choice = {
1758
+ choices[0]: choices_translated[0],
1759
+ choices[2]: choices_translated[2],
1760
+ }
1761
+ references.append(
1762
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1763
+ )
1764
+
1765
+ elif row["subset"] == "pair":
1766
+ premise_noun = self.prompt_components["premise_noun"]
1767
+ question = self.prompt_components["pair_question"]
1768
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1769
+ instruction = self.prompt_components["pair_instruction"]
1770
+ label = self.prompt_components[str(row["label"])]
1771
+
1772
+ passage = (
1773
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1774
+ premise_noun=premise_noun,
1775
+ premise=row["text"],
1776
+ question=question,
1777
+ conclusion_noun=conclusion_noun,
1778
+ conclusion=row["conclusion"],
1779
+ instruction=instruction,
1780
+ )
1781
+ )
1782
+
1783
+ references.append(
1784
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1785
+ )
1786
+
1787
+ input = Input(text=str(passage))
1788
+ instance = Instance(
1789
+ input=input,
1790
+ references=references,
1791
+ split=TEST_SPLIT,
1792
+ )
1793
+ outputs.append(instance)
1794
+ return outputs
1795
+
1796
+
1797
+ # 2.2 Pragmatics: LINDSEA Scalar Implicatures
1798
+ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1799
+ """
1800
+ The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
1801
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1802
+ of quality control.
1803
+
1804
+ The scalar implicatures dataset involves two formats: single and pair sentences.
1805
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1806
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1807
+ from another sentence.
1808
+
1809
+ For the single format, the models are prompted using the following general format:
1810
+
1811
+ Is the following statement true or false?
1812
+ Statement: <sentence>
1813
+ Answer only with True or False.
1814
+
1815
+ For the pair format, the models are prompted using the following general format:
1816
+
1817
+ Situation: <premise>
1818
+ Given this situation, is the following statement true or false?
1819
+ Statement: <hypothesis>
1820
+ Answer only with True or False.
1821
+
1822
+ Target completion:
1823
+ <answer>
1824
+
1825
+ @misc{leong2023bhasa,
1826
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1827
+ author={Wei Qi Leong
1828
+ and Jian Gang Ngui
1829
+ and Yosephine Susanto
1830
+ and Hamsawardhini Rengarajan
1831
+ and Kengatharaiyer Sarveswaran
1832
+ and William Chandra Tjhi
1833
+ },
1834
+ year={2023},
1835
+ eprint={2309.06085},
1836
+ archivePrefix={arXiv},
1837
+ primaryClass={cs.CL}
1838
+ }
1839
+ """
1840
+
1841
+ name = "lindsea_pragmatics_scalar_implicatures"
1842
+ description = "LINDSEA scalar implicatures task"
1843
+ tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
1844
+
1845
+ def __init__(self, language: str, subset: str):
1846
+ super().__init__()
1847
+ self.language = language
1848
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1849
+ self.language_to_prompt_components = {
1850
+ "id": {
1851
+ "text_noun": "Pernyataan",
1852
+ "premise_noun": "Situasi",
1853
+ "conclusion_noun": "Pernyataan",
1854
+ "single_question": "Apakah pernyataan berikut ini {}?",
1855
+ "single_instruction": "Jawablah dengan {} saja.",
1856
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1857
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1858
+ "True": "Benar",
1859
+ "False": "Salah",
1860
+ },
1861
+ }
1862
+ if self.language not in self.language_to_prompt_components.keys():
1863
+ raise Exception(
1864
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1865
+ )
1866
+ else:
1867
+ self.prompt_components = self.language_to_prompt_components[self.language]
1868
+
1869
+ def download_dataset(self, output_path: str):
1870
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1871
+ datasets = []
1872
+ for subset in self.subsets:
1873
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1874
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1875
+ target_path_file = os.path.join(output_path, file)
1876
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1877
+ data = pd.read_json(target_path_file, lines=True)
1878
+ data["subset"] = subset
1879
+ data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
1880
+ datasets.append(data)
1881
+ dataset = pd.concat(datasets)
1882
+ return dataset
1883
+
1884
+ def get_instances(self, output_path) -> List[Instance]:
1885
+ data = self.download_dataset(output_path)
1886
+ outputs = []
1887
+ for _, row in data.iterrows():
1888
+ passage = None
1889
+ references = []
1890
+
1891
+ if row["subset"] == "single":
1892
+ question = self.prompt_components["single_question"]
1893
+ text_noun = self.prompt_components["text_noun"]
1894
+ instruction = self.prompt_components["single_instruction"]
1895
+
1896
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1897
+ question=question.format(row["question_translated"]),
1898
+ text_noun=text_noun,
1899
+ text=row["text"],
1900
+ instruction=instruction.format(row["choices_translated"]),
1901
+ )
1902
+ # Split "True or False" into ["True", "or", "False"]
1903
+ choices = row["choices"].split()
1904
+ choices_translated = row["choices_translated"].split()
1905
+ label2choice = {
1906
+ choices[0]: choices_translated[0],
1907
+ choices[2]: choices_translated[2],
1908
+ }
1909
+ references.append(
1910
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1911
+ )
1912
+
1913
+ elif row["subset"] == "pair":
1914
+ premise_noun = self.prompt_components["premise_noun"]
1915
+ question = self.prompt_components["pair_question"]
1916
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1917
+ instruction = self.prompt_components["pair_instruction"]
1918
+ label = self.prompt_components[str(row["label"])]
1919
+
1920
+ passage = (
1921
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1922
+ premise_noun=premise_noun,
1923
+ premise=row["text"],
1924
+ question=question,
1925
+ conclusion_noun=conclusion_noun,
1926
+ conclusion=row["conclusion"],
1927
+ instruction=instruction,
1928
+ )
1929
+ )
1930
+
1931
+ references.append(
1932
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1933
+ )
1934
+
1935
+ input = Input(text=str(passage))
1936
+ instance = Instance(
1937
+ input=input,
1938
+ references=references,
1939
+ split=TEST_SPLIT,
1940
+ )
1941
+ outputs.append(instance)
1942
+ return outputs