crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,1798 @@
1
+ import datasets
2
+ import os
3
+ import random
4
+ from typing import List, Dict
5
+
6
+ import pandas as pd
7
+
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Input,
10
+ Instance,
11
+ Output,
12
+ PassageQuestionInput,
13
+ Reference,
14
+ Scenario,
15
+ CORRECT_TAG,
16
+ TEST_SPLIT,
17
+ TRAIN_SPLIT,
18
+ )
19
+ from helm.common.general import ensure_file_downloaded
20
+ from helm.common.hierarchical_logger import hlog
21
+
22
+ # BHASA Scenarios
23
+ # A. Natural Language Understanding
24
+ # B. Natural Language Generation
25
+ # C. Natural Language Reasoning
26
+ # D. Linguistic Diagnostics
27
+
28
+ # A. Natural Language Understanding
29
+ # 1. Question Answering
30
+ # 2. Sentiment Analysis
31
+ # 3. Toxicity Detection/Classification
32
+
33
+
34
+ # 1. Question Answering
35
+ # 1.1 Indonesian: TyDiQA
36
+ class TyDiQAScenario(Scenario):
37
+ """
38
+ TyDiQA is is an open-book question answering scenario for 11 typologically-diverse languages.
39
+ The questions are written by people who want to know the answer, but do not know the answer yet,
40
+ and the data is collected directly in each language without the use of translation.
41
+
42
+ This scenario only uses the Indonesian subset of the data, and uses the Gold Passage (GoldP) task,
43
+ which requires the tested system to extract a span from the given passage to answer a given question.
44
+ There are no unanswerable questions.
45
+
46
+ The models are prompted using the following format:
47
+
48
+ Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan mengekstrak jawaban
49
+ dari paragraf tersebut.
50
+
51
+ Paragraf: <text>
52
+ Pertanyaan: <question>
53
+ Jawaban: <answer>
54
+
55
+ ...
56
+
57
+ Paragraf: <text>
58
+ Pertanyaan: <question>
59
+ Jawaban:
60
+
61
+
62
+ Target completion:
63
+ <answer>
64
+
65
+ @article{clark-etal-2020-tydi,
66
+ title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically
67
+ Diverse Languages",
68
+ author = "Clark, Jonathan H. and
69
+ Choi, Eunsol and
70
+ Collins, Michael and
71
+ Garrette, Dan and
72
+ Kwiatkowski, Tom and
73
+ Nikolaev, Vitaly and
74
+ Palomaki, Jennimaria",
75
+ editor = "Johnson, Mark and
76
+ Roark, Brian and
77
+ Nenkova, Ani",
78
+ journal = "Transactions of the Association for Computational Linguistics",
79
+ volume = "8",
80
+ year = "2020",
81
+ address = "Cambridge, MA",
82
+ publisher = "MIT Press",
83
+ url = "https://aclanthology.org/2020.tacl-1.30",
84
+ doi = "10.1162/tacl_a_00317",
85
+ pages = "454--470",
86
+ }
87
+ """
88
+
89
+ name = "tydiqa"
90
+ description = "Indonesian Open-book Question Answering task"
91
+ tags = ["question_answering"]
92
+
93
+ def __init__(self):
94
+ super().__init__()
95
+ self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
96
+
97
+ def get_instances(self, output_path) -> List[Instance]:
98
+ dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
99
+
100
+ outputs = []
101
+ for split in self.splits.keys():
102
+ df = dataset[split].to_pandas()
103
+
104
+ if split == "train":
105
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
106
+ data = df[df["passage_text"].apply(len) < df["passage_text"].apply(len).quantile(0.2)]
107
+ else:
108
+ data = df
109
+
110
+ for _, row in data.iterrows():
111
+ passage = row["passage_text"].strip()
112
+ question = row["question_text"].strip()
113
+ input = PassageQuestionInput(
114
+ passage=passage,
115
+ question=question,
116
+ passage_prefix="Paragraf: ",
117
+ question_prefix="Pertanyaan: ",
118
+ )
119
+ references = []
120
+ for answer in row["answers"]["text"]:
121
+ output = Output(text=answer.strip())
122
+ references.append(Reference(output, tags=[CORRECT_TAG]))
123
+ instance = Instance(input=input, references=references, split=self.splits[split])
124
+ outputs.append(instance)
125
+ return outputs
126
+
127
+
128
+ # 1.2 Vietnamese & Thai: XQuAD
129
+ class XQuADScenario(Scenario):
130
+ """
131
+ XQuAD is an open-book question answering scenario that is parallel across 10 languages.
132
+ The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the
133
+ development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
134
+
135
+ This scenario only uses the Vietnamese and Thai subsets of the data and there are no
136
+ unanswerable questions.
137
+
138
+ The models are prompted using the following general format:
139
+
140
+ You will be given a paragraph and a question. Answer the question by extracting the answer from the paragraph.
141
+
142
+ Paragraph: <text>
143
+ Question: <question>
144
+ Answer: <answer>
145
+
146
+ ...
147
+
148
+ Paragraph: <text>
149
+ Question: <question>
150
+ Answer:
151
+
152
+ Target completion:
153
+ <answer>
154
+
155
+ @article{Artetxe:etal:2019,
156
+ author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
157
+ title = {On the cross-lingual transferability of monolingual representations},
158
+ journal = {CoRR},
159
+ volume = {abs/1910.11856},
160
+ year = {2019},
161
+ archivePrefix = {arXiv},
162
+ eprint = {1910.11856}
163
+ }
164
+ """
165
+
166
+ name = "xquad"
167
+ description = "Vietnamese and Thai Open-book Question Answering task"
168
+ tags = ["question_answering"]
169
+
170
+ def __init__(self, language: str):
171
+ super().__init__()
172
+ self.language = language
173
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
174
+ self.map = {
175
+ "th": {
176
+ "passage_prefix": "ข้อความ: ",
177
+ "question_prefix": "คำถาม: ",
178
+ "random_state": 4520,
179
+ },
180
+ "vi": {
181
+ "passage_prefix": "Đoạn văn: ",
182
+ "question_prefix": "Câu hỏi: ",
183
+ "random_state": 4502,
184
+ },
185
+ }
186
+
187
+ def get_instances(self, output_path) -> List[Instance]:
188
+ dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
189
+ df = dataset.to_pandas()
190
+
191
+ # Sample 1000 examples for test
192
+ df_test = df.sample(n=1000, random_state=self.map[self.language]["random_state"])
193
+
194
+ # In-context examples to be drawn from remaining examples (since there is no train data)
195
+ df_train = df[~df.index.isin(df_test.index)]
196
+
197
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
198
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
199
+ dataset = {
200
+ "train": df_train,
201
+ "test": df_test,
202
+ }
203
+
204
+ outputs = []
205
+ for split in self.splits.keys():
206
+ data = dataset[split]
207
+ for _, row in data.iterrows():
208
+ passage = row["context"].strip()
209
+ question = row["question"].strip()
210
+ input = PassageQuestionInput(
211
+ passage=passage,
212
+ question=question,
213
+ passage_prefix=str(self.map[self.language]["passage_prefix"]),
214
+ question_prefix=str(self.map[self.language]["question_prefix"]),
215
+ )
216
+ references = []
217
+ for answer in row["answers"]["text"]:
218
+ output = Output(text=answer.strip())
219
+ references.append(Reference(output, tags=[CORRECT_TAG]))
220
+ instance = Instance(input=input, references=references, split=self.splits[split])
221
+ outputs.append(instance)
222
+ return outputs
223
+
224
+
225
+ # 1.3 Tamil: IndicQA
226
+ class IndicQAScenario(Scenario):
227
+ """
228
+ IndicQA is an open-book question answering scenario for 11 Indic languages.
229
+ Answers to questions are to be extracted from the text provided. The data is taken from
230
+ Wikipedia articles across various domains and questions and answers were manually created
231
+ by native speakers.
232
+
233
+ This scenario only uses the Tamil subset of the data and unanswerable questions
234
+ are removed from the dataset in order to be consistent with the question answering
235
+ scenarios for Indonesian, Vietnamese and Thai.
236
+
237
+ The models are prompted using the following format:
238
+
239
+ உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்.
240
+
241
+ பத்தி: <text>
242
+ கேள்வி: <question>
243
+ பதில்: <answer>
244
+
245
+ ...
246
+
247
+ பத்தி: <text>
248
+ கேள்வி: <question>
249
+ பதில்:
250
+
251
+ Target completion:
252
+ <answer>
253
+
254
+ @inproceedings{doddapaneni-etal-2023-towards,
255
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
256
+ {I}ndic Languages",
257
+ author = "Doddapaneni, Sumanth and
258
+ Aralikatte, Rahul and
259
+ Ramesh, Gowtham and
260
+ Goyal, Shreya and
261
+ Khapra, Mitesh M. and
262
+ Kunchukuttan, Anoop and
263
+ Kumar, Pratyush",
264
+ editor = "Rogers, Anna and
265
+ Boyd-Graber, Jordan and
266
+ Okazaki, Naoaki",
267
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
268
+ Long Papers)",
269
+ month = jul,
270
+ year = "2023",
271
+ address = "Toronto, Canada",
272
+ publisher = "Association for Computational Linguistics",
273
+ url = "https://aclanthology.org/2023.acl-long.693",
274
+ doi = "10.18653/v1/2023.acl-long.693",
275
+ pages = "12402--12426",
276
+ }
277
+ """
278
+
279
+ name = "indicqa"
280
+ description = "Tamil Open-book Question Answering task"
281
+ tags = ["question_answering"]
282
+
283
+ def __init__(self):
284
+ super().__init__()
285
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
286
+
287
+ def get_instances(self, output_path) -> List[Instance]:
288
+ dataset = datasets.load_dataset(
289
+ "ai4bharat/IndicQA",
290
+ "indicqa.ta",
291
+ split="test",
292
+ revision="78ee8d58e880c72f324e176c989dfefa55427af4",
293
+ trust_remote_code=True,
294
+ )
295
+ df = dataset.to_pandas()
296
+
297
+ # Remove unanswerable questions (answer is an empty string)
298
+ df = df[df["answers"].apply(lambda x: len(x["text"][0].strip()) > 0)]
299
+
300
+ # Sample 1000 examples for test
301
+ df_test = df.sample(n=1000, random_state=7900)
302
+
303
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
304
+ df_train = df[~df.index.isin(df_test.index)]
305
+
306
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
307
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
308
+ dataset = {
309
+ "train": df_train,
310
+ "test": df_test,
311
+ }
312
+
313
+ outputs = []
314
+ for split in self.splits.keys():
315
+ data = dataset[split]
316
+ for _, row in data.iterrows():
317
+ passage = row["context"].strip()
318
+ question = row["question"].strip()
319
+ input = PassageQuestionInput(
320
+ passage=passage,
321
+ question=question,
322
+ passage_prefix="பத்தி: ",
323
+ question_prefix="கேள்வி: ",
324
+ )
325
+ references = []
326
+ for answer in row["answers"]["text"]:
327
+ output = Output(text=answer.strip())
328
+ references.append(Reference(output, tags=[CORRECT_TAG]))
329
+ instance = Instance(input=input, references=references, split=self.splits[split])
330
+ outputs.append(instance)
331
+ return outputs
332
+
333
+
334
+ # 2. Sentiment Analysis
335
+ # 2.1 Indonesian: NusaX Sentiment
336
+ class NusaXScenario(Scenario):
337
+ """
338
+ NusaX is a sentiment analysis scenario for 11 Indonesian languages.
339
+ The data is derived from a subset of SmSA (Purwarianti and Crisdayanti, 2019) and manually translated
340
+ from Indonesian to 10 other local languages, such as Acehnese and Toba Batak.
341
+ It consists of comments and reviews from various online platforms.
342
+
343
+ Only the Indonesian subset of the data is used for this scenario, and the labels are
344
+ positive, negative or neutral.
345
+
346
+ The models are prompted using the following format:
347
+
348
+ Apa sentimen dari kalimat berikut ini?
349
+ Jawablah dengan satu kata saja:
350
+ - Positif
351
+ - Negatif
352
+ - Netral
353
+
354
+ Kalimat: <text>
355
+ Jawaban: <sentiment>
356
+
357
+ ...
358
+
359
+ Kalimat: <text>
360
+ Jawaban:
361
+
362
+ Target completion:
363
+ <sentiment>
364
+
365
+ @inproceedings{winata-etal-2023-nusax,
366
+ title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
367
+ author = "Winata, Genta Indra and
368
+ Aji, Alham Fikri and
369
+ Cahyawijaya, Samuel and
370
+ Mahendra, Rahmad and
371
+ Koto, Fajri and
372
+ Romadhony, Ade and
373
+ Kurniawan, Kemal and
374
+ Moeljadi, David and
375
+ Prasojo, Radityo Eko and
376
+ Fung, Pascale and
377
+ Baldwin, Timothy and
378
+ Lau, Jey Han and
379
+ Sennrich, Rico and
380
+ Ruder, Sebastian",
381
+ editor = "Vlachos, Andreas and
382
+ Augenstein, Isabelle",
383
+ booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for
384
+ Computational Linguistics",
385
+ month = may,
386
+ year = "2023",
387
+ address = "Dubrovnik, Croatia",
388
+ publisher = "Association for Computational Linguistics",
389
+ url = "https://aclanthology.org/2023.eacl-main.57",
390
+ doi = "10.18653/v1/2023.eacl-main.57",
391
+ pages = "815--834",
392
+ }
393
+ """
394
+
395
+ name = "nusax"
396
+ description = "Indonesian NusaX-Senti Sentiment Analysis dataset"
397
+ tags = ["sentiment_analysis"]
398
+
399
+ def __init__(self):
400
+ super().__init__()
401
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
402
+ self.sentiment2label = {
403
+ "positive": "Positif",
404
+ "negative": "Negatif",
405
+ "neutral": "Netral",
406
+ }
407
+
408
+ def download_dataset(self, output_path: str):
409
+ URLS = {
410
+ "test": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/test.csv",
411
+ "train": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/train.csv",
412
+ }
413
+
414
+ dataset: Dict[str, pd.DataFrame] = {}
415
+ for split in self.splits.keys():
416
+ target_path_file = os.path.join(output_path, split)
417
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
418
+ data = pd.read_csv(target_path_file)
419
+ dataset[split] = data
420
+ return dataset
421
+
422
+ def get_instances(self, output_path) -> List[Instance]:
423
+ dataset = self.download_dataset(output_path)
424
+ outputs = []
425
+ for split in self.splits.keys():
426
+ data = dataset[split]
427
+ for _, row in data.iterrows():
428
+ input = Input(row["text"].strip())
429
+ output = Output(text=self.sentiment2label[row["label"]])
430
+ references = [
431
+ Reference(output, tags=[CORRECT_TAG]),
432
+ ]
433
+ instance = Instance(input=input, references=references, split=self.splits[split])
434
+ outputs.append(instance)
435
+ return outputs
436
+
437
+
438
+ # 2.2 Vietnamese: UIT-VSFC
439
+ class UITVSFCScenario(Scenario):
440
+ """
441
+ UIT-VSFC is a Vietnamese sentiment analysis scenario. The data consists of student feedback obtained from
442
+ end-of-semester surveys at a Vietnamese university. Feedback is labeled as one of three sentiment
443
+ polarities: positive, negative or neutral.
444
+
445
+ The models are prompted using the following format:
446
+
447
+ Sắc thái của câu sau đây là gì?
448
+ Trả lời với một từ duy nhất:
449
+ - Tích cực
450
+ - Tiêu cực
451
+ - Trung lập
452
+
453
+ Câu văn: <text>
454
+ Câu trả lời: <sentiment>
455
+
456
+ ...
457
+
458
+ Câu văn: <text>
459
+ Câu trả lời:
460
+
461
+ Target completion:
462
+ <sentiment>
463
+
464
+ @inproceedings{van2018uit,
465
+ title={UIT-VSFC: Vietnamese students’ feedback corpus for sentiment analysis},
466
+ author={Van Nguyen, Kiet and Nguyen, Vu Duc and Nguyen, Phu XV and Truong, Tham TH and Nguyen, Ngan Luu-Thuy},
467
+ booktitle={2018 10th international conference on knowledge and systems engineering (KSE)},
468
+ pages={19--24},
469
+ year={2018},
470
+ organization={IEEE},
471
+ url={https://ieeexplore.ieee.org/document/8573337},
472
+ }
473
+ """
474
+
475
+ name = "uitvsfc"
476
+ description = "Vietnamese Students' Feedback Corpus sentiment analysis task"
477
+ tags = ["sentiment_analysis"]
478
+
479
+ def __init__(self):
480
+ super().__init__()
481
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
482
+ self.id2label = {
483
+ 0: "Tiêu cực",
484
+ 1: "Trung lập",
485
+ 2: "Tích cực",
486
+ }
487
+
488
+ def download_dataset(self, output_path: str):
489
+ URLS = {
490
+ "train": {
491
+ "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
492
+ "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
493
+ },
494
+ "test": {
495
+ "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
496
+ "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
497
+ },
498
+ }
499
+
500
+ dataset: Dict[str, pd.DataFrame] = {}
501
+ for split in list(URLS.keys()):
502
+ file_lines: Dict[str, List[str]] = {}
503
+ for file in list(URLS[split].keys()):
504
+ file_lines[file] = []
505
+ target_path_file = os.path.join(output_path, split, file)
506
+ ensure_file_downloaded(source_url=URLS[split][file], target_path=target_path_file)
507
+ with open(target_path_file, "r") as f:
508
+ lines = f.readlines()
509
+ for line in lines:
510
+ file_lines[file].append(str(line).strip())
511
+ df = pd.DataFrame({"text": file_lines["sentences"], "label": file_lines["sentiments"]})
512
+ if split == "test":
513
+ dataset[split] = df.groupby("label", group_keys=False).apply(
514
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
515
+ )
516
+ else:
517
+ dataset[split] = df
518
+ return dataset
519
+
520
+ def get_instances(self, output_path) -> List[Instance]:
521
+ dataset = self.download_dataset(output_path)
522
+ outputs = []
523
+ for split in self.splits.keys():
524
+ data = dataset[split]
525
+ for _, row in data.iterrows():
526
+ input = Input(row["text"])
527
+ output = Output(text=self.id2label[int(row["label"])])
528
+ references = [
529
+ Reference(output, tags=[CORRECT_TAG]),
530
+ ]
531
+ instance = Instance(input=input, references=references, split=self.splits[split])
532
+ outputs.append(instance)
533
+ return outputs
534
+
535
+
536
+ # 2.3 Thai: Wisesight Sentiment
537
+ class WisesightScenario(Scenario):
538
+ """
539
+ Wisesight Sentiment is a Thai sentiment analysis scenario. The data consists of social media messages
540
+ regarding consumer products and services.
541
+
542
+ The dataset originally included the label "question" for instances that were questions. These instances
543
+ made up only a small subset of the data and were dropped in order to make the task more consistent
544
+ with those of other languages. Labels are therefore only positive, negative or neutral.
545
+
546
+ The models are prompted using the following format:
547
+
548
+ อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?
549
+ กรุณาตอบโดยใช้คำเดียวเท่านั้น:
550
+ - แง่บวก
551
+ - แง่ลบ
552
+ - เฉยๆ
553
+
554
+ ข้อความ: <text>
555
+ คำตอบ: <sentiment>
556
+
557
+ ...
558
+
559
+ ข้อความ: <text>
560
+ คำตอบ:
561
+
562
+ Target completion:
563
+ <sentiment>
564
+
565
+ @software{bact_2019_3457447,
566
+ author = {Suriyawongkul, Arthit and
567
+ Chuangsuwanich, Ekapol and
568
+ Chormai, Pattarawat and
569
+ Polpanumas, Charin},
570
+ title = {PyThaiNLP/wisesight-sentiment: First release},
571
+ month = sep,
572
+ year = 2019,
573
+ publisher = {Zenodo},
574
+ version = {v1.0},
575
+ doi = {10.5281/zenodo.3457447},
576
+ url = {https://doi.org/10.5281/zenodo.3457447}
577
+ }
578
+ """
579
+
580
+ name = "wisesight"
581
+ description = "Wisesight Sentiment Thai sentiment analysis task"
582
+ tags = ["sentiment_analysis"]
583
+
584
+ def __init__(self):
585
+ super().__init__()
586
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
587
+ self.sentiment2label = {
588
+ "pos": "แง่บวก",
589
+ "neg": "แง่ลบ",
590
+ "neu": "เฉยๆ",
591
+ }
592
+
593
+ def download_dataset(self, output_path: str):
594
+ URL = "https://github.com/PyThaiNLP/wisesight-sentiment/raw/master/huggingface/data.zip"
595
+ data_path = os.path.join(output_path, "data")
596
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
597
+
598
+ dataset: Dict[str, pd.DataFrame] = {}
599
+ for split in self.splits.keys():
600
+ target_path_file = os.path.join(data_path, "data", f"{split}.jsonl")
601
+ df = pd.read_json(target_path_file, lines=True)
602
+ df = df[df["category"] != "q"] # Drop instances with the "question" label
603
+ if split == "test":
604
+ dataset[split] = df.groupby("category", group_keys=False).apply(
605
+ lambda x: x.sample(frac=1000 / len(df), random_state=4183)
606
+ )
607
+ else:
608
+ dataset[split] = df
609
+ return dataset
610
+
611
+ def get_instances(self, output_path) -> List[Instance]:
612
+ dataset = self.download_dataset(output_path)
613
+ outputs = []
614
+ for split in self.splits.keys():
615
+ data = dataset[split]
616
+ for _, row in data.iterrows():
617
+ input = Input(row["texts"].strip())
618
+ output = Output(text=self.sentiment2label[row["category"]])
619
+ references = [
620
+ Reference(output, tags=[CORRECT_TAG]),
621
+ ]
622
+ instance = Instance(input=input, references=references, split=self.splits[split])
623
+ outputs.append(instance)
624
+ return outputs
625
+
626
+
627
+ # 2.4 Tamil: IndicSentiment
628
+ class IndicSentimentScenario(Scenario):
629
+ """
630
+ IndicSentiment is a sentiment analysis scenario for 10 Indic languages. The data consists of
631
+ product reviews written in English that were then translated by native speakers of the
632
+ respective languages, resulting in a parallel dataset across the 10 languages.
633
+
634
+ Only the Tamil subset of the dataset is used for this scenario. Labels are positive or negative.
635
+
636
+ The models are prompted using the following format:
637
+
638
+ பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?
639
+ ஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:
640
+ - நேர்மறை
641
+ - எதிர்மறை
642
+
643
+ வாக்கியம்: <text>
644
+ பதில்:
645
+
646
+ ...
647
+
648
+ வாக்கியம்: <text>
649
+ பதில்: <answer>
650
+
651
+ Target completion:
652
+ <sentiment> (<sentiment>:positive or negative)
653
+
654
+ @inproceedings{doddapaneni-etal-2023-towards,
655
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
656
+ {I}ndic Languages",
657
+ author = "Doddapaneni, Sumanth and
658
+ Aralikatte, Rahul and
659
+ Ramesh, Gowtham and
660
+ Goyal, Shreya and
661
+ Khapra, Mitesh M. and
662
+ Kunchukuttan, Anoop and
663
+ Kumar, Pratyush",
664
+ editor = "Rogers, Anna and
665
+ Boyd-Graber, Jordan and
666
+ Okazaki, Naoaki",
667
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
668
+ Long Papers)",
669
+ month = jul,
670
+ year = "2023",
671
+ address = "Toronto, Canada",
672
+ publisher = "Association for Computational Linguistics",
673
+ url = "https://aclanthology.org/2023.acl-long.693",
674
+ doi = "10.18653/v1/2023.acl-long.693",
675
+ pages = "12402--12426",
676
+ }
677
+ """
678
+
679
+ name = "indicsentiment"
680
+ description = "IndicSentiment Tamil sentiment analysis task"
681
+ tags = ["sentiment_analysis"]
682
+
683
+ def __init__(self):
684
+ super().__init__()
685
+ self.splits = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
686
+ self.sentiment2label = {
687
+ "Positive": "நேர்மறை",
688
+ "Negative": "எதிர்மறை",
689
+ }
690
+
691
+ def get_instances(self, output_path) -> List[Instance]:
692
+ dataset = datasets.load_dataset(
693
+ "ai4bharat/IndicSentiment",
694
+ "translation-ta",
695
+ revision="dc8f3f66886531c6897fedffca1e938a68fc5013",
696
+ trust_remote_code=True,
697
+ )
698
+
699
+ outputs = []
700
+ for split in self.splits.keys():
701
+ data = dataset[split].to_pandas()
702
+ # Current version on HuggingFace datasets has 2 instances without labels across all languages.
703
+ # Confirmed with first author that the labels for these instances should be Positive.
704
+ data["LABEL"] = data["LABEL"].fillna("Positive")
705
+ for _, row in data.iterrows():
706
+ input = Input(row["INDIC REVIEW"].strip())
707
+ output = Output(text=self.sentiment2label[row["LABEL"]])
708
+ references = [
709
+ Reference(output, tags=[CORRECT_TAG]),
710
+ ]
711
+ instance = Instance(input=input, references=references, split=self.splits[split])
712
+ outputs.append(instance)
713
+ return outputs
714
+
715
+
716
+ # 3. Toxicity Detection/Classification
717
+ # 3.1 Indonesian: Multi-Label Hate Speech Detection
718
+ class MLHSDScenario(Scenario):
719
+ """
720
+ Multi-Label Hate Speech and Abusive Language Detection (MLHSD) is an Indonesian toxicity
721
+ classification scenario. The data is obtained from Twitter and PII have been anonymized to
722
+ USER and URL.
723
+
724
+ The original dataset was used for a multi-label classification task, but it has been repurposed
725
+ as a multi-class classification task to be more aligned with the task for other languages.
726
+ The mapping is done as follows:
727
+ - Clean: No abusive language or hate speech labels
728
+ - Abusive: Only abusive language label but no hate speech labels
729
+ - Hate: As long as one hate speech label is present
730
+
731
+ The models are prompted using the following format:
732
+
733
+ Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:
734
+ Bersih: Tidak ada ujaran kebencian.
735
+ Kasar: Ada ujaran kebencian dan kata-kata kasar, namun tidak menyerang pihak tertentu.
736
+ Benci: Ada ujaran kebencian atau serangan langsung terhadap pihak tertentu.
737
+ Berdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:
738
+ - Bersih
739
+ - Kasar
740
+ - Benci
741
+
742
+ Kalimat: <text>
743
+ Jawaban: <answer>
744
+
745
+ ...
746
+
747
+ Kalimat: <text>
748
+ Jawaban:
749
+
750
+ Target completion:
751
+ <answer>
752
+
753
+ @inproceedings{ibrohim-budi-2019-multi,
754
+ title = "Multi-label Hate Speech and Abusive Language Detection in {I}ndonesian {T}witter",
755
+ author = "Ibrohim, Muhammad Okky and
756
+ Budi, Indra",
757
+ editor = "Roberts, Sarah T. and
758
+ Tetreault, Joel and
759
+ Prabhakaran, Vinodkumar and
760
+ Waseem, Zeerak",
761
+ booktitle = "Proceedings of the Third Workshop on Abusive Language Online",
762
+ month = aug,
763
+ year = "2019",
764
+ address = "Florence, Italy",
765
+ publisher = "Association for Computational Linguistics",
766
+ url = "https://aclanthology.org/W19-3506",
767
+ doi = "10.18653/v1/W19-3506",
768
+ pages = "46--57",
769
+ }
770
+ """
771
+
772
+ name = "mlhsd"
773
+ description = (
774
+ "Multi-Label Hate Speech and Abusive Language Detection (MLHSD) Indonesian toxicity classification task"
775
+ )
776
+ tags = ["toxicity_detection"]
777
+
778
+ def __init__(self):
779
+ super().__init__()
780
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
781
+
782
+ def download_dataset(self, output_path: str):
783
+ BASE_URL = "https://raw.githubusercontent.com/okkyibrohim/"
784
+ URL = f"{BASE_URL}id-multi-label-hate-speech-and-abusive-language-detection/master/re_dataset.csv"
785
+ target_path_file = os.path.join(output_path, "mlhsd")
786
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
787
+ df = pd.read_csv(target_path_file, encoding="ISO-8859-1")
788
+
789
+ # Map multi-label task to multi-class task
790
+ df["label"] = df.apply(lambda x: self.get_label(x), axis=1)
791
+
792
+ df_test = df.groupby("label", group_keys=False).apply(
793
+ lambda x: x.sample(frac=1000 / len(df), random_state=7123)
794
+ )
795
+
796
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
797
+ df_train = df[~df.index.isin(df_test.index)]
798
+ dataset = {
799
+ "train": df_train,
800
+ "test": df_test,
801
+ }
802
+ return dataset
803
+
804
+ def get_label(self, row) -> str:
805
+ if int(row["HS"]) == 1:
806
+ return "Benci"
807
+ elif int(row["Abusive"]) == 1:
808
+ return "Kasar"
809
+ else:
810
+ return "Bersih"
811
+
812
+ def get_instances(self, output_path) -> List[Instance]:
813
+ dataset = self.download_dataset(output_path)
814
+ outputs = []
815
+ for split in self.splits.keys():
816
+ data = dataset[split]
817
+ for _, row in data.iterrows():
818
+ input = Input(row["Tweet"].strip())
819
+ output = Output(text=row["label"])
820
+ references = [
821
+ Reference(output, tags=[CORRECT_TAG]),
822
+ ]
823
+ instance = Instance(input=input, references=references, split=self.splits[split])
824
+ outputs.append(instance)
825
+ return outputs
826
+
827
+
828
+ # 3.2 Vietnamese: ViHSD
829
+ class ViHSDScenario(Scenario):
830
+ """
831
+ ViHSD is a Vietnamese toxicity classification scenario. The data is obtained from social media.
832
+ The labels are Clean, Offensive and Hate.
833
+
834
+ The models are prompted using the following format:
835
+
836
+ Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:
837
+ Sạch: Không quấy rối.
838
+ Công kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không tấn công bất kì đối tượng cụ thể nào.
839
+ Thù ghét: Trực tiếp quấy rối hay lăng mạ một đối tượng cụ thể.
840
+ Với các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:
841
+ - Sạch
842
+ - Công kích
843
+ - Thù ghét
844
+
845
+
846
+ Câu văn: <text>
847
+ Câu trả lời: <toxicity>
848
+
849
+ ...
850
+
851
+ Câu văn: <text>
852
+ Câu trả lời:
853
+
854
+ Target completion:
855
+ <toxicity>
856
+
857
+ @InProceedings{10.1007/978-3-030-79457-6_35,
858
+ author="Luu, Son T.
859
+ and Nguyen, Kiet Van
860
+ and Nguyen, Ngan Luu-Thuy",
861
+ editor="Fujita, Hamido
862
+ and Selamat, Ali
863
+ and Lin, Jerry Chun-Wei
864
+ and Ali, Moonis",
865
+ title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts",
866
+ booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices",
867
+ year="2021",
868
+ publisher="Springer International Publishing",
869
+ address="Cham",
870
+ pages="415--426",
871
+ isbn="978-3-030-79457-6",
872
+ url="https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35",
873
+ }
874
+ """
875
+
876
+ name = "vihsd"
877
+ description = "ViHSD Vietnamese toxicity classification task"
878
+ tags = ["toxicity_detection"]
879
+
880
+ def __init__(self):
881
+ super().__init__()
882
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
883
+ self.id2label = {
884
+ 0: "Sạch",
885
+ 1: "Công kích",
886
+ 2: "Thù ghét",
887
+ }
888
+
889
+ def download_dataset(self, output_path: str):
890
+ URL = "https://raw.githubusercontent.com/sonlam1102/vihsd/main/data/vihsd.zip"
891
+ data_path = os.path.join(output_path, "data")
892
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
893
+
894
+ dataset: Dict[str, pd.DataFrame] = {}
895
+ for split in self.splits.keys():
896
+ target_path_file = os.path.join(data_path, "vihsd", f"{split}.csv")
897
+ df = pd.read_csv(target_path_file)
898
+ data = df.groupby("label_id", group_keys=False).apply(
899
+ lambda x: x.sample(frac=1000 / len(df), random_state=4878)
900
+ )
901
+ dataset[split] = data
902
+ return dataset
903
+
904
+ def get_instances(self, output_path) -> List[Instance]:
905
+ dataset = self.download_dataset(output_path)
906
+ outputs = []
907
+ for split in self.splits.keys():
908
+ data = dataset[split]
909
+ for _, row in data.iterrows():
910
+ input = Input(str(row["free_text"]).strip())
911
+ output = Output(text=self.id2label[int(row["label_id"])])
912
+ references = [
913
+ Reference(output, tags=[CORRECT_TAG]),
914
+ ]
915
+ instance = Instance(input=input, references=references, split=self.splits[split])
916
+ outputs.append(instance)
917
+ return outputs
918
+
919
+
920
+ # 3.3 Thai: Thai Toxicity Tweets
921
+ class ThaiToxicityTweetsScenario(Scenario):
922
+ """
923
+ Thai Toxicity Tweets is a Thai toxicity detection scenario. The data is obtained from Twitter.
924
+ Instances with no labels or had "TWEET_NOT_FOUND" as the text were dropped from the dataset.
925
+ The labels are either Y (the text is toxic) or N (the text is clean).
926
+
927
+ The models are prompted using the following format:
928
+
929
+ คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ
930
+ ข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล
931
+ และคุณควรคำนึงถึงการประชดประชันด้วย
932
+ เมื่อได้รับข้อความ ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ
933
+
934
+ ข้อความ: <text>
935
+ คำตอบ: <toxicity>
936
+
937
+ ...
938
+
939
+ ข้อความ: <text>
940
+ คำตอบ:
941
+
942
+ Target completion:
943
+ <toxicity>
944
+
945
+ @inproceedings{sirihattasak2018annotation,
946
+ title={Annotation and classification of toxicity for Thai Twitter},
947
+ author={Sirihattasak, Sugan and Komachi, Mamoru and Ishikawa, Hiroshi},
948
+ booktitle={TA-COS 2018: 2nd Workshop on Text Analytics for Cybersecurity and Online Safety},
949
+ pages={1},
950
+ year={2018},
951
+ url={http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf},
952
+ }
953
+ """
954
+
955
+ name = "thaitoxicitytweets"
956
+ description = "Thai Toxicity Tweets toxicity detection task"
957
+ tags = ["toxicity_detection"]
958
+
959
+ def __init__(self):
960
+ super().__init__()
961
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
962
+ self.id2label = {
963
+ 0: "N",
964
+ 1: "Y",
965
+ }
966
+
967
+ def get_instances(self, output_path) -> List[Instance]:
968
+ dataset = datasets.load_dataset(
969
+ "tmu-nlp/thai_toxicity_tweet",
970
+ split="train",
971
+ revision="aa021e41d0ee6dbee2975fbed620ec8c586bdaf6",
972
+ trust_remote_code=True,
973
+ )
974
+ df = dataset.to_pandas()
975
+
976
+ # Drop instances where there are no labels or text is "TWEET_NOT_FOUND"
977
+ df = df[df["tweet_text"].str.len() > 0]
978
+ df = df[df["tweet_text"] != "TWEET_NOT_FOUND"]
979
+
980
+ df_test = df.groupby("is_toxic", group_keys=False).apply(
981
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
982
+ )
983
+
984
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
985
+ df_train = df[~df.index.isin(df_test.index)]
986
+
987
+ dataset = {
988
+ "train": df_train,
989
+ "test": df_test,
990
+ }
991
+
992
+ outputs = []
993
+ for split in self.splits.keys():
994
+ data = dataset[split]
995
+ for _, row in data.iterrows():
996
+ input = Input(row["tweet_text"].strip())
997
+ output = Output(text=self.id2label[int(row["is_toxic"])])
998
+ references = [
999
+ Reference(output, tags=[CORRECT_TAG]),
1000
+ ]
1001
+ instance = Instance(input=input, references=references, split=self.splits[split])
1002
+ outputs.append(instance)
1003
+ return outputs
1004
+
1005
+
1006
+ # B. Natural Language Generation
1007
+ # 1. Machine Translation
1008
+
1009
+
1010
+ # 1. Machine Translation: FLoRes-200
1011
+ class FloresScenario(Scenario):
1012
+ """
1013
+ FLoRes-200 is a machine translation scenario for 200+ languages. The data is obtained from English Wikimedia
1014
+ projects (Wikivoyage, Wikijunior and Wikinews), and professionally translated across 200+ languages to obtain a
1015
+ parallel dataset.
1016
+
1017
+ Only the English, Indonesian, Vietnamese, Thai and Tamil subsets are used in this scenario. Both directions
1018
+ (in and out of English) for each Southeast Asian language are included in the scenario.
1019
+
1020
+ The models are prompted using the following general format:
1021
+
1022
+ Translate the following text into <language> language.
1023
+
1024
+ Text: <text>
1025
+ Translation: <translation>
1026
+
1027
+ ...
1028
+
1029
+ Text: <text>
1030
+ Translation:
1031
+
1032
+ Target completion:
1033
+ <translation>
1034
+
1035
+ @article{nllb2022,
1036
+ author = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield,
1037
+ Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang,
1038
+ Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti,
1039
+ John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran,
1040
+ Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao,
1041
+ Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
1042
+ Safiyyah Saleem, Holger Schwenk, Jeff Wang
1043
+ },
1044
+ title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
1045
+ year = {2022},
1046
+ url = {https://research.facebook.com/publications/no-language-left-behind/},
1047
+ }
1048
+
1049
+ """
1050
+
1051
+ name = "flores"
1052
+ description = "FLoRes-200 machine translation task"
1053
+ tags = ["machine_translation"]
1054
+
1055
+ def __init__(self, pair: str):
1056
+ super().__init__()
1057
+ self.pair = pair
1058
+ self.source = pair.split("_")[0]
1059
+ self.target = pair.split("_")[1]
1060
+
1061
+ self.splits = {"dev": TRAIN_SPLIT, "devtest": TEST_SPLIT}
1062
+
1063
+ self.languages = {
1064
+ "en": "eng_Latn",
1065
+ "id": "ind_Latn",
1066
+ "vi": "vie_Latn",
1067
+ "th": "tha_Thai",
1068
+ "ta": "tam_Taml",
1069
+ }
1070
+
1071
+ def get_instances(self, output_path) -> List[Instance]:
1072
+ source_dataset = datasets.load_dataset(
1073
+ "facebook/flores",
1074
+ self.languages[self.source],
1075
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1076
+ trust_remote_code=True,
1077
+ )
1078
+ target_dataset = datasets.load_dataset(
1079
+ "facebook/flores",
1080
+ self.languages[self.target],
1081
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1082
+ trust_remote_code=True,
1083
+ )
1084
+
1085
+ outputs = []
1086
+ for split in self.splits.keys():
1087
+ source_df = source_dataset[split].to_pandas()
1088
+ target_df = target_dataset[split].to_pandas()
1089
+ data = source_df.join(target_df, lsuffix="_source", rsuffix="_target")
1090
+ for _, row in data.iterrows():
1091
+ input = Input(row["sentence_source"].strip())
1092
+ output = Output(row["sentence_target"].strip())
1093
+ references = [
1094
+ Reference(output, tags=[CORRECT_TAG]),
1095
+ ]
1096
+ instance = Instance(input=input, references=references, split=self.splits[split])
1097
+ outputs.append(instance)
1098
+ return outputs
1099
+
1100
+
1101
+ # C. Natural Language Reasoning
1102
+ # 1. Natural Language Inference
1103
+ # 2. Causal Reasoning
1104
+
1105
+
1106
+ # 1. Natural Language Inference
1107
+ # 1.1 Indonesian: IndoNLI
1108
+ class IndoNLIScenario(Scenario):
1109
+ """
1110
+ IndoNLI is an Indonesian Natural Language Inference (NLI) scenario. The data is sourced from Wikipedia, news,
1111
+ and web articles. Native speakers use premise text from these sources and write hypothesis sentences for each
1112
+ NLI label. The labels are entailment, contradiction, or neutral.
1113
+
1114
+ The models are prompted using the following format:
1115
+
1116
+ Anda akan diberikan dua kalimat, X dan Y.
1117
+ Tentukan mana dari pernyataan berikut ini yang paling sesuai untuk kalimat X dan Y.
1118
+ A: Kalau X benar, maka Y juga harus benar.
1119
+ B: X bertentangan dengan Y.
1120
+ C: Ketika X benar, Y mungkin benar atau mungkin tidak benar.
1121
+ Jawablah dengan satu huruf saja, A, B atau C.
1122
+
1123
+ X: <sentence1>
1124
+ Y: <sentence2>
1125
+ Jawaban: <entailment>
1126
+
1127
+ ...
1128
+
1129
+ X: <sentence1>
1130
+ Y: <sentence2>
1131
+ Jawaban:
1132
+
1133
+ Target completion:
1134
+ <entailment>
1135
+
1136
+ @inproceedings{mahendra-etal-2021-indonli,
1137
+ title = "{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian",
1138
+ author = "Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara",
1139
+ booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
1140
+ month = nov,
1141
+ year = "2021",
1142
+ address = "Online and Punta Cana, Dominican Republic",
1143
+ publisher = "Association for Computational Linguistics",
1144
+ url = "https://aclanthology.org/2021.emnlp-main.821",
1145
+ pages = "10511--10527",
1146
+ }
1147
+ """
1148
+
1149
+ name = "indonli"
1150
+ description = "IndoNLI Indonesian Natural Language Inference task"
1151
+ tags = ["natural_language_inference"]
1152
+
1153
+ def __init__(self):
1154
+ super().__init__()
1155
+ self.splits = {
1156
+ "train": TRAIN_SPLIT,
1157
+ "test": TEST_SPLIT,
1158
+ }
1159
+ self.id2label = {"e": "A", "c": "B", "n": "C"}
1160
+
1161
+ def download_dataset(self, output_path: str):
1162
+ URLS = {
1163
+ "train": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/train.jsonl",
1164
+ "test": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/test_lay.jsonl",
1165
+ }
1166
+
1167
+ dataset: Dict[str, pd.DataFrame] = {}
1168
+ for split in self.splits.keys():
1169
+ target_path_file = os.path.join(output_path, split)
1170
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
1171
+ df = pd.read_json(target_path_file, lines=True)
1172
+ if split == "test":
1173
+ dataset[split] = df.groupby("label", group_keys=False).apply(
1174
+ lambda x: x.sample(frac=1000 / len(df), random_state=4685)
1175
+ )
1176
+ else:
1177
+ dataset[split] = df
1178
+ return dataset
1179
+
1180
+ def get_instances(self, output_path) -> List[Instance]:
1181
+ dataset = self.download_dataset(output_path)
1182
+ outputs = []
1183
+ for split in self.splits.keys():
1184
+ data = dataset[split]
1185
+ for _, row in data.iterrows():
1186
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1187
+ input = Input(passage)
1188
+ output = Output(self.id2label[row["label"]])
1189
+ references = [
1190
+ Reference(output, tags=[CORRECT_TAG]),
1191
+ ]
1192
+ instance = Instance(input=input, references=references, split=self.splits[split])
1193
+ outputs.append(instance)
1194
+ return outputs
1195
+
1196
+
1197
+ # 1.2 Vietnamese & Thai: XNLI
1198
+ class XNLIScenario(Scenario):
1199
+ """
1200
+ XNLI is a Natural Language Inference scenario for 15 languages. The data was constructed following the
1201
+ MultiNLI crowdsourcing procedure to obtain English data, which was then professionally translated across
1202
+ 14 other languages. Labels are entailment, neutral, or contradiction.
1203
+
1204
+ The models are prompted using the following general format:
1205
+
1206
+ You will be given two sentences, X and Y.
1207
+ Determine which of the following statements applies to sentences X and Y the best.
1208
+ A: If X is true, Y must be true.
1209
+ B: X contradicts Y.
1210
+ C: When X is true, Y may or may not be true.
1211
+ Answer strictly with a single letter A, B or C.
1212
+
1213
+ X: <sentence1>
1214
+ Y: <sentence2>
1215
+ Answer: <entailment>
1216
+
1217
+ ...
1218
+
1219
+ X: <sentence1>
1220
+ Y: <sentence2>
1221
+ Answer:
1222
+
1223
+ Target completion:
1224
+ <entailment>
1225
+
1226
+ @inproceedings{conneau-etal-2018-xnli,
1227
+ title = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
1228
+ author = "Conneau, Alexis and
1229
+ Rinott, Ruty and
1230
+ Lample, Guillaume and
1231
+ Williams, Adina and
1232
+ Bowman, Samuel and
1233
+ Schwenk, Holger and
1234
+ Stoyanov, Veselin",
1235
+ editor = "Riloff, Ellen and
1236
+ Chiang, David and
1237
+ Hockenmaier, Julia and
1238
+ Tsujii, Jun{'}ichi",
1239
+ booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
1240
+ month = oct # "-" # nov,
1241
+ year = "2018",
1242
+ address = "Brussels, Belgium",
1243
+ publisher = "Association for Computational Linguistics",
1244
+ url = "https://aclanthology.org/D18-1269",
1245
+ doi = "10.18653/v1/D18-1269",
1246
+ pages = "2475--2485",
1247
+ }
1248
+ """
1249
+
1250
+ name = "xnli"
1251
+ description = "XNLI Natural Language Inference task"
1252
+ tags = ["natural_language_inference"]
1253
+
1254
+ def __init__(self, language: str):
1255
+ super().__init__()
1256
+ self.language = language
1257
+ self.splits = {
1258
+ "validation": TRAIN_SPLIT,
1259
+ "test": TEST_SPLIT,
1260
+ }
1261
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1262
+
1263
+ def get_instances(self, output_path) -> List[Instance]:
1264
+ dataset = datasets.load_dataset("xnli", self.language)
1265
+ outputs = []
1266
+ for split in self.splits.keys():
1267
+ df = dataset[split].to_pandas()
1268
+ if split == "validation":
1269
+ data = df
1270
+ else:
1271
+ # This produces 999 instances
1272
+ data = df.groupby("label", group_keys=False).apply(
1273
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1274
+ )
1275
+
1276
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1277
+ remainder = df[~df.index.isin(data.index)]
1278
+ neutral_instance = remainder[remainder["label"] == 1].iloc[0].to_frame().transpose()
1279
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1280
+ for _, row in data.iterrows():
1281
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1282
+ input = Input(passage)
1283
+ output = Output(self.id2label[int(row["label"])])
1284
+ references = [
1285
+ Reference(output, tags=[CORRECT_TAG]),
1286
+ ]
1287
+ instance = Instance(input=input, references=references, split=self.splits[split])
1288
+ outputs.append(instance)
1289
+ return outputs
1290
+
1291
+
1292
+ # 1.3 Tamil: IndicXNLI
1293
+ class IndicXNLIScenario(Scenario):
1294
+ """
1295
+ IndicXNLI is a Natural Language Inference scenario for 11 Indic languages. The data was
1296
+ automatically translated from the English XNLI dataset into 11 Indic languages using
1297
+ IndicTrans (Ramesh et al., 2021).
1298
+
1299
+ Only the Tamil subset of the data is used in this scenario. The labels are
1300
+ entailment, contradiction and neutral.
1301
+
1302
+ The models are prompted using the following format:
1303
+
1304
+ உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்.
1305
+ பின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்.
1306
+ A: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.
1307
+ B: X உம் Y உம் முரண்படுகின்றன.
1308
+ C: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்.
1309
+ A அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.
1310
+
1311
+ X: <premise>
1312
+ Y: <hypothesis>
1313
+ பதில்: <entailment>
1314
+
1315
+ ...
1316
+
1317
+ X: <premise>
1318
+ Y: <hypothesis>
1319
+ பதில்:
1320
+
1321
+ Target completion:
1322
+ <entailment>
1323
+
1324
+ @inproceedings{aggarwal-etal-2022-indicxnli,
1325
+ title = "{I}ndic{XNLI}: Evaluating Multilingual Inference for {I}ndian Languages",
1326
+ author = "Aggarwal, Divyanshu and
1327
+ Gupta, Vivek and
1328
+ Kunchukuttan, Anoop",
1329
+ editor = "Goldberg, Yoav and
1330
+ Kozareva, Zornitsa and
1331
+ Zhang, Yue",
1332
+ booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
1333
+ month = dec,
1334
+ year = "2022",
1335
+ address = "Abu Dhabi, United Arab Emirates",
1336
+ publisher = "Association for Computational Linguistics",
1337
+ url = "https://aclanthology.org/2022.emnlp-main.755",
1338
+ doi = "10.18653/v1/2022.emnlp-main.755",
1339
+ pages = "10994--11006",
1340
+ }
1341
+ """
1342
+
1343
+ name = "indicxnli"
1344
+ description = "IndicXNLI Natural Language Inference task"
1345
+ tags = ["natural_language_inference"]
1346
+
1347
+ def __init__(self):
1348
+ super().__init__()
1349
+ self.splits = {
1350
+ "validation": TRAIN_SPLIT,
1351
+ "test": TEST_SPLIT,
1352
+ }
1353
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1354
+
1355
+ def get_instances(self, output_path) -> List[Instance]:
1356
+ dataset = datasets.load_dataset("Divyanshu/indicxnli", "ta")
1357
+
1358
+ outputs = []
1359
+ for split in self.splits.keys():
1360
+ df = dataset[split].to_pandas()
1361
+ if split == "validation":
1362
+ data = df
1363
+ else:
1364
+ # This produces 999 instances
1365
+ data = df.groupby("label", group_keys=False).apply(
1366
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1367
+ )
1368
+
1369
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1370
+ remainder = df[~df.index.isin(data.index)]
1371
+ neutral_instance = remainder[remainder["label"] == 2].iloc[0].to_frame().transpose()
1372
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1373
+ for _, row in data.iterrows():
1374
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1375
+ input = Input(passage)
1376
+ output = Output(text=self.id2label[row["label"]])
1377
+ references = [
1378
+ Reference(output, tags=[CORRECT_TAG]),
1379
+ ]
1380
+ instance = Instance(input=input, references=references, split=self.splits[split])
1381
+ outputs.append(instance)
1382
+ return outputs
1383
+
1384
+
1385
+ # 2. Causal Reasoning: XCOPA
1386
+ class XCOPAScenario(Scenario):
1387
+ """
1388
+ XCOPA is a commonsense causal reasoning scenario for 11 languages. The data is sourced from the English
1389
+ COPA dataset and professionally translated across 11 languages to create a parallel dataset.
1390
+
1391
+ Only the Indonesian, Vietnamese, Thai and Tamil subsets were used for this scenario. Each instance consists of
1392
+ a premise and two sentences. The system under test needs to determine which of the two sentences is more likely
1393
+ to be the cause/effect of the premise. Whether the cause or the effect is asked for differs from instance to
1394
+ instance. Although there should be an equal number of instances asking for the cause and for the effect, it was
1395
+ found in the BHASA paper (Leong et al., 2023) that this was not the case for Indonesian and Thai. The
1396
+ cause/effect label is fixed in this scenario by harmonizing the labels across the four languages based on the
1397
+ Tamil subset as the reference.
1398
+
1399
+ The models are prompted using the following general format:
1400
+
1401
+ Based on the following situation, which of the following choices is most likely to be its {cause/effect}?
1402
+ Answer only with a single letter A or B.
1403
+
1404
+ Situation: <premise>
1405
+ A: <choice1>
1406
+ B: <choice2>
1407
+ Answer: <answer>
1408
+
1409
+ ...
1410
+
1411
+ Situation: <premise>
1412
+ A: <choice1>
1413
+ B: <choice2>
1414
+ Answer:
1415
+
1416
+ Target completion:
1417
+ <answer>
1418
+
1419
+ @article{ponti2020xcopa,
1420
+ title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
1421
+ author={Edoardo M. Ponti, Goran Glava
1422
+ {s}, Olga Majewska, Qianchu Liu, Ivan Vuli'{c} and Anna Korhonen},
1423
+ journal={arXiv preprint},
1424
+ year={2020},
1425
+ url={https://ducdauge.github.io/files/xcopa.pdf}
1426
+ }
1427
+
1428
+ @inproceedings{roemmele2011choice,
1429
+ title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},
1430
+ author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},
1431
+ booktitle={2011 AAAI Spring Symposium Series},
1432
+ year={2011},
1433
+ url={https://people.ict.usc.edu/~gordon/publications/AAAI-SPRING11A.PDF},
1434
+ }
1435
+ """
1436
+
1437
+ name = "xcopa"
1438
+ description = "XCOPA causal reasoning task"
1439
+ tags = ["causal_reasoning"]
1440
+
1441
+ def __init__(self, language: str):
1442
+ super().__init__()
1443
+ self.language = language
1444
+ self.splits = {
1445
+ "validation": TRAIN_SPLIT,
1446
+ "test": TEST_SPLIT,
1447
+ }
1448
+ self.id2label = {
1449
+ 0: "A",
1450
+ 1: "B",
1451
+ }
1452
+ self.prompt = {
1453
+ "id": {
1454
+ "cause": "sebab",
1455
+ "effect": "akibat",
1456
+ "instruction1": "Berdasarkan situasi di atas, mana dari pilihan-pilihan berikut ini yang lebih "
1457
+ "mungkin menjadi {}?",
1458
+ "instruction2": "Jawablah dengan satu huruf saja, A atau B.",
1459
+ },
1460
+ "ta": {
1461
+ "cause": "காரணமாக",
1462
+ "effect": "விளைவாக",
1463
+ "instruction1": "பின்வரும் வாக்கியங்களில் பெரும்பாலும் எது தரப்பட்ட சூழ்நிலைக்குரிய {} இருக்கும்?",
1464
+ "instruction2": "A அல்லது B என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
1465
+ },
1466
+ "th": {
1467
+ "cause": "สาเหตุ",
1468
+ "effect": "ผล",
1469
+ "instruction1": "เมื่อพิจารณาจากสถานการณ์นี้ ตัวเลือกใดต่อไปนี้น่าจะเป็น{}มากกว่ากัน?",
1470
+ "instruction2": "กรุณาตอบด้วยตัวอักษร A หรือ B ตัวเดียวเท่านั้น",
1471
+ },
1472
+ "vi": {
1473
+ "cause": "nguyên nhân",
1474
+ "effect": "kết quả",
1475
+ "instruction1": "Với tình huống trên, lựa chọn nào dưới đây có khả năng cao là {} của nó hơn?",
1476
+ "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
1477
+ },
1478
+ }
1479
+
1480
+ def get_instances(self, output_path) -> List[Instance]:
1481
+ language_dataset = datasets.load_dataset("xcopa", self.language)
1482
+ tamil_dataset = datasets.load_dataset("xcopa", "ta")
1483
+
1484
+ outputs = []
1485
+ for split in self.splits.keys():
1486
+ language_df = language_dataset[split].to_pandas()
1487
+ tamil_df = tamil_dataset[split].to_pandas()
1488
+ data = pd.merge(
1489
+ language_df, tamil_df[["question", "idx"]], on="idx"
1490
+ ) # Use the Tamil split's question column
1491
+ for _, row in data.iterrows():
1492
+ instruction1 = self.prompt[self.language]["instruction1"].format(
1493
+ self.prompt[self.language][row["question_y"]]
1494
+ )
1495
+ passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
1496
+ premise=row["premise"].strip(),
1497
+ instruction1=instruction1,
1498
+ choice1=row["choice1"].strip(),
1499
+ choice2=row["choice2"].strip(),
1500
+ instruction2=self.prompt[self.language]["instruction2"],
1501
+ )
1502
+ input = Input(passage)
1503
+ output = Output(self.id2label[int(row["label"])])
1504
+ references = [
1505
+ Reference(output, tags=[CORRECT_TAG]),
1506
+ ]
1507
+ instance = Instance(input=input, references=references, split=self.splits[split])
1508
+ outputs.append(instance)
1509
+ return outputs
1510
+
1511
+
1512
+ # 1. Syntax: LINDSEA Minimal Pairs
1513
+ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1514
+ """
1515
+ The LINDSEA Minimal Pairs dataset is a linguistic diagnostic scenario targeting syntactic phenomena.
1516
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1517
+ of quality control. The high-level categories tested for include morphology, argument structure,
1518
+ filler-gap dependencies, as well as negative polarity items and negation.
1519
+
1520
+ The test is designed as a minimal pair, with a pair of sentences that differ minimally from each other
1521
+ and which exemplify a specific syntactic phenomenon. The system under test needs to determine which
1522
+ sentence of the pair is more acceptable.
1523
+
1524
+ The models are prompted using the following general format:
1525
+
1526
+ Which sentence is more acceptable?
1527
+ Answer only with a single letter A or B.
1528
+ <sentence>
1529
+
1530
+ Target completion:
1531
+ <sentence>
1532
+
1533
+ @misc{leong2023bhasa,
1534
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1535
+ author={Wei Qi Leong
1536
+ and Jian Gang Ngui
1537
+ and Yosephine Susanto
1538
+ and Hamsawardhini Rengarajan
1539
+ and Kengatharaiyer Sarveswaran
1540
+ and William Chandra Tjhi
1541
+ },
1542
+ year={2023},
1543
+ eprint={2309.06085},
1544
+ archivePrefix={arXiv},
1545
+ primaryClass={cs.CL},
1546
+ url={https://arxiv.org/abs/2309.06085},
1547
+ }
1548
+ """
1549
+
1550
+ name = "lindsea_minimal_pairs"
1551
+ description = "LINDSEA minimal pairs task"
1552
+ tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
1553
+
1554
+ def __init__(self, method: str, language: str):
1555
+ super().__init__()
1556
+ self.method = method
1557
+ self.language = language
1558
+ self.prompts = {
1559
+ "id": {
1560
+ "instructions": "Kalimat mana yang lebih mungkin?",
1561
+ "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
1562
+ }
1563
+ }
1564
+
1565
+ def download_dataset(self, output_path: str):
1566
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1567
+ URLS = {
1568
+ "npis_and_negation": f"{BASE_URL}{self.language}/syntax/NPIs_and_negation.jsonl",
1569
+ "argument_structure": f"{BASE_URL}{self.language}/syntax/argument_structure.jsonl",
1570
+ "filler_gap_dependencies": f"{BASE_URL}{self.language}/syntax/filler-gap_dependencies.jsonl",
1571
+ "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
1572
+ }
1573
+
1574
+ data_files = {}
1575
+ for file in list(URLS.keys()):
1576
+ target_path_file = os.path.join(output_path, file)
1577
+ ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
1578
+ data_files[file] = pd.read_json(target_path_file, lines=True)
1579
+ dataset = pd.concat(data_files)
1580
+
1581
+ return dataset
1582
+
1583
+ def get_instances(self, output_path: str) -> List[Instance]:
1584
+ data = self.download_dataset(output_path)
1585
+
1586
+ outputs = []
1587
+ if self.method == "mcq":
1588
+ category_list = data["category"].value_counts().keys()
1589
+ hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
1590
+ for category in category_list:
1591
+ # Fix shuffling within each category
1592
+ random.seed(1)
1593
+ for _, row in data[data["category"] == category].iterrows():
1594
+ options = [(row["correct"], 1), (row["wrong"], 2)]
1595
+ random.shuffle(options)
1596
+ options_reversed = True if options[0][1] == 2 else False
1597
+
1598
+ prompt_components = self.prompts[self.language]
1599
+ instructions = prompt_components["instructions"]
1600
+ output_prefix = prompt_components["output_prefix"]
1601
+ prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
1602
+ input = Input(text=prompt)
1603
+ # Determine correct option based on whether shuffling reversed the options
1604
+ references = [
1605
+ Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
1606
+ Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
1607
+ ]
1608
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
1609
+ outputs.append(instance)
1610
+
1611
+ else:
1612
+ for _, row in data.iterrows():
1613
+ # No need to shuffle since we are comparing logprobs of the options separately
1614
+ input = Input(text="")
1615
+ references = [
1616
+ Reference(Output(text=row["correct"].strip()), tags=[CORRECT_TAG]),
1617
+ Reference(Output(text=row["wrong"].strip()), tags=[]),
1618
+ ]
1619
+ instance = Instance(
1620
+ input=input,
1621
+ references=references,
1622
+ split=TEST_SPLIT,
1623
+ )
1624
+ outputs.append(instance)
1625
+ return outputs
1626
+
1627
+
1628
+ # 2. Pragmatics
1629
+ # 2.1 LINDSEA Pragmatic Reasoning (single sentence)
1630
+ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1631
+ """
1632
+ The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1633
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1634
+ of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1635
+
1636
+ The single-sentence pragmatic reasoning dataset involves questions targeting the truth value of a single sentence.
1637
+ The system under test needs to determine if the sentence is true/false or if the proposition is possible/impossible.
1638
+
1639
+ The models are prompted using the following general format:
1640
+
1641
+ Is the following statement true or false?
1642
+ Statement: <sentence>
1643
+ Answer only with True or False.
1644
+
1645
+ Target completion:
1646
+ <answer>
1647
+
1648
+ @misc{leong2023bhasa,
1649
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1650
+ author={Wei Qi Leong
1651
+ and Jian Gang Ngui
1652
+ and Yosephine Susanto
1653
+ and Hamsawardhini Rengarajan
1654
+ and Kengatharaiyer Sarveswaran
1655
+ and William Chandra Tjhi
1656
+ },
1657
+ year={2023},
1658
+ eprint={2309.06085},
1659
+ archivePrefix={arXiv},
1660
+ primaryClass={cs.CL}
1661
+ }
1662
+ """
1663
+
1664
+ name = "lindsea_pragmatic_reasoning_single"
1665
+ description = "LINDSEA pragmatic reasoning single sentence task"
1666
+ tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1667
+
1668
+ def __init__(self, language: str):
1669
+ super().__init__()
1670
+ self.language = language
1671
+ self.prompt = {
1672
+ "id": {
1673
+ "question": "Apakah pernyataan berikut ini {}?",
1674
+ "instruction": "Jawablah dengan {} saja.",
1675
+ },
1676
+ }
1677
+
1678
+ def download_dataset(self, output_path: str):
1679
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1680
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_single.jsonl"
1681
+ file = "pragmatic_reasoning_single"
1682
+ target_path_file = os.path.join(output_path, file)
1683
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1684
+ dataset = pd.read_json(target_path_file, lines=True)
1685
+ return dataset
1686
+
1687
+ def get_instances(self, output_path) -> List[Instance]:
1688
+ data = self.download_dataset(output_path)
1689
+ outputs = []
1690
+ for _, row in data.iterrows():
1691
+ passage = "{question}\nPernyataan: {text}\n{instruction}".format(
1692
+ question=self.prompt[self.language]["question"].format(row["question_translated"]),
1693
+ text=row["text"],
1694
+ instruction=self.prompt[self.language]["instruction"].format(row["choices_translated"]),
1695
+ )
1696
+ input = Input(text=passage)
1697
+
1698
+ # Split "True or False" into ["True", "or", "False"]
1699
+ choices = row["choices"].split()
1700
+ choices_translated = row["choices_translated"].split()
1701
+ label2choice = {
1702
+ choices[0]: choices_translated[0],
1703
+ choices[2]: choices_translated[2],
1704
+ }
1705
+ references = [
1706
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1707
+ ]
1708
+ instance = Instance(
1709
+ input=input,
1710
+ references=references,
1711
+ split=TEST_SPLIT,
1712
+ )
1713
+ outputs.append(instance)
1714
+ return outputs
1715
+
1716
+
1717
+ # 2.2 Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
1718
+ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
1719
+ """
1720
+ The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1721
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1722
+ of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1723
+
1724
+ The sentence-pair pragmatic reasoning dataset involves questions targeting whether a conclusion can be drawn
1725
+ from another sentence.
1726
+
1727
+ The models are prompted using the following general format:
1728
+
1729
+ Situation: <premise>
1730
+ Given this situation, is the following statement true or false?
1731
+ Statement: <hypothesis>
1732
+ Answer only with True or False.
1733
+
1734
+ Target completion:
1735
+ <answer>
1736
+
1737
+ @misc{leong2023bhasa,
1738
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1739
+ author={Wei Qi Leong
1740
+ and Jian Gang Ngui
1741
+ and Yosephine Susanto
1742
+ and Hamsawardhini Rengarajan
1743
+ and Kengatharaiyer Sarveswaran
1744
+ and William Chandra Tjhi
1745
+ },
1746
+ year={2023},
1747
+ eprint={2309.06085},
1748
+ archivePrefix={arXiv},
1749
+ primaryClass={cs.CL}
1750
+ }
1751
+ """
1752
+
1753
+ name = "lindsea_pragmatic_reasoning_pair"
1754
+ description = "LINDSEA pragmatic reasoning sentence pair task"
1755
+ tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1756
+
1757
+ def __init__(self, language: str):
1758
+ super().__init__()
1759
+ self.language = language
1760
+ self.prompt = {
1761
+ "id": {
1762
+ "question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1763
+ "instruction": "Jawablah dengan Benar atau Salah saja.",
1764
+ True: "Benar",
1765
+ False: "Salah",
1766
+ },
1767
+ }
1768
+
1769
+ def download_dataset(self, output_path: str):
1770
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1771
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_pair.jsonl"
1772
+ file = "pragmatic_reasoning_pair"
1773
+ target_path_file = os.path.join(output_path, file)
1774
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1775
+ dataset = pd.read_json(target_path_file, lines=True)
1776
+ return dataset
1777
+
1778
+ def get_instances(self, output_path) -> List[Instance]:
1779
+ data = self.download_dataset(output_path)
1780
+ outputs = []
1781
+ for _, row in data.iterrows():
1782
+ passage = "Situasi: {premise}\n{question}\nPernyataan: {conclusion}\n{instruction}".format(
1783
+ premise=row["text"],
1784
+ question=self.prompt[self.language]["question"],
1785
+ conclusion=row["conclusion"],
1786
+ instruction=self.prompt[self.language]["instruction"],
1787
+ )
1788
+ input = Input(text=passage)
1789
+ references = [
1790
+ Reference(Output(text=self.prompt[self.language][row["label"]]), tags=[CORRECT_TAG]),
1791
+ ]
1792
+ instance = Instance(
1793
+ input=input,
1794
+ references=references,
1795
+ split=TEST_SPLIT,
1796
+ )
1797
+ outputs.append(instance)
1798
+ return outputs