crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,638 @@
1
+ from helm.benchmark.adaptation.adapter_spec import (
2
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
3
+ )
4
+ from helm.benchmark.adaptation.common_adapter_specs import (
5
+ get_generation_adapter_spec,
6
+ get_multiple_choice_separate_adapter_spec,
7
+ )
8
+ from helm.benchmark.metrics.bhasa_metrics_specs import (
9
+ get_bhasa_machine_translation_metric_specs,
10
+ get_bhasa_qa_metric_specs,
11
+ )
12
+ from helm.benchmark.metrics.common_metric_specs import (
13
+ get_basic_metric_specs,
14
+ get_exact_match_metric_specs,
15
+ get_classification_metric_specs,
16
+ )
17
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
18
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
19
+
20
+ # BHASA Run Specs
21
+ # A. Natural Language Understanding
22
+ # B. Natural Language Generation
23
+ # C. Natural Language Reasoning
24
+ # D. Linguistic Diagnostics
25
+
26
+ # A. Natural Language Understanding
27
+ # 1. Question Answering
28
+ # 2. Sentiment Analysis
29
+ # 3. Toxicity Detection/Classification
30
+
31
+
32
+ # 1. Question Answering
33
+ # 1.1 Indonesian: TyDiQA
34
+ @run_spec_function("tydiqa")
35
+ def get_tydiqa_spec() -> RunSpec:
36
+ name = "tydiqa"
37
+
38
+ adapter_spec = get_generation_adapter_spec(
39
+ instructions="Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan "
40
+ "mengekstrak jawaban dari paragraf tersebut.",
41
+ output_noun="Jawaban",
42
+ stop_sequences=["\n"],
43
+ max_tokens=256,
44
+ )
45
+
46
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.TyDiQAScenario")
47
+
48
+ return RunSpec(
49
+ name=name,
50
+ scenario_spec=scenario_spec,
51
+ adapter_spec=adapter_spec,
52
+ metric_specs=get_bhasa_qa_metric_specs(
53
+ args={
54
+ "language": "id",
55
+ }
56
+ ),
57
+ groups=["bhasa_nlu", "tydiqa"],
58
+ )
59
+
60
+
61
+ # 1.2 Vietnamese & Thai: XQuAD
62
+ XQUAD_PROMPTS = {
63
+ "th": {
64
+ "instructions": "คุณจะได้รับข้อความและคำถาม กรุณาตอบคำถามโดยแยกคำตอบจากข้อความ",
65
+ "output_noun": "คำตอบ",
66
+ },
67
+ "vi": {
68
+ "instructions": "Bạn sẽ được cho một đoạn văn và một câu hỏi. Trả lời câu hỏi bằng cách trích xuất câu "
69
+ "trả lời từ đoạn văn.",
70
+ "output_noun": "Câu trả lời",
71
+ },
72
+ }
73
+
74
+
75
+ @run_spec_function("xquad")
76
+ def get_xquad_spec(language="th") -> RunSpec:
77
+ name = f"xquad_{language}"
78
+
79
+ adapter_spec = get_generation_adapter_spec(
80
+ instructions=XQUAD_PROMPTS[language]["instructions"],
81
+ output_noun=XQUAD_PROMPTS[language]["output_noun"],
82
+ stop_sequences=["\n"],
83
+ max_tokens=256,
84
+ )
85
+
86
+ scenario_spec = ScenarioSpec(
87
+ class_name="helm.benchmark.scenarios.bhasa_scenario.XQuADScenario",
88
+ args={
89
+ "language": language,
90
+ },
91
+ )
92
+
93
+ return RunSpec(
94
+ name=name,
95
+ scenario_spec=scenario_spec,
96
+ adapter_spec=adapter_spec,
97
+ metric_specs=get_bhasa_qa_metric_specs(
98
+ args={
99
+ "language": language,
100
+ }
101
+ ),
102
+ groups=["bhasa_nlu", f"xquad_{language}"],
103
+ )
104
+
105
+
106
+ # 1.3 Tamil: IndicQA
107
+ @run_spec_function("indicqa")
108
+ def get_indicqa_spec() -> RunSpec:
109
+ name = "indicqa"
110
+ i = "உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்."
111
+
112
+ adapter_spec = get_generation_adapter_spec(
113
+ instructions=i,
114
+ output_noun="பதில்",
115
+ stop_sequences=["\n"],
116
+ max_tokens=256,
117
+ )
118
+
119
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicQAScenario")
120
+
121
+ return RunSpec(
122
+ name=name,
123
+ scenario_spec=scenario_spec,
124
+ adapter_spec=adapter_spec,
125
+ metric_specs=get_bhasa_qa_metric_specs(
126
+ args={
127
+ "language": "ta",
128
+ }
129
+ ),
130
+ groups=["bhasa_nlu", "indicqa"],
131
+ )
132
+
133
+
134
+ # 2. Sentiment Analysis
135
+ # 2.1 Indonesian: NusaX Sentiment
136
+ @run_spec_function("nusax")
137
+ def get_nusax_spec() -> RunSpec:
138
+ name = "nusax"
139
+
140
+ adapter_spec = get_generation_adapter_spec(
141
+ instructions="Apa sentimen dari kalimat berikut ini?\nJawablah dengan satu kata saja:"
142
+ "\n- Positif\n- Negatif\n- Netral",
143
+ input_noun="Kalimat",
144
+ output_noun="Jawaban",
145
+ stop_sequences=["\n"],
146
+ max_tokens=16,
147
+ )
148
+
149
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.NusaXScenario")
150
+
151
+ return RunSpec(
152
+ name=name,
153
+ scenario_spec=scenario_spec,
154
+ adapter_spec=adapter_spec,
155
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
156
+ groups=["bhasa_nlu", "nusax"],
157
+ )
158
+
159
+
160
+ # 2.2 Vietnamese: UIT-VSFC
161
+ @run_spec_function("uitvsfc")
162
+ def get_uitvsfc_spec() -> RunSpec:
163
+ name = "uitvsfc"
164
+
165
+ adapter_spec = get_generation_adapter_spec(
166
+ instructions="Sắc thái của câu sau đây là gì?\nTrả lời với một từ duy nhất:"
167
+ "\n- Tích cực\n- Tiêu cực\n- Trung lập",
168
+ input_noun="Câu văn",
169
+ output_noun="Câu trả lời",
170
+ stop_sequences=["\n"],
171
+ max_tokens=16,
172
+ )
173
+
174
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.UITVSFCScenario")
175
+
176
+ return RunSpec(
177
+ name=name,
178
+ scenario_spec=scenario_spec,
179
+ adapter_spec=adapter_spec,
180
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
181
+ groups=["bhasa_nlu", "uitvsfc"],
182
+ )
183
+
184
+
185
+ # 2.3 Thai: Wisesight Sentiment
186
+ @run_spec_function("wisesight")
187
+ def get_wisesight_spec() -> RunSpec:
188
+ name = "wisesight"
189
+ i = "อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?\nกรุณาตอบโดยใช้คำเดียวเท่านั้น:\n- แง่บวก\n- แง่ลบ\n- เฉยๆ"
190
+
191
+ adapter_spec = get_generation_adapter_spec(
192
+ instructions=i,
193
+ input_noun="ข้อความ",
194
+ output_noun="คำตอบ",
195
+ stop_sequences=["\n"],
196
+ max_tokens=16,
197
+ )
198
+
199
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.WisesightScenario")
200
+
201
+ return RunSpec(
202
+ name=name,
203
+ scenario_spec=scenario_spec,
204
+ adapter_spec=adapter_spec,
205
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
206
+ groups=["bhasa_nlu", "wisesight"],
207
+ )
208
+
209
+
210
+ # 2.4 Tamil: IndicSentiment
211
+ @run_spec_function("indicsentiment")
212
+ def get_indicsentiment_spec() -> RunSpec:
213
+ name = "indicsentiment"
214
+
215
+ adapter_spec = get_generation_adapter_spec(
216
+ instructions="பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?\nஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:"
217
+ "\n- நேர்மறை\n- எதிர்மறை",
218
+ input_noun="வாக்கியம்",
219
+ output_noun="பதில்",
220
+ stop_sequences=["\n"],
221
+ max_tokens=16,
222
+ )
223
+
224
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicSentimentScenario")
225
+
226
+ return RunSpec(
227
+ name=name,
228
+ scenario_spec=scenario_spec,
229
+ adapter_spec=adapter_spec,
230
+ metric_specs=get_classification_metric_specs() + get_basic_metric_specs([]),
231
+ groups=["bhasa_nlu", "indicsentiment"],
232
+ )
233
+
234
+
235
+ # 3. Toxicity Detection/Classification
236
+ # 3.1 Indonesian: Multi-Label Hate Speech Detection
237
+ @run_spec_function("mlhsd")
238
+ def get_mlhsd_spec() -> RunSpec:
239
+ name = "mlhsd"
240
+
241
+ adapter_spec = get_generation_adapter_spec(
242
+ instructions="Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:"
243
+ "\nBersih: Tidak ada ujaran kebencian.\nKasar: Ada ujaran kebencian dan kata-kata kasar, namun "
244
+ "tidak menyerang pihak tertentu.\nBenci: Ada ujaran kebencian atau serangan langsung terhadap pihak "
245
+ "tertentu.\nBerdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:"
246
+ "\n- Bersih\n- Kasar\n- Benci",
247
+ input_noun="Kalimat",
248
+ output_noun="Jawaban",
249
+ stop_sequences=["\n"],
250
+ max_tokens=16,
251
+ )
252
+
253
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.MLHSDScenario")
254
+
255
+ return RunSpec(
256
+ name=name,
257
+ scenario_spec=scenario_spec,
258
+ adapter_spec=adapter_spec,
259
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
260
+ groups=["bhasa_nlu", "mlhsd"],
261
+ )
262
+
263
+
264
+ # 3.2 Vietnamese: ViHSD
265
+ @run_spec_function("vihsd")
266
+ def get_vihsd_spec() -> RunSpec:
267
+ name = "vihsd"
268
+
269
+ adapter_spec = get_generation_adapter_spec(
270
+ instructions="Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:"
271
+ "\nSạch: Không quấy rối.\nCông kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không "
272
+ "tấn công bất kì đối tượng cụ thể nào.\nThù ghét: Trực tiếp quấy rối hay lăng mạ một đối "
273
+ "tượng cụ thể.\nVới các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:"
274
+ "\n- Sạch\n- Công kích\n- Thù ghét",
275
+ input_noun="Câu văn",
276
+ output_noun="Câu trả lời",
277
+ stop_sequences=["\n"],
278
+ max_tokens=16,
279
+ )
280
+
281
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ViHSDScenario")
282
+
283
+ return RunSpec(
284
+ name=name,
285
+ scenario_spec=scenario_spec,
286
+ adapter_spec=adapter_spec,
287
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
288
+ groups=["bhasa_nlu", "vihsd"],
289
+ )
290
+
291
+
292
+ # 3.3 Thai: Thai Toxicity Tweets
293
+ @run_spec_function("thaitoxicitytweets")
294
+ def get_thaitoxicitytweets_spec() -> RunSpec:
295
+ name = "thaitoxicitytweets"
296
+
297
+ adapter_spec = get_generation_adapter_spec(
298
+ instructions="คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ\nข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย "
299
+ "หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล และคุณควรคำนึงถึงการประชดประชันด้วย\nเมื่อได้รับข้อความ "
300
+ "ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ",
301
+ input_noun="ข้อความ",
302
+ output_noun="คำตอบ",
303
+ stop_sequences=["\n"],
304
+ max_tokens=16,
305
+ )
306
+
307
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ThaiToxicityTweetsScenario")
308
+
309
+ return RunSpec(
310
+ name=name,
311
+ scenario_spec=scenario_spec,
312
+ adapter_spec=adapter_spec,
313
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
314
+ groups=["bhasa_nlu", "thaitoxicitytweets"],
315
+ )
316
+
317
+
318
+ # B. Natural Language Generation
319
+ # 1. Machine Translation
320
+
321
+ # 1. Machine Translation: FLoRes-200
322
+ TRANSLATION_PROMPTS = {
323
+ "en_id": {
324
+ "instructions": "Terjemahkan teks berikut ini ke dalam Bahasa Indonesia.",
325
+ "input_noun": "Teks",
326
+ "output_noun": "Terjemahan",
327
+ },
328
+ "en_ta": {
329
+ "instructions": "பின்வரும் உரையைத் தமிழ் மொழிக்கு மொழிபெயர்க்கவும்.",
330
+ "input_noun": "உரை",
331
+ "output_noun": "மொழிபெயர்ப்பு",
332
+ },
333
+ "en_th": {
334
+ "instructions": "กรุณาแปลข้อความต่อไปนี้เป็นภาษาไทย",
335
+ "input_noun": "ข้อความ",
336
+ "output_noun": "คำแปล",
337
+ },
338
+ "en_vi": {
339
+ "instructions": "Dịch văn bản dưới đây sang Tiếng Việt.",
340
+ "input_noun": "Văn bản",
341
+ "output_noun": "Bản dịch",
342
+ },
343
+ "id_en": {
344
+ "instructions": "Terjemahkan teks berikut ini ke dalam Bahasa Inggris.",
345
+ "input_noun": "Teks",
346
+ "output_noun": "Terjemahan",
347
+ },
348
+ "ta_en": {
349
+ "instructions": "பின்வரும் உரையை ஆங்கில மொழிக்கு மொழிபெயர்க்கவும்.",
350
+ "input_noun": "உரை",
351
+ "output_noun": "மொழிபெயர்ப்பு",
352
+ },
353
+ "th_en": {
354
+ "instructions": "กรุณาแปลข้อความต่อไปนี้เป็นภาษาอังกฤษ",
355
+ "input_noun": "ข้อความ",
356
+ "output_noun": "คำแปล",
357
+ },
358
+ "vi_en": {
359
+ "instructions": "Dịch văn bản dưới đây sang Tiếng Anh.",
360
+ "input_noun": "Văn bản",
361
+ "output_noun": "Bản dịch",
362
+ },
363
+ }
364
+
365
+
366
+ @run_spec_function("flores")
367
+ def get_flores_spec(source="en", target="id") -> RunSpec:
368
+ pair = f"{source}_{target}"
369
+ name = f"flores_{pair}"
370
+
371
+ adapter_spec = get_generation_adapter_spec(
372
+ instructions=TRANSLATION_PROMPTS[pair]["instructions"],
373
+ input_noun=TRANSLATION_PROMPTS[pair]["input_noun"],
374
+ output_noun=TRANSLATION_PROMPTS[pair]["output_noun"],
375
+ stop_sequences=["\n"],
376
+ max_tokens=256,
377
+ sample_train=False,
378
+ )
379
+
380
+ scenario_spec = ScenarioSpec(
381
+ class_name="helm.benchmark.scenarios.bhasa_scenario.FloresScenario",
382
+ args={
383
+ "pair": pair,
384
+ },
385
+ )
386
+
387
+ return RunSpec(
388
+ name=name,
389
+ scenario_spec=scenario_spec,
390
+ adapter_spec=adapter_spec,
391
+ metric_specs=get_bhasa_machine_translation_metric_specs(),
392
+ groups=["bhasa_nlg", f"flores_{pair}"],
393
+ )
394
+
395
+
396
+ # C. Natural Language Reasoning
397
+ # 1. Natural Language Inference
398
+ # 2. Causal Reasoning
399
+
400
+
401
+ # 1. Natural Language Inference
402
+ # 1.1 Indonesian: IndoNLI
403
+ @run_spec_function("indonli")
404
+ def get_indonli_spec() -> RunSpec:
405
+ name = "indonli"
406
+
407
+ adapter_spec = get_generation_adapter_spec(
408
+ instructions="Anda akan diberikan dua kalimat, X dan Y.\nTentukan mana dari pernyataan berikut "
409
+ "ini yang paling sesuai untuk kalimat X dan Y.\nA: Kalau X benar, maka Y juga harus benar."
410
+ "\nB: X bertentangan dengan Y.\nC: Ketika X benar, Y mungkin benar atau mungkin tidak benar."
411
+ "\nJawablah dengan satu huruf saja, A, B atau C.",
412
+ output_noun="Jawaban",
413
+ stop_sequences=["\n"],
414
+ max_tokens=2,
415
+ )
416
+
417
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndoNLIScenario")
418
+
419
+ return RunSpec(
420
+ name=name,
421
+ scenario_spec=scenario_spec,
422
+ adapter_spec=adapter_spec,
423
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
424
+ groups=["bhasa_nlr", "indonli"],
425
+ )
426
+
427
+
428
+ # 1.2 Vietnamese & Thai: XNLI
429
+ XNLI_PROMPTS = {
430
+ "th": {
431
+ "instructions": "คุณจะได้รับสองข้อความ X และ Y",
432
+ "input_suffix": "กรุณาพิจารณาว่า ข้อความใดต่อไปนี้ใช้กับข้อความ X และ Y ได้ดีที่สุด"
433
+ "\nA: ถ้า X เป็นจริง Y จะต้องเป็นจริง\nB: X ขัดแย้งกับ Y\nC: เมื่อ X เป็นจริง Y อาจเป็นจริงหรือไม่ก็ได้"
434
+ "\nกรุณาตอบด้วยตัวอักษร A, B หรือ C ตัวเดียวเท่านั้น",
435
+ "output_noun": "คำตอบ",
436
+ },
437
+ "vi": {
438
+ "instructions": "Bạn sẽ được cho hai câu, X và Y.",
439
+ "input_suffix": "Xác định câu nào sau đây là câu phù hợp nhất cho câu X và Y."
440
+ "\nA: Nếu X đúng thì Y phải đúng.\nB: X mâu thuẫn với Y."
441
+ "\nC: Khi X đúng, Y có thể đúng hoặc không đúng.\nTrả lời với một chữ cái duy nhất A, B, hoặc C.",
442
+ "output_noun": "Câu trả lời",
443
+ },
444
+ }
445
+
446
+
447
+ @run_spec_function("xnli")
448
+ def get_xnli_spec(language="vi") -> RunSpec:
449
+ name = f"xnli_{language}"
450
+
451
+ adapter_spec = get_generation_adapter_spec(
452
+ instructions=XNLI_PROMPTS[language]["instructions"] + "\n" + XNLI_PROMPTS[language]["input_suffix"],
453
+ output_noun=XNLI_PROMPTS[language]["output_noun"],
454
+ stop_sequences=["\n"],
455
+ max_tokens=2,
456
+ )
457
+
458
+ scenario_spec = ScenarioSpec(
459
+ class_name="helm.benchmark.scenarios.bhasa_scenario.XNLIScenario",
460
+ args={
461
+ "language": language,
462
+ },
463
+ )
464
+
465
+ return RunSpec(
466
+ name=name,
467
+ scenario_spec=scenario_spec,
468
+ adapter_spec=adapter_spec,
469
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
470
+ groups=["bhasa_nlr", f"xnli_{language}"],
471
+ )
472
+
473
+
474
+ # 1.3 Tamil: IndicXNLI
475
+ @run_spec_function("indicxnli")
476
+ def get_indicxnli_spec() -> RunSpec:
477
+ name = "indicxnli"
478
+
479
+ adapter_spec = get_generation_adapter_spec(
480
+ instructions="உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்."
481
+ "\nபின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்."
482
+ "\nA: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.\nB: X உம் Y உம் முரண்படுகின்றன."
483
+ "\nC: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்."
484
+ "\nA அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
485
+ output_noun="பதில்",
486
+ stop_sequences=["\n"],
487
+ max_tokens=2,
488
+ )
489
+
490
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicXNLIScenario")
491
+
492
+ return RunSpec(
493
+ name=name,
494
+ scenario_spec=scenario_spec,
495
+ adapter_spec=adapter_spec,
496
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
497
+ groups=["bhasa_nlr", "indicxnli"],
498
+ )
499
+
500
+
501
+ # 2. Causal Reasoning: XCOPA
502
+ XCOPA_PROMPTS = {
503
+ "id": {
504
+ "input_noun": "Situasi",
505
+ "output_noun": "Jawaban",
506
+ },
507
+ "ta": {
508
+ "input_noun": "சூழ்நிலை",
509
+ "output_noun": "பதில்",
510
+ },
511
+ "th": {
512
+ "input_noun": "สถานการณ์",
513
+ "output_noun": "คำตอบ",
514
+ },
515
+ "vi": {
516
+ "input_noun": "Tình huống",
517
+ "output_noun": "Câu trả lời",
518
+ },
519
+ }
520
+
521
+
522
+ @run_spec_function("xcopa")
523
+ def get_xcopa_spec(language="id") -> RunSpec:
524
+ name = f"xcopa_{language}"
525
+
526
+ adapter_spec = get_generation_adapter_spec(
527
+ input_noun=XCOPA_PROMPTS[language]["input_noun"],
528
+ output_noun=XCOPA_PROMPTS[language]["output_noun"],
529
+ stop_sequences=["\n"],
530
+ max_tokens=2,
531
+ )
532
+
533
+ scenario_spec = ScenarioSpec(
534
+ class_name="helm.benchmark.scenarios.bhasa_scenario.XCOPAScenario",
535
+ args={
536
+ "language": language,
537
+ },
538
+ )
539
+
540
+ return RunSpec(
541
+ name=name,
542
+ scenario_spec=scenario_spec,
543
+ adapter_spec=adapter_spec,
544
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
545
+ groups=["bhasa_nlr", f"xcopa_{language}"],
546
+ )
547
+
548
+
549
+ # D. Linguistic Diagnostics (LINDSEA)
550
+ # 1. Syntax
551
+ # 2. Pragmatics
552
+
553
+ # 1. Syntax: LINDSEA Minimal Pairs
554
+ LINDSEA_OUTPUT_NOUNS = {"id": "Jawaban"}
555
+
556
+
557
+ @run_spec_function("lindsea_syntax_minimal_pairs")
558
+ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec:
559
+ name = f"lindsea_syntax_minimal_pairs_{language}"
560
+ if method == "mcq":
561
+ adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
562
+ else:
563
+ adapter_spec = get_multiple_choice_separate_adapter_spec(
564
+ method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
565
+ empty_input=True,
566
+ )
567
+
568
+ scenario_spec = ScenarioSpec(
569
+ class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
570
+ args={
571
+ "method": method,
572
+ "language": language,
573
+ },
574
+ )
575
+
576
+ return RunSpec(
577
+ name=name,
578
+ scenario_spec=scenario_spec,
579
+ adapter_spec=adapter_spec,
580
+ metric_specs=get_exact_match_metric_specs(),
581
+ groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
582
+ )
583
+
584
+
585
+ # 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
586
+ @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
587
+ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
588
+ name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
589
+
590
+ adapter_spec = get_generation_adapter_spec(
591
+ output_noun=LINDSEA_OUTPUT_NOUNS[language],
592
+ stop_sequences=["\n"],
593
+ max_train_instances=0,
594
+ max_tokens=8,
595
+ )
596
+
597
+ scenario_spec = ScenarioSpec(
598
+ class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
599
+ args={
600
+ "language": language,
601
+ },
602
+ )
603
+
604
+ return RunSpec(
605
+ name=name,
606
+ scenario_spec=scenario_spec,
607
+ adapter_spec=adapter_spec,
608
+ metric_specs=get_exact_match_metric_specs(),
609
+ groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
610
+ )
611
+
612
+
613
+ # 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
614
+ @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
615
+ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
616
+ name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
617
+
618
+ adapter_spec = get_generation_adapter_spec(
619
+ output_noun=LINDSEA_OUTPUT_NOUNS[language],
620
+ stop_sequences=["\n"],
621
+ max_train_instances=0,
622
+ max_tokens=8,
623
+ )
624
+
625
+ scenario_spec = ScenarioSpec(
626
+ class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
627
+ args={
628
+ "language": language,
629
+ },
630
+ )
631
+
632
+ return RunSpec(
633
+ name=name,
634
+ scenario_spec=scenario_spec,
635
+ adapter_spec=adapter_spec,
636
+ metric_specs=get_exact_match_metric_specs(),
637
+ groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
638
+ )