crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,709 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ - name: num_perplexity_tokens
5
+ display_name: '# tokens'
6
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
7
+ - name: num_bytes
8
+ display_name: '# bytes'
9
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
10
+ - name: num_references
11
+ display_name: '# ref'
12
+ description: Number of references.
13
+ - name: num_train_trials
14
+ display_name: '# trials'
15
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
16
+ - name: estimated_num_tokens_cost
17
+ display_name: 'cost'
18
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
19
+ - name: num_prompt_tokens
20
+ display_name: '# prompt tokens'
21
+ description: Number of tokens in the prompt.
22
+ - name: num_prompt_characters
23
+ display_name: '# prompt chars'
24
+ description: Number of characters in the prompt.
25
+ - name: num_completion_tokens
26
+ display_name: '# completion tokens'
27
+ description: Actual number of completion tokens (over all completions).
28
+ - name: num_output_tokens
29
+ display_name: '# output tokens'
30
+ description: Actual number of output tokens.
31
+ - name: max_num_output_tokens
32
+ display_name: 'Max output tokens'
33
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
34
+ - name: num_requests
35
+ display_name: '# requests'
36
+ description: Number of distinct API requests.
37
+ - name: num_instances
38
+ display_name: '# eval'
39
+ description: Number of evaluation instances.
40
+ - name: num_train_instances
41
+ display_name: '# train'
42
+ description: Number of training instances (e.g., in-context examples).
43
+ - name: prompt_truncated
44
+ display_name: truncated
45
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
46
+ - name: finish_reason_length
47
+ display_name: finish b/c length
48
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
49
+ - name: finish_reason_stop
50
+ display_name: finish b/c stop
51
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
52
+ - name: finish_reason_endoftext
53
+ display_name: finish b/c endoftext
54
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
55
+ - name: finish_reason_unknown
56
+ display_name: finish b/c unknown
57
+ description: Fraction of instances where the the output was terminated for unknown reasons.
58
+ - name: num_completions
59
+ display_name: '# completions'
60
+ description: Number of completions.
61
+ - name: predicted_index
62
+ display_name: Predicted index
63
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
64
+ - name: exact_match
65
+ display_name: Exact match
66
+ short_display_name: EM
67
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
68
+ lower_is_better: false
69
+ - name: chr_f_plus_plus
70
+ display_name: ChrF++
71
+ description: Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, 2015)](https://aclanthology.org/W15-3049/). Code can be found [here](https://github.com/mjpost/sacrebleu).
72
+ - name: squad_exact_match_score
73
+ display_name: SQuAD exact match
74
+ description: SQuAD exact match score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
75
+ - name: squad_f1_score
76
+ display_name: SQuAD macro-averaged F1 score
77
+ description: SQuAD macro-averaged F1 score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
78
+ - name: classification_macro_f1
79
+ display_name: Macro F1 score
80
+ description: Macro F1 score
81
+
82
+ ############################################################
83
+ perturbations: []
84
+
85
+ ############################################################
86
+ metric_groups:
87
+ - name: accuracy
88
+ display_name: Accuracy
89
+ metrics:
90
+ - name: ${main_name}
91
+ split: ${main_split}
92
+
93
+ - name: efficiency
94
+ display_name: Efficiency
95
+ metrics:
96
+ - name: inference_runtime
97
+ split: ${main_split}
98
+
99
+ - name: general_information
100
+ display_name: General information
101
+ metrics:
102
+ - name: num_instances
103
+ split: ${main_split}
104
+ - name: num_train_instances
105
+ split: ${main_split}
106
+ - name: prompt_truncated
107
+ split: ${main_split}
108
+ - name: num_prompt_tokens
109
+ split: ${main_split}
110
+ - name: num_output_tokens
111
+ split: ${main_split}
112
+
113
+ ############################################################
114
+
115
+ run_groups:
116
+ - name: bhasa_nlu
117
+ display_name: BHASA natural language understanding (NLU)
118
+ description: BHASA natural language understanding (NLU) scenarios
119
+ category: BHASA scenarios
120
+ subgroups:
121
+ - tydiqa
122
+ - xquad_vi
123
+ - xquad_th
124
+ - indicqa
125
+ - nusax
126
+ - uitvsfc
127
+ - wisesight
128
+ - indicsentiment
129
+ - mlhsd
130
+ - vihsd
131
+ - thaitoxicitytweets
132
+
133
+ - name: bhasa_nlg
134
+ display_name: BHASA natural language generation (NLG)
135
+ description: BHASA natural language generation (NLG) scenarios
136
+ category: BHASA scenarios
137
+ subgroups:
138
+ - flores_id_en
139
+ - flores_vi_en
140
+ - flores_th_en
141
+ - flores_ta_en
142
+ - flores_en_id
143
+ - flores_en_vi
144
+ - flores_en_th
145
+ - flores_en_ta
146
+
147
+ - name: bhasa_nlr
148
+ display_name: BHASA natural language reasoning (NLR)
149
+ description: BHASA natural language reasoning (NLR) scenarios
150
+ category: BHASA scenarios
151
+ subgroups:
152
+ - indonli
153
+ - xnli_vi
154
+ - xnli_th
155
+ - indicxnli
156
+ - xcopa_id
157
+ - xcopa_vi
158
+ - xcopa_th
159
+ - xcopa_ta
160
+
161
+ - name: bhasa_lindsea
162
+ display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
163
+ description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
164
+ category: BHASA scenarios
165
+ subgroups:
166
+ - lindsea_syntax_minimal_pairs_id
167
+ - lindsea_pragmatics_presuppositions_id
168
+ - lindsea_pragmatics_scalar_implicatures_id
169
+
170
+ - name: tydiqa
171
+ display_name: TyDiQA
172
+ description: >
173
+ TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an open-book question answering dataset for 11 typologically-diverse languages. The questions are written by people who want to know the answer, but do not know the answer yet,
174
+ and the data is collected directly in each language without the use of translation.
175
+ metric_groups:
176
+ - accuracy
177
+ - efficiency
178
+ - general_information
179
+ environment:
180
+ main_name: squad_f1_score
181
+ main_split: test
182
+ taxonomy:
183
+ task: question answering
184
+ what: questions by human annotators about Wikipedia articles
185
+ who: "human annotators"
186
+ when: "?"
187
+ language: Indonesian
188
+
189
+ - name: xquad_vi
190
+ display_name: XQuAD (Vietnamese)
191
+ description: >
192
+ XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book question answering dataset that is parallel across 10 languages. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
193
+ metric_groups:
194
+ - accuracy
195
+ - efficiency
196
+ - general_information
197
+ environment:
198
+ main_name: squad_f1_score
199
+ main_split: test
200
+ taxonomy:
201
+ task: question answering
202
+ what: questions by crowdworkers about Wikipedia articles translated from English to Thai and Vietnamese
203
+ who: "?"
204
+ when: "?"
205
+ language: Vietnamese
206
+
207
+ - name: xquad_th
208
+ display_name: XQuAD (Thai)
209
+ description: >
210
+ XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book question answering dataset that is parallel across 10 languages. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
211
+ metric_groups:
212
+ - accuracy
213
+ - efficiency
214
+ - general_information
215
+ environment:
216
+ main_name: squad_f1_score
217
+ main_split: test
218
+ taxonomy:
219
+ task: question answering
220
+ what: questions by crowdworkers about Wikipedia articles translated from English to Thai and Vietnamese
221
+ who: "?"
222
+ when: "?"
223
+ language: Thai
224
+
225
+ - name: indicqa
226
+ display_name: IndicQA
227
+ description: >
228
+ IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an open-book question answering dataset for 11 Indic languages. Answers to questions are to be extracted from the text provided. The data is taken from Wikipedia articles across various domains and questions and answers were manually created by native speakers.
229
+ metric_groups:
230
+ - accuracy
231
+ - efficiency
232
+ - general_information
233
+ environment:
234
+ main_name: squad_f1_score
235
+ main_split: test
236
+ taxonomy:
237
+ task: question answering
238
+ what: questions about Wikipedia articles translated by native speakers from English to Tamil
239
+ who: "?"
240
+ when: "?"
241
+ language: Tamil
242
+
243
+ - name: nusax
244
+ display_name: NusaX
245
+ description: >
246
+ NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an Indonesian sentiment analysis dataset. The data consists of comments and reviews from various online platforms.
247
+ metric_groups:
248
+ - accuracy
249
+ - efficiency
250
+ - general_information
251
+ environment:
252
+ main_name: classification_macro_f1
253
+ main_split: test
254
+ taxonomy:
255
+ task: sentiment analysis
256
+ what: online comments and reviews
257
+ who: "internet users"
258
+ when: "?"
259
+ language: Indonesian
260
+
261
+ - name: uitvsfc
262
+ display_name: UIT-VSFC
263
+ description: >
264
+ UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a Vietnamese sentiment analysis dataset. The data consists of student feedback obtained from end-of-semester surveys at a Vietnamese university.
265
+ metric_groups:
266
+ - accuracy
267
+ - efficiency
268
+ - general_information
269
+ environment:
270
+ main_name: classification_macro_f1
271
+ main_split: test
272
+ taxonomy:
273
+ task: sentiment analysis
274
+ what: university student end-of-semester survey responses
275
+ who: "university students"
276
+ when: "?"
277
+ language: Vietnamese
278
+
279
+ - name: wisesight
280
+ display_name: Wisesight
281
+ description: >
282
+ Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is an Thai sentiment analysis scenario. The data consists of social media messages regarding consumer products and services.
283
+ metric_groups:
284
+ - accuracy
285
+ - efficiency
286
+ - general_information
287
+ environment:
288
+ main_name: classification_macro_f1
289
+ main_split: test
290
+ taxonomy:
291
+ task: sentiment analysis
292
+ what: social media messages regarding consumer products and services
293
+ who: "social media users"
294
+ when: "?"
295
+ language: Thai
296
+
297
+ - name: indicsentiment
298
+ display_name: IndicSentiment
299
+ description: >
300
+ IndicSentiment is a Tamil sentiment analysis dataset that comes from IndicXTREME [(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product reviews that were written by annotators. Labels are positive or negative.
301
+ metric_groups:
302
+ - accuracy
303
+ - efficiency
304
+ - general_information
305
+ environment:
306
+ main_name: classification_macro_f1
307
+ main_split: test
308
+ taxonomy:
309
+ task: sentiment analysis
310
+ what: product reviews
311
+ who: "human annotators"
312
+ when: "?"
313
+ language: Tamil
314
+
315
+ - name: mlhsd
316
+ display_name: MLHSD
317
+ description: >
318
+ MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian toxicity detection dataset obtained from tweets on Twitter.
319
+ metric_groups:
320
+ - accuracy
321
+ - efficiency
322
+ - general_information
323
+ environment:
324
+ main_name: classification_macro_f1
325
+ main_split: test
326
+ taxonomy:
327
+ task: toxicity detection/classification
328
+ what: tweets
329
+ who: "Twitter users"
330
+ when: "?"
331
+ language: Indonesian
332
+
333
+ - name: vihsd
334
+ display_name: ViHSD
335
+ description: >
336
+ ViHSD [(Luu, 2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a Vietnamese toxicity detection dataset obtained from comments on Facebook, Youtube, Instagram, and Tiktok.
337
+ metric_groups:
338
+ - accuracy
339
+ - efficiency
340
+ - general_information
341
+ environment:
342
+ main_name: classification_macro_f1
343
+ main_split: test
344
+ taxonomy:
345
+ task: toxicity detection/classification
346
+ what: social media comments
347
+ who: "Social media users"
348
+ when: "?"
349
+ language: Vietnamese
350
+
351
+ - name: thaitoxicitytweets
352
+ display_name: Thai Toxicity Tweets
353
+ description: >
354
+ Thai Toxicity Tweets [(Sirihattasak, 2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a Thai toxicity detection dataset obtained from tweets on Twitter.
355
+ metric_groups:
356
+ - accuracy
357
+ - efficiency
358
+ - general_information
359
+ environment:
360
+ main_name: classification_macro_f1
361
+ main_split: test
362
+ taxonomy:
363
+ task: toxicity detection/classification
364
+ what: tweets
365
+ who: "Twitter users"
366
+ when: ""
367
+ language: Thai
368
+
369
+ - name: flores_en_id
370
+ display_name: Flores
371
+ description: >
372
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
373
+ metric_groups:
374
+ - accuracy
375
+ - efficiency
376
+ - general_information
377
+ environment:
378
+ main_name: chr_f_plus_plus
379
+ main_split: test
380
+ taxonomy:
381
+ task: machine translation
382
+ what: translations from professional human translators
383
+ who: "professional human translators"
384
+ when: "?"
385
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
386
+
387
+ - name: flores_en_vi
388
+ display_name: Flores
389
+ description: >
390
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
391
+ metric_groups:
392
+ - accuracy
393
+ - efficiency
394
+ - general_information
395
+ environment:
396
+ main_name: chr_f_plus_plus
397
+ main_split: test
398
+ taxonomy:
399
+ task: machine translation
400
+ what: translations from professional human translators
401
+ who: "professional human translators"
402
+ when: "?"
403
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
404
+
405
+ - name: flores_en_th
406
+ display_name: Flores
407
+ description: >
408
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
409
+ metric_groups:
410
+ - accuracy
411
+ - efficiency
412
+ - general_information
413
+ environment:
414
+ main_name: chr_f_plus_plus
415
+ main_split: test
416
+ taxonomy:
417
+ task: machine translation
418
+ what: translations from professional human translators
419
+ who: "professional human translators"
420
+ when: "?"
421
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
422
+
423
+ - name: flores_en_ta
424
+ display_name: Flores
425
+ description: >
426
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
427
+ metric_groups:
428
+ - accuracy
429
+ - efficiency
430
+ - general_information
431
+ environment:
432
+ main_name: chr_f_plus_plus
433
+ main_split: test
434
+ taxonomy:
435
+ task: machine translation
436
+ what: translations from professional human translators
437
+ who: "professional human translators"
438
+ when: "?"
439
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
440
+
441
+ - name: flores_id_en
442
+ display_name: Flores
443
+ description: >
444
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
445
+ metric_groups:
446
+ - accuracy
447
+ - efficiency
448
+ - general_information
449
+ environment:
450
+ main_name: chr_f_plus_plus
451
+ main_split: test
452
+ taxonomy:
453
+ task: machine translation
454
+ what: translations from professional human translators
455
+ who: "professional human translators"
456
+ when: "?"
457
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
458
+
459
+ - name: flores_vi_en
460
+ display_name: Flores
461
+ description: >
462
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
463
+ metric_groups:
464
+ - accuracy
465
+ - efficiency
466
+ - general_information
467
+ environment:
468
+ main_name: chr_f_plus_plus
469
+ main_split: test
470
+ taxonomy:
471
+ task: machine translation
472
+ what: translations from professional human translators
473
+ who: "professional human translators"
474
+ when: "?"
475
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
476
+
477
+ - name: flores_th_en
478
+ display_name: Flores
479
+ description: >
480
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
481
+ metric_groups:
482
+ - accuracy
483
+ - efficiency
484
+ - general_information
485
+ environment:
486
+ main_name: chr_f_plus_plus
487
+ main_split: test
488
+ taxonomy:
489
+ task: machine translation
490
+ what: translations from professional human translators
491
+ who: "professional human translators"
492
+ when: "?"
493
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
494
+
495
+ - name: flores_ta_en
496
+ display_name: Flores
497
+ description: >
498
+ Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
499
+ metric_groups:
500
+ - accuracy
501
+ - efficiency
502
+ - general_information
503
+ environment:
504
+ main_name: chr_f_plus_plus
505
+ main_split: test
506
+ taxonomy:
507
+ task: machine translation
508
+ what: translations from professional human translators
509
+ who: "professional human translators"
510
+ when: "?"
511
+ language: English, Indonesian, Tamil, Thai, Vietnamese, English
512
+
513
+ - name: indonli
514
+ display_name: IndoNLI
515
+ description: >
516
+ IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a natural language inference dataset obtained from Wikipedia, news, and web articles that incorporates various linguistic phenomena such as numerical reasoning, structural changes, idioms, or temporal and spatial reasoning.
517
+ metric_groups:
518
+ - accuracy
519
+ - efficiency
520
+ - general_information
521
+ environment:
522
+ main_name: exact_match
523
+ main_split: test
524
+ taxonomy:
525
+ task: natural language inference
526
+ what: Wikipedia, news, and web articles
527
+ who: "?"
528
+ when: "?"
529
+ language: Indonesian
530
+
531
+ - name: xnli_vi
532
+ display_name: XNLI (Vietnamese)
533
+ description: >
534
+ XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural language inference dataset obtained from crowdsourced NLI data then professionally translated across 14 other languages.
535
+ metric_groups:
536
+ - accuracy
537
+ - efficiency
538
+ - general_information
539
+ environment:
540
+ main_name: exact_match
541
+ main_split: test
542
+ taxonomy:
543
+ task: natural language inference
544
+ what: crowdsourced NLI data professionally translated
545
+ who: "?"
546
+ when: "?"
547
+ language: Vietnamese
548
+
549
+ - name: xnli_th
550
+ display_name: XNLI (Thai)
551
+ description: >
552
+ XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural language inference dataset obtained from crowdsourced NLI data then professionally translated across 14 other languages.
553
+ metric_groups:
554
+ - accuracy
555
+ - efficiency
556
+ - general_information
557
+ environment:
558
+ main_name: exact_match
559
+ main_split: test
560
+ taxonomy:
561
+ task: natural language inference
562
+ what: crowdsourced NLI data professionally translated
563
+ who: "?"
564
+ when: "?"
565
+ language: Thai
566
+
567
+ - name: indicxnli
568
+ display_name: IndicXNLI
569
+ description: >
570
+ IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME [(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which automatically translated from XNLI into 11 Indic languages.
571
+ metric_groups:
572
+ - accuracy
573
+ - efficiency
574
+ - general_information
575
+ environment:
576
+ main_name: exact_match
577
+ main_split: test
578
+ taxonomy:
579
+ task: natural language inference
580
+ what: crowdsourced NLI data professionally translated into Tamil
581
+ who: "?"
582
+ when: "?"
583
+ language: Tamil
584
+
585
+ - name: xcopa_id
586
+ display_name: XCOPA (Indonesian)
587
+ description: >
588
+ XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
589
+ metric_groups:
590
+ - accuracy
591
+ - efficiency
592
+ - general_information
593
+ environment:
594
+ main_name: exact_match
595
+ main_split: test
596
+ taxonomy:
597
+ task: causal reasoning
598
+ what: commonsense causal reasoning questions translated into Indonesian
599
+ who: "?"
600
+ when: "?"
601
+ language: Indonesian, Tamil, Thai, Vietnamese
602
+
603
+ - name: xcopa_vi
604
+ display_name: XCOPA (Vietnamese)
605
+ description: >
606
+ XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
607
+ metric_groups:
608
+ - accuracy
609
+ - efficiency
610
+ - general_information
611
+ environment:
612
+ main_name: exact_match
613
+ main_split: test
614
+ taxonomy:
615
+ task: causal reasoning
616
+ what: commonsense causal reasoning questions translated into Vietnamese
617
+ who: "?"
618
+ when: "?"
619
+ language: Vietnamese
620
+
621
+ - name: xcopa_th
622
+ display_name: XCOPA (Thai)
623
+ description: >
624
+ XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
625
+ metric_groups:
626
+ - accuracy
627
+ - efficiency
628
+ - general_information
629
+ environment:
630
+ main_name: exact_match
631
+ main_split: test
632
+ taxonomy:
633
+ task: causal reasoning
634
+ what: commonsense causal reasoning questions translated into Thai
635
+ who: "?"
636
+ when: "?"
637
+ language: Thai
638
+
639
+ - name: xcopa_ta
640
+ display_name: XCOPA (Tamil)
641
+ description: >
642
+ XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
643
+ metric_groups:
644
+ - accuracy
645
+ - efficiency
646
+ - general_information
647
+ environment:
648
+ main_name: exact_match
649
+ main_split: test
650
+ taxonomy:
651
+ task: causal reasoning
652
+ what: commonsense causal reasoning questions translated into Tamil
653
+ who: "?"
654
+ when: "?"
655
+ language: Tamil
656
+
657
+ - name: lindsea_syntax_minimal_pairs_id
658
+ display_name: LINDSEA Syntax Minimal Pairs
659
+ description: >
660
+ LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of sentences that differ minimally from each other and contrast in grammatical acceptability.
661
+ metric_groups:
662
+ - accuracy
663
+ - efficiency
664
+ - general_information
665
+ environment:
666
+ main_name: exact_match
667
+ main_split: test
668
+ taxonomy:
669
+ task: minimal pairs
670
+ what: sentence pairs with minimal differences and constrasting grammatical acceptability
671
+ who: "?"
672
+ when: "?"
673
+ language: Indonesian
674
+
675
+ - name: lindsea_pragmatics_presuppositions_id
676
+ display_name: LINDSEA Pragmatics Presuppositions
677
+ description: >
678
+ LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
679
+ metric_groups:
680
+ - accuracy
681
+ - efficiency
682
+ - general_information
683
+ environment:
684
+ main_name: exact_match
685
+ main_split: test
686
+ taxonomy:
687
+ task: pragmatic reasoning
688
+ what: presuppositions
689
+ who: "?"
690
+ when: "?"
691
+ language: Indonesian
692
+
693
+ - name: lindsea_pragmatics_scalar_implicatures_id
694
+ display_name: LINDSEA Pragmatics Scalar Implicatures
695
+ description: >
696
+ LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
697
+ metric_groups:
698
+ - accuracy
699
+ - efficiency
700
+ - general_information
701
+ environment:
702
+ main_name: exact_match
703
+ main_split: test
704
+ taxonomy:
705
+ task: pragmatic reasoning
706
+ what: scalar implicatures
707
+ who: "?"
708
+ when: "?"
709
+ language: Indonesian