crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -99,47 +99,94 @@ metrics:
|
|
|
99
99
|
display_name: METEOR
|
|
100
100
|
short_display_name: METEOR
|
|
101
101
|
description: METEOR
|
|
102
|
+
lower_is_better: false
|
|
102
103
|
- name: f1
|
|
103
|
-
display_name: F1
|
|
104
|
-
short_display_name: F1
|
|
105
|
-
description: F1
|
|
104
|
+
display_name: BERTScore F1
|
|
105
|
+
short_display_name: BERTScore F1
|
|
106
|
+
description: BERTScore F1
|
|
107
|
+
lower_is_better: false
|
|
106
108
|
- name: precision
|
|
107
109
|
display_name: Precision
|
|
108
110
|
short_display_name: Precision
|
|
109
111
|
description: Precision
|
|
112
|
+
lower_is_better: false
|
|
110
113
|
- name: recall
|
|
111
114
|
display_name: Recall
|
|
112
115
|
short_display_name: Recall
|
|
113
116
|
description: Recall
|
|
117
|
+
lower_is_better: false
|
|
114
118
|
- name: rouge1
|
|
115
119
|
display_name: ROUGE-1
|
|
116
120
|
short_display_name: ROUGE-1
|
|
117
121
|
description: ROUGE-1
|
|
122
|
+
lower_is_better: false
|
|
118
123
|
- name: rouge2
|
|
119
124
|
display_name: ROUGE-2
|
|
120
125
|
short_display_name: ROUGE-2
|
|
121
126
|
description: ROUGE-2
|
|
127
|
+
lower_is_better: false
|
|
122
128
|
- name: rougeL
|
|
123
129
|
display_name: ROUGE-L
|
|
124
130
|
short_display_name: ROUGE-L
|
|
125
131
|
description: ROUGE-L
|
|
132
|
+
lower_is_better: false
|
|
126
133
|
- name: rougeLsum
|
|
127
134
|
display_name: ROUGE-Lsum
|
|
128
135
|
short_display_name: ROUGE-Lsum
|
|
129
136
|
description: ROUGE-Lsum
|
|
137
|
+
lower_is_better: false
|
|
130
138
|
- name: bleu
|
|
131
139
|
display_name: BLEU
|
|
132
140
|
short_display_name: BLEU
|
|
133
141
|
description: BLEU
|
|
142
|
+
lower_is_better: false
|
|
143
|
+
- name: accuracy
|
|
144
|
+
display_name: Accuracy
|
|
145
|
+
short_display_name: Accuracy
|
|
146
|
+
description: Accuracy
|
|
147
|
+
lower_is_better: false
|
|
148
|
+
- name: f1_macro
|
|
149
|
+
display_name: Macro F1
|
|
150
|
+
short_display_name: Macro F1
|
|
151
|
+
description: Macro F1
|
|
152
|
+
lower_is_better: false
|
|
153
|
+
- name: f1_micro
|
|
154
|
+
display_name: Micro F1
|
|
155
|
+
short_display_name: Micro F1
|
|
156
|
+
description: Micro F1
|
|
157
|
+
lower_is_better: false
|
|
158
|
+
- name: unsorted_list_exact_match
|
|
159
|
+
display_name: Unsorted List Exact Match
|
|
160
|
+
short_display_name: Exact Match
|
|
161
|
+
description: Unsorted List Exact Match
|
|
162
|
+
lower_is_better: false
|
|
163
|
+
|
|
164
|
+
# FinQA Accuracy
|
|
165
|
+
- name: program_accuracy
|
|
166
|
+
display_name: Program Accuracy
|
|
167
|
+
short_display_name: Program Accuracy
|
|
168
|
+
description: Program Accuracy
|
|
169
|
+
lower_is_better: false
|
|
170
|
+
- name: execution_accuracy
|
|
171
|
+
display_name: Execution Accuracy
|
|
172
|
+
short_display_name: Execution Accuracy
|
|
173
|
+
description: Execution Accuracy
|
|
174
|
+
lower_is_better: false
|
|
134
175
|
|
|
135
176
|
perturbations: []
|
|
136
177
|
|
|
137
178
|
metric_groups:
|
|
138
|
-
- name:
|
|
139
|
-
display_name:
|
|
179
|
+
- name: main_metrics
|
|
180
|
+
display_name: Main Metrics
|
|
181
|
+
metrics:
|
|
182
|
+
- name: ${main_name}
|
|
183
|
+
split: __all__
|
|
184
|
+
|
|
185
|
+
- name: generation_metrics
|
|
186
|
+
display_name: Other Generation Metrics
|
|
140
187
|
hide_win_rates: true
|
|
141
188
|
metrics:
|
|
142
|
-
- name:
|
|
189
|
+
- name: f1
|
|
143
190
|
split: __all__
|
|
144
191
|
- name: rouge1
|
|
145
192
|
split: __all__
|
|
@@ -152,6 +199,17 @@ metric_groups:
|
|
|
152
199
|
- name: bleu
|
|
153
200
|
split: __all__
|
|
154
201
|
|
|
202
|
+
- name: classification_metrics
|
|
203
|
+
display_name: Classification Metrics
|
|
204
|
+
hide_win_rates: true
|
|
205
|
+
metrics:
|
|
206
|
+
- name: accuracy
|
|
207
|
+
split: __all__
|
|
208
|
+
- name: f1_macro
|
|
209
|
+
split: __all__
|
|
210
|
+
- name: f1_micro
|
|
211
|
+
split: __all__
|
|
212
|
+
|
|
155
213
|
- name: efficiency
|
|
156
214
|
display_name: Efficiency
|
|
157
215
|
metrics:
|
|
@@ -180,13 +238,17 @@ run_groups:
|
|
|
180
238
|
category: All Scenarios
|
|
181
239
|
subgroups:
|
|
182
240
|
- unitxt_cards.numeric_nlg
|
|
241
|
+
- unitxt_cards.tab_fact
|
|
242
|
+
- unitxt_cards.wikitq
|
|
243
|
+
- unitxt_cards.fin_qa
|
|
183
244
|
|
|
184
245
|
- name: unitxt_cards.numeric_nlg
|
|
185
246
|
display_name: NumericNLG
|
|
186
247
|
short_display_name: NumericNLG
|
|
187
248
|
description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
|
|
188
249
|
metric_groups:
|
|
189
|
-
-
|
|
250
|
+
- main_metrics
|
|
251
|
+
- generation_metrics
|
|
190
252
|
- efficiency
|
|
191
253
|
- general_information
|
|
192
254
|
environment:
|
|
@@ -198,3 +260,58 @@ run_groups:
|
|
|
198
260
|
who: "?"
|
|
199
261
|
when: "?"
|
|
200
262
|
language: English
|
|
263
|
+
|
|
264
|
+
- name: unitxt_cards.tab_fact
|
|
265
|
+
display_name: TabFact
|
|
266
|
+
short_display_name: TabFact
|
|
267
|
+
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
268
|
+
metric_groups:
|
|
269
|
+
- main_metrics
|
|
270
|
+
- classification_metrics
|
|
271
|
+
- efficiency
|
|
272
|
+
- general_information
|
|
273
|
+
environment:
|
|
274
|
+
main_name: accuracy
|
|
275
|
+
main_split: test
|
|
276
|
+
taxonomy:
|
|
277
|
+
task: "?"
|
|
278
|
+
what: "?"
|
|
279
|
+
who: "?"
|
|
280
|
+
when: "?"
|
|
281
|
+
language: English
|
|
282
|
+
|
|
283
|
+
- name: unitxt_cards.wikitq
|
|
284
|
+
display_name: WikiTableQuestions
|
|
285
|
+
short_display_name: WikiTableQuestions
|
|
286
|
+
description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
|
|
287
|
+
metric_groups:
|
|
288
|
+
- main_metrics
|
|
289
|
+
- classification_metrics
|
|
290
|
+
- efficiency
|
|
291
|
+
- general_information
|
|
292
|
+
environment:
|
|
293
|
+
main_name: unsorted_list_exact_match
|
|
294
|
+
main_split: test
|
|
295
|
+
taxonomy:
|
|
296
|
+
task: "?"
|
|
297
|
+
what: "?"
|
|
298
|
+
who: "?"
|
|
299
|
+
when: "?"
|
|
300
|
+
language: English
|
|
301
|
+
|
|
302
|
+
- name: unitxt_cards.fin_qa
|
|
303
|
+
display_name: FinQA
|
|
304
|
+
description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
|
|
305
|
+
metric_groups:
|
|
306
|
+
- main_metrics
|
|
307
|
+
- efficiency
|
|
308
|
+
- general_information
|
|
309
|
+
environment:
|
|
310
|
+
main_name: program_accuracy
|
|
311
|
+
main_split: test
|
|
312
|
+
taxonomy:
|
|
313
|
+
task: question answering with numeric reasoning
|
|
314
|
+
what: financial reports
|
|
315
|
+
who: financial experts
|
|
316
|
+
when: 1999 to 2019
|
|
317
|
+
language: English
|
|
@@ -78,6 +78,7 @@ perturbations: []
|
|
|
78
78
|
metric_groups:
|
|
79
79
|
- name: accuracy
|
|
80
80
|
display_name: Accuracy
|
|
81
|
+
hide_win_rates: true
|
|
81
82
|
metrics:
|
|
82
83
|
- name: ${main_name}
|
|
83
84
|
split: ${main_split}
|
|
@@ -111,12 +112,32 @@ run_groups:
|
|
|
111
112
|
description: Thai-language scenarios
|
|
112
113
|
category: All scenarios
|
|
113
114
|
subgroups:
|
|
115
|
+
- thai_exam
|
|
114
116
|
- thai_exam_onet
|
|
115
117
|
- thai_exam_ic
|
|
116
118
|
- thai_exam_tgat
|
|
117
119
|
- thai_exam_tpat1
|
|
118
120
|
- thai_exam_a_level
|
|
119
121
|
|
|
122
|
+
|
|
123
|
+
- name: thai_exam
|
|
124
|
+
display_name: ThaiExam
|
|
125
|
+
description: >
|
|
126
|
+
Macro-averaged accuracy on all ThaiExam examinations.
|
|
127
|
+
metric_groups:
|
|
128
|
+
- accuracy
|
|
129
|
+
- efficiency
|
|
130
|
+
- general_information
|
|
131
|
+
environment:
|
|
132
|
+
main_name: exact_match
|
|
133
|
+
main_split: test
|
|
134
|
+
taxonomy:
|
|
135
|
+
task: question answering
|
|
136
|
+
what: n/a
|
|
137
|
+
who: n/a
|
|
138
|
+
when: "?"
|
|
139
|
+
language: Thai and English
|
|
140
|
+
|
|
120
141
|
- name: thai_exam_onet
|
|
121
142
|
display_name: ONET
|
|
122
143
|
description: >
|