crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,266 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: safety_score
78
+ display_name: LM Evaluated Safety score
79
+ description: LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).
80
+ lower_is_better: false
81
+ - name: bbq_accuracy
82
+ display_name: BBQ accuracy
83
+ description: BBQ accuracy
84
+ lower_is_better: false
85
+
86
+ # Toxicity metrics
87
+ - name: expected_max_toxicity
88
+ display_name: Average max toxicity
89
+ short_display_name: Avg. max toxicity
90
+ lower_is_better: true
91
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
92
+ - name: max_toxicity_probability
93
+ display_name: max toxicity
94
+ lower_is_better: true
95
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
96
+ - name: toxic_frac
97
+ display_name: Toxic fraction
98
+ lower_is_better: true
99
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
100
+
101
+ # Annotation metrics
102
+ - name: safety_llama_annotator_success
103
+ display_name: Llama Annotator Success Rate
104
+ description: Fraction of annotator requests to Llama that succeeded.
105
+ - name: safety_gpt_annotator_success
106
+ display_name: GPT Annotator Success Rate
107
+ description: Fraction of annotator requests to GPT that succeeded.
108
+
109
+ ############################################################
110
+ perturbations: []
111
+
112
+ ############################################################
113
+ metric_groups:
114
+ - name: accuracy
115
+ display_name: Accuracy
116
+ aggregation_strategies:
117
+ - win_rate
118
+ - mean
119
+ metrics:
120
+ - name: ${main_name}
121
+ split: ${main_split}
122
+
123
+ - name: efficiency
124
+ display_name: Efficiency
125
+ metrics:
126
+ - name: inference_runtime
127
+ split: ${main_split}
128
+
129
+ - name: general_information
130
+ display_name: General information
131
+ hide_win_rates: true
132
+ metrics:
133
+ - name: num_instances
134
+ split: ${main_split}
135
+ - name: num_train_instances
136
+ split: ${main_split}
137
+ - name: prompt_truncated
138
+ split: ${main_split}
139
+ - name: num_prompt_tokens
140
+ split: ${main_split}
141
+ - name: num_output_tokens
142
+ split: ${main_split}
143
+
144
+ - name: annotation_metrics
145
+ display_name: Annotation
146
+ hide_win_rates: true
147
+ metrics:
148
+ - name: safety_llama_annotator_success
149
+ split: ${main_split}
150
+ - name: safety_gpt_annotator_success
151
+ split: ${main_split}
152
+
153
+ - name: toxicity
154
+ display_name: Toxicity
155
+ metrics:
156
+ - name: toxic_frac
157
+ split: ${main_split}
158
+
159
+ - name: bbq_metrics
160
+ display_name: BBQ metrics
161
+ description: Metrics used for the BBQ bias benchmark.
162
+ metrics:
163
+ - name: bbq_metric_ambiguous_bias
164
+ split: ${main_split}
165
+ - name: bbq_metric_unambiguous_bias
166
+ split: ${main_split}
167
+
168
+ ############################################################
169
+ run_groups:
170
+ - name: safety_scenarios
171
+ display_name: Safety Scenarios
172
+ description: Scenarios for the model safety
173
+ category: All scenarios
174
+ subgroups:
175
+ - harm_bench
176
+ - simple_safety_tests
177
+ - bbq
178
+ - anthropic_red_team
179
+ - xstest
180
+
181
+ - name: harm_bench
182
+ display_name: HarmBench
183
+ description: HarmBench
184
+ metric_groups:
185
+ - accuracy
186
+ - general_information
187
+ - annotation_metrics
188
+ environment:
189
+ main_name: safety_score
190
+ main_split: test
191
+ taxonomy:
192
+ task: question answering
193
+ what: n/a
194
+ who: n/a
195
+ when: n/a
196
+ language: English
197
+
198
+ - name: simple_safety_tests
199
+ display_name: SimpleSafetyTests
200
+ description: SimpleSafetyTests
201
+ metric_groups:
202
+ - accuracy
203
+ - general_information
204
+ - annotation_metrics
205
+ environment:
206
+ main_name: safety_score
207
+ main_split: test
208
+ taxonomy:
209
+ task: question answering
210
+ what: n/a
211
+ who: n/a
212
+ when: n/a
213
+ language: English
214
+
215
+ - name: xstest
216
+ display_name: XSTest
217
+ description: XSTest
218
+ metric_groups:
219
+ - accuracy
220
+ - general_information
221
+ - annotation_metrics
222
+ environment:
223
+ main_name: safety_score
224
+ main_split: test
225
+ taxonomy:
226
+ task: question answering
227
+ what: n/a
228
+ who: n/a
229
+ when: n/a
230
+ language: English
231
+
232
+ - name: bbq
233
+ display_name: BBQ
234
+ description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
235
+ metric_groups:
236
+ - accuracy
237
+ - general_information
238
+ - bbq_metrics
239
+ - annotation_metrics
240
+ environment:
241
+ main_name: bbq_accuracy
242
+ main_split: test
243
+ taxonomy:
244
+ task: question answering
245
+ what: n/a
246
+ who: n/a
247
+ when: n/a
248
+ language: English
249
+
250
+ - name: anthropic_red_team
251
+ display_name: Anthropic Red Team
252
+ short_display_name: Anthropic Red Team
253
+ description: Anthropic Red Team
254
+ metric_groups:
255
+ - accuracy
256
+ - general_information
257
+ - annotation_metrics
258
+ environment:
259
+ main_name: safety_score
260
+ main_split: test
261
+ taxonomy:
262
+ task: question answering
263
+ what: "?"
264
+ who: "?"
265
+ when: "?"
266
+ language: English
@@ -99,47 +99,101 @@ metrics:
99
99
  display_name: METEOR
100
100
  short_display_name: METEOR
101
101
  description: METEOR
102
+ lower_is_better: false
102
103
  - name: f1
103
- display_name: F1
104
- short_display_name: F1
105
- description: F1
104
+ display_name: BERTScore F1
105
+ short_display_name: BERTScore F1
106
+ description: BERTScore F1
107
+ lower_is_better: false
106
108
  - name: precision
107
109
  display_name: Precision
108
110
  short_display_name: Precision
109
111
  description: Precision
112
+ lower_is_better: false
110
113
  - name: recall
111
114
  display_name: Recall
112
115
  short_display_name: Recall
113
116
  description: Recall
117
+ lower_is_better: false
114
118
  - name: rouge1
115
119
  display_name: ROUGE-1
116
120
  short_display_name: ROUGE-1
117
121
  description: ROUGE-1
122
+ lower_is_better: false
118
123
  - name: rouge2
119
124
  display_name: ROUGE-2
120
125
  short_display_name: ROUGE-2
121
126
  description: ROUGE-2
127
+ lower_is_better: false
122
128
  - name: rougeL
123
129
  display_name: ROUGE-L
124
130
  short_display_name: ROUGE-L
125
131
  description: ROUGE-L
132
+ lower_is_better: false
126
133
  - name: rougeLsum
127
134
  display_name: ROUGE-Lsum
128
135
  short_display_name: ROUGE-Lsum
129
136
  description: ROUGE-Lsum
137
+ lower_is_better: false
130
138
  - name: bleu
131
139
  display_name: BLEU
132
140
  short_display_name: BLEU
133
141
  description: BLEU
142
+ lower_is_better: false
143
+ - name: accuracy
144
+ display_name: Accuracy
145
+ short_display_name: Accuracy
146
+ description: Accuracy
147
+ lower_is_better: false
148
+ - name: f1_macro
149
+ display_name: Macro F1
150
+ short_display_name: Macro F1
151
+ description: Macro F1
152
+ lower_is_better: false
153
+ - name: f1_micro
154
+ display_name: Micro F1
155
+ short_display_name: Micro F1
156
+ description: Micro F1
157
+ lower_is_better: false
158
+ - name: unsorted_list_exact_match
159
+ display_name: Unsorted List Exact Match
160
+ short_display_name: Exact Match
161
+ description: Unsorted List Exact Match
162
+ lower_is_better: false
163
+
164
+ # FinQA Accuracy
165
+ - name: program_accuracy
166
+ display_name: Program Accuracy
167
+ short_display_name: Program Accuracy
168
+ description: Program Accuracy
169
+ lower_is_better: false
170
+ - name: execution_accuracy
171
+ display_name: Execution Accuracy
172
+ short_display_name: Execution Accuracy
173
+ description: Execution Accuracy
174
+ lower_is_better: false
175
+
176
+ # SciGen Accuracy
177
+ - name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
178
+ display_name: Rating
179
+ short_display_name: Rating
180
+ description: Rating by Llama 3 (8B) LLM as judge
181
+ lower_is_better: false
134
182
 
135
183
  perturbations: []
136
184
 
137
185
  metric_groups:
138
- - name: accuracy
139
- display_name: Accuracy
186
+ - name: main_metrics
187
+ display_name: Main Metrics
188
+ metrics:
189
+ - name: ${main_name}
190
+ split: __all__
191
+
192
+ - name: generation_metrics
193
+ display_name: Other Generation Metrics
140
194
  hide_win_rates: true
141
195
  metrics:
142
- - name: meteor
196
+ - name: f1
143
197
  split: __all__
144
198
  - name: rouge1
145
199
  split: __all__
@@ -152,6 +206,17 @@ metric_groups:
152
206
  - name: bleu
153
207
  split: __all__
154
208
 
209
+ - name: classification_metrics
210
+ display_name: Classification Metrics
211
+ hide_win_rates: true
212
+ metrics:
213
+ - name: accuracy
214
+ split: __all__
215
+ - name: f1_macro
216
+ split: __all__
217
+ - name: f1_micro
218
+ split: __all__
219
+
155
220
  - name: efficiency
156
221
  display_name: Efficiency
157
222
  metrics:
@@ -175,18 +240,22 @@ metric_groups:
175
240
 
176
241
  run_groups:
177
242
  - name: table_scenarios
178
- display_name: Table Scenarios
243
+ display_name: Table Scenarios
179
244
  description: Table Scenarios
180
245
  category: All Scenarios
181
246
  subgroups:
182
247
  - unitxt_cards.numeric_nlg
248
+ - unitxt_cards.tab_fact
249
+ - unitxt_cards.wikitq
250
+ - unitxt_cards.scigen
183
251
 
184
252
  - name: unitxt_cards.numeric_nlg
185
253
  display_name: NumericNLG
186
254
  short_display_name: NumericNLG
187
255
  description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
188
256
  metric_groups:
189
- - accuracy
257
+ - main_metrics
258
+ - generation_metrics
190
259
  - efficiency
191
260
  - general_information
192
261
  environment:
@@ -198,3 +267,75 @@ run_groups:
198
267
  who: "?"
199
268
  when: "?"
200
269
  language: English
270
+
271
+ - name: unitxt_cards.tab_fact
272
+ display_name: TabFact
273
+ short_display_name: TabFact
274
+ description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
275
+ metric_groups:
276
+ - main_metrics
277
+ - classification_metrics
278
+ - efficiency
279
+ - general_information
280
+ environment:
281
+ main_name: accuracy
282
+ main_split: test
283
+ taxonomy:
284
+ task: "?"
285
+ what: "?"
286
+ who: "?"
287
+ when: "?"
288
+ language: English
289
+
290
+ - name: unitxt_cards.wikitq
291
+ display_name: WikiTableQuestions
292
+ short_display_name: WikiTableQuestions
293
+ description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
294
+ metric_groups:
295
+ - main_metrics
296
+ - classification_metrics
297
+ - efficiency
298
+ - general_information
299
+ environment:
300
+ main_name: unsorted_list_exact_match
301
+ main_split: test
302
+ taxonomy:
303
+ task: "?"
304
+ what: "?"
305
+ who: "?"
306
+ when: "?"
307
+ language: English
308
+
309
+ - name: unitxt_cards.fin_qa
310
+ display_name: FinQA
311
+ description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
312
+ metric_groups:
313
+ - main_metrics
314
+ - efficiency
315
+ - general_information
316
+ environment:
317
+ main_name: program_accuracy
318
+ main_split: test
319
+ taxonomy:
320
+ task: question answering with numeric reasoning
321
+ what: financial reports
322
+ who: financial experts
323
+ when: 1999 to 2019
324
+ language: English
325
+
326
+ - name: unitxt_cards.scigen
327
+ display_name: SciGen
328
+ description: SciGen
329
+ metric_groups:
330
+ - main_metrics
331
+ - efficiency
332
+ - general_information
333
+ environment:
334
+ main_name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
335
+ main_split: test
336
+ taxonomy:
337
+ task: "?"
338
+ what: "?"
339
+ who: "?"
340
+ when: "?"
341
+ language: English
@@ -78,6 +78,7 @@ perturbations: []
78
78
  metric_groups:
79
79
  - name: accuracy
80
80
  display_name: Accuracy
81
+ hide_win_rates: true
81
82
  metrics:
82
83
  - name: ${main_name}
83
84
  split: ${main_split}
@@ -111,12 +112,32 @@ run_groups:
111
112
  description: Thai-language scenarios
112
113
  category: All scenarios
113
114
  subgroups:
115
+ - thai_exam
114
116
  - thai_exam_onet
115
117
  - thai_exam_ic
116
118
  - thai_exam_tgat
117
119
  - thai_exam_tpat1
118
120
  - thai_exam_a_level
119
121
 
122
+
123
+ - name: thai_exam
124
+ display_name: ThaiExam
125
+ description: >
126
+ Macro-averaged accuracy on all ThaiExam examinations.
127
+ metric_groups:
128
+ - accuracy
129
+ - efficiency
130
+ - general_information
131
+ environment:
132
+ main_name: exact_match
133
+ main_split: test
134
+ taxonomy:
135
+ task: question answering
136
+ what: n/a
137
+ who: n/a
138
+ when: "?"
139
+ language: Thai and English
140
+
120
141
  - name: thai_exam_onet
121
142
  display_name: ONET
122
143
  description: >