crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,367 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: quasi_exact_match
74
+ display_name: Quasi-exact match
75
+ short_display_name: EM
76
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
77
+ lower_is_better: false
78
+ - name: prefix_exact_match
79
+ display_name: Prefix exact match
80
+ short_display_name: PEM
81
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
82
+ lower_is_better: false
83
+ - name: quasi_prefix_exact_match
84
+ # TODO: should call this prefix_quasi_exact_match
85
+ display_name: Prefix quasi-exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
88
+ lower_is_better: false
89
+
90
+ - name: exact_match@5
91
+ display_name: Exact match @5
92
+ short_display_name: EM@5
93
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
94
+ lower_is_better: false
95
+ - name: quasi_exact_match@5
96
+ display_name: Quasi-exact match @5
97
+ short_display_name: EM@5
98
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
99
+ lower_is_better: false
100
+ - name: prefix_exact_match@5
101
+ display_name: Prefix exact match @5
102
+ short_display_name: PEM@5
103
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
104
+ lower_is_better: false
105
+ - name: quasi_prefix_exact_match@5
106
+ display_name: Prefix quasi-exact match @5
107
+ short_display_name: PEM@5
108
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
109
+ lower_is_better: false
110
+
111
+
112
+ ############################################################
113
+ perturbations: []
114
+
115
+ ############################################################
116
+ metric_groups:
117
+ - name: accuracy
118
+ display_name: Accuracy
119
+ hide_win_rates: true
120
+ metrics:
121
+ - name: ${main_name}
122
+ split: ${main_split}
123
+
124
+ - name: efficiency
125
+ display_name: Efficiency
126
+ metrics:
127
+ - name: inference_runtime
128
+ split: ${main_split}
129
+
130
+ - name: general_information
131
+ display_name: General information
132
+ hide_win_rates: true
133
+ metrics:
134
+ - name: num_instances
135
+ split: ${main_split}
136
+ - name: num_train_instances
137
+ split: ${main_split}
138
+ - name: prompt_truncated
139
+ split: ${main_split}
140
+ - name: num_prompt_tokens
141
+ split: ${main_split}
142
+ - name: num_output_tokens
143
+ split: ${main_split}
144
+
145
+ ############################################################
146
+ run_groups:
147
+ - name: world_knowledge_scenarios
148
+ display_name: World Knowledge Scenarios
149
+ description: Scenarios the world knowledge
150
+ category: All scenarios
151
+ subgroups:
152
+ - ewok
153
+ - ewok_agent_properties
154
+ - ewok_material_dynamics
155
+ - ewok_material_properties
156
+ - ewok_physical_dynamics
157
+ - ewok_physical_interactions
158
+ - ewok_physical_relations
159
+ - ewok_quantitative_properties
160
+ - ewok_social_interactions
161
+ - ewok_social_properties
162
+ - ewok_social_relations
163
+ - ewok_spatial_relations
164
+
165
+ - name: ewok
166
+ display_name: EWoK
167
+ description: Elements of World Knowledge (EWoK) is a benchmark for evaluating world modeling by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context.
168
+ metric_groups:
169
+ - accuracy
170
+ - efficiency
171
+ - general_information
172
+ environment:
173
+ main_name: exact_match
174
+ main_split: test
175
+ taxonomy:
176
+ task: multiple choice question answering
177
+ what: world knowledge
178
+ who: n/a
179
+ when: n/a
180
+ language: English
181
+
182
+ - name: ewok_agent_properties
183
+ display_name: Agent Properties
184
+ description: The Agent Properties domain from Elements of World Knowledge (EWoK).
185
+ metric_groups:
186
+ - accuracy
187
+ - efficiency
188
+ - general_information
189
+ environment:
190
+ main_name: exact_match
191
+ main_split: test
192
+ taxonomy:
193
+ task: multiple choice question answering
194
+ what: world knowledge
195
+ who: n/a
196
+ when: n/a
197
+ language: English
198
+
199
+ - name: ewok_material_dynamics
200
+ display_name: Material Dynamics
201
+ description: The Material Dynamics domain from Elements of World Knowledge (EWoK).
202
+ metric_groups:
203
+ - accuracy
204
+ - efficiency
205
+ - general_information
206
+ environment:
207
+ main_name: exact_match
208
+ main_split: test
209
+ taxonomy:
210
+ task: multiple choice question answering
211
+ what: world knowledge
212
+ who: n/a
213
+ when: n/a
214
+ language: English
215
+
216
+ - name: ewok_material_properties
217
+ display_name: Material Properties
218
+ description: The Material Properties domain from Elements of World Knowledge (EWoK).
219
+ metric_groups:
220
+ - accuracy
221
+ - efficiency
222
+ - general_information
223
+ environment:
224
+ main_name: exact_match
225
+ main_split: test
226
+ taxonomy:
227
+ task: multiple choice question answering
228
+ what: world knowledge
229
+ who: n/a
230
+ when: n/a
231
+ language: English
232
+
233
+ - name: ewok_physical_dynamics
234
+ display_name: Physical Dynamics
235
+ description: The Physical Dynamics domain from Elements of World Knowledge (EWoK).
236
+ metric_groups:
237
+ - accuracy
238
+ - efficiency
239
+ - general_information
240
+ environment:
241
+ main_name: exact_match
242
+ main_split: test
243
+ taxonomy:
244
+ task: multiple choice question answering
245
+ what: world knowledge
246
+ who: n/a
247
+ when: n/a
248
+ language: English
249
+
250
+ - name: ewok_physical_interactions
251
+ display_name: Physical Interactions
252
+ description: The Physical Interactions domain from Elements of World Knowledge (EWoK).
253
+ metric_groups:
254
+ - accuracy
255
+ - efficiency
256
+ - general_information
257
+ environment:
258
+ main_name: exact_match
259
+ main_split: test
260
+ taxonomy:
261
+ task: multiple choice question answering
262
+ what: world knowledge
263
+ who: n/a
264
+ when: n/a
265
+ language: English
266
+
267
+ - name: ewok_physical_relations
268
+ display_name: Physical Relations
269
+ description: The Physical Relations domain from Elements of World Knowledge (EWoK).
270
+ metric_groups:
271
+ - accuracy
272
+ - efficiency
273
+ - general_information
274
+ environment:
275
+ main_name: exact_match
276
+ main_split: test
277
+ taxonomy:
278
+ task: multiple choice question answering
279
+ what: world knowledge
280
+ who: n/a
281
+ when: n/a
282
+ language: English
283
+
284
+ - name: ewok_quantitative_properties
285
+ display_name: Quantitative Properties
286
+ description: The Quantitative Properties domain from Elements of World Knowledge (EWoK).
287
+ metric_groups:
288
+ - accuracy
289
+ - efficiency
290
+ - general_information
291
+ environment:
292
+ main_name: exact_match
293
+ main_split: test
294
+ taxonomy:
295
+ task: multiple choice question answering
296
+ what: world knowledge
297
+ who: n/a
298
+ when: n/a
299
+ language: English
300
+
301
+ - name: ewok_social_interactions
302
+ display_name: Social Interactions
303
+ description: The Social Interactions domain from Elements of World Knowledge (EWoK).
304
+ metric_groups:
305
+ - accuracy
306
+ - efficiency
307
+ - general_information
308
+ environment:
309
+ main_name: exact_match
310
+ main_split: test
311
+ taxonomy:
312
+ task: multiple choice question answering
313
+ what: world knowledge
314
+ who: n/a
315
+ when: n/a
316
+ language: English
317
+
318
+ - name: ewok_social_properties
319
+ display_name: Social Properties
320
+ description: The Social Properties domain from Elements of World Knowledge (EWoK).
321
+ metric_groups:
322
+ - accuracy
323
+ - efficiency
324
+ - general_information
325
+ environment:
326
+ main_name: exact_match
327
+ main_split: test
328
+ taxonomy:
329
+ task: multiple choice question answering
330
+ what: world knowledge
331
+ who: n/a
332
+ when: n/a
333
+ language: English
334
+
335
+ - name: ewok_social_relations
336
+ display_name: Social Relations
337
+ description: The Social Relations domain from Elements of World Knowledge (EWoK).
338
+ metric_groups:
339
+ - accuracy
340
+ - efficiency
341
+ - general_information
342
+ environment:
343
+ main_name: exact_match
344
+ main_split: test
345
+ taxonomy:
346
+ task: multiple choice question answering
347
+ what: world knowledge
348
+ who: n/a
349
+ when: n/a
350
+ language: English
351
+
352
+ - name: ewok_spatial_relations
353
+ display_name: Spatial Relations
354
+ description: The Spatial Relations domain from Elements of World Knowledge (EWoK).
355
+ metric_groups:
356
+ - accuracy
357
+ - efficiency
358
+ - general_information
359
+ environment:
360
+ main_name: exact_match
361
+ main_split: test
362
+ taxonomy:
363
+ task: multiple choice question answering
364
+ what: world knowledge
365
+ who: n/a
366
+ when: n/a
367
+ language: English
@@ -73,6 +73,15 @@ metrics:
73
73
  display_name: Execution Accuracy
74
74
  description: Accuracy of the final result of the generated program
75
75
  lower_is_better: false
76
+ - name: annotation_financebench_label_correct_answer
77
+ display_name: Correct Answer
78
+ description: Whether the final result was correct, as judged by a GPT-4o
79
+ lower_is_better: false
80
+ - name: quasi_exact_match
81
+ display_name: Quasi-exact match
82
+ short_display_name: EM
83
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
84
+ lower_is_better: false
76
85
 
77
86
  ############################################################
78
87
  perturbations: []
@@ -114,6 +123,8 @@ run_groups:
114
123
  category: All scenarios
115
124
  subgroups:
116
125
  - fin_qa
126
+ - financebench
127
+ - banking77
117
128
 
118
129
  - name: fin_qa
119
130
  display_name: FinQA
@@ -132,12 +143,47 @@ run_groups:
132
143
  when: 1999 to 2019
133
144
  language: English
134
145
 
135
- - name: financial_scenarios_ablations
136
- display_name: Financial Scenarios Ablations
137
- description: Scenarios for the financial domain with ablations
138
- category: All scenarios
139
- subgroups:
140
- - fin_qa
141
- adapter_keys_shown:
142
- - model
143
- - max_train_instances
146
+ - name: financebench
147
+ display_name: FinanceBench
148
+ description: FinanceBench is a benchmark for open book financial question answering. It comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings
149
+ metric_groups:
150
+ - accuracy
151
+ - efficiency
152
+ - general_information
153
+ environment:
154
+ main_name: annotation_financebench_label_correct_answer
155
+ main_split: test
156
+ taxonomy:
157
+ task: question answering with numeric reasoning
158
+ what: financial reports
159
+ who: financial experts
160
+ when: 2015 to 2023
161
+ language: English
162
+
163
+ - name: banking77
164
+ display_name: BANKING77
165
+ short_display_name: BANKING77
166
+ description: BANKING77 is a benchmark for intent classification of customer service queries in the banking domain [(Casanueva et al., 2020)](https://aclanthology.org/2020.nlp4convai-1.5/)).
167
+ metric_groups:
168
+ - accuracy
169
+ - efficiency
170
+ - general_information
171
+ environment:
172
+ main_name: quasi_exact_match
173
+ main_split: test
174
+ taxonomy:
175
+ task: text classification
176
+ what: customer service queries in the banking domain
177
+ who: banking customers
178
+ when: During or before 2020
179
+ language: English
180
+
181
+ # - name: financial_scenarios_ablations
182
+ # display_name: Financial Scenarios Ablations
183
+ # description: Scenarios for the financial domain with ablations
184
+ # category: All scenarios
185
+ # subgroups:
186
+ # - fin_qa
187
+ # adapter_keys_shown:
188
+ # - model
189
+ # - max_train_instances