crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -99,47 +99,94 @@ metrics:
99
99
  display_name: METEOR
100
100
  short_display_name: METEOR
101
101
  description: METEOR
102
+ lower_is_better: false
102
103
  - name: f1
103
- display_name: F1
104
- short_display_name: F1
105
- description: F1
104
+ display_name: BERTScore F1
105
+ short_display_name: BERTScore F1
106
+ description: BERTScore F1
107
+ lower_is_better: false
106
108
  - name: precision
107
109
  display_name: Precision
108
110
  short_display_name: Precision
109
111
  description: Precision
112
+ lower_is_better: false
110
113
  - name: recall
111
114
  display_name: Recall
112
115
  short_display_name: Recall
113
116
  description: Recall
117
+ lower_is_better: false
114
118
  - name: rouge1
115
119
  display_name: ROUGE-1
116
120
  short_display_name: ROUGE-1
117
121
  description: ROUGE-1
122
+ lower_is_better: false
118
123
  - name: rouge2
119
124
  display_name: ROUGE-2
120
125
  short_display_name: ROUGE-2
121
126
  description: ROUGE-2
127
+ lower_is_better: false
122
128
  - name: rougeL
123
129
  display_name: ROUGE-L
124
130
  short_display_name: ROUGE-L
125
131
  description: ROUGE-L
132
+ lower_is_better: false
126
133
  - name: rougeLsum
127
134
  display_name: ROUGE-Lsum
128
135
  short_display_name: ROUGE-Lsum
129
136
  description: ROUGE-Lsum
137
+ lower_is_better: false
130
138
  - name: bleu
131
139
  display_name: BLEU
132
140
  short_display_name: BLEU
133
141
  description: BLEU
142
+ lower_is_better: false
143
+ - name: accuracy
144
+ display_name: Accuracy
145
+ short_display_name: Accuracy
146
+ description: Accuracy
147
+ lower_is_better: false
148
+ - name: f1_macro
149
+ display_name: Macro F1
150
+ short_display_name: Macro F1
151
+ description: Macro F1
152
+ lower_is_better: false
153
+ - name: f1_micro
154
+ display_name: Micro F1
155
+ short_display_name: Micro F1
156
+ description: Micro F1
157
+ lower_is_better: false
158
+ - name: unsorted_list_exact_match
159
+ display_name: Unsorted List Exact Match
160
+ short_display_name: Exact Match
161
+ description: Unsorted List Exact Match
162
+ lower_is_better: false
163
+
164
+ # FinQA Accuracy
165
+ - name: program_accuracy
166
+ display_name: Program Accuracy
167
+ short_display_name: Program Accuracy
168
+ description: Program Accuracy
169
+ lower_is_better: false
170
+ - name: execution_accuracy
171
+ display_name: Execution Accuracy
172
+ short_display_name: Execution Accuracy
173
+ description: Execution Accuracy
174
+ lower_is_better: false
134
175
 
135
176
  perturbations: []
136
177
 
137
178
  metric_groups:
138
- - name: accuracy
139
- display_name: Accuracy
179
+ - name: main_metrics
180
+ display_name: Main Metrics
181
+ metrics:
182
+ - name: ${main_name}
183
+ split: __all__
184
+
185
+ - name: generation_metrics
186
+ display_name: Other Generation Metrics
140
187
  hide_win_rates: true
141
188
  metrics:
142
- - name: meteor
189
+ - name: f1
143
190
  split: __all__
144
191
  - name: rouge1
145
192
  split: __all__
@@ -152,6 +199,17 @@ metric_groups:
152
199
  - name: bleu
153
200
  split: __all__
154
201
 
202
+ - name: classification_metrics
203
+ display_name: Classification Metrics
204
+ hide_win_rates: true
205
+ metrics:
206
+ - name: accuracy
207
+ split: __all__
208
+ - name: f1_macro
209
+ split: __all__
210
+ - name: f1_micro
211
+ split: __all__
212
+
155
213
  - name: efficiency
156
214
  display_name: Efficiency
157
215
  metrics:
@@ -180,13 +238,17 @@ run_groups:
180
238
  category: All Scenarios
181
239
  subgroups:
182
240
  - unitxt_cards.numeric_nlg
241
+ - unitxt_cards.tab_fact
242
+ - unitxt_cards.wikitq
243
+ - unitxt_cards.fin_qa
183
244
 
184
245
  - name: unitxt_cards.numeric_nlg
185
246
  display_name: NumericNLG
186
247
  short_display_name: NumericNLG
187
248
  description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
188
249
  metric_groups:
189
- - accuracy
250
+ - main_metrics
251
+ - generation_metrics
190
252
  - efficiency
191
253
  - general_information
192
254
  environment:
@@ -198,3 +260,58 @@ run_groups:
198
260
  who: "?"
199
261
  when: "?"
200
262
  language: English
263
+
264
+ - name: unitxt_cards.tab_fact
265
+ display_name: TabFact
266
+ short_display_name: TabFact
267
+ description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
268
+ metric_groups:
269
+ - main_metrics
270
+ - classification_metrics
271
+ - efficiency
272
+ - general_information
273
+ environment:
274
+ main_name: accuracy
275
+ main_split: test
276
+ taxonomy:
277
+ task: "?"
278
+ what: "?"
279
+ who: "?"
280
+ when: "?"
281
+ language: English
282
+
283
+ - name: unitxt_cards.wikitq
284
+ display_name: WikiTableQuestions
285
+ short_display_name: WikiTableQuestions
286
+ description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
287
+ metric_groups:
288
+ - main_metrics
289
+ - classification_metrics
290
+ - efficiency
291
+ - general_information
292
+ environment:
293
+ main_name: unsorted_list_exact_match
294
+ main_split: test
295
+ taxonomy:
296
+ task: "?"
297
+ what: "?"
298
+ who: "?"
299
+ when: "?"
300
+ language: English
301
+
302
+ - name: unitxt_cards.fin_qa
303
+ display_name: FinQA
304
+ description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
305
+ metric_groups:
306
+ - main_metrics
307
+ - efficiency
308
+ - general_information
309
+ environment:
310
+ main_name: program_accuracy
311
+ main_split: test
312
+ taxonomy:
313
+ task: question answering with numeric reasoning
314
+ what: financial reports
315
+ who: financial experts
316
+ when: 1999 to 2019
317
+ language: English
@@ -78,6 +78,7 @@ perturbations: []
78
78
  metric_groups:
79
79
  - name: accuracy
80
80
  display_name: Accuracy
81
+ hide_win_rates: true
81
82
  metrics:
82
83
  - name: ${main_name}
83
84
  split: ${main_split}
@@ -111,12 +112,32 @@ run_groups:
111
112
  description: Thai-language scenarios
112
113
  category: All scenarios
113
114
  subgroups:
115
+ - thai_exam
114
116
  - thai_exam_onet
115
117
  - thai_exam_ic
116
118
  - thai_exam_tgat
117
119
  - thai_exam_tpat1
118
120
  - thai_exam_a_level
119
121
 
122
+
123
+ - name: thai_exam
124
+ display_name: ThaiExam
125
+ description: >
126
+ Macro-averaged accuracy on all ThaiExam examinations.
127
+ metric_groups:
128
+ - accuracy
129
+ - efficiency
130
+ - general_information
131
+ environment:
132
+ main_name: exact_match
133
+ main_split: test
134
+ taxonomy:
135
+ task: question answering
136
+ what: n/a
137
+ who: n/a
138
+ when: "?"
139
+ language: Thai and English
140
+
120
141
  - name: thai_exam_onet
121
142
  display_name: ONET
122
143
  description: >