crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -84,9 +84,19 @@ metrics:
84
84
  description: Fraction of instances where the generated code compiles successfully.
85
85
  lower_is_better: false
86
86
  - name: fid_similarity
87
- display_name: FID similarity
88
- short_display_name: FID
89
- description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
87
+ display_name: CIS
88
+ short_display_name: CIS
89
+ description: The cosine similarity between the Inception feature vectors.
90
+ lower_is_better: false
91
+ - name: lpips_similarity
92
+ display_name: LPIPS
93
+ short_display_name: LPIPS
94
+ description: The LPIPS distance between the generated image and the target image.
95
+ lower_is_better: false
96
+ - name: ssim_similarity
97
+ display_name: SSIM
98
+ short_display_name: SSIM
99
+ description: The SSIM similarity between the generated image and the target image.
90
100
  lower_is_better: false
91
101
 
92
102
  # Accuracy metrics:
@@ -165,6 +175,10 @@ metric_groups:
165
175
  split: ${main_split}
166
176
  - name: earth_mover_similarity
167
177
  split: ${main_split}
178
+ - name: lpips_similarity
179
+ split: ${main_split}
180
+ - name: ssim_similarity
181
+ split: ${main_split}
168
182
 
169
183
  - name: generation_text
170
184
  display_name: Generation (text)
@@ -175,7 +189,7 @@ metric_groups:
175
189
  ############################################################
176
190
  run_groups:
177
191
  - name: core_scenarios
178
- display_name: Image2Structure
192
+ display_name: Image2Struct
179
193
  description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
180
194
  category: All scenarios
181
195
  subgroups:
@@ -183,13 +197,13 @@ run_groups:
183
197
  - image2webpage
184
198
  - image2musicsheet
185
199
 
186
- - name: image2structure_real
187
- display_name: Image2Structure (Wild)
200
+ - name: image2struct_wild
201
+ display_name: Image2Struct (Wild)
188
202
  description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
189
203
  category: All scenarios
190
204
  subgroups:
191
- - image2latex_real
192
- - image2webpage_real
205
+ - image2latex_wild
206
+ - image2webpage_wild
193
207
 
194
208
  - name: image2latex
195
209
  display_name: Image2LaTeX
@@ -209,9 +223,9 @@ run_groups:
209
223
  when: "2024"
210
224
  language: English
211
225
 
212
- - name: image2latex_easy
213
- display_name: I2LaTeX (Easy)
214
- description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
226
+ - name: image2latex_equation
227
+ display_name: I2LaTeX (Equation)
228
+ description: The Image2LaTeX benchmark subset for converting images of mathematical equations to LaTeX.
215
229
  metric_groups:
216
230
  - accuracy_simple
217
231
  - compilation
@@ -223,14 +237,14 @@ run_groups:
223
237
  main_split: valid
224
238
  taxonomy:
225
239
  task: image-to-text
226
- what: mathematical equations, tables, algorithms, tikz
240
+ what: mathematical equations
227
241
  who: dataset authors
228
242
  when: "2024"
229
243
  language: English
230
244
 
231
- - name: image2latex_medium
232
- display_name: I2LaTeX (Medium)
233
- description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
245
+ - name: image2latex_table
246
+ display_name: I2LaTeX (Table)
247
+ description: The Image2LaTeX benchmark subset for converting images of tables to LaTeX.
234
248
  metric_groups:
235
249
  - accuracy_simple
236
250
  - compilation
@@ -242,14 +256,13 @@ run_groups:
242
256
  main_split: valid
243
257
  taxonomy:
244
258
  task: image-to-text
245
- what: mathematical equations, tables, algorithms, tikz
259
+ what: tables
246
260
  who: dataset authors
247
261
  when: "2024"
248
- language: English
249
262
 
250
- - name: image2latex_hard
251
- display_name: I2LaTeX (Hard)
252
- description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
263
+ - name: image2latex_algorithm
264
+ display_name: I2LaTeX (Algorithm)
265
+ description: The Image2LaTeX benchmark subset for converting images of algorithms to LaTeX.
253
266
  metric_groups:
254
267
  - accuracy_simple
255
268
  - compilation
@@ -261,12 +274,86 @@ run_groups:
261
274
  main_split: valid
262
275
  taxonomy:
263
276
  task: image-to-text
264
- what: mathematical equations, tables, algorithms, tikz
277
+ what: algorithms
265
278
  who: dataset authors
266
279
  when: "2024"
267
- language: English
268
280
 
269
- - name: image2latex_real
281
+ - name: image2latex_plot
282
+ display_name: I2LaTeX (Tikz)
283
+ description: The Image2LaTeX benchmark subset for converting images of tikz to LaTeX.
284
+ metric_groups:
285
+ - accuracy_simple
286
+ - compilation
287
+ - generation_image
288
+ - generation_text
289
+ - general_information
290
+ environment:
291
+ main_name: earth_mover_similarity
292
+ main_split: valid
293
+ taxonomy:
294
+ task: image-to-text
295
+ what: tikz (plots)
296
+ who: dataset authors
297
+ when: "2024"
298
+
299
+ # - name: image2latex_easy
300
+ # display_name: I2LaTeX (Easy)
301
+ # description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
302
+ # metric_groups:
303
+ # - accuracy_simple
304
+ # - compilation
305
+ # - generation_image
306
+ # - generation_text
307
+ # - general_information
308
+ # environment:
309
+ # main_name: earth_mover_similarity
310
+ # main_split: valid
311
+ # taxonomy:
312
+ # task: image-to-text
313
+ # what: mathematical equations, tables, algorithms, tikz
314
+ # who: dataset authors
315
+ # when: "2024"
316
+ # language: English
317
+
318
+ # - name: image2latex_medium
319
+ # display_name: I2LaTeX (Medium)
320
+ # description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
321
+ # metric_groups:
322
+ # - accuracy_simple
323
+ # - compilation
324
+ # - generation_image
325
+ # - generation_text
326
+ # - general_information
327
+ # environment:
328
+ # main_name: earth_mover_similarity
329
+ # main_split: valid
330
+ # taxonomy:
331
+ # task: image-to-text
332
+ # what: mathematical equations, tables, algorithms, tikz
333
+ # who: dataset authors
334
+ # when: "2024"
335
+ # language: English
336
+
337
+ # - name: image2latex_hard
338
+ # display_name: I2LaTeX (Hard)
339
+ # description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
340
+ # metric_groups:
341
+ # - accuracy_simple
342
+ # - compilation
343
+ # - generation_image
344
+ # - generation_text
345
+ # - general_information
346
+ # environment:
347
+ # main_name: earth_mover_similarity
348
+ # main_split: valid
349
+ # taxonomy:
350
+ # task: image-to-text
351
+ # what: mathematical equations, tables, algorithms, tikz
352
+ # who: dataset authors
353
+ # when: "2024"
354
+ # language: English
355
+
356
+ - name: image2latex_wild
270
357
  display_name: Image2LaTeX (Wild)
271
358
  description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
272
359
  metric_groups:
@@ -301,9 +388,9 @@ run_groups:
301
388
  when: "2024"
302
389
  language: English
303
390
 
304
- - name: image2webpage_easy
305
- display_name: I2webpage (Easy)
306
- description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
391
+ - name: image2webpage_css
392
+ display_name: I2webpage (CSS)
393
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly CSS.
307
394
  metric_groups:
308
395
  - accuracy_simple
309
396
  - compilation
@@ -315,14 +402,13 @@ run_groups:
315
402
  main_split: valid
316
403
  taxonomy:
317
404
  task: image-to-text
318
- what: css, html, javascript
405
+ what: code (mostly CSS)
319
406
  who: dataset authors
320
407
  when: "2024"
321
- language: English
322
408
 
323
- - name: image2webpage_medium
324
- display_name: I2webpage (Medium)
325
- description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
409
+ - name: image2webpage_html
410
+ display_name: I2webpage (HTML)
411
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly HTML.
326
412
  metric_groups:
327
413
  - accuracy_simple
328
414
  - compilation
@@ -334,14 +420,13 @@ run_groups:
334
420
  main_split: valid
335
421
  taxonomy:
336
422
  task: image-to-text
337
- what: css, html, javascript
423
+ what: code (mostly HTML)
338
424
  who: dataset authors
339
425
  when: "2024"
340
- language: English
341
426
 
342
- - name: image2webpage_hard
343
- display_name: I2webpage (Hard)
344
- description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
427
+ - name: image2webpage_javascript
428
+ display_name: I2webpage (Javascript)
429
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly Javascript.
345
430
  metric_groups:
346
431
  - accuracy_simple
347
432
  - compilation
@@ -353,12 +438,68 @@ run_groups:
353
438
  main_split: valid
354
439
  taxonomy:
355
440
  task: image-to-text
356
- what: css, html, javascript
441
+ what: code (mostly Javascript)
357
442
  who: dataset authors
358
443
  when: "2024"
359
- language: English
360
444
 
361
- - name: image2webpage_real
445
+ # - name: image2webpage_easy
446
+ # display_name: I2webpage (Easy)
447
+ # description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
448
+ # metric_groups:
449
+ # - accuracy_simple
450
+ # - compilation
451
+ # - generation_image
452
+ # - generation_text
453
+ # - general_information
454
+ # environment:
455
+ # main_name: earth_mover_similarity
456
+ # main_split: valid
457
+ # taxonomy:
458
+ # task: image-to-text
459
+ # what: css, html, javascript
460
+ # who: dataset authors
461
+ # when: "2024"
462
+ # language: English
463
+
464
+ # - name: image2webpage_medium
465
+ # display_name: I2webpage (Medium)
466
+ # description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
467
+ # metric_groups:
468
+ # - accuracy_simple
469
+ # - compilation
470
+ # - generation_image
471
+ # - generation_text
472
+ # - general_information
473
+ # environment:
474
+ # main_name: earth_mover_similarity
475
+ # main_split: valid
476
+ # taxonomy:
477
+ # task: image-to-text
478
+ # what: css, html, javascript
479
+ # who: dataset authors
480
+ # when: "2024"
481
+ # language: English
482
+
483
+ # - name: image2webpage_hard
484
+ # display_name: I2webpage (Hard)
485
+ # description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
486
+ # metric_groups:
487
+ # - accuracy_simple
488
+ # - compilation
489
+ # - generation_image
490
+ # - generation_text
491
+ # - general_information
492
+ # environment:
493
+ # main_name: earth_mover_similarity
494
+ # main_split: valid
495
+ # taxonomy:
496
+ # task: image-to-text
497
+ # what: css, html, javascript
498
+ # who: dataset authors
499
+ # when: "2024"
500
+ # language: English
501
+
502
+ - name: image2webpage_wild
362
503
  display_name: Image2webpage (Wild)
363
504
  description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
364
505
  metric_groups:
@@ -392,56 +533,56 @@ run_groups:
392
533
  when: "2024"
393
534
  language: English
394
535
 
395
- - name: image2musicsheet_easy
396
- display_name: I2musicsheet (Easy)
397
- description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
398
- metric_groups:
399
- - accuracy_simple
400
- - compilation
401
- - generation_image
402
- - general_information
403
- environment:
404
- main_name: earth_mover_similarity
405
- main_split: valid
406
- taxonomy:
407
- task: image-to-text
408
- what: music sheets
409
- who: dataset authors
410
- when: "2024"
411
- language: English
536
+ # - name: image2musicsheet_easy
537
+ # display_name: I2musicsheet (Easy)
538
+ # description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
539
+ # metric_groups:
540
+ # - accuracy_simple
541
+ # - compilation
542
+ # - generation_image
543
+ # - general_information
544
+ # environment:
545
+ # main_name: earth_mover_similarity
546
+ # main_split: valid
547
+ # taxonomy:
548
+ # task: image-to-text
549
+ # what: music sheets
550
+ # who: dataset authors
551
+ # when: "2024"
552
+ # language: English
412
553
 
413
- - name: image2musicsheet_medium
414
- display_name: I2musicsheet (Medium)
415
- description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
416
- metric_groups:
417
- - accuracy_simple
418
- - compilation
419
- - generation_image
420
- - general_information
421
- environment:
422
- main_name: earth_mover_similarity
423
- main_split: valid
424
- taxonomy:
425
- task: image-to-text
426
- what: music sheets
427
- who: dataset authors
428
- when: "2024"
429
- language: English
554
+ # - name: image2musicsheet_medium
555
+ # display_name: I2musicsheet (Medium)
556
+ # description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
557
+ # metric_groups:
558
+ # - accuracy_simple
559
+ # - compilation
560
+ # - generation_image
561
+ # - general_information
562
+ # environment:
563
+ # main_name: earth_mover_similarity
564
+ # main_split: valid
565
+ # taxonomy:
566
+ # task: image-to-text
567
+ # what: music sheets
568
+ # who: dataset authors
569
+ # when: "2024"
570
+ # language: English
430
571
 
431
- - name: image2musicsheet_hard
432
- display_name: I2musicsheet (Hard)
433
- description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
434
- metric_groups:
435
- - accuracy_simple
436
- - compilation
437
- - generation_image
438
- - general_information
439
- environment:
440
- main_name: earth_mover_similarity
441
- main_split: valid
442
- taxonomy:
443
- task: image-to-text
444
- what: music sheets
445
- who: dataset authors
446
- when: "2024"
447
- language: English
572
+ # - name: image2musicsheet_hard
573
+ # display_name: I2musicsheet (Hard)
574
+ # description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
575
+ # metric_groups:
576
+ # - accuracy_simple
577
+ # - compilation
578
+ # - generation_image
579
+ # - general_information
580
+ # environment:
581
+ # main_name: earth_mover_similarity
582
+ # main_split: valid
583
+ # taxonomy:
584
+ # task: image-to-text
585
+ # what: music sheets
586
+ # who: dataset authors
587
+ # when: "2024"
588
+ # language: English
@@ -0,0 +1,247 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: safety_score
78
+ display_name: LM Evaluated Safety score
79
+ description: LM Evaluated Safety score (mean of Claude Sonnet3.5 and GPT4o ratings)
80
+ lower_is_better: false
81
+ - name: bbq_accuracy
82
+ display_name: BBQ accuracy
83
+ description: BBQ accuracy
84
+ lower_is_better: false
85
+
86
+ # Toxicity metrics
87
+ - name: expected_max_toxicity
88
+ display_name: Average max toxicity
89
+ short_display_name: Avg. max toxicity
90
+ lower_is_better: true
91
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
92
+ - name: max_toxicity_probability
93
+ display_name: max toxicity
94
+ lower_is_better: true
95
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
96
+ - name: toxic_frac
97
+ display_name: Toxic fraction
98
+ lower_is_better: true
99
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
100
+
101
+
102
+ ############################################################
103
+ perturbations: []
104
+
105
+ ############################################################
106
+ metric_groups:
107
+ - name: accuracy
108
+ display_name: Accuracy
109
+ metrics:
110
+ - name: ${main_name}
111
+ split: ${main_split}
112
+
113
+ - name: efficiency
114
+ display_name: Efficiency
115
+ metrics:
116
+ - name: inference_runtime
117
+ split: ${main_split}
118
+
119
+ - name: general_information
120
+ display_name: General information
121
+ hide_win_rates: true
122
+ metrics:
123
+ - name: num_instances
124
+ split: ${main_split}
125
+ - name: num_train_instances
126
+ split: ${main_split}
127
+ - name: prompt_truncated
128
+ split: ${main_split}
129
+ - name: num_prompt_tokens
130
+ split: ${main_split}
131
+ - name: num_output_tokens
132
+ split: ${main_split}
133
+
134
+ - name: toxicity
135
+ display_name: Toxicity
136
+ metrics:
137
+ - name: toxic_frac
138
+ split: ${main_split}
139
+
140
+ - name: bbq_metrics
141
+ display_name: BBQ metrics
142
+ description: Metrics used for the BBQ bias benchmark.
143
+ metrics:
144
+ - name: bbq_metric_ambiguous_bias
145
+ split: ${main_split}
146
+ - name: bbq_metric_unambiguous_bias
147
+ split: ${main_split}
148
+
149
+ ############################################################
150
+ run_groups:
151
+ - name: safety_scenarios
152
+ display_name: Safety Scenarios
153
+ description: Scenarios for the model safety
154
+ category: All scenarios
155
+ subgroups:
156
+ - harm_bench
157
+ - simple_safety_tests
158
+ - bbq
159
+ - anthropic_red_team
160
+ - xstest
161
+
162
+ - name: harm_bench
163
+ display_name: HarmBench
164
+ description: HarmBench
165
+ metric_groups:
166
+ - accuracy
167
+ - efficiency
168
+ - general_information
169
+ environment:
170
+ main_name: safety_score
171
+ main_split: test
172
+ taxonomy:
173
+ task: question answering
174
+ what: n/a
175
+ who: n/a
176
+ when: n/a
177
+ language: English
178
+
179
+ - name: simple_safety_tests
180
+ display_name: SimpleSafetyTests
181
+ description: SimpleSafetyTests
182
+ metric_groups:
183
+ - accuracy
184
+ - efficiency
185
+ - general_information
186
+ environment:
187
+ main_name: safety_score
188
+ main_split: test
189
+ taxonomy:
190
+ task: question answering
191
+ what: n/a
192
+ who: n/a
193
+ when: n/a
194
+ language: English
195
+
196
+ - name: xstest
197
+ display_name: XSTest
198
+ description: XSTest
199
+ metric_groups:
200
+ - accuracy
201
+ - efficiency
202
+ - general_information
203
+ environment:
204
+ main_name: safety_score
205
+ main_split: test
206
+ taxonomy:
207
+ task: question answering
208
+ what: n/a
209
+ who: n/a
210
+ when: n/a
211
+ language: English
212
+
213
+ - name: bbq
214
+ display_name: BBQ
215
+ description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
216
+ metric_groups:
217
+ - accuracy
218
+ - efficiency
219
+ - general_information
220
+ - bbq_metrics
221
+ environment:
222
+ main_name: bbq_accuracy
223
+ main_split: test
224
+ taxonomy:
225
+ task: question answering
226
+ what: n/a
227
+ who: n/a
228
+ when: n/a
229
+ language: English
230
+
231
+ - name: anthropic_red_team
232
+ display_name: Anthropic Red Team
233
+ short_display_name: Anthropic Red Team
234
+ description: Anthropic Red Team
235
+ metric_groups:
236
+ - accuracy
237
+ - efficiency
238
+ - general_information
239
+ environment:
240
+ main_name: safety_score
241
+ main_split: test
242
+ taxonomy:
243
+ task: question answering
244
+ what: "?"
245
+ who: "?"
246
+ when: "?"
247
+ language: English