crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -84,9 +84,19 @@ metrics:
84
84
  description: Fraction of instances where the generated code compiles successfully.
85
85
  lower_is_better: false
86
86
  - name: fid_similarity
87
- display_name: FID similarity
88
- short_display_name: FID
89
- description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
87
+ display_name: CIS
88
+ short_display_name: CIS
89
+ description: The cosine similarity between the Inception feature vectors.
90
+ lower_is_better: false
91
+ - name: lpips_similarity
92
+ display_name: LPIPS
93
+ short_display_name: LPIPS
94
+ description: The LPIPS distance between the generated image and the target image.
95
+ lower_is_better: false
96
+ - name: ssim_similarity
97
+ display_name: SSIM
98
+ short_display_name: SSIM
99
+ description: The SSIM similarity between the generated image and the target image.
90
100
  lower_is_better: false
91
101
 
92
102
  # Accuracy metrics:
@@ -165,6 +175,10 @@ metric_groups:
165
175
  split: ${main_split}
166
176
  - name: earth_mover_similarity
167
177
  split: ${main_split}
178
+ - name: lpips_similarity
179
+ split: ${main_split}
180
+ - name: ssim_similarity
181
+ split: ${main_split}
168
182
 
169
183
  - name: generation_text
170
184
  display_name: Generation (text)
@@ -175,7 +189,7 @@ metric_groups:
175
189
  ############################################################
176
190
  run_groups:
177
191
  - name: core_scenarios
178
- display_name: Image2Structure
192
+ display_name: Image2Struct
179
193
  description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
180
194
  category: All scenarios
181
195
  subgroups:
@@ -183,13 +197,13 @@ run_groups:
183
197
  - image2webpage
184
198
  - image2musicsheet
185
199
 
186
- - name: image2structure_real
187
- display_name: Image2Structure (Wild)
200
+ - name: image2struct_wild
201
+ display_name: Image2Struct (Wild)
188
202
  description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
189
203
  category: All scenarios
190
204
  subgroups:
191
- - image2latex_real
192
- - image2webpage_real
205
+ - image2latex_wild
206
+ - image2webpage_wild
193
207
 
194
208
  - name: image2latex
195
209
  display_name: Image2LaTeX
@@ -209,9 +223,9 @@ run_groups:
209
223
  when: "2024"
210
224
  language: English
211
225
 
212
- - name: image2latex_easy
213
- display_name: I2LaTeX (Easy)
214
- description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
226
+ - name: image2latex_equation
227
+ display_name: I2LaTeX (Equation)
228
+ description: The Image2LaTeX benchmark subset for converting images of mathematical equations to LaTeX.
215
229
  metric_groups:
216
230
  - accuracy_simple
217
231
  - compilation
@@ -223,14 +237,14 @@ run_groups:
223
237
  main_split: valid
224
238
  taxonomy:
225
239
  task: image-to-text
226
- what: mathematical equations, tables, algorithms, tikz
240
+ what: mathematical equations
227
241
  who: dataset authors
228
242
  when: "2024"
229
243
  language: English
230
244
 
231
- - name: image2latex_medium
232
- display_name: I2LaTeX (Medium)
233
- description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
245
+ - name: image2latex_table
246
+ display_name: I2LaTeX (Table)
247
+ description: The Image2LaTeX benchmark subset for converting images of tables to LaTeX.
234
248
  metric_groups:
235
249
  - accuracy_simple
236
250
  - compilation
@@ -242,14 +256,13 @@ run_groups:
242
256
  main_split: valid
243
257
  taxonomy:
244
258
  task: image-to-text
245
- what: mathematical equations, tables, algorithms, tikz
259
+ what: tables
246
260
  who: dataset authors
247
261
  when: "2024"
248
- language: English
249
262
 
250
- - name: image2latex_hard
251
- display_name: I2LaTeX (Hard)
252
- description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
263
+ - name: image2latex_algorithm
264
+ display_name: I2LaTeX (Algorithm)
265
+ description: The Image2LaTeX benchmark subset for converting images of algorithms to LaTeX.
253
266
  metric_groups:
254
267
  - accuracy_simple
255
268
  - compilation
@@ -261,12 +274,86 @@ run_groups:
261
274
  main_split: valid
262
275
  taxonomy:
263
276
  task: image-to-text
264
- what: mathematical equations, tables, algorithms, tikz
277
+ what: algorithms
265
278
  who: dataset authors
266
279
  when: "2024"
267
- language: English
268
280
 
269
- - name: image2latex_real
281
+ - name: image2latex_plot
282
+ display_name: I2LaTeX (Tikz)
283
+ description: The Image2LaTeX benchmark subset for converting images of tikz to LaTeX.
284
+ metric_groups:
285
+ - accuracy_simple
286
+ - compilation
287
+ - generation_image
288
+ - generation_text
289
+ - general_information
290
+ environment:
291
+ main_name: earth_mover_similarity
292
+ main_split: valid
293
+ taxonomy:
294
+ task: image-to-text
295
+ what: tikz (plots)
296
+ who: dataset authors
297
+ when: "2024"
298
+
299
+ # - name: image2latex_easy
300
+ # display_name: I2LaTeX (Easy)
301
+ # description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
302
+ # metric_groups:
303
+ # - accuracy_simple
304
+ # - compilation
305
+ # - generation_image
306
+ # - generation_text
307
+ # - general_information
308
+ # environment:
309
+ # main_name: earth_mover_similarity
310
+ # main_split: valid
311
+ # taxonomy:
312
+ # task: image-to-text
313
+ # what: mathematical equations, tables, algorithms, tikz
314
+ # who: dataset authors
315
+ # when: "2024"
316
+ # language: English
317
+
318
+ # - name: image2latex_medium
319
+ # display_name: I2LaTeX (Medium)
320
+ # description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
321
+ # metric_groups:
322
+ # - accuracy_simple
323
+ # - compilation
324
+ # - generation_image
325
+ # - generation_text
326
+ # - general_information
327
+ # environment:
328
+ # main_name: earth_mover_similarity
329
+ # main_split: valid
330
+ # taxonomy:
331
+ # task: image-to-text
332
+ # what: mathematical equations, tables, algorithms, tikz
333
+ # who: dataset authors
334
+ # when: "2024"
335
+ # language: English
336
+
337
+ # - name: image2latex_hard
338
+ # display_name: I2LaTeX (Hard)
339
+ # description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
340
+ # metric_groups:
341
+ # - accuracy_simple
342
+ # - compilation
343
+ # - generation_image
344
+ # - generation_text
345
+ # - general_information
346
+ # environment:
347
+ # main_name: earth_mover_similarity
348
+ # main_split: valid
349
+ # taxonomy:
350
+ # task: image-to-text
351
+ # what: mathematical equations, tables, algorithms, tikz
352
+ # who: dataset authors
353
+ # when: "2024"
354
+ # language: English
355
+
356
+ - name: image2latex_wild
270
357
  display_name: Image2LaTeX (Wild)
271
358
  description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
272
359
  metric_groups:
@@ -301,9 +388,9 @@ run_groups:
301
388
  when: "2024"
302
389
  language: English
303
390
 
304
- - name: image2webpage_easy
305
- display_name: I2webpage (Easy)
306
- description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
391
+ - name: image2webpage_css
392
+ display_name: I2webpage (CSS)
393
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly CSS.
307
394
  metric_groups:
308
395
  - accuracy_simple
309
396
  - compilation
@@ -315,14 +402,13 @@ run_groups:
315
402
  main_split: valid
316
403
  taxonomy:
317
404
  task: image-to-text
318
- what: css, html, javascript
405
+ what: code (mostly CSS)
319
406
  who: dataset authors
320
407
  when: "2024"
321
- language: English
322
408
 
323
- - name: image2webpage_medium
324
- display_name: I2webpage (Medium)
325
- description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
409
+ - name: image2webpage_html
410
+ display_name: I2webpage (HTML)
411
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly HTML.
326
412
  metric_groups:
327
413
  - accuracy_simple
328
414
  - compilation
@@ -334,14 +420,13 @@ run_groups:
334
420
  main_split: valid
335
421
  taxonomy:
336
422
  task: image-to-text
337
- what: css, html, javascript
423
+ what: code (mostly HTML)
338
424
  who: dataset authors
339
425
  when: "2024"
340
- language: English
341
426
 
342
- - name: image2webpage_hard
343
- display_name: I2webpage (Hard)
344
- description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
427
+ - name: image2webpage_javascript
428
+ display_name: I2webpage (Javascript)
429
+ description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly Javascript.
345
430
  metric_groups:
346
431
  - accuracy_simple
347
432
  - compilation
@@ -353,12 +438,68 @@ run_groups:
353
438
  main_split: valid
354
439
  taxonomy:
355
440
  task: image-to-text
356
- what: css, html, javascript
441
+ what: code (mostly Javascript)
357
442
  who: dataset authors
358
443
  when: "2024"
359
- language: English
360
444
 
361
- - name: image2webpage_real
445
+ # - name: image2webpage_easy
446
+ # display_name: I2webpage (Easy)
447
+ # description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
448
+ # metric_groups:
449
+ # - accuracy_simple
450
+ # - compilation
451
+ # - generation_image
452
+ # - generation_text
453
+ # - general_information
454
+ # environment:
455
+ # main_name: earth_mover_similarity
456
+ # main_split: valid
457
+ # taxonomy:
458
+ # task: image-to-text
459
+ # what: css, html, javascript
460
+ # who: dataset authors
461
+ # when: "2024"
462
+ # language: English
463
+
464
+ # - name: image2webpage_medium
465
+ # display_name: I2webpage (Medium)
466
+ # description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
467
+ # metric_groups:
468
+ # - accuracy_simple
469
+ # - compilation
470
+ # - generation_image
471
+ # - generation_text
472
+ # - general_information
473
+ # environment:
474
+ # main_name: earth_mover_similarity
475
+ # main_split: valid
476
+ # taxonomy:
477
+ # task: image-to-text
478
+ # what: css, html, javascript
479
+ # who: dataset authors
480
+ # when: "2024"
481
+ # language: English
482
+
483
+ # - name: image2webpage_hard
484
+ # display_name: I2webpage (Hard)
485
+ # description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
486
+ # metric_groups:
487
+ # - accuracy_simple
488
+ # - compilation
489
+ # - generation_image
490
+ # - generation_text
491
+ # - general_information
492
+ # environment:
493
+ # main_name: earth_mover_similarity
494
+ # main_split: valid
495
+ # taxonomy:
496
+ # task: image-to-text
497
+ # what: css, html, javascript
498
+ # who: dataset authors
499
+ # when: "2024"
500
+ # language: English
501
+
502
+ - name: image2webpage_wild
362
503
  display_name: Image2webpage (Wild)
363
504
  description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
364
505
  metric_groups:
@@ -392,56 +533,56 @@ run_groups:
392
533
  when: "2024"
393
534
  language: English
394
535
 
395
- - name: image2musicsheet_easy
396
- display_name: I2musicsheet (Easy)
397
- description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
398
- metric_groups:
399
- - accuracy_simple
400
- - compilation
401
- - generation_image
402
- - general_information
403
- environment:
404
- main_name: earth_mover_similarity
405
- main_split: valid
406
- taxonomy:
407
- task: image-to-text
408
- what: music sheets
409
- who: dataset authors
410
- when: "2024"
411
- language: English
536
+ # - name: image2musicsheet_easy
537
+ # display_name: I2musicsheet (Easy)
538
+ # description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
539
+ # metric_groups:
540
+ # - accuracy_simple
541
+ # - compilation
542
+ # - generation_image
543
+ # - general_information
544
+ # environment:
545
+ # main_name: earth_mover_similarity
546
+ # main_split: valid
547
+ # taxonomy:
548
+ # task: image-to-text
549
+ # what: music sheets
550
+ # who: dataset authors
551
+ # when: "2024"
552
+ # language: English
412
553
 
413
- - name: image2musicsheet_medium
414
- display_name: I2musicsheet (Medium)
415
- description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
416
- metric_groups:
417
- - accuracy_simple
418
- - compilation
419
- - generation_image
420
- - general_information
421
- environment:
422
- main_name: earth_mover_similarity
423
- main_split: valid
424
- taxonomy:
425
- task: image-to-text
426
- what: music sheets
427
- who: dataset authors
428
- when: "2024"
429
- language: English
554
+ # - name: image2musicsheet_medium
555
+ # display_name: I2musicsheet (Medium)
556
+ # description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
557
+ # metric_groups:
558
+ # - accuracy_simple
559
+ # - compilation
560
+ # - generation_image
561
+ # - general_information
562
+ # environment:
563
+ # main_name: earth_mover_similarity
564
+ # main_split: valid
565
+ # taxonomy:
566
+ # task: image-to-text
567
+ # what: music sheets
568
+ # who: dataset authors
569
+ # when: "2024"
570
+ # language: English
430
571
 
431
- - name: image2musicsheet_hard
432
- display_name: I2musicsheet (Hard)
433
- description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
434
- metric_groups:
435
- - accuracy_simple
436
- - compilation
437
- - generation_image
438
- - general_information
439
- environment:
440
- main_name: earth_mover_similarity
441
- main_split: valid
442
- taxonomy:
443
- task: image-to-text
444
- what: music sheets
445
- who: dataset authors
446
- when: "2024"
447
- language: English
572
+ # - name: image2musicsheet_hard
573
+ # display_name: I2musicsheet (Hard)
574
+ # description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
575
+ # metric_groups:
576
+ # - accuracy_simple
577
+ # - compilation
578
+ # - generation_image
579
+ # - general_information
580
+ # environment:
581
+ # main_name: earth_mover_similarity
582
+ # main_split: valid
583
+ # taxonomy:
584
+ # task: image-to-text
585
+ # what: music sheets
586
+ # who: dataset authors
587
+ # when: "2024"
588
+ # language: English