crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,64 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: instance_prefix
22
- description: The string that is included before each instance (e.g., '\n\n').
23
- - name: input_prefix
24
- description: The string that is included before each input (e.g., 'Question:').
25
- - name: input_suffix
26
- description: The string that is included after each input (e.g., '\n').
27
- - name: reference_prefix
28
- description: The string that is included before each reference (for multiple-choice questions).
29
- - name: reference_suffix
30
- description: The string that is included after each reference (for multiple-choice questions).
31
- - name: output_prefix
32
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
33
- - name: output_suffix
34
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
35
- - name: substitutions
36
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
37
- - name: max_train_instances
38
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
39
- - name: max_eval_instances
40
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
41
- - name: num_outputs
42
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
43
- - name: num_train_trials
44
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
45
- - name: sample_train
46
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
47
- - name: model
48
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
49
- - name: model_deployment
50
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
51
- - name: temperature
52
- description: Temperature parameter used in generation.
53
- - name: max_tokens
54
- description: Maximum number of tokens to generate.
55
- - name: stop_sequences
56
- description: List of sequences, where we stop generation if we encounter any of them.
57
- - name: random
58
- description: Random seed (string), which guarantees reproducibility.
59
- - name: multi_label
60
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
61
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
62
6
  ############################################################
63
7
  metrics:
64
8
  # Infrastructure metrics:
@@ -0,0 +1,143 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: program_accuracy
69
+ display_name: Program Accuracy
70
+ description: Accuracy of the generated programs
71
+ lower_is_better: false
72
+ - name: execution_accuracy
73
+ display_name: Execution Accuracy
74
+ description: Accuracy of the final result of the generated program
75
+ lower_is_better: false
76
+
77
+ ############################################################
78
+ perturbations: []
79
+
80
+ ############################################################
81
+ metric_groups:
82
+ - name: accuracy
83
+ display_name: Accuracy
84
+ metrics:
85
+ - name: ${main_name}
86
+ split: ${main_split}
87
+
88
+ - name: efficiency
89
+ display_name: Efficiency
90
+ metrics:
91
+ - name: inference_runtime
92
+ split: ${main_split}
93
+
94
+ - name: general_information
95
+ display_name: General information
96
+ hide_win_rates: true
97
+ metrics:
98
+ - name: num_instances
99
+ split: ${main_split}
100
+ - name: num_train_instances
101
+ split: ${main_split}
102
+ - name: prompt_truncated
103
+ split: ${main_split}
104
+ - name: num_prompt_tokens
105
+ split: ${main_split}
106
+ - name: num_output_tokens
107
+ split: ${main_split}
108
+
109
+ ############################################################
110
+ run_groups:
111
+ - name: financial_scenarios
112
+ display_name: Financial Scenarios
113
+ description: Scenarios for the financial domain
114
+ category: All scenarios
115
+ subgroups:
116
+ - fin_qa
117
+
118
+ - name: fin_qa
119
+ display_name: FinQA
120
+ description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
121
+ metric_groups:
122
+ - accuracy
123
+ - efficiency
124
+ - general_information
125
+ environment:
126
+ main_name: program_accuracy
127
+ main_split: test
128
+ taxonomy:
129
+ task: question answering with numeric reasoning
130
+ what: financial reports
131
+ who: financial experts
132
+ when: 1999 to 2019
133
+ language: English
134
+
135
+ - name: financial_scenarios_ablations
136
+ display_name: Financial Scenarios Ablations
137
+ description: Scenarios for the financial domain with ablations
138
+ category: All scenarios
139
+ subgroups:
140
+ - fin_qa
141
+ adapter_keys_shown:
142
+ - model
143
+ - max_train_instances
@@ -1,78 +1,11 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: generation_multimodal
10
- description: Given the multimodal input, the model generates the output free-form.
11
- - name: multiple_choice_joint
12
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
- - name: multiple_choice_separate_original
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
15
- - name: multiple_choice_separate_calibrated
16
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
17
- - name: language_modeling
18
- description: Given the input, the model assigns the sequence a probability.
19
- - name: instructions
20
- description: The description of the task that is included at the very beginning of the prompt.
21
- - name: global_prefix
22
- description: The string that is prepended to the prompt.
23
- - name: global_suffix
24
- description: The string that is appended to the prompt.
25
- - name: instance_prefix
26
- description: The string that is included before each instance (e.g., '\n\n').
27
- - name: input_prefix
28
- description: The string that is included before each input (e.g., 'Question:').
29
- - name: input_suffix
30
- description: The string that is included after each input (e.g., '\n').
31
- - name: reference_prefix
32
- description: The string that is included before each reference (for multiple-choice questions).
33
- - name: reference_suffix
34
- description: The string that is included after each reference (for multiple-choice questions).
35
- - name: output_prefix
36
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
37
- - name: output_suffix
38
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
39
- - name: substitutions
40
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
41
- - name: max_train_instances
42
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
43
- - name: max_eval_instances
44
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
45
- - name: num_outputs
46
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
47
- - name: num_train_trials
48
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
49
- - name: sample_train
50
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
51
- - name: model
52
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
53
- - name: model_deployment
54
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
55
- - name: temperature
56
- description: Temperature parameter used in generation.
57
- - name: max_tokens
58
- description: Maximum number of tokens to generate.
59
- - name: stop_sequences
60
- description: List of sequences, where we stop generation if we encounter any of them.
61
- - name: random
62
- description: Random seed (string), which guarantees reproducibility.
63
- - name: multi_label
64
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
65
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
66
6
  ############################################################
67
7
  metrics:
68
8
  # Infrastructure metrics:
69
- - name: num_perplexity_tokens
70
- display_name: '# tokens'
71
- description: Average number of tokens in the predicted output (for language modeling, the input too).
72
- - name: num_bytes
73
- display_name: '# bytes'
74
- description: Average number of bytes in the predicted output (for language modeling, the input too).
75
-
76
9
  - name: num_references
77
10
  display_name: '# ref'
78
11
  description: Number of references.
@@ -136,50 +69,25 @@ metrics:
136
69
  description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
137
70
 
138
71
  # Vision Language metrics [image]:
139
- - name: block_emd_similarity
140
- display_name: Block Earth Mover Similarity
141
- short_display_name: Block EMS
142
- description: Block Earth Mover Similarity
143
- lower_is_better: false
144
- - name: block_emd_similarity_white
145
- display_name: Block Earth Mover Similarity (white)
146
- short_display_name: Block EMS (white)
147
- description: Block Earth Mover Similarity (white)
148
- lower_is_better: false
149
- - name: block_emd_similarity_median_color
150
- display_name: Block Earth Mover Similarity (median)
151
- short_display_name: Block EMS (median)
152
- description: Block Earth Mover Similarity (median)
72
+ - name: earth_mover_similarity
73
+ display_name: Earth Mover Similarity
74
+ short_display_name: EMS
75
+ description: Earth Mover Similarity (EMD adapted to speed up calculations)
153
76
  lower_is_better: false
154
77
  - name: pixel_similarity
155
78
  display_name: Pixel Similarity
156
79
  short_display_name: PS
157
80
  description: Pixel Similarity between an image generated by the model and the target image.
158
81
  lower_is_better: false
159
- - name: sift_similarity
160
- display_name: SIFT Similarity
161
- short_display_name: SIFT
162
- description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
163
- lower_is_better: false
164
82
  - name: compilation_success
165
83
  display_name: Compilation success
166
84
  description: Fraction of instances where the generated code compiles successfully.
167
85
  lower_is_better: false
168
- - name: lpips_similarity
169
- display_name: LPIPS similarity
170
- short_display_name: LPIPS
171
- description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
172
- lower_is_better: false
173
86
  - name: fid_similarity
174
87
  display_name: FID similarity
175
88
  short_display_name: FID
176
89
  description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
177
90
  lower_is_better: false
178
- - name: ssim_similarity
179
- display_name: SSIM
180
- short_display_name: SSIM
181
- description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
182
- lower_is_better: false
183
91
 
184
92
  # Accuracy metrics:
185
93
  - name: exact_match
@@ -215,11 +123,37 @@ metric_groups:
215
123
  - name: accuracy
216
124
  display_name: Compilation Rate and Earth Mover Similarity
217
125
  metrics:
218
- - name: ${main_name}
126
+ - name: earth_mover_similarity
219
127
  split: ${main_split}
220
128
  - name: compilation_success
221
129
  split: ${main_split}
222
130
 
131
+ - name: general_information
132
+ display_name: General information
133
+ metrics:
134
+ - name: num_instances
135
+ split: ${main_split}
136
+ - name: num_train_instances
137
+ split: ${main_split}
138
+ - name: prompt_truncated
139
+ split: ${main_split}
140
+ - name: num_prompt_tokens
141
+ split: ${main_split}
142
+ - name: num_output_tokens
143
+ split: ${main_split}
144
+
145
+ - name: accuracy_simple
146
+ display_name: Earth Mover Similarity
147
+ metrics:
148
+ - name: earth_mover_similarity
149
+ split: ${main_split}
150
+
151
+ - name: compilation
152
+ display_name: Compilation Rate
153
+ metrics:
154
+ - name: compilation_success
155
+ split: ${main_split}
156
+
223
157
  - name: generation_image
224
158
  display_name: Generation (image)
225
159
  metrics:
@@ -229,11 +163,7 @@ metric_groups:
229
163
  split: ${main_split}
230
164
  - name: fid_similarity
231
165
  split: ${main_split}
232
- - name: block_emd_similarity
233
- split: ${main_split}
234
- - name: block_emd_similarity_white
235
- split: ${main_split}
236
- - name: block_emd_similarity_median_color
166
+ - name: earth_mover_similarity
237
167
  split: ${main_split}
238
168
 
239
169
  - name: generation_text
@@ -253,6 +183,14 @@ run_groups:
253
183
  - image2webpage
254
184
  - image2musicsheet
255
185
 
186
+ - name: image2structure_real
187
+ display_name: Image2Structure (Wild)
188
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
189
+ category: All scenarios
190
+ subgroups:
191
+ - image2latex_real
192
+ - image2webpage_real
193
+
256
194
  - name: image2latex
257
195
  display_name: Image2LaTeX
258
196
  description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
@@ -260,13 +198,88 @@ run_groups:
260
198
  - accuracy
261
199
  - generation_image
262
200
  - generation_text
201
+ - general_information
263
202
  environment:
264
- main_name: block_emd_similarity
203
+ main_name: earth_mover_similarity
265
204
  main_split: valid
266
205
  taxonomy:
267
206
  task: image-to-text
268
207
  what: mathematical equations, tables, algorithms, tikz
269
- who: n/a
208
+ who: dataset authors
209
+ when: "2024"
210
+ language: English
211
+
212
+ - name: image2latex_easy
213
+ display_name: I2LaTeX (Easy)
214
+ description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
215
+ metric_groups:
216
+ - accuracy_simple
217
+ - compilation
218
+ - generation_image
219
+ - generation_text
220
+ - general_information
221
+ environment:
222
+ main_name: earth_mover_similarity
223
+ main_split: valid
224
+ taxonomy:
225
+ task: image-to-text
226
+ what: mathematical equations, tables, algorithms, tikz
227
+ who: dataset authors
228
+ when: "2024"
229
+ language: English
230
+
231
+ - name: image2latex_medium
232
+ display_name: I2LaTeX (Medium)
233
+ description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
234
+ metric_groups:
235
+ - accuracy_simple
236
+ - compilation
237
+ - generation_image
238
+ - generation_text
239
+ - general_information
240
+ environment:
241
+ main_name: earth_mover_similarity
242
+ main_split: valid
243
+ taxonomy:
244
+ task: image-to-text
245
+ what: mathematical equations, tables, algorithms, tikz
246
+ who: dataset authors
247
+ when: "2024"
248
+ language: English
249
+
250
+ - name: image2latex_hard
251
+ display_name: I2LaTeX (Hard)
252
+ description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
253
+ metric_groups:
254
+ - accuracy_simple
255
+ - compilation
256
+ - generation_image
257
+ - generation_text
258
+ - general_information
259
+ environment:
260
+ main_name: earth_mover_similarity
261
+ main_split: valid
262
+ taxonomy:
263
+ task: image-to-text
264
+ what: mathematical equations, tables, algorithms, tikz
265
+ who: dataset authors
266
+ when: "2024"
267
+ language: English
268
+
269
+ - name: image2latex_real
270
+ display_name: Image2LaTeX (Wild)
271
+ description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
272
+ metric_groups:
273
+ - accuracy
274
+ - generation_image
275
+ - general_information
276
+ environment:
277
+ main_name: earth_mover_similarity
278
+ main_split: valid
279
+ taxonomy:
280
+ task: image-to-text
281
+ what: mathematical equations, tables, algorithms, tikz
282
+ who: dataset authors
270
283
  when: "2024"
271
284
  language: English
272
285
 
@@ -277,13 +290,88 @@ run_groups:
277
290
  - accuracy
278
291
  - generation_image
279
292
  - generation_text
293
+ - general_information
280
294
  environment:
281
- main_name: block_emd_similarity
295
+ main_name: earth_mover_similarity
282
296
  main_split: valid
283
297
  taxonomy:
284
298
  task: image-to-text
285
299
  what: css, html, javascript
286
- who: n/a
300
+ who: dataset authors
301
+ when: "2024"
302
+ language: English
303
+
304
+ - name: image2webpage_easy
305
+ display_name: I2webpage (Easy)
306
+ description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
307
+ metric_groups:
308
+ - accuracy_simple
309
+ - compilation
310
+ - generation_image
311
+ - generation_text
312
+ - general_information
313
+ environment:
314
+ main_name: earth_mover_similarity
315
+ main_split: valid
316
+ taxonomy:
317
+ task: image-to-text
318
+ what: css, html, javascript
319
+ who: dataset authors
320
+ when: "2024"
321
+ language: English
322
+
323
+ - name: image2webpage_medium
324
+ display_name: I2webpage (Medium)
325
+ description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
326
+ metric_groups:
327
+ - accuracy_simple
328
+ - compilation
329
+ - generation_image
330
+ - generation_text
331
+ - general_information
332
+ environment:
333
+ main_name: earth_mover_similarity
334
+ main_split: valid
335
+ taxonomy:
336
+ task: image-to-text
337
+ what: css, html, javascript
338
+ who: dataset authors
339
+ when: "2024"
340
+ language: English
341
+
342
+ - name: image2webpage_hard
343
+ display_name: I2webpage (Hard)
344
+ description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
345
+ metric_groups:
346
+ - accuracy_simple
347
+ - compilation
348
+ - generation_image
349
+ - generation_text
350
+ - general_information
351
+ environment:
352
+ main_name: earth_mover_similarity
353
+ main_split: valid
354
+ taxonomy:
355
+ task: image-to-text
356
+ what: css, html, javascript
357
+ who: dataset authors
358
+ when: "2024"
359
+ language: English
360
+
361
+ - name: image2webpage_real
362
+ display_name: Image2webpage (Wild)
363
+ description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
364
+ metric_groups:
365
+ - accuracy
366
+ - generation_image
367
+ - general_information
368
+ environment:
369
+ main_name: earth_mover_similarity
370
+ main_split: valid
371
+ taxonomy:
372
+ task: image-to-text
373
+ what: css, html, javascript
374
+ who: dataset authors
287
375
  when: "2024"
288
376
  language: English
289
377
 
@@ -293,12 +381,67 @@ run_groups:
293
381
  metric_groups:
294
382
  - accuracy
295
383
  - generation_image
384
+ - general_information
385
+ environment:
386
+ main_name: earth_mover_similarity
387
+ main_split: valid
388
+ taxonomy:
389
+ task: image-to-text
390
+ what: music sheets
391
+ who: dataset authors
392
+ when: "2024"
393
+ language: English
394
+
395
+ - name: image2musicsheet_easy
396
+ display_name: I2musicsheet (Easy)
397
+ description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
398
+ metric_groups:
399
+ - accuracy_simple
400
+ - compilation
401
+ - generation_image
402
+ - general_information
403
+ environment:
404
+ main_name: earth_mover_similarity
405
+ main_split: valid
406
+ taxonomy:
407
+ task: image-to-text
408
+ what: music sheets
409
+ who: dataset authors
410
+ when: "2024"
411
+ language: English
412
+
413
+ - name: image2musicsheet_medium
414
+ display_name: I2musicsheet (Medium)
415
+ description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
416
+ metric_groups:
417
+ - accuracy_simple
418
+ - compilation
419
+ - generation_image
420
+ - general_information
421
+ environment:
422
+ main_name: earth_mover_similarity
423
+ main_split: valid
424
+ taxonomy:
425
+ task: image-to-text
426
+ what: music sheets
427
+ who: dataset authors
428
+ when: "2024"
429
+ language: English
430
+
431
+ - name: image2musicsheet_hard
432
+ display_name: I2musicsheet (Hard)
433
+ description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
434
+ metric_groups:
435
+ - accuracy_simple
436
+ - compilation
437
+ - generation_image
438
+ - general_information
296
439
  environment:
297
- main_name: block_emd_similarity
440
+ main_name: earth_mover_similarity
298
441
  main_split: valid
299
442
  taxonomy:
300
443
  task: image-to-text
301
444
  what: music sheets
302
- who: n/a
445
+ who: dataset authors
303
446
  when: "2024"
304
447
  language: English