crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,64 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: instance_prefix
22
- description: The string that is included before each instance (e.g., '\n\n').
23
- - name: input_prefix
24
- description: The string that is included before each input (e.g., 'Question:').
25
- - name: input_suffix
26
- description: The string that is included after each input (e.g., '\n').
27
- - name: reference_prefix
28
- description: The string that is included before each reference (for multiple-choice questions).
29
- - name: reference_suffix
30
- description: The string that is included after each reference (for multiple-choice questions).
31
- - name: output_prefix
32
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
33
- - name: output_suffix
34
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
35
- - name: substitutions
36
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
37
- - name: max_train_instances
38
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
39
- - name: max_eval_instances
40
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
41
- - name: num_outputs
42
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
43
- - name: num_train_trials
44
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
45
- - name: sample_train
46
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
47
- - name: model
48
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
49
- - name: model_deployment
50
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
51
- - name: temperature
52
- description: Temperature parameter used in generation.
53
- - name: max_tokens
54
- description: Maximum number of tokens to generate.
55
- - name: stop_sequences
56
- description: List of sequences, where we stop generation if we encounter any of them.
57
- - name: random
58
- description: Random seed (string), which guarantees reproducibility.
59
- - name: multi_label
60
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
61
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
62
6
  ############################################################
63
7
  metrics:
64
8
  # Infrastructure metrics:
@@ -0,0 +1,143 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: program_accuracy
69
+ display_name: Program Accuracy
70
+ description: Accuracy of the generated programs
71
+ lower_is_better: false
72
+ - name: execution_accuracy
73
+ display_name: Execution Accuracy
74
+ description: Accuracy of the final result of the generated program
75
+ lower_is_better: false
76
+
77
+ ############################################################
78
+ perturbations: []
79
+
80
+ ############################################################
81
+ metric_groups:
82
+ - name: accuracy
83
+ display_name: Accuracy
84
+ metrics:
85
+ - name: ${main_name}
86
+ split: ${main_split}
87
+
88
+ - name: efficiency
89
+ display_name: Efficiency
90
+ metrics:
91
+ - name: inference_runtime
92
+ split: ${main_split}
93
+
94
+ - name: general_information
95
+ display_name: General information
96
+ hide_win_rates: true
97
+ metrics:
98
+ - name: num_instances
99
+ split: ${main_split}
100
+ - name: num_train_instances
101
+ split: ${main_split}
102
+ - name: prompt_truncated
103
+ split: ${main_split}
104
+ - name: num_prompt_tokens
105
+ split: ${main_split}
106
+ - name: num_output_tokens
107
+ split: ${main_split}
108
+
109
+ ############################################################
110
+ run_groups:
111
+ - name: financial_scenarios
112
+ display_name: Financial Scenarios
113
+ description: Scenarios for the financial domain
114
+ category: All scenarios
115
+ subgroups:
116
+ - fin_qa
117
+
118
+ - name: fin_qa
119
+ display_name: FinQA
120
+ description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
121
+ metric_groups:
122
+ - accuracy
123
+ - efficiency
124
+ - general_information
125
+ environment:
126
+ main_name: program_accuracy
127
+ main_split: test
128
+ taxonomy:
129
+ task: question answering with numeric reasoning
130
+ what: financial reports
131
+ who: financial experts
132
+ when: 1999 to 2019
133
+ language: English
134
+
135
+ - name: financial_scenarios_ablations
136
+ display_name: Financial Scenarios Ablations
137
+ description: Scenarios for the financial domain with ablations
138
+ category: All scenarios
139
+ subgroups:
140
+ - fin_qa
141
+ adapter_keys_shown:
142
+ - model
143
+ - max_train_instances
@@ -0,0 +1,447 @@
1
+ ---
2
+ ############################################################
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
6
+ ############################################################
7
+ metrics:
8
+ # Infrastructure metrics:
9
+ - name: num_references
10
+ display_name: '# ref'
11
+ description: Number of references.
12
+ - name: num_train_trials
13
+ display_name: '# trials'
14
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
15
+ - name: estimated_num_tokens_cost
16
+ display_name: 'cost'
17
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
18
+ - name: num_prompt_tokens
19
+ display_name: '# prompt tokens'
20
+ description: Number of tokens in the prompt.
21
+ - name: num_prompt_characters
22
+ display_name: '# prompt chars'
23
+ description: Number of characters in the prompt.
24
+ - name: num_completion_tokens
25
+ display_name: '# completion tokens'
26
+ description: Actual number of completion tokens (over all completions).
27
+ - name: num_output_tokens
28
+ display_name: '# output tokens'
29
+ description: Actual number of output tokens.
30
+ - name: max_num_output_tokens
31
+ display_name: 'Max output tokens'
32
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
33
+ - name: num_requests
34
+ display_name: '# requests'
35
+ description: Number of distinct API requests.
36
+ - name: num_instances
37
+ display_name: '# eval'
38
+ description: Number of evaluation instances.
39
+ - name: num_train_instances
40
+ display_name: '# train'
41
+ description: Number of training instances (e.g., in-context examples).
42
+ - name: prompt_truncated
43
+ display_name: truncated
44
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
45
+ - name: finish_reason_length
46
+ display_name: finish b/c length
47
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
48
+ - name: finish_reason_stop
49
+ display_name: finish b/c stop
50
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
51
+ - name: finish_reason_endoftext
52
+ display_name: finish b/c endoftext
53
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
54
+ - name: finish_reason_unknown
55
+ display_name: finish b/c unknown
56
+ description: Fraction of instances where the the output was terminated for unknown reasons.
57
+ - name: num_completions
58
+ display_name: '# completions'
59
+ description: Number of completions.
60
+ - name: predicted_index
61
+ display_name: Predicted index
62
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
63
+
64
+ # Vision Language metrics [text]:
65
+ - name: edit_similarity
66
+ display_name: Edit similarity (Levenshtein)
67
+ short_display_name: Edit sim.
68
+ lower_is_better: false
69
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
70
+
71
+ # Vision Language metrics [image]:
72
+ - name: earth_mover_similarity
73
+ display_name: Earth Mover Similarity
74
+ short_display_name: EMS
75
+ description: Earth Mover Similarity (EMD adapted to speed up calculations)
76
+ lower_is_better: false
77
+ - name: pixel_similarity
78
+ display_name: Pixel Similarity
79
+ short_display_name: PS
80
+ description: Pixel Similarity between an image generated by the model and the target image.
81
+ lower_is_better: false
82
+ - name: compilation_success
83
+ display_name: Compilation success
84
+ description: Fraction of instances where the generated code compiles successfully.
85
+ lower_is_better: false
86
+ - name: fid_similarity
87
+ display_name: FID similarity
88
+ short_display_name: FID
89
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
90
+ lower_is_better: false
91
+
92
+ # Accuracy metrics:
93
+ - name: exact_match
94
+ display_name: Exact match
95
+ short_display_name: EM
96
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
97
+ lower_is_better: false
98
+ - name: quasi_exact_match
99
+ display_name: Quasi-exact match
100
+ short_display_name: EM
101
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
102
+ lower_is_better: false
103
+ - name: prefix_exact_match
104
+ display_name: Prefix exact match
105
+ short_display_name: PEM
106
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
107
+ lower_is_better: false
108
+ - name: quasi_prefix_exact_match
109
+ # TODO: should call this prefix_quasi_exact_match
110
+ display_name: Prefix quasi-exact match
111
+ short_display_name: PEM
112
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
113
+ lower_is_better: false
114
+
115
+ ############################################################
116
+ perturbations:
117
+ - name: robustness
118
+ display_name: Robustness
119
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
120
+
121
+ ############################################################
122
+ metric_groups:
123
+ - name: accuracy
124
+ display_name: Compilation Rate and Earth Mover Similarity
125
+ metrics:
126
+ - name: earth_mover_similarity
127
+ split: ${main_split}
128
+ - name: compilation_success
129
+ split: ${main_split}
130
+
131
+ - name: general_information
132
+ display_name: General information
133
+ metrics:
134
+ - name: num_instances
135
+ split: ${main_split}
136
+ - name: num_train_instances
137
+ split: ${main_split}
138
+ - name: prompt_truncated
139
+ split: ${main_split}
140
+ - name: num_prompt_tokens
141
+ split: ${main_split}
142
+ - name: num_output_tokens
143
+ split: ${main_split}
144
+
145
+ - name: accuracy_simple
146
+ display_name: Earth Mover Similarity
147
+ metrics:
148
+ - name: earth_mover_similarity
149
+ split: ${main_split}
150
+
151
+ - name: compilation
152
+ display_name: Compilation Rate
153
+ metrics:
154
+ - name: compilation_success
155
+ split: ${main_split}
156
+
157
+ - name: generation_image
158
+ display_name: Generation (image)
159
+ metrics:
160
+ - name: pixel_similarity
161
+ split: ${main_split}
162
+ - name: compilation_success
163
+ split: ${main_split}
164
+ - name: fid_similarity
165
+ split: ${main_split}
166
+ - name: earth_mover_similarity
167
+ split: ${main_split}
168
+
169
+ - name: generation_text
170
+ display_name: Generation (text)
171
+ metrics:
172
+ - name: edit_similarity
173
+ split: ${main_split}
174
+
175
+ ############################################################
176
+ run_groups:
177
+ - name: core_scenarios
178
+ display_name: Image2Structure
179
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
180
+ category: All scenarios
181
+ subgroups:
182
+ - image2latex
183
+ - image2webpage
184
+ - image2musicsheet
185
+
186
+ - name: image2structure_real
187
+ display_name: Image2Structure (Wild)
188
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
189
+ category: All scenarios
190
+ subgroups:
191
+ - image2latex_real
192
+ - image2webpage_real
193
+
194
+ - name: image2latex
195
+ display_name: Image2LaTeX
196
+ description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
197
+ metric_groups:
198
+ - accuracy
199
+ - generation_image
200
+ - generation_text
201
+ - general_information
202
+ environment:
203
+ main_name: earth_mover_similarity
204
+ main_split: valid
205
+ taxonomy:
206
+ task: image-to-text
207
+ what: mathematical equations, tables, algorithms, tikz
208
+ who: dataset authors
209
+ when: "2024"
210
+ language: English
211
+
212
+ - name: image2latex_easy
213
+ display_name: I2LaTeX (Easy)
214
+ description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
215
+ metric_groups:
216
+ - accuracy_simple
217
+ - compilation
218
+ - generation_image
219
+ - generation_text
220
+ - general_information
221
+ environment:
222
+ main_name: earth_mover_similarity
223
+ main_split: valid
224
+ taxonomy:
225
+ task: image-to-text
226
+ what: mathematical equations, tables, algorithms, tikz
227
+ who: dataset authors
228
+ when: "2024"
229
+ language: English
230
+
231
+ - name: image2latex_medium
232
+ display_name: I2LaTeX (Medium)
233
+ description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
234
+ metric_groups:
235
+ - accuracy_simple
236
+ - compilation
237
+ - generation_image
238
+ - generation_text
239
+ - general_information
240
+ environment:
241
+ main_name: earth_mover_similarity
242
+ main_split: valid
243
+ taxonomy:
244
+ task: image-to-text
245
+ what: mathematical equations, tables, algorithms, tikz
246
+ who: dataset authors
247
+ when: "2024"
248
+ language: English
249
+
250
+ - name: image2latex_hard
251
+ display_name: I2LaTeX (Hard)
252
+ description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
253
+ metric_groups:
254
+ - accuracy_simple
255
+ - compilation
256
+ - generation_image
257
+ - generation_text
258
+ - general_information
259
+ environment:
260
+ main_name: earth_mover_similarity
261
+ main_split: valid
262
+ taxonomy:
263
+ task: image-to-text
264
+ what: mathematical equations, tables, algorithms, tikz
265
+ who: dataset authors
266
+ when: "2024"
267
+ language: English
268
+
269
+ - name: image2latex_real
270
+ display_name: Image2LaTeX (Wild)
271
+ description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
272
+ metric_groups:
273
+ - accuracy
274
+ - generation_image
275
+ - general_information
276
+ environment:
277
+ main_name: earth_mover_similarity
278
+ main_split: valid
279
+ taxonomy:
280
+ task: image-to-text
281
+ what: mathematical equations, tables, algorithms, tikz
282
+ who: dataset authors
283
+ when: "2024"
284
+ language: English
285
+
286
+ - name: image2webpage
287
+ display_name: Image2webpage
288
+ description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
289
+ metric_groups:
290
+ - accuracy
291
+ - generation_image
292
+ - generation_text
293
+ - general_information
294
+ environment:
295
+ main_name: earth_mover_similarity
296
+ main_split: valid
297
+ taxonomy:
298
+ task: image-to-text
299
+ what: css, html, javascript
300
+ who: dataset authors
301
+ when: "2024"
302
+ language: English
303
+
304
+ - name: image2webpage_easy
305
+ display_name: I2webpage (Easy)
306
+ description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
307
+ metric_groups:
308
+ - accuracy_simple
309
+ - compilation
310
+ - generation_image
311
+ - generation_text
312
+ - general_information
313
+ environment:
314
+ main_name: earth_mover_similarity
315
+ main_split: valid
316
+ taxonomy:
317
+ task: image-to-text
318
+ what: css, html, javascript
319
+ who: dataset authors
320
+ when: "2024"
321
+ language: English
322
+
323
+ - name: image2webpage_medium
324
+ display_name: I2webpage (Medium)
325
+ description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
326
+ metric_groups:
327
+ - accuracy_simple
328
+ - compilation
329
+ - generation_image
330
+ - generation_text
331
+ - general_information
332
+ environment:
333
+ main_name: earth_mover_similarity
334
+ main_split: valid
335
+ taxonomy:
336
+ task: image-to-text
337
+ what: css, html, javascript
338
+ who: dataset authors
339
+ when: "2024"
340
+ language: English
341
+
342
+ - name: image2webpage_hard
343
+ display_name: I2webpage (Hard)
344
+ description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
345
+ metric_groups:
346
+ - accuracy_simple
347
+ - compilation
348
+ - generation_image
349
+ - generation_text
350
+ - general_information
351
+ environment:
352
+ main_name: earth_mover_similarity
353
+ main_split: valid
354
+ taxonomy:
355
+ task: image-to-text
356
+ what: css, html, javascript
357
+ who: dataset authors
358
+ when: "2024"
359
+ language: English
360
+
361
+ - name: image2webpage_real
362
+ display_name: Image2webpage (Wild)
363
+ description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
364
+ metric_groups:
365
+ - accuracy
366
+ - generation_image
367
+ - general_information
368
+ environment:
369
+ main_name: earth_mover_similarity
370
+ main_split: valid
371
+ taxonomy:
372
+ task: image-to-text
373
+ what: css, html, javascript
374
+ who: dataset authors
375
+ when: "2024"
376
+ language: English
377
+
378
+ - name: image2musicsheet
379
+ display_name: Image2musicsheet
380
+ description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
381
+ metric_groups:
382
+ - accuracy
383
+ - generation_image
384
+ - general_information
385
+ environment:
386
+ main_name: earth_mover_similarity
387
+ main_split: valid
388
+ taxonomy:
389
+ task: image-to-text
390
+ what: music sheets
391
+ who: dataset authors
392
+ when: "2024"
393
+ language: English
394
+
395
+ - name: image2musicsheet_easy
396
+ display_name: I2musicsheet (Easy)
397
+ description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
398
+ metric_groups:
399
+ - accuracy_simple
400
+ - compilation
401
+ - generation_image
402
+ - general_information
403
+ environment:
404
+ main_name: earth_mover_similarity
405
+ main_split: valid
406
+ taxonomy:
407
+ task: image-to-text
408
+ what: music sheets
409
+ who: dataset authors
410
+ when: "2024"
411
+ language: English
412
+
413
+ - name: image2musicsheet_medium
414
+ display_name: I2musicsheet (Medium)
415
+ description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
416
+ metric_groups:
417
+ - accuracy_simple
418
+ - compilation
419
+ - generation_image
420
+ - general_information
421
+ environment:
422
+ main_name: earth_mover_similarity
423
+ main_split: valid
424
+ taxonomy:
425
+ task: image-to-text
426
+ what: music sheets
427
+ who: dataset authors
428
+ when: "2024"
429
+ language: English
430
+
431
+ - name: image2musicsheet_hard
432
+ display_name: I2musicsheet (Hard)
433
+ description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
434
+ metric_groups:
435
+ - accuracy_simple
436
+ - compilation
437
+ - generation_image
438
+ - general_information
439
+ environment:
440
+ main_name: earth_mover_similarity
441
+ main_split: valid
442
+ taxonomy:
443
+ task: image-to-text
444
+ what: music sheets
445
+ who: dataset authors
446
+ when: "2024"
447
+ language: English