crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +77 -0
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +168 -45
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +254 -111
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +43 -9
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +9 -2
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +19 -0
- helm/config/model_deployments.yaml +412 -18
- helm/config/model_metadata.yaml +447 -25
- helm/config/tokenizer_configs.yaml +93 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,64 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: multiple_choice_joint
|
|
10
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
11
|
-
- name: multiple_choice_separate_original
|
|
12
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
13
|
-
- name: multiple_choice_separate_calibrated
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
15
|
-
- name: language_modeling
|
|
16
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
17
|
-
- name: instructions
|
|
18
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
19
|
-
- name: global_prefix
|
|
20
|
-
description: The string that is prepended to the prompt.
|
|
21
|
-
- name: instance_prefix
|
|
22
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
23
|
-
- name: input_prefix
|
|
24
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
25
|
-
- name: input_suffix
|
|
26
|
-
description: The string that is included after each input (e.g., '\n').
|
|
27
|
-
- name: reference_prefix
|
|
28
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
29
|
-
- name: reference_suffix
|
|
30
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
31
|
-
- name: output_prefix
|
|
32
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
33
|
-
- name: output_suffix
|
|
34
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
35
|
-
- name: substitutions
|
|
36
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
37
|
-
- name: max_train_instances
|
|
38
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
39
|
-
- name: max_eval_instances
|
|
40
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
41
|
-
- name: num_outputs
|
|
42
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
43
|
-
- name: num_train_trials
|
|
44
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
45
|
-
- name: sample_train
|
|
46
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
47
|
-
- name: model
|
|
48
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
49
|
-
- name: model_deployment
|
|
50
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
51
|
-
- name: temperature
|
|
52
|
-
description: Temperature parameter used in generation.
|
|
53
|
-
- name: max_tokens
|
|
54
|
-
description: Maximum number of tokens to generate.
|
|
55
|
-
- name: stop_sequences
|
|
56
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
57
|
-
- name: random
|
|
58
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
59
|
-
- name: multi_label
|
|
60
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
61
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
62
6
|
############################################################
|
|
63
7
|
metrics:
|
|
64
8
|
# Infrastructure metrics:
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: program_accuracy
|
|
69
|
+
display_name: Program Accuracy
|
|
70
|
+
description: Accuracy of the generated programs
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: execution_accuracy
|
|
73
|
+
display_name: Execution Accuracy
|
|
74
|
+
description: Accuracy of the final result of the generated program
|
|
75
|
+
lower_is_better: false
|
|
76
|
+
|
|
77
|
+
############################################################
|
|
78
|
+
perturbations: []
|
|
79
|
+
|
|
80
|
+
############################################################
|
|
81
|
+
metric_groups:
|
|
82
|
+
- name: accuracy
|
|
83
|
+
display_name: Accuracy
|
|
84
|
+
metrics:
|
|
85
|
+
- name: ${main_name}
|
|
86
|
+
split: ${main_split}
|
|
87
|
+
|
|
88
|
+
- name: efficiency
|
|
89
|
+
display_name: Efficiency
|
|
90
|
+
metrics:
|
|
91
|
+
- name: inference_runtime
|
|
92
|
+
split: ${main_split}
|
|
93
|
+
|
|
94
|
+
- name: general_information
|
|
95
|
+
display_name: General information
|
|
96
|
+
hide_win_rates: true
|
|
97
|
+
metrics:
|
|
98
|
+
- name: num_instances
|
|
99
|
+
split: ${main_split}
|
|
100
|
+
- name: num_train_instances
|
|
101
|
+
split: ${main_split}
|
|
102
|
+
- name: prompt_truncated
|
|
103
|
+
split: ${main_split}
|
|
104
|
+
- name: num_prompt_tokens
|
|
105
|
+
split: ${main_split}
|
|
106
|
+
- name: num_output_tokens
|
|
107
|
+
split: ${main_split}
|
|
108
|
+
|
|
109
|
+
############################################################
|
|
110
|
+
run_groups:
|
|
111
|
+
- name: financial_scenarios
|
|
112
|
+
display_name: Financial Scenarios
|
|
113
|
+
description: Scenarios for the financial domain
|
|
114
|
+
category: All scenarios
|
|
115
|
+
subgroups:
|
|
116
|
+
- fin_qa
|
|
117
|
+
|
|
118
|
+
- name: fin_qa
|
|
119
|
+
display_name: FinQA
|
|
120
|
+
description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
|
|
121
|
+
metric_groups:
|
|
122
|
+
- accuracy
|
|
123
|
+
- efficiency
|
|
124
|
+
- general_information
|
|
125
|
+
environment:
|
|
126
|
+
main_name: program_accuracy
|
|
127
|
+
main_split: test
|
|
128
|
+
taxonomy:
|
|
129
|
+
task: question answering with numeric reasoning
|
|
130
|
+
what: financial reports
|
|
131
|
+
who: financial experts
|
|
132
|
+
when: 1999 to 2019
|
|
133
|
+
language: English
|
|
134
|
+
|
|
135
|
+
- name: financial_scenarios_ablations
|
|
136
|
+
display_name: Financial Scenarios Ablations
|
|
137
|
+
description: Scenarios for the financial domain with ablations
|
|
138
|
+
category: All scenarios
|
|
139
|
+
subgroups:
|
|
140
|
+
- fin_qa
|
|
141
|
+
adapter_keys_shown:
|
|
142
|
+
- model
|
|
143
|
+
- max_train_instances
|
|
@@ -1,78 +1,11 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: generation_multimodal
|
|
10
|
-
description: Given the multimodal input, the model generates the output free-form.
|
|
11
|
-
- name: multiple_choice_joint
|
|
12
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
13
|
-
- name: multiple_choice_separate_original
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
15
|
-
- name: multiple_choice_separate_calibrated
|
|
16
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
17
|
-
- name: language_modeling
|
|
18
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
19
|
-
- name: instructions
|
|
20
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
21
|
-
- name: global_prefix
|
|
22
|
-
description: The string that is prepended to the prompt.
|
|
23
|
-
- name: global_suffix
|
|
24
|
-
description: The string that is appended to the prompt.
|
|
25
|
-
- name: instance_prefix
|
|
26
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
27
|
-
- name: input_prefix
|
|
28
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
29
|
-
- name: input_suffix
|
|
30
|
-
description: The string that is included after each input (e.g., '\n').
|
|
31
|
-
- name: reference_prefix
|
|
32
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
33
|
-
- name: reference_suffix
|
|
34
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
35
|
-
- name: output_prefix
|
|
36
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
37
|
-
- name: output_suffix
|
|
38
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
39
|
-
- name: substitutions
|
|
40
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
41
|
-
- name: max_train_instances
|
|
42
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
43
|
-
- name: max_eval_instances
|
|
44
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
45
|
-
- name: num_outputs
|
|
46
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
47
|
-
- name: num_train_trials
|
|
48
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
49
|
-
- name: sample_train
|
|
50
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
51
|
-
- name: model
|
|
52
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: model_deployment
|
|
54
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
55
|
-
- name: temperature
|
|
56
|
-
description: Temperature parameter used in generation.
|
|
57
|
-
- name: max_tokens
|
|
58
|
-
description: Maximum number of tokens to generate.
|
|
59
|
-
- name: stop_sequences
|
|
60
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
61
|
-
- name: random
|
|
62
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
63
|
-
- name: multi_label
|
|
64
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
65
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
66
6
|
############################################################
|
|
67
7
|
metrics:
|
|
68
8
|
# Infrastructure metrics:
|
|
69
|
-
- name: num_perplexity_tokens
|
|
70
|
-
display_name: '# tokens'
|
|
71
|
-
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
72
|
-
- name: num_bytes
|
|
73
|
-
display_name: '# bytes'
|
|
74
|
-
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
75
|
-
|
|
76
9
|
- name: num_references
|
|
77
10
|
display_name: '# ref'
|
|
78
11
|
description: Number of references.
|
|
@@ -136,50 +69,25 @@ metrics:
|
|
|
136
69
|
description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
|
|
137
70
|
|
|
138
71
|
# Vision Language metrics [image]:
|
|
139
|
-
- name:
|
|
140
|
-
display_name:
|
|
141
|
-
short_display_name:
|
|
142
|
-
description:
|
|
143
|
-
lower_is_better: false
|
|
144
|
-
- name: block_emd_similarity_white
|
|
145
|
-
display_name: Block Earth Mover Similarity (white)
|
|
146
|
-
short_display_name: Block EMS (white)
|
|
147
|
-
description: Block Earth Mover Similarity (white)
|
|
148
|
-
lower_is_better: false
|
|
149
|
-
- name: block_emd_similarity_median_color
|
|
150
|
-
display_name: Block Earth Mover Similarity (median)
|
|
151
|
-
short_display_name: Block EMS (median)
|
|
152
|
-
description: Block Earth Mover Similarity (median)
|
|
72
|
+
- name: earth_mover_similarity
|
|
73
|
+
display_name: Earth Mover Similarity
|
|
74
|
+
short_display_name: EMS
|
|
75
|
+
description: Earth Mover Similarity (EMD adapted to speed up calculations)
|
|
153
76
|
lower_is_better: false
|
|
154
77
|
- name: pixel_similarity
|
|
155
78
|
display_name: Pixel Similarity
|
|
156
79
|
short_display_name: PS
|
|
157
80
|
description: Pixel Similarity between an image generated by the model and the target image.
|
|
158
81
|
lower_is_better: false
|
|
159
|
-
- name: sift_similarity
|
|
160
|
-
display_name: SIFT Similarity
|
|
161
|
-
short_display_name: SIFT
|
|
162
|
-
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
|
|
163
|
-
lower_is_better: false
|
|
164
82
|
- name: compilation_success
|
|
165
83
|
display_name: Compilation success
|
|
166
84
|
description: Fraction of instances where the generated code compiles successfully.
|
|
167
85
|
lower_is_better: false
|
|
168
|
-
- name: lpips_similarity
|
|
169
|
-
display_name: LPIPS similarity
|
|
170
|
-
short_display_name: LPIPS
|
|
171
|
-
description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
|
|
172
|
-
lower_is_better: false
|
|
173
86
|
- name: fid_similarity
|
|
174
87
|
display_name: FID similarity
|
|
175
88
|
short_display_name: FID
|
|
176
89
|
description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
|
|
177
90
|
lower_is_better: false
|
|
178
|
-
- name: ssim_similarity
|
|
179
|
-
display_name: SSIM
|
|
180
|
-
short_display_name: SSIM
|
|
181
|
-
description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
|
|
182
|
-
lower_is_better: false
|
|
183
91
|
|
|
184
92
|
# Accuracy metrics:
|
|
185
93
|
- name: exact_match
|
|
@@ -215,11 +123,37 @@ metric_groups:
|
|
|
215
123
|
- name: accuracy
|
|
216
124
|
display_name: Compilation Rate and Earth Mover Similarity
|
|
217
125
|
metrics:
|
|
218
|
-
- name:
|
|
126
|
+
- name: earth_mover_similarity
|
|
219
127
|
split: ${main_split}
|
|
220
128
|
- name: compilation_success
|
|
221
129
|
split: ${main_split}
|
|
222
130
|
|
|
131
|
+
- name: general_information
|
|
132
|
+
display_name: General information
|
|
133
|
+
metrics:
|
|
134
|
+
- name: num_instances
|
|
135
|
+
split: ${main_split}
|
|
136
|
+
- name: num_train_instances
|
|
137
|
+
split: ${main_split}
|
|
138
|
+
- name: prompt_truncated
|
|
139
|
+
split: ${main_split}
|
|
140
|
+
- name: num_prompt_tokens
|
|
141
|
+
split: ${main_split}
|
|
142
|
+
- name: num_output_tokens
|
|
143
|
+
split: ${main_split}
|
|
144
|
+
|
|
145
|
+
- name: accuracy_simple
|
|
146
|
+
display_name: Earth Mover Similarity
|
|
147
|
+
metrics:
|
|
148
|
+
- name: earth_mover_similarity
|
|
149
|
+
split: ${main_split}
|
|
150
|
+
|
|
151
|
+
- name: compilation
|
|
152
|
+
display_name: Compilation Rate
|
|
153
|
+
metrics:
|
|
154
|
+
- name: compilation_success
|
|
155
|
+
split: ${main_split}
|
|
156
|
+
|
|
223
157
|
- name: generation_image
|
|
224
158
|
display_name: Generation (image)
|
|
225
159
|
metrics:
|
|
@@ -229,11 +163,7 @@ metric_groups:
|
|
|
229
163
|
split: ${main_split}
|
|
230
164
|
- name: fid_similarity
|
|
231
165
|
split: ${main_split}
|
|
232
|
-
- name:
|
|
233
|
-
split: ${main_split}
|
|
234
|
-
- name: block_emd_similarity_white
|
|
235
|
-
split: ${main_split}
|
|
236
|
-
- name: block_emd_similarity_median_color
|
|
166
|
+
- name: earth_mover_similarity
|
|
237
167
|
split: ${main_split}
|
|
238
168
|
|
|
239
169
|
- name: generation_text
|
|
@@ -253,6 +183,14 @@ run_groups:
|
|
|
253
183
|
- image2webpage
|
|
254
184
|
- image2musicsheet
|
|
255
185
|
|
|
186
|
+
- name: image2structure_real
|
|
187
|
+
display_name: Image2Structure (Wild)
|
|
188
|
+
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
|
|
189
|
+
category: All scenarios
|
|
190
|
+
subgroups:
|
|
191
|
+
- image2latex_real
|
|
192
|
+
- image2webpage_real
|
|
193
|
+
|
|
256
194
|
- name: image2latex
|
|
257
195
|
display_name: Image2LaTeX
|
|
258
196
|
description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
|
|
@@ -260,13 +198,88 @@ run_groups:
|
|
|
260
198
|
- accuracy
|
|
261
199
|
- generation_image
|
|
262
200
|
- generation_text
|
|
201
|
+
- general_information
|
|
263
202
|
environment:
|
|
264
|
-
main_name:
|
|
203
|
+
main_name: earth_mover_similarity
|
|
265
204
|
main_split: valid
|
|
266
205
|
taxonomy:
|
|
267
206
|
task: image-to-text
|
|
268
207
|
what: mathematical equations, tables, algorithms, tikz
|
|
269
|
-
who:
|
|
208
|
+
who: dataset authors
|
|
209
|
+
when: "2024"
|
|
210
|
+
language: English
|
|
211
|
+
|
|
212
|
+
- name: image2latex_easy
|
|
213
|
+
display_name: I2LaTeX (Easy)
|
|
214
|
+
description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
215
|
+
metric_groups:
|
|
216
|
+
- accuracy_simple
|
|
217
|
+
- compilation
|
|
218
|
+
- generation_image
|
|
219
|
+
- generation_text
|
|
220
|
+
- general_information
|
|
221
|
+
environment:
|
|
222
|
+
main_name: earth_mover_similarity
|
|
223
|
+
main_split: valid
|
|
224
|
+
taxonomy:
|
|
225
|
+
task: image-to-text
|
|
226
|
+
what: mathematical equations, tables, algorithms, tikz
|
|
227
|
+
who: dataset authors
|
|
228
|
+
when: "2024"
|
|
229
|
+
language: English
|
|
230
|
+
|
|
231
|
+
- name: image2latex_medium
|
|
232
|
+
display_name: I2LaTeX (Medium)
|
|
233
|
+
description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
234
|
+
metric_groups:
|
|
235
|
+
- accuracy_simple
|
|
236
|
+
- compilation
|
|
237
|
+
- generation_image
|
|
238
|
+
- generation_text
|
|
239
|
+
- general_information
|
|
240
|
+
environment:
|
|
241
|
+
main_name: earth_mover_similarity
|
|
242
|
+
main_split: valid
|
|
243
|
+
taxonomy:
|
|
244
|
+
task: image-to-text
|
|
245
|
+
what: mathematical equations, tables, algorithms, tikz
|
|
246
|
+
who: dataset authors
|
|
247
|
+
when: "2024"
|
|
248
|
+
language: English
|
|
249
|
+
|
|
250
|
+
- name: image2latex_hard
|
|
251
|
+
display_name: I2LaTeX (Hard)
|
|
252
|
+
description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
253
|
+
metric_groups:
|
|
254
|
+
- accuracy_simple
|
|
255
|
+
- compilation
|
|
256
|
+
- generation_image
|
|
257
|
+
- generation_text
|
|
258
|
+
- general_information
|
|
259
|
+
environment:
|
|
260
|
+
main_name: earth_mover_similarity
|
|
261
|
+
main_split: valid
|
|
262
|
+
taxonomy:
|
|
263
|
+
task: image-to-text
|
|
264
|
+
what: mathematical equations, tables, algorithms, tikz
|
|
265
|
+
who: dataset authors
|
|
266
|
+
when: "2024"
|
|
267
|
+
language: English
|
|
268
|
+
|
|
269
|
+
- name: image2latex_real
|
|
270
|
+
display_name: Image2LaTeX (Wild)
|
|
271
|
+
description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
|
|
272
|
+
metric_groups:
|
|
273
|
+
- accuracy
|
|
274
|
+
- generation_image
|
|
275
|
+
- general_information
|
|
276
|
+
environment:
|
|
277
|
+
main_name: earth_mover_similarity
|
|
278
|
+
main_split: valid
|
|
279
|
+
taxonomy:
|
|
280
|
+
task: image-to-text
|
|
281
|
+
what: mathematical equations, tables, algorithms, tikz
|
|
282
|
+
who: dataset authors
|
|
270
283
|
when: "2024"
|
|
271
284
|
language: English
|
|
272
285
|
|
|
@@ -277,13 +290,88 @@ run_groups:
|
|
|
277
290
|
- accuracy
|
|
278
291
|
- generation_image
|
|
279
292
|
- generation_text
|
|
293
|
+
- general_information
|
|
280
294
|
environment:
|
|
281
|
-
main_name:
|
|
295
|
+
main_name: earth_mover_similarity
|
|
282
296
|
main_split: valid
|
|
283
297
|
taxonomy:
|
|
284
298
|
task: image-to-text
|
|
285
299
|
what: css, html, javascript
|
|
286
|
-
who:
|
|
300
|
+
who: dataset authors
|
|
301
|
+
when: "2024"
|
|
302
|
+
language: English
|
|
303
|
+
|
|
304
|
+
- name: image2webpage_easy
|
|
305
|
+
display_name: I2webpage (Easy)
|
|
306
|
+
description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
307
|
+
metric_groups:
|
|
308
|
+
- accuracy_simple
|
|
309
|
+
- compilation
|
|
310
|
+
- generation_image
|
|
311
|
+
- generation_text
|
|
312
|
+
- general_information
|
|
313
|
+
environment:
|
|
314
|
+
main_name: earth_mover_similarity
|
|
315
|
+
main_split: valid
|
|
316
|
+
taxonomy:
|
|
317
|
+
task: image-to-text
|
|
318
|
+
what: css, html, javascript
|
|
319
|
+
who: dataset authors
|
|
320
|
+
when: "2024"
|
|
321
|
+
language: English
|
|
322
|
+
|
|
323
|
+
- name: image2webpage_medium
|
|
324
|
+
display_name: I2webpage (Medium)
|
|
325
|
+
description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
326
|
+
metric_groups:
|
|
327
|
+
- accuracy_simple
|
|
328
|
+
- compilation
|
|
329
|
+
- generation_image
|
|
330
|
+
- generation_text
|
|
331
|
+
- general_information
|
|
332
|
+
environment:
|
|
333
|
+
main_name: earth_mover_similarity
|
|
334
|
+
main_split: valid
|
|
335
|
+
taxonomy:
|
|
336
|
+
task: image-to-text
|
|
337
|
+
what: css, html, javascript
|
|
338
|
+
who: dataset authors
|
|
339
|
+
when: "2024"
|
|
340
|
+
language: English
|
|
341
|
+
|
|
342
|
+
- name: image2webpage_hard
|
|
343
|
+
display_name: I2webpage (Hard)
|
|
344
|
+
description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
345
|
+
metric_groups:
|
|
346
|
+
- accuracy_simple
|
|
347
|
+
- compilation
|
|
348
|
+
- generation_image
|
|
349
|
+
- generation_text
|
|
350
|
+
- general_information
|
|
351
|
+
environment:
|
|
352
|
+
main_name: earth_mover_similarity
|
|
353
|
+
main_split: valid
|
|
354
|
+
taxonomy:
|
|
355
|
+
task: image-to-text
|
|
356
|
+
what: css, html, javascript
|
|
357
|
+
who: dataset authors
|
|
358
|
+
when: "2024"
|
|
359
|
+
language: English
|
|
360
|
+
|
|
361
|
+
- name: image2webpage_real
|
|
362
|
+
display_name: Image2webpage (Wild)
|
|
363
|
+
description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
|
|
364
|
+
metric_groups:
|
|
365
|
+
- accuracy
|
|
366
|
+
- generation_image
|
|
367
|
+
- general_information
|
|
368
|
+
environment:
|
|
369
|
+
main_name: earth_mover_similarity
|
|
370
|
+
main_split: valid
|
|
371
|
+
taxonomy:
|
|
372
|
+
task: image-to-text
|
|
373
|
+
what: css, html, javascript
|
|
374
|
+
who: dataset authors
|
|
287
375
|
when: "2024"
|
|
288
376
|
language: English
|
|
289
377
|
|
|
@@ -293,12 +381,67 @@ run_groups:
|
|
|
293
381
|
metric_groups:
|
|
294
382
|
- accuracy
|
|
295
383
|
- generation_image
|
|
384
|
+
- general_information
|
|
385
|
+
environment:
|
|
386
|
+
main_name: earth_mover_similarity
|
|
387
|
+
main_split: valid
|
|
388
|
+
taxonomy:
|
|
389
|
+
task: image-to-text
|
|
390
|
+
what: music sheets
|
|
391
|
+
who: dataset authors
|
|
392
|
+
when: "2024"
|
|
393
|
+
language: English
|
|
394
|
+
|
|
395
|
+
- name: image2musicsheet_easy
|
|
396
|
+
display_name: I2musicsheet (Easy)
|
|
397
|
+
description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
398
|
+
metric_groups:
|
|
399
|
+
- accuracy_simple
|
|
400
|
+
- compilation
|
|
401
|
+
- generation_image
|
|
402
|
+
- general_information
|
|
403
|
+
environment:
|
|
404
|
+
main_name: earth_mover_similarity
|
|
405
|
+
main_split: valid
|
|
406
|
+
taxonomy:
|
|
407
|
+
task: image-to-text
|
|
408
|
+
what: music sheets
|
|
409
|
+
who: dataset authors
|
|
410
|
+
when: "2024"
|
|
411
|
+
language: English
|
|
412
|
+
|
|
413
|
+
- name: image2musicsheet_medium
|
|
414
|
+
display_name: I2musicsheet (Medium)
|
|
415
|
+
description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
416
|
+
metric_groups:
|
|
417
|
+
- accuracy_simple
|
|
418
|
+
- compilation
|
|
419
|
+
- generation_image
|
|
420
|
+
- general_information
|
|
421
|
+
environment:
|
|
422
|
+
main_name: earth_mover_similarity
|
|
423
|
+
main_split: valid
|
|
424
|
+
taxonomy:
|
|
425
|
+
task: image-to-text
|
|
426
|
+
what: music sheets
|
|
427
|
+
who: dataset authors
|
|
428
|
+
when: "2024"
|
|
429
|
+
language: English
|
|
430
|
+
|
|
431
|
+
- name: image2musicsheet_hard
|
|
432
|
+
display_name: I2musicsheet (Hard)
|
|
433
|
+
description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
434
|
+
metric_groups:
|
|
435
|
+
- accuracy_simple
|
|
436
|
+
- compilation
|
|
437
|
+
- generation_image
|
|
438
|
+
- general_information
|
|
296
439
|
environment:
|
|
297
|
-
main_name:
|
|
440
|
+
main_name: earth_mover_similarity
|
|
298
441
|
main_split: valid
|
|
299
442
|
taxonomy:
|
|
300
443
|
task: image-to-text
|
|
301
444
|
what: music sheets
|
|
302
|
-
who:
|
|
445
|
+
who: dataset authors
|
|
303
446
|
when: "2024"
|
|
304
447
|
language: English
|