crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,57 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
description: The high-level strategy for converting instances into a prompt for the language model.
|
|
7
|
-
values:
|
|
8
|
-
- name: generation
|
|
9
|
-
description: Given the input, the model generates the output free-form.
|
|
10
|
-
- name: instructions
|
|
11
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
12
|
-
- name: global_prefix
|
|
13
|
-
description: The string that is prepended to the prompt.
|
|
14
|
-
- name: instance_prefix
|
|
15
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
16
|
-
- name: input_prefix
|
|
17
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
18
|
-
- name: input_suffix
|
|
19
|
-
description: The string that is included after each input (e.g., '\n').
|
|
20
|
-
- name: reference_prefix
|
|
21
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
22
|
-
- name: reference_suffix
|
|
23
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
24
|
-
- name: output_prefix
|
|
25
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
26
|
-
- name: output_suffix
|
|
27
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
28
|
-
- name: substitutions
|
|
29
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
30
|
-
- name: max_train_instances
|
|
31
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
32
|
-
- name: max_eval_instances
|
|
33
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
34
|
-
- name: num_outputs
|
|
35
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
36
|
-
- name: num_train_trials
|
|
37
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
38
|
-
- name: sample_train
|
|
39
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
40
|
-
- name: model
|
|
41
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
42
|
-
- name: model_deployment
|
|
43
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
44
|
-
- name: temperature
|
|
45
|
-
description: Temperature parameter used in generation.
|
|
46
|
-
- name: max_tokens
|
|
47
|
-
description: Maximum number of tokens to generate.
|
|
48
|
-
- name: stop_sequences
|
|
49
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
50
|
-
- name: random
|
|
51
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
52
|
-
- name: multi_label
|
|
53
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
54
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
55
6
|
############################################################
|
|
56
7
|
metrics:
|
|
57
8
|
- name: Helpfulness
|
|
@@ -1,66 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: multiple_choice_joint
|
|
10
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
11
|
-
- name: multiple_choice_separate_original
|
|
12
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
13
|
-
- name: multiple_choice_separate_calibrated
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
15
|
-
- name: language_modeling
|
|
16
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
17
|
-
- name: instructions
|
|
18
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
19
|
-
- name: global_prefix
|
|
20
|
-
description: The string that is prepended to the prompt.
|
|
21
|
-
- name: global_suffix
|
|
22
|
-
description: The string that is appended to the prompt.
|
|
23
|
-
- name: instance_prefix
|
|
24
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
25
|
-
- name: input_prefix
|
|
26
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
27
|
-
- name: input_suffix
|
|
28
|
-
description: The string that is included after each input (e.g., '\n').
|
|
29
|
-
- name: reference_prefix
|
|
30
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
31
|
-
- name: reference_suffix
|
|
32
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
33
|
-
- name: output_prefix
|
|
34
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
35
|
-
- name: output_suffix
|
|
36
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
37
|
-
- name: substitutions
|
|
38
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
39
|
-
- name: max_train_instances
|
|
40
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
41
|
-
- name: max_eval_instances
|
|
42
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
43
|
-
- name: num_outputs
|
|
44
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
45
|
-
- name: num_train_trials
|
|
46
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
47
|
-
- name: sample_train
|
|
48
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
49
|
-
- name: model
|
|
50
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
51
|
-
- name: model_deployment
|
|
52
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: temperature
|
|
54
|
-
description: Temperature parameter used in generation.
|
|
55
|
-
- name: max_tokens
|
|
56
|
-
description: Maximum number of tokens to generate.
|
|
57
|
-
- name: stop_sequences
|
|
58
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
59
|
-
- name: random
|
|
60
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
61
|
-
- name: multi_label
|
|
62
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
63
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this in the future.
|
|
5
|
+
adapter: []
|
|
64
6
|
############################################################
|
|
65
7
|
metrics:
|
|
66
8
|
# Infrastructure metrics:
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: f1_score
|
|
74
|
+
display_name: F1
|
|
75
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: live_qa_score
|
|
78
|
+
display_name: Judge Score
|
|
79
|
+
description: LLM-as-judge score
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: medication_qa_score
|
|
82
|
+
display_name: Judge Score
|
|
83
|
+
description: LLM-as-judge score
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
|
|
86
|
+
# Toxicity metrics
|
|
87
|
+
- name: expected_max_toxicity
|
|
88
|
+
display_name: Average max toxicity
|
|
89
|
+
short_display_name: Avg. max toxicity
|
|
90
|
+
lower_is_better: true
|
|
91
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
92
|
+
- name: max_toxicity_probability
|
|
93
|
+
display_name: max toxicity
|
|
94
|
+
lower_is_better: true
|
|
95
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
96
|
+
- name: toxic_frac
|
|
97
|
+
display_name: Toxic fraction
|
|
98
|
+
lower_is_better: true
|
|
99
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
############################################################
|
|
103
|
+
perturbations: []
|
|
104
|
+
|
|
105
|
+
############################################################
|
|
106
|
+
metric_groups:
|
|
107
|
+
- name: accuracy
|
|
108
|
+
display_name: Accuracy
|
|
109
|
+
metrics:
|
|
110
|
+
- name: ${main_name}
|
|
111
|
+
split: ${main_split}
|
|
112
|
+
|
|
113
|
+
- name: efficiency
|
|
114
|
+
display_name: Efficiency
|
|
115
|
+
metrics:
|
|
116
|
+
- name: inference_runtime
|
|
117
|
+
split: ${main_split}
|
|
118
|
+
|
|
119
|
+
- name: general_information
|
|
120
|
+
display_name: General information
|
|
121
|
+
hide_win_rates: true
|
|
122
|
+
metrics:
|
|
123
|
+
- name: num_instances
|
|
124
|
+
split: ${main_split}
|
|
125
|
+
- name: num_train_instances
|
|
126
|
+
split: ${main_split}
|
|
127
|
+
- name: prompt_truncated
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
- name: num_prompt_tokens
|
|
130
|
+
split: ${main_split}
|
|
131
|
+
- name: num_output_tokens
|
|
132
|
+
split: ${main_split}
|
|
133
|
+
|
|
134
|
+
- name: toxicity
|
|
135
|
+
display_name: Toxicity
|
|
136
|
+
metrics:
|
|
137
|
+
- name: toxic_frac
|
|
138
|
+
split: ${main_split}
|
|
139
|
+
|
|
140
|
+
############################################################
|
|
141
|
+
run_groups:
|
|
142
|
+
- name: medical_scenarios
|
|
143
|
+
display_name: Medical Scenarios
|
|
144
|
+
description: Scenarios for the medical domain
|
|
145
|
+
category: All scenarios
|
|
146
|
+
subgroups:
|
|
147
|
+
- med_qa
|
|
148
|
+
- med_mcqa
|
|
149
|
+
- pubmed_qa
|
|
150
|
+
- mmlu
|
|
151
|
+
- live_qa
|
|
152
|
+
- medication_qa
|
|
153
|
+
|
|
154
|
+
- name: med_qa
|
|
155
|
+
display_name: MedQA
|
|
156
|
+
description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
|
|
157
|
+
metric_groups:
|
|
158
|
+
- accuracy
|
|
159
|
+
- efficiency
|
|
160
|
+
- general_information
|
|
161
|
+
environment:
|
|
162
|
+
main_name: exact_match
|
|
163
|
+
main_split: test
|
|
164
|
+
taxonomy:
|
|
165
|
+
task: question answering
|
|
166
|
+
what: n/a
|
|
167
|
+
who: n/a
|
|
168
|
+
when: n/a
|
|
169
|
+
language: English
|
|
170
|
+
|
|
171
|
+
- name: med_mcqa
|
|
172
|
+
display_name: MedMCQA
|
|
173
|
+
description: AIIMS/NEET QA multiple choice questions with 4 choices.
|
|
174
|
+
metric_groups:
|
|
175
|
+
- accuracy
|
|
176
|
+
- efficiency
|
|
177
|
+
- general_information
|
|
178
|
+
environment:
|
|
179
|
+
main_name: exact_match
|
|
180
|
+
main_split: valid
|
|
181
|
+
taxonomy:
|
|
182
|
+
task: question answering
|
|
183
|
+
what: n/a
|
|
184
|
+
who: n/a
|
|
185
|
+
when: n/a
|
|
186
|
+
language: English
|
|
187
|
+
|
|
188
|
+
- name: pubmed_qa
|
|
189
|
+
display_name: PubMedQA
|
|
190
|
+
description: biomedical literature Q + Context + A yes/no/maybe + long answer questions
|
|
191
|
+
metric_groups:
|
|
192
|
+
- accuracy
|
|
193
|
+
- efficiency
|
|
194
|
+
- general_information
|
|
195
|
+
environment:
|
|
196
|
+
main_name: exact_match
|
|
197
|
+
main_split: test
|
|
198
|
+
taxonomy:
|
|
199
|
+
task: question answering
|
|
200
|
+
what: n/a
|
|
201
|
+
who: n/a
|
|
202
|
+
when: n/a
|
|
203
|
+
language: English
|
|
204
|
+
|
|
205
|
+
- name: mmlu
|
|
206
|
+
display_name: MMLU (Massive Multitask Language Understanding)
|
|
207
|
+
short_display_name: MMLU
|
|
208
|
+
description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).
|
|
209
|
+
metric_groups:
|
|
210
|
+
- accuracy
|
|
211
|
+
- efficiency
|
|
212
|
+
- general_information
|
|
213
|
+
environment:
|
|
214
|
+
main_name: exact_match
|
|
215
|
+
main_split: test
|
|
216
|
+
taxonomy:
|
|
217
|
+
task: question answering
|
|
218
|
+
what: "?"
|
|
219
|
+
who: "?"
|
|
220
|
+
when: "?"
|
|
221
|
+
language: English
|
|
222
|
+
|
|
223
|
+
- name: live_qa
|
|
224
|
+
display_name: LiveQA
|
|
225
|
+
description: Consumer health questions with librarian-generated reference answers.
|
|
226
|
+
metric_groups:
|
|
227
|
+
- accuracy
|
|
228
|
+
- efficiency
|
|
229
|
+
- general_information
|
|
230
|
+
environment:
|
|
231
|
+
main_name: live_qa_score
|
|
232
|
+
main_split: test
|
|
233
|
+
taxonomy:
|
|
234
|
+
task: question answering
|
|
235
|
+
what: n/a
|
|
236
|
+
who: n/a
|
|
237
|
+
when: n/a
|
|
238
|
+
language: English
|
|
239
|
+
|
|
240
|
+
- name: medication_qa
|
|
241
|
+
display_name: MedicationQA
|
|
242
|
+
description: Consumer medication questions with reference answers.
|
|
243
|
+
metric_groups:
|
|
244
|
+
- accuracy
|
|
245
|
+
- efficiency
|
|
246
|
+
- general_information
|
|
247
|
+
environment:
|
|
248
|
+
main_name: medication_qa_score
|
|
249
|
+
main_split: test
|
|
250
|
+
taxonomy:
|
|
251
|
+
task: question answering
|
|
252
|
+
what: n/a
|
|
253
|
+
who: n/a
|
|
254
|
+
when: n/a
|
|
255
|
+
language: English
|
|
@@ -1,66 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
############################################################
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: multiple_choice_joint
|
|
10
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
11
|
-
- name: multiple_choice_separate_original
|
|
12
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
13
|
-
- name: multiple_choice_separate_calibrated
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
15
|
-
- name: language_modeling
|
|
16
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
17
|
-
- name: instructions
|
|
18
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
19
|
-
- name: global_prefix
|
|
20
|
-
description: The string that is prepended to the prompt.
|
|
21
|
-
- name: global_suffix
|
|
22
|
-
description: The string that is appended to the prompt.
|
|
23
|
-
- name: instance_prefix
|
|
24
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
25
|
-
- name: input_prefix
|
|
26
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
27
|
-
- name: input_suffix
|
|
28
|
-
description: The string that is included after each input (e.g., '\n').
|
|
29
|
-
- name: reference_prefix
|
|
30
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
31
|
-
- name: reference_suffix
|
|
32
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
33
|
-
- name: output_prefix
|
|
34
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
35
|
-
- name: output_suffix
|
|
36
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
37
|
-
- name: substitutions
|
|
38
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
39
|
-
- name: max_train_instances
|
|
40
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
41
|
-
- name: max_eval_instances
|
|
42
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
43
|
-
- name: num_outputs
|
|
44
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
45
|
-
- name: num_train_trials
|
|
46
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
47
|
-
- name: sample_train
|
|
48
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
49
|
-
- name: model
|
|
50
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
51
|
-
- name: model_deployment
|
|
52
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: temperature
|
|
54
|
-
description: Temperature parameter used in generation.
|
|
55
|
-
- name: max_tokens
|
|
56
|
-
description: Maximum number of tokens to generate.
|
|
57
|
-
- name: stop_sequences
|
|
58
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
59
|
-
- name: random
|
|
60
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
61
|
-
- name: multi_label
|
|
62
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
63
|
-
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
64
6
|
############################################################
|
|
65
7
|
metrics:
|
|
66
8
|
# Infrastructure metrics:
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
---
|
|
2
|
+
|
|
3
|
+
############################################################
|
|
4
|
+
metrics:
|
|
5
|
+
# Infrastructure metrics:
|
|
6
|
+
- name: num_perplexity_tokens
|
|
7
|
+
display_name: '# tokens'
|
|
8
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
9
|
+
- name: num_bytes
|
|
10
|
+
display_name: '# bytes'
|
|
11
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Efficiency metrics:
|
|
68
|
+
- name: training_co2_cost
|
|
69
|
+
display_name: Estimated training emissions (kg CO2)
|
|
70
|
+
short_display_name: Training emissions (kg CO2)
|
|
71
|
+
lower_is_better: true
|
|
72
|
+
description: Estimate of the CO2 emissions from training the model.
|
|
73
|
+
- name: training_energy_cost
|
|
74
|
+
display_name: Estimated training energy cost (MWh)
|
|
75
|
+
short_display_name: Training energy (MWh)
|
|
76
|
+
lower_is_better: true
|
|
77
|
+
description: Estimate of the amount of energy used to train the model.
|
|
78
|
+
- name: inference_runtime
|
|
79
|
+
display_name: Observed inference runtime (s)
|
|
80
|
+
short_display_name: Observed inference time (s)
|
|
81
|
+
lower_is_better: true
|
|
82
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
83
|
+
- name: inference_idealized_runtime
|
|
84
|
+
display_name: Idealized inference runtime (s)
|
|
85
|
+
short_display_name: Idealized inference time (s)
|
|
86
|
+
lower_is_better: true
|
|
87
|
+
description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
|
|
88
|
+
- name: inference_denoised_runtime
|
|
89
|
+
display_name: Denoised inference runtime (s)
|
|
90
|
+
short_display_name: Denoised inference time (s)
|
|
91
|
+
lower_is_better: true
|
|
92
|
+
description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
|
|
93
|
+
- name: batch_size
|
|
94
|
+
display_name: Batch size
|
|
95
|
+
description: For batch jobs, how many requests are in a batch.
|
|
96
|
+
|
|
97
|
+
# Unitxt Metrics
|
|
98
|
+
- name: meteor
|
|
99
|
+
display_name: METEOR
|
|
100
|
+
short_display_name: METEOR
|
|
101
|
+
description: METEOR
|
|
102
|
+
- name: f1
|
|
103
|
+
display_name: F1
|
|
104
|
+
short_display_name: F1
|
|
105
|
+
description: F1
|
|
106
|
+
- name: precision
|
|
107
|
+
display_name: Precision
|
|
108
|
+
short_display_name: Precision
|
|
109
|
+
description: Precision
|
|
110
|
+
- name: recall
|
|
111
|
+
display_name: Recall
|
|
112
|
+
short_display_name: Recall
|
|
113
|
+
description: Recall
|
|
114
|
+
- name: rouge1
|
|
115
|
+
display_name: ROUGE-1
|
|
116
|
+
short_display_name: ROUGE-1
|
|
117
|
+
description: ROUGE-1
|
|
118
|
+
- name: rouge2
|
|
119
|
+
display_name: ROUGE-2
|
|
120
|
+
short_display_name: ROUGE-2
|
|
121
|
+
description: ROUGE-2
|
|
122
|
+
- name: rougeL
|
|
123
|
+
display_name: ROUGE-L
|
|
124
|
+
short_display_name: ROUGE-L
|
|
125
|
+
description: ROUGE-L
|
|
126
|
+
- name: rougeLsum
|
|
127
|
+
display_name: ROUGE-Lsum
|
|
128
|
+
short_display_name: ROUGE-Lsum
|
|
129
|
+
description: ROUGE-Lsum
|
|
130
|
+
- name: bleu
|
|
131
|
+
display_name: BLEU
|
|
132
|
+
short_display_name: BLEU
|
|
133
|
+
description: BLEU
|
|
134
|
+
|
|
135
|
+
perturbations: []
|
|
136
|
+
|
|
137
|
+
metric_groups:
|
|
138
|
+
- name: accuracy
|
|
139
|
+
display_name: Accuracy
|
|
140
|
+
hide_win_rates: true
|
|
141
|
+
metrics:
|
|
142
|
+
- name: meteor
|
|
143
|
+
split: __all__
|
|
144
|
+
- name: rouge1
|
|
145
|
+
split: __all__
|
|
146
|
+
- name: rouge2
|
|
147
|
+
split: __all__
|
|
148
|
+
- name: rougeL
|
|
149
|
+
split: __all__
|
|
150
|
+
- name: rougeLsum
|
|
151
|
+
split: __all__
|
|
152
|
+
- name: bleu
|
|
153
|
+
split: __all__
|
|
154
|
+
|
|
155
|
+
- name: efficiency
|
|
156
|
+
display_name: Efficiency
|
|
157
|
+
metrics:
|
|
158
|
+
- name: inference_runtime
|
|
159
|
+
split: ${main_split}
|
|
160
|
+
|
|
161
|
+
- name: general_information
|
|
162
|
+
display_name: General information
|
|
163
|
+
hide_win_rates: true
|
|
164
|
+
metrics:
|
|
165
|
+
- name: num_instances
|
|
166
|
+
split: ${main_split}
|
|
167
|
+
- name: num_train_instances
|
|
168
|
+
split: ${main_split}
|
|
169
|
+
- name: prompt_truncated
|
|
170
|
+
split: ${main_split}
|
|
171
|
+
- name: num_prompt_tokens
|
|
172
|
+
split: ${main_split}
|
|
173
|
+
- name: num_output_tokens
|
|
174
|
+
split: ${main_split}
|
|
175
|
+
|
|
176
|
+
run_groups:
|
|
177
|
+
- name: table_scenarios
|
|
178
|
+
display_name: Table Scenarios
|
|
179
|
+
description: Table Scenarios
|
|
180
|
+
category: All Scenarios
|
|
181
|
+
subgroups:
|
|
182
|
+
- unitxt_cards.numeric_nlg
|
|
183
|
+
|
|
184
|
+
- name: unitxt_cards.numeric_nlg
|
|
185
|
+
display_name: NumericNLG
|
|
186
|
+
short_display_name: NumericNLG
|
|
187
|
+
description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
|
|
188
|
+
metric_groups:
|
|
189
|
+
- accuracy
|
|
190
|
+
- efficiency
|
|
191
|
+
- general_information
|
|
192
|
+
environment:
|
|
193
|
+
main_name: meteor
|
|
194
|
+
main_split: test
|
|
195
|
+
taxonomy:
|
|
196
|
+
task: "?"
|
|
197
|
+
what: "?"
|
|
198
|
+
who: "?"
|
|
199
|
+
when: "?"
|
|
200
|
+
language: English
|