crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: quasi_exact_match
|
|
74
|
+
display_name: Quasi-exact match
|
|
75
|
+
short_display_name: EM
|
|
76
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
77
|
+
lower_is_better: false
|
|
78
|
+
- name: prefix_exact_match
|
|
79
|
+
display_name: Prefix exact match
|
|
80
|
+
short_display_name: PEM
|
|
81
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
82
|
+
lower_is_better: false
|
|
83
|
+
- name: quasi_prefix_exact_match
|
|
84
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
85
|
+
display_name: Prefix quasi-exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
|
|
90
|
+
- name: rouge_1
|
|
91
|
+
display_name: ROUGE-1
|
|
92
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
93
|
+
lower_is_better: false
|
|
94
|
+
- name: rouge_2
|
|
95
|
+
display_name: ROUGE-2
|
|
96
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
97
|
+
lower_is_better: false
|
|
98
|
+
- name: rouge_l
|
|
99
|
+
display_name: ROUGE-L
|
|
100
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
101
|
+
lower_is_better: false
|
|
102
|
+
- name: bleu_1
|
|
103
|
+
display_name: BLEU-1
|
|
104
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
105
|
+
lower_is_better: false
|
|
106
|
+
- name: bleu_4
|
|
107
|
+
display_name: BLEU-4
|
|
108
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
109
|
+
lower_is_better: false
|
|
110
|
+
- name: f1_set_match
|
|
111
|
+
display_name: F1 (set match)
|
|
112
|
+
short_display_name: F1
|
|
113
|
+
description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
|
|
114
|
+
lower_is_better: false
|
|
115
|
+
- name: f1_score
|
|
116
|
+
display_name: F1
|
|
117
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
118
|
+
lower_is_better: false
|
|
119
|
+
|
|
120
|
+
# Toxicity metrics
|
|
121
|
+
- name: expected_max_toxicity
|
|
122
|
+
display_name: Average max toxicity
|
|
123
|
+
short_display_name: Avg. max toxicity
|
|
124
|
+
lower_is_better: true
|
|
125
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
126
|
+
- name: max_toxicity_probability
|
|
127
|
+
display_name: max toxicity
|
|
128
|
+
lower_is_better: true
|
|
129
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
130
|
+
- name: toxic_frac
|
|
131
|
+
display_name: Toxic fraction
|
|
132
|
+
lower_is_better: true
|
|
133
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
134
|
+
|
|
135
|
+
# Efficiency metrics:
|
|
136
|
+
- name: training_co2_cost
|
|
137
|
+
display_name: Estimated training emissions (kg CO2)
|
|
138
|
+
short_display_name: Training emissions (kg CO2)
|
|
139
|
+
lower_is_better: true
|
|
140
|
+
description: Estimate of the CO2 emissions from training the model.
|
|
141
|
+
- name: training_energy_cost
|
|
142
|
+
display_name: Estimated training energy cost (MWh)
|
|
143
|
+
short_display_name: Training energy (MWh)
|
|
144
|
+
lower_is_better: true
|
|
145
|
+
description: Estimate of the amount of energy used to train the model.
|
|
146
|
+
- name: inference_runtime
|
|
147
|
+
display_name: Observed inference runtime (s)
|
|
148
|
+
short_display_name: Observed inference time (s)
|
|
149
|
+
lower_is_better: true
|
|
150
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
151
|
+
- name: inference_idealized_runtime
|
|
152
|
+
display_name: Idealized inference runtime (s)
|
|
153
|
+
short_display_name: Idealized inference time (s)
|
|
154
|
+
lower_is_better: true
|
|
155
|
+
description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
|
|
156
|
+
- name: inference_denoised_runtime
|
|
157
|
+
display_name: Denoised inference runtime (s)
|
|
158
|
+
short_display_name: Denoised inference time (s)
|
|
159
|
+
lower_is_better: true
|
|
160
|
+
description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
|
|
161
|
+
- name: batch_size
|
|
162
|
+
display_name: Batch size
|
|
163
|
+
description: For batch jobs, how many requests are in a batch.
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# DecodingTrust Fairness Metrics
|
|
167
|
+
- name: equalized_odds_difference
|
|
168
|
+
display_name: Equalized Odds Difference
|
|
169
|
+
short_display_name: Equalized Odds
|
|
170
|
+
description: The rate at which the model refuses to answer an OoD question.
|
|
171
|
+
lower_is_better: true
|
|
172
|
+
|
|
173
|
+
- name: demographic_parity_difference
|
|
174
|
+
display_name: Demographic Parity Difference
|
|
175
|
+
short_display_name: demographic parity
|
|
176
|
+
description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
|
|
177
|
+
lower_is_better: true
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# DecodingTrust OoD Metrics
|
|
181
|
+
- name: ood_knowledge_rr
|
|
182
|
+
display_name: OOD Knowledge Refusal Rate
|
|
183
|
+
short_display_name: OoD-RR
|
|
184
|
+
description: The rate at which the model refuses to answer an OoD question.
|
|
185
|
+
lower_is_better: false
|
|
186
|
+
|
|
187
|
+
- name: ood_knowledge_macc
|
|
188
|
+
display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
|
|
189
|
+
short_display_name: OoD-MACC
|
|
190
|
+
description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
|
|
191
|
+
lower_is_better: false
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# DecodingTrust Stereotype Bias Metrics
|
|
195
|
+
- name: decodingtrust_stereotype_bias
|
|
196
|
+
display_name: DecodingTrust Bias Score
|
|
197
|
+
short_display_name: Bias Score
|
|
198
|
+
description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
|
|
199
|
+
lower_is_better: false
|
|
200
|
+
|
|
201
|
+
- name: decodingtrust_stereotype_bias_rejection_rate
|
|
202
|
+
display_name: Accuracy at 10% coverage
|
|
203
|
+
short_display_name: Bias Rejection Rate
|
|
204
|
+
description: Rejection rate of stereotype prompts
|
|
205
|
+
lower_is_better: false
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# DecodingTrust Privacy Metrics
|
|
209
|
+
- name: decodingtrust_privacy_leakage_rate
|
|
210
|
+
display_name: Privacy Leakage Rate
|
|
211
|
+
short_display_name: Privacy Leakage Rate
|
|
212
|
+
description: TBD
|
|
213
|
+
lower_is_better: true
|
|
214
|
+
|
|
215
|
+
############################################################
|
|
216
|
+
perturbations: []
|
|
217
|
+
|
|
218
|
+
############################################################
|
|
219
|
+
metric_groups:
|
|
220
|
+
- name: accuracy
|
|
221
|
+
display_name: Accuracy
|
|
222
|
+
metrics:
|
|
223
|
+
- name: ${main_name}
|
|
224
|
+
split: ${main_split}
|
|
225
|
+
|
|
226
|
+
- name: toxicity
|
|
227
|
+
display_name: Toxicity
|
|
228
|
+
metrics:
|
|
229
|
+
- name: toxic_frac
|
|
230
|
+
split: ${main_split}
|
|
231
|
+
|
|
232
|
+
- name: efficiency
|
|
233
|
+
display_name: Efficiency
|
|
234
|
+
metrics:
|
|
235
|
+
- name: inference_denoised_runtime
|
|
236
|
+
split: ${main_split}
|
|
237
|
+
|
|
238
|
+
- name: general_information
|
|
239
|
+
display_name: General information
|
|
240
|
+
metrics:
|
|
241
|
+
- name: num_instances
|
|
242
|
+
split: ${main_split}
|
|
243
|
+
- name: num_train_instances
|
|
244
|
+
split: ${main_split}
|
|
245
|
+
- name: prompt_truncated
|
|
246
|
+
split: ${main_split}
|
|
247
|
+
- name: num_prompt_tokens
|
|
248
|
+
split: ${main_split}
|
|
249
|
+
- name: num_output_tokens
|
|
250
|
+
split: ${main_split}
|
|
251
|
+
- name: num_train_trials
|
|
252
|
+
split: ${main_split}
|
|
253
|
+
|
|
254
|
+
- name: decodingtrust_fairness_metrics
|
|
255
|
+
display_name: Fairness
|
|
256
|
+
metrics:
|
|
257
|
+
- name: equalized_odds_difference
|
|
258
|
+
split: ${main_split}
|
|
259
|
+
- name: demographic_parity_difference
|
|
260
|
+
split: ${main_split}
|
|
261
|
+
|
|
262
|
+
- name: decodingtrust_ood_metrics
|
|
263
|
+
display_name: OOD Accuracy
|
|
264
|
+
metrics:
|
|
265
|
+
- name: ood_knowledge_rr
|
|
266
|
+
split: ${main_split}
|
|
267
|
+
- name: ood_knowledge_macc
|
|
268
|
+
split: ${main_split}
|
|
269
|
+
|
|
270
|
+
- name: decodingtrust_stereotype_bias_metrics
|
|
271
|
+
display_name: Stereotype Bias
|
|
272
|
+
metrics:
|
|
273
|
+
- name: decodingtrust_stereotype_bias
|
|
274
|
+
split: ${main_split}
|
|
275
|
+
- name: decodingtrust_stereotype_bias_rejection_rate
|
|
276
|
+
split: ${main_split}
|
|
277
|
+
|
|
278
|
+
- name: decodingtrust_privacy_metrics
|
|
279
|
+
display_name: Privacy
|
|
280
|
+
metrics:
|
|
281
|
+
- name: decodingtrust_privacy_leakage_rate
|
|
282
|
+
split: ${main_split}
|
|
283
|
+
|
|
284
|
+
############################################################
|
|
285
|
+
run_groups:
|
|
286
|
+
|
|
287
|
+
- name: decodingtrust
|
|
288
|
+
display_name: DecodingTrust
|
|
289
|
+
description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
|
|
290
|
+
category: Trustworthiness
|
|
291
|
+
subgroups:
|
|
292
|
+
- adv_robustness
|
|
293
|
+
- adv_demonstration
|
|
294
|
+
- ood_robustness
|
|
295
|
+
- fairness
|
|
296
|
+
- privacy
|
|
297
|
+
- machine_ethics
|
|
298
|
+
- toxicity_prompts
|
|
299
|
+
- stereotype_bias
|
|
300
|
+
|
|
301
|
+
- name: adv_robustness
|
|
302
|
+
display_name: DecodingTrust - AdvGLUE++
|
|
303
|
+
short_display_name: AdvGLUE++
|
|
304
|
+
description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
|
|
305
|
+
metric_groups:
|
|
306
|
+
- accuracy
|
|
307
|
+
- efficiency
|
|
308
|
+
- general_information
|
|
309
|
+
environment:
|
|
310
|
+
main_name: quasi_exact_match
|
|
311
|
+
main_split: valid
|
|
312
|
+
taxonomy:
|
|
313
|
+
task: "?"
|
|
314
|
+
what: "?"
|
|
315
|
+
who: "?"
|
|
316
|
+
when: "?"
|
|
317
|
+
language: English
|
|
318
|
+
todo: true
|
|
319
|
+
|
|
320
|
+
- name: adv_demonstration
|
|
321
|
+
display_name: DecodingTrust - Adversarial Demonstrations
|
|
322
|
+
short_display_name: AdvDemo
|
|
323
|
+
description: Robustness analysis of LM generations when facing adversarial demonstrations
|
|
324
|
+
metric_groups:
|
|
325
|
+
- accuracy
|
|
326
|
+
- efficiency
|
|
327
|
+
- general_information
|
|
328
|
+
environment:
|
|
329
|
+
main_name: quasi_exact_match
|
|
330
|
+
main_split: valid
|
|
331
|
+
taxonomy:
|
|
332
|
+
task: "?"
|
|
333
|
+
what: "?"
|
|
334
|
+
who: "?"
|
|
335
|
+
when: "?"
|
|
336
|
+
language: English
|
|
337
|
+
|
|
338
|
+
- name: ood_robustness
|
|
339
|
+
display_name: DecodingTrust - OoD Robustness
|
|
340
|
+
short_display_name: OoD
|
|
341
|
+
description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
|
|
342
|
+
metric_groups:
|
|
343
|
+
- accuracy
|
|
344
|
+
- efficiency
|
|
345
|
+
- general_information
|
|
346
|
+
- decodingtrust_ood_metrics
|
|
347
|
+
environment:
|
|
348
|
+
main_name: quasi_exact_match
|
|
349
|
+
main_split: valid
|
|
350
|
+
taxonomy:
|
|
351
|
+
task: "?"
|
|
352
|
+
what: "?"
|
|
353
|
+
who: "?"
|
|
354
|
+
when: "?"
|
|
355
|
+
language: English
|
|
356
|
+
|
|
357
|
+
- name: fairness
|
|
358
|
+
display_name: DecodingTrust - Fairness
|
|
359
|
+
short_display_name: Fairness
|
|
360
|
+
description: Fairness analysis of LLMs
|
|
361
|
+
metric_groups:
|
|
362
|
+
- accuracy
|
|
363
|
+
- decodingtrust_fairness_metrics
|
|
364
|
+
- efficiency
|
|
365
|
+
- general_information
|
|
366
|
+
environment:
|
|
367
|
+
main_name: quasi_exact_match
|
|
368
|
+
main_split: valid
|
|
369
|
+
taxonomy:
|
|
370
|
+
task: "?"
|
|
371
|
+
what: "?"
|
|
372
|
+
who: "?"
|
|
373
|
+
when: "?"
|
|
374
|
+
language: English
|
|
375
|
+
|
|
376
|
+
- name: privacy
|
|
377
|
+
display_name: DecodingTrust - Privacy
|
|
378
|
+
short_display_name: Privacy
|
|
379
|
+
description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
|
|
380
|
+
metric_groups:
|
|
381
|
+
- decodingtrust_privacy_metrics
|
|
382
|
+
- efficiency
|
|
383
|
+
- general_information
|
|
384
|
+
environment:
|
|
385
|
+
main_name: quasi_exact_match
|
|
386
|
+
main_split: test
|
|
387
|
+
taxonomy:
|
|
388
|
+
task: "?"
|
|
389
|
+
what: "?"
|
|
390
|
+
who: "?"
|
|
391
|
+
when: "?"
|
|
392
|
+
language: English
|
|
393
|
+
|
|
394
|
+
- name: machine_ethics
|
|
395
|
+
display_name: DecodingTrust - Ethics
|
|
396
|
+
short_display_name: Ethics
|
|
397
|
+
description: Evaluation of the understanding of ethical behaviors of LLMs
|
|
398
|
+
metric_groups:
|
|
399
|
+
- accuracy
|
|
400
|
+
- efficiency
|
|
401
|
+
- general_information
|
|
402
|
+
environment:
|
|
403
|
+
main_name: quasi_exact_match
|
|
404
|
+
main_split: test
|
|
405
|
+
taxonomy:
|
|
406
|
+
task: "?"
|
|
407
|
+
what: "?"
|
|
408
|
+
who: "?"
|
|
409
|
+
when: "?"
|
|
410
|
+
language: English
|
|
411
|
+
|
|
412
|
+
- name: toxicity_prompts
|
|
413
|
+
display_name: DecodingTrust - Toxicity
|
|
414
|
+
short_display_name: Toxicity
|
|
415
|
+
description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
|
|
416
|
+
metric_groups:
|
|
417
|
+
- toxicity
|
|
418
|
+
- efficiency
|
|
419
|
+
- general_information
|
|
420
|
+
environment:
|
|
421
|
+
main_split: valid
|
|
422
|
+
taxonomy:
|
|
423
|
+
task: "?"
|
|
424
|
+
what: "?"
|
|
425
|
+
who: "?"
|
|
426
|
+
when: "?"
|
|
427
|
+
language: English
|
|
428
|
+
|
|
429
|
+
- name: stereotype_bias
|
|
430
|
+
display_name: DecodingTrust - Stereotype Bias
|
|
431
|
+
short_display_name: Stereotype
|
|
432
|
+
description: Manually crafted stereotype user prompts from DecodingTrust
|
|
433
|
+
metric_groups:
|
|
434
|
+
- decodingtrust_stereotype_bias_metrics
|
|
435
|
+
- efficiency
|
|
436
|
+
- general_information
|
|
437
|
+
environment:
|
|
438
|
+
main_split: valid
|
|
439
|
+
taxonomy:
|
|
440
|
+
task: "?"
|
|
441
|
+
what: "?"
|
|
442
|
+
who: "?"
|
|
443
|
+
when: "?"
|
|
444
|
+
language: English
|