crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,444 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: quasi_exact_match
74
+ display_name: Quasi-exact match
75
+ short_display_name: EM
76
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
77
+ lower_is_better: false
78
+ - name: prefix_exact_match
79
+ display_name: Prefix exact match
80
+ short_display_name: PEM
81
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
82
+ lower_is_better: false
83
+ - name: quasi_prefix_exact_match
84
+ # TODO: should call this prefix_quasi_exact_match
85
+ display_name: Prefix quasi-exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
88
+ lower_is_better: false
89
+
90
+ - name: rouge_1
91
+ display_name: ROUGE-1
92
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
93
+ lower_is_better: false
94
+ - name: rouge_2
95
+ display_name: ROUGE-2
96
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
97
+ lower_is_better: false
98
+ - name: rouge_l
99
+ display_name: ROUGE-L
100
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
101
+ lower_is_better: false
102
+ - name: bleu_1
103
+ display_name: BLEU-1
104
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
105
+ lower_is_better: false
106
+ - name: bleu_4
107
+ display_name: BLEU-4
108
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
109
+ lower_is_better: false
110
+ - name: f1_set_match
111
+ display_name: F1 (set match)
112
+ short_display_name: F1
113
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
114
+ lower_is_better: false
115
+ - name: f1_score
116
+ display_name: F1
117
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
118
+ lower_is_better: false
119
+
120
+ # Toxicity metrics
121
+ - name: expected_max_toxicity
122
+ display_name: Average max toxicity
123
+ short_display_name: Avg. max toxicity
124
+ lower_is_better: true
125
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
126
+ - name: max_toxicity_probability
127
+ display_name: max toxicity
128
+ lower_is_better: true
129
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
130
+ - name: toxic_frac
131
+ display_name: Toxic fraction
132
+ lower_is_better: true
133
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
134
+
135
+ # Efficiency metrics:
136
+ - name: training_co2_cost
137
+ display_name: Estimated training emissions (kg CO2)
138
+ short_display_name: Training emissions (kg CO2)
139
+ lower_is_better: true
140
+ description: Estimate of the CO2 emissions from training the model.
141
+ - name: training_energy_cost
142
+ display_name: Estimated training energy cost (MWh)
143
+ short_display_name: Training energy (MWh)
144
+ lower_is_better: true
145
+ description: Estimate of the amount of energy used to train the model.
146
+ - name: inference_runtime
147
+ display_name: Observed inference runtime (s)
148
+ short_display_name: Observed inference time (s)
149
+ lower_is_better: true
150
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
151
+ - name: inference_idealized_runtime
152
+ display_name: Idealized inference runtime (s)
153
+ short_display_name: Idealized inference time (s)
154
+ lower_is_better: true
155
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
156
+ - name: inference_denoised_runtime
157
+ display_name: Denoised inference runtime (s)
158
+ short_display_name: Denoised inference time (s)
159
+ lower_is_better: true
160
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
161
+ - name: batch_size
162
+ display_name: Batch size
163
+ description: For batch jobs, how many requests are in a batch.
164
+
165
+
166
+ # DecodingTrust Fairness Metrics
167
+ - name: equalized_odds_difference
168
+ display_name: Equalized Odds Difference
169
+ short_display_name: Equalized Odds
170
+ description: The rate at which the model refuses to answer an OoD question.
171
+ lower_is_better: true
172
+
173
+ - name: demographic_parity_difference
174
+ display_name: Demographic Parity Difference
175
+ short_display_name: demographic parity
176
+ description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
177
+ lower_is_better: true
178
+
179
+
180
+ # DecodingTrust OoD Metrics
181
+ - name: ood_knowledge_rr
182
+ display_name: OOD Knowledge Refusal Rate
183
+ short_display_name: OoD-RR
184
+ description: The rate at which the model refuses to answer an OoD question.
185
+ lower_is_better: false
186
+
187
+ - name: ood_knowledge_macc
188
+ display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
189
+ short_display_name: OoD-MACC
190
+ description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
191
+ lower_is_better: false
192
+
193
+
194
+ # DecodingTrust Stereotype Bias Metrics
195
+ - name: decodingtrust_stereotype_bias
196
+ display_name: DecodingTrust Bias Score
197
+ short_display_name: Bias Score
198
+ description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
199
+ lower_is_better: false
200
+
201
+ - name: decodingtrust_stereotype_bias_rejection_rate
202
+ display_name: Accuracy at 10% coverage
203
+ short_display_name: Bias Rejection Rate
204
+ description: Rejection rate of stereotype prompts
205
+ lower_is_better: false
206
+
207
+
208
+ # DecodingTrust Privacy Metrics
209
+ - name: decodingtrust_privacy_leakage_rate
210
+ display_name: Privacy Leakage Rate
211
+ short_display_name: Privacy Leakage Rate
212
+ description: TBD
213
+ lower_is_better: true
214
+
215
+ ############################################################
216
+ perturbations: []
217
+
218
+ ############################################################
219
+ metric_groups:
220
+ - name: accuracy
221
+ display_name: Accuracy
222
+ metrics:
223
+ - name: ${main_name}
224
+ split: ${main_split}
225
+
226
+ - name: toxicity
227
+ display_name: Toxicity
228
+ metrics:
229
+ - name: toxic_frac
230
+ split: ${main_split}
231
+
232
+ - name: efficiency
233
+ display_name: Efficiency
234
+ metrics:
235
+ - name: inference_denoised_runtime
236
+ split: ${main_split}
237
+
238
+ - name: general_information
239
+ display_name: General information
240
+ metrics:
241
+ - name: num_instances
242
+ split: ${main_split}
243
+ - name: num_train_instances
244
+ split: ${main_split}
245
+ - name: prompt_truncated
246
+ split: ${main_split}
247
+ - name: num_prompt_tokens
248
+ split: ${main_split}
249
+ - name: num_output_tokens
250
+ split: ${main_split}
251
+ - name: num_train_trials
252
+ split: ${main_split}
253
+
254
+ - name: decodingtrust_fairness_metrics
255
+ display_name: Fairness
256
+ metrics:
257
+ - name: equalized_odds_difference
258
+ split: ${main_split}
259
+ - name: demographic_parity_difference
260
+ split: ${main_split}
261
+
262
+ - name: decodingtrust_ood_metrics
263
+ display_name: OOD Accuracy
264
+ metrics:
265
+ - name: ood_knowledge_rr
266
+ split: ${main_split}
267
+ - name: ood_knowledge_macc
268
+ split: ${main_split}
269
+
270
+ - name: decodingtrust_stereotype_bias_metrics
271
+ display_name: Stereotype Bias
272
+ metrics:
273
+ - name: decodingtrust_stereotype_bias
274
+ split: ${main_split}
275
+ - name: decodingtrust_stereotype_bias_rejection_rate
276
+ split: ${main_split}
277
+
278
+ - name: decodingtrust_privacy_metrics
279
+ display_name: Privacy
280
+ metrics:
281
+ - name: decodingtrust_privacy_leakage_rate
282
+ split: ${main_split}
283
+
284
+ ############################################################
285
+ run_groups:
286
+
287
+ - name: decodingtrust
288
+ display_name: DecodingTrust
289
+ description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
290
+ category: Trustworthiness
291
+ subgroups:
292
+ - adv_robustness
293
+ - adv_demonstration
294
+ - ood_robustness
295
+ - fairness
296
+ - privacy
297
+ - machine_ethics
298
+ - toxicity_prompts
299
+ - stereotype_bias
300
+
301
+ - name: adv_robustness
302
+ display_name: DecodingTrust - AdvGLUE++
303
+ short_display_name: AdvGLUE++
304
+ description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
305
+ metric_groups:
306
+ - accuracy
307
+ - efficiency
308
+ - general_information
309
+ environment:
310
+ main_name: quasi_exact_match
311
+ main_split: valid
312
+ taxonomy:
313
+ task: "?"
314
+ what: "?"
315
+ who: "?"
316
+ when: "?"
317
+ language: English
318
+ todo: true
319
+
320
+ - name: adv_demonstration
321
+ display_name: DecodingTrust - Adversarial Demonstrations
322
+ short_display_name: AdvDemo
323
+ description: Robustness analysis of LM generations when facing adversarial demonstrations
324
+ metric_groups:
325
+ - accuracy
326
+ - efficiency
327
+ - general_information
328
+ environment:
329
+ main_name: quasi_exact_match
330
+ main_split: valid
331
+ taxonomy:
332
+ task: "?"
333
+ what: "?"
334
+ who: "?"
335
+ when: "?"
336
+ language: English
337
+
338
+ - name: ood_robustness
339
+ display_name: DecodingTrust - OoD Robustness
340
+ short_display_name: OoD
341
+ description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
342
+ metric_groups:
343
+ - accuracy
344
+ - efficiency
345
+ - general_information
346
+ - decodingtrust_ood_metrics
347
+ environment:
348
+ main_name: quasi_exact_match
349
+ main_split: valid
350
+ taxonomy:
351
+ task: "?"
352
+ what: "?"
353
+ who: "?"
354
+ when: "?"
355
+ language: English
356
+
357
+ - name: fairness
358
+ display_name: DecodingTrust - Fairness
359
+ short_display_name: Fairness
360
+ description: Fairness analysis of LLMs
361
+ metric_groups:
362
+ - accuracy
363
+ - decodingtrust_fairness_metrics
364
+ - efficiency
365
+ - general_information
366
+ environment:
367
+ main_name: quasi_exact_match
368
+ main_split: valid
369
+ taxonomy:
370
+ task: "?"
371
+ what: "?"
372
+ who: "?"
373
+ when: "?"
374
+ language: English
375
+
376
+ - name: privacy
377
+ display_name: DecodingTrust - Privacy
378
+ short_display_name: Privacy
379
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
380
+ metric_groups:
381
+ - decodingtrust_privacy_metrics
382
+ - efficiency
383
+ - general_information
384
+ environment:
385
+ main_name: quasi_exact_match
386
+ main_split: test
387
+ taxonomy:
388
+ task: "?"
389
+ what: "?"
390
+ who: "?"
391
+ when: "?"
392
+ language: English
393
+
394
+ - name: machine_ethics
395
+ display_name: DecodingTrust - Ethics
396
+ short_display_name: Ethics
397
+ description: Evaluation of the understanding of ethical behaviors of LLMs
398
+ metric_groups:
399
+ - accuracy
400
+ - efficiency
401
+ - general_information
402
+ environment:
403
+ main_name: quasi_exact_match
404
+ main_split: test
405
+ taxonomy:
406
+ task: "?"
407
+ what: "?"
408
+ who: "?"
409
+ when: "?"
410
+ language: English
411
+
412
+ - name: toxicity_prompts
413
+ display_name: DecodingTrust - Toxicity
414
+ short_display_name: Toxicity
415
+ description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
416
+ metric_groups:
417
+ - toxicity
418
+ - efficiency
419
+ - general_information
420
+ environment:
421
+ main_split: valid
422
+ taxonomy:
423
+ task: "?"
424
+ what: "?"
425
+ who: "?"
426
+ when: "?"
427
+ language: English
428
+
429
+ - name: stereotype_bias
430
+ display_name: DecodingTrust - Stereotype Bias
431
+ short_display_name: Stereotype
432
+ description: Manually crafted stereotype user prompts from DecodingTrust
433
+ metric_groups:
434
+ - decodingtrust_stereotype_bias_metrics
435
+ - efficiency
436
+ - general_information
437
+ environment:
438
+ main_split: valid
439
+ taxonomy:
440
+ task: "?"
441
+ what: "?"
442
+ who: "?"
443
+ when: "?"
444
+ language: English