crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show
  1. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
  2. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  5. helm/benchmark/annotation/call_center_annotator.py +22 -11
  6. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  7. helm/benchmark/annotation/live_qa_annotator.py +9 -4
  8. helm/benchmark/annotation/medication_qa_annotator.py +9 -4
  9. helm/benchmark/annotation/model_as_judge.py +70 -19
  10. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  11. helm/benchmark/annotation/xstest_annotator.py +20 -30
  12. helm/benchmark/metrics/safety_metrics.py +39 -17
  13. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  14. helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
  15. helm/benchmark/presentation/create_plots.py +1 -1
  16. helm/benchmark/presentation/schema.py +3 -0
  17. helm/benchmark/presentation/summarize.py +106 -256
  18. helm/benchmark/presentation/test_summarize.py +145 -3
  19. helm/benchmark/run_expander.py +27 -0
  20. helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
  21. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  22. helm/benchmark/run_specs/vlm_run_specs.py +8 -3
  23. helm/benchmark/scenarios/bhasa_scenario.py +226 -82
  24. helm/benchmark/scenarios/raft_scenario.py +1 -1
  25. helm/benchmark/static/schema_bhasa.yaml +10 -10
  26. helm/benchmark/static/schema_legal.yaml +566 -0
  27. helm/benchmark/static/schema_safety.yaml +25 -6
  28. helm/benchmark/static/schema_tables.yaml +26 -2
  29. helm/benchmark/static/schema_vhelm.yaml +42 -11
  30. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  31. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  32. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  33. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  34. helm/benchmark/static_build/index.html +1 -1
  35. helm/benchmark/window_services/tokenizer_service.py +0 -5
  36. helm/clients/openai_client.py +16 -1
  37. helm/clients/palmyra_client.py +1 -2
  38. helm/clients/together_client.py +22 -0
  39. helm/common/cache.py +8 -30
  40. helm/common/key_value_store.py +9 -9
  41. helm/common/mongo_key_value_store.py +3 -3
  42. helm/common/test_cache.py +1 -48
  43. helm/common/tokenization_request.py +0 -9
  44. helm/config/model_deployments.yaml +135 -3
  45. helm/config/model_metadata.yaml +134 -6
  46. helm/config/tokenizer_configs.yaml +24 -0
  47. helm/proxy/server.py +0 -9
  48. helm/proxy/services/remote_service.py +0 -6
  49. helm/proxy/services/server_service.py +5 -18
  50. helm/proxy/services/service.py +0 -6
  51. helm/benchmark/data_overlap/__init__.py +0 -0
  52. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  53. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  54. helm/benchmark/data_overlap/light_scenario.py +0 -60
  55. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  56. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  57. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  58. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  59. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  60. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
@@ -164,8 +164,8 @@ run_groups:
164
164
  category: BHASA scenarios
165
165
  subgroups:
166
166
  - lindsea_syntax_minimal_pairs_id
167
- - lindsea_pragmatics_pragmatic_reasoning_single_id
168
- - lindsea_pragmatics_pragmatic_reasoning_pair_id
167
+ - lindsea_pragmatics_presuppositions_id
168
+ - lindsea_pragmatics_scalar_implicatures_id
169
169
 
170
170
  - name: tydiqa
171
171
  display_name: TyDiQA
@@ -672,10 +672,10 @@ run_groups:
672
672
  when: "?"
673
673
  language: Indonesian
674
674
 
675
- - name: lindsea_pragmatics_pragmatic_reasoning_single_id
676
- display_name: LINDSEA Pragmatics Pragmatic Reasoning (single sentence)
675
+ - name: lindsea_pragmatics_presuppositions_id
676
+ display_name: LINDSEA Pragmatics Presuppositions
677
677
  description: >
678
- LINDSEA pragmatic reasoning (single sentence) is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving scalar implicatures and presuppositions.
678
+ LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
679
679
  metric_groups:
680
680
  - accuracy
681
681
  - efficiency
@@ -685,15 +685,15 @@ run_groups:
685
685
  main_split: test
686
686
  taxonomy:
687
687
  task: pragmatic reasoning
688
- what: scalar implicatures and presuppositions
688
+ what: presuppositions
689
689
  who: "?"
690
690
  when: "?"
691
691
  language: Indonesian
692
692
 
693
- - name: lindsea_pragmatics_pragmatic_reasoning_pair_id
694
- display_name: LINDSEA Pragmatics Pragmatic Reasoning (sentence pair)
693
+ - name: lindsea_pragmatics_scalar_implicatures_id
694
+ display_name: LINDSEA Pragmatics Scalar Implicatures
695
695
  description: >
696
- LINDSEA pragmatic reasoning (sentence pair) is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving scalar implicatures and presuppositions.
696
+ LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
697
697
  metric_groups:
698
698
  - accuracy
699
699
  - efficiency
@@ -703,7 +703,7 @@ run_groups:
703
703
  main_split: test
704
704
  taxonomy:
705
705
  task: pragmatic reasoning
706
- what: scalar implicatures and presuppositions
706
+ what: scalar implicatures
707
707
  who: "?"
708
708
  when: "?"
709
709
  language: Indonesian
@@ -0,0 +1,566 @@
1
+ ---
2
+ ############################################################
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this in the future.
5
+ adapter: [ ]
6
+ ############################################################
7
+ metrics:
8
+ # Infrastructure metrics:
9
+ - name: num_perplexity_tokens
10
+ display_name: '# tokens'
11
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
12
+ - name: num_bytes
13
+ display_name: '# bytes'
14
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
15
+
16
+ - name: num_references
17
+ display_name: '# ref'
18
+ description: Number of references.
19
+ - name: num_train_trials
20
+ display_name: '# trials'
21
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
22
+ - name: estimated_num_tokens_cost
23
+ display_name: 'cost'
24
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
25
+ - name: num_prompt_tokens
26
+ display_name: '# prompt tokens'
27
+ description: Number of tokens in the prompt.
28
+ - name: num_prompt_characters
29
+ display_name: '# prompt chars'
30
+ description: Number of characters in the prompt.
31
+ - name: num_completion_tokens
32
+ display_name: '# completion tokens'
33
+ description: Actual number of completion tokens (over all completions).
34
+ - name: num_output_tokens
35
+ display_name: '# output tokens'
36
+ description: Actual number of output tokens.
37
+ - name: max_num_output_tokens
38
+ display_name: 'Max output tokens'
39
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
40
+ - name: num_requests
41
+ display_name: '# requests'
42
+ description: Number of distinct API requests.
43
+ - name: num_instances
44
+ display_name: '# eval'
45
+ description: Number of evaluation instances.
46
+ - name: num_train_instances
47
+ display_name: '# train'
48
+ description: Number of training instances (e.g., in-context examples).
49
+ - name: prompt_truncated
50
+ display_name: truncated
51
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
52
+ - name: finish_reason_length
53
+ display_name: finish b/c length
54
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
55
+ - name: finish_reason_stop
56
+ display_name: finish b/c stop
57
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
58
+ - name: finish_reason_endoftext
59
+ display_name: finish b/c endoftext
60
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
61
+ - name: finish_reason_unknown
62
+ display_name: finish b/c unknown
63
+ description: Fraction of instances where the the output was terminated for unknown reasons.
64
+ - name: num_completions
65
+ display_name: '# completions'
66
+ description: Number of completions.
67
+ - name: predicted_index
68
+ display_name: Predicted index
69
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
70
+
71
+ # Accuracy metrics:
72
+ - name: exact_match
73
+ display_name: Exact match
74
+ short_display_name: EM
75
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
76
+ lower_is_better: false
77
+ - name: quasi_exact_match
78
+ display_name: Quasi-exact match
79
+ short_display_name: EM
80
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
81
+ lower_is_better: false
82
+ - name: prefix_exact_match
83
+ display_name: Prefix exact match
84
+ short_display_name: PEM
85
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
86
+ lower_is_better: false
87
+ - name: quasi_prefix_exact_match
88
+ # TODO: should call this prefix_quasi_exact_match
89
+ display_name: Prefix quasi-exact match
90
+ short_display_name: PEM
91
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
92
+ lower_is_better: false
93
+
94
+ - name: exact_match@5
95
+ display_name: Exact match @5
96
+ short_display_name: EM@5
97
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
98
+ lower_is_better: false
99
+ - name: quasi_exact_match@5
100
+ display_name: Quasi-exact match @5
101
+ short_display_name: EM@5
102
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
103
+ lower_is_better: false
104
+ - name: prefix_exact_match@5
105
+ display_name: Prefix exact match @5
106
+ short_display_name: PEM@5
107
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
108
+ lower_is_better: false
109
+ - name: quasi_prefix_exact_match@5
110
+ display_name: Prefix quasi-exact match @5
111
+ short_display_name: PEM@5
112
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
113
+ lower_is_better: false
114
+
115
+ - name: logprob
116
+ display_name: Log probability
117
+ short_display_name: Logprob
118
+ description: Predicted output's average log probability (input's log prob for language modeling).
119
+ lower_is_better: false
120
+ - name: logprob_per_byte
121
+ display_name: Log probability / byte
122
+ short_display_name: Logprob/byte
123
+ description: Predicted output's average log probability normalized by the number of bytes.
124
+ lower_is_better: false
125
+ - name: bits_per_byte
126
+ display_name: Bits/byte
127
+ short_display_name: BPB
128
+ lower_is_better: true
129
+ description: Average number of bits per byte according to model probabilities.
130
+ - name: perplexity
131
+ display_name: Perplexity
132
+ short_display_name: PPL
133
+ lower_is_better: true
134
+ description: Perplexity of the output completion (effective branching factor per output token).
135
+ - name: rouge_1
136
+ display_name: ROUGE-1
137
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
138
+ lower_is_better: false
139
+ - name: rouge_2
140
+ display_name: ROUGE-2
141
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
142
+ lower_is_better: false
143
+ - name: rouge_l
144
+ display_name: ROUGE-L
145
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
146
+ lower_is_better: false
147
+ - name: bleu_1
148
+ display_name: BLEU-1
149
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
150
+ lower_is_better: false
151
+ - name: bleu_4
152
+ display_name: BLEU-4
153
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
154
+ lower_is_better: false
155
+ - name: f1_set_match
156
+ display_name: F1 (set match)
157
+ short_display_name: F1
158
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
159
+ lower_is_better: false
160
+ - name: f1_score
161
+ display_name: F1
162
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
163
+ lower_is_better: false
164
+ - name: classification_macro_f1
165
+ display_name: Macro-F1
166
+ description: Population-level macro-averaged F1 score.
167
+ lower_is_better: false
168
+ - name: classification_micro_f1
169
+ display_name: Micro-F1
170
+ description: Population-level micro-averaged F1 score.
171
+ lower_is_better: false
172
+ - name: absolute_value_difference
173
+ display_name: Absolute difference
174
+ short_display_name: Diff.
175
+ lower_is_better: true
176
+ description: Average absolute difference between the model output (converted to a number) and the correct reference.
177
+ - name: distance
178
+ display_name: Geometric distance
179
+ short_display_name: Dist.
180
+ lower_is_better: true
181
+ description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
182
+ - name: percent_valid
183
+ display_name: Valid fraction
184
+ short_display_name: Valid
185
+ description: Fraction of valid model outputs (as a number).
186
+ lower_is_better: false
187
+ - name: NDCG@10
188
+ display_name: NDCG@10
189
+ description: Normalized discounted cumulative gain at 10 in information retrieval.
190
+ lower_is_better: false
191
+ - name: RR@10
192
+ display_name: RR@10
193
+ description: Mean reciprocal rank at 10 in information retrieval.
194
+ lower_is_better: false
195
+ - name: NDCG@20
196
+ display_name: NDCG@20
197
+ description: Normalized discounted cumulative gain at 20 in information retrieval.
198
+ lower_is_better: false
199
+ - name: RR@20
200
+ display_name: RR@20
201
+ description: Mean reciprocal rank at 20 in information retrieval.
202
+ lower_is_better: false
203
+ - name: math_equiv
204
+ display_name: Equivalent
205
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference.
206
+ lower_is_better: false
207
+ - name: math_equiv_chain_of_thought
208
+ display_name: Equivalent (CoT)
209
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
210
+ lower_is_better: false
211
+ - name: exact_match_indicator
212
+ display_name: Exact match (final)
213
+ short_display_name: EM
214
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
215
+ lower_is_better: false
216
+ - name: final_number_exact_match
217
+ display_name: Exact match (final number)
218
+ short_display_name: EM
219
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
220
+ lower_is_better: false
221
+ - name: exact_set_match
222
+ display_name: Exact match (at sets)
223
+ short_display_name: EM
224
+ description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
225
+ lower_is_better: false
226
+ - name: iou_set_match
227
+ display_name: Intersection over union (as sets)
228
+ short_display_name: IoU
229
+ description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
230
+ lower_is_better: false
231
+
232
+ # Summarization metrics
233
+ - name: summac
234
+ display_name: SummaC
235
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
236
+ lower_is_better: false
237
+ - name: QAFactEval
238
+ display_name: QAFactEval
239
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
240
+ lower_is_better: false
241
+ - name: summarization_coverage
242
+ display_name: Coverage
243
+ description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
244
+ - name: summarization_density
245
+ display_name: Density
246
+ description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
247
+ - name: summarization_compression
248
+ display_name: Compression
249
+ description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
250
+ - name: BERTScore-P
251
+ display_name: BERTScore (P)
252
+ description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
253
+ lower_is_better: false
254
+ - name: BERTScore-R
255
+ display_name: BERTScore (R)
256
+ description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
257
+ lower_is_better: false
258
+ - name: BERTScore-F
259
+ display_name: BERTScore (F1)
260
+ description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
261
+ lower_is_better: false
262
+ - name: HumanEval-faithfulness
263
+ display_name: HumanEval-faithfulness
264
+ description: Human evaluation score for faithfulness.
265
+ lower_is_better: false
266
+ - name: HumanEval-relevance
267
+ display_name: HumanEval-relevance
268
+ description: Human evaluation score for relevance.
269
+ lower_is_better: false
270
+ - name: HumanEval-coherence
271
+ display_name: HumanEval-coherence
272
+ description: Human evaluation score for coherence.
273
+ lower_is_better: false
274
+
275
+ # Disinformation metrics (measure diversity):
276
+ - name: self_bleu
277
+ display_name: Self-BLEU
278
+ description: Self-BLEU metric of [Zhu et al. (2018)](https://arxiv.org/pdf/1802.01886.pdf) as a measure of diversity in model generations.
279
+ - name: monte_carlo_entropy
280
+ display_name: Entropy (Monte Carlo)
281
+ short_display_name: Entropy
282
+ description: Monte Carlo estimate of the entropy as a measure of diversity in model generations.
283
+ # Disinformation wedging metrics (measure quality of disinformation):
284
+ - name: wedging_eval_address_intended_audience
285
+ display_name: Address intended audience (human)
286
+ short_display_name: Intended audience (human)
287
+ description: Human judgment of fraction of model generations that target the intended audience.
288
+ - name: wedging_eval_include_intended_goal
289
+ display_name: Include intended goal (human)
290
+ short_display_name: Intended goal (human)
291
+ description: Human judgment of fraction of model generations that support the intended goal.
292
+ - name: wedging_eval_style
293
+ display_name: Correct style (human)
294
+ short_display_name: Style (human)
295
+ description: Average human judgment of whether model generations are of the specified style.
296
+ - name: wedging_eval_divisiveness
297
+ display_name: Divisiveness (human)
298
+ description: Average human judgment of the extent to which model generations are divisive.
299
+ - name: wedging_eval_toxicity
300
+ display_name: Toxicity (human)
301
+ description: Average human judgment of the extent to which model generations are toxicity.
302
+ # Disinformation reiteration metrics (measure quality of disinformation):
303
+ - name: reiteration_eval_support_thesis
304
+ display_name: Support thesis (human)
305
+ description: Average human judgment of the extent to which model generations support the stated thesis.
306
+ - name: reiteration_eval_style
307
+ display_name: Style (human)
308
+ description: Average human judgment of whether model generations are of the specified style.
309
+
310
+ # Copyright metrics (measure copying/overlap):
311
+ - name: longest_common_prefix_length
312
+ display_name: Longest common prefix length
313
+ short_display_name: LCS
314
+ lower_is_better: true
315
+ description: Average length of longest common prefix between model generation and reference.
316
+ - name: edit_distance
317
+ display_name: Edit distance (Levenshtein)
318
+ short_display_name: Edit dist.
319
+ description: Average Levenshtein edit distance between model generation and reference.
320
+ lower_is_better: false
321
+ - name: edit_similarity
322
+ display_name: Edit similarity (Levenshtein)
323
+ short_display_name: Edit sim.
324
+ lower_is_better: true
325
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
326
+
327
+ # Bias metrics:
328
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
329
+ display_name: Stereotypical associations (race, profession)
330
+ short_display_name: Stereotypes (race)
331
+ lower_is_better: true
332
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
333
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
334
+ display_name: Stereotypical associations (race, adjectives)
335
+ short_display_name: Stereotypes (race)
336
+ lower_is_better: true
337
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
338
+
339
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
340
+ display_name: Stereotypical associations (gender, profession)
341
+ short_display_name: Stereotypes (gender)
342
+ lower_is_better: true
343
+ description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
344
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
345
+ display_name: Stereotypical associations (gender, adjectives)
346
+ short_display_name: Stereotypes (gender)
347
+ lower_is_better: true
348
+ description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
349
+
350
+ - name: bias_metric:mode=representation,demographic_category=race
351
+ display_name: Demographic representation (race)
352
+ short_display_name: Representation (race)
353
+ lower_is_better: true
354
+ description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
355
+ - name: bias_metric:mode=representation,demographic_category=gender
356
+ display_name: Demographic representation (gender)
357
+ short_display_name: Representation (gender)
358
+ lower_is_better: true
359
+ description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
360
+
361
+ - name: bbq_metric_ambiguous_bias
362
+ display_name: BBQ (ambiguous)
363
+ lower_is_better: true
364
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
365
+ - name: bbq_metric_unambiguous_bias
366
+ display_name: BBQ (unambiguous)
367
+ lower_is_better: true
368
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
369
+
370
+ # Toxicity metrics
371
+ - name: expected_max_toxicity
372
+ display_name: Average max toxicity
373
+ short_display_name: Avg. max toxicity
374
+ lower_is_better: true
375
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
376
+ - name: max_toxicity_probability
377
+ display_name: max toxicity
378
+ lower_is_better: true
379
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
380
+ - name: toxic_frac
381
+ display_name: Toxic fraction
382
+ lower_is_better: true
383
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
384
+
385
+ # Efficiency metrics:
386
+ - name: training_co2_cost
387
+ display_name: Estimated training emissions (kg CO2)
388
+ short_display_name: Training emissions (kg CO2)
389
+ lower_is_better: true
390
+ description: Estimate of the CO2 emissions from training the model.
391
+ - name: training_energy_cost
392
+ display_name: Estimated training energy cost (MWh)
393
+ short_display_name: Training energy (MWh)
394
+ lower_is_better: true
395
+ description: Estimate of the amount of energy used to train the model.
396
+ - name: inference_runtime
397
+ display_name: Observed inference runtime (s)
398
+ short_display_name: Observed inference time (s)
399
+ lower_is_better: true
400
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
401
+ - name: inference_idealized_runtime
402
+ display_name: Idealized inference runtime (s)
403
+ short_display_name: Idealized inference time (s)
404
+ lower_is_better: true
405
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
406
+ - name: inference_denoised_runtime
407
+ display_name: Denoised inference runtime (s)
408
+ short_display_name: Denoised inference time (s)
409
+ lower_is_better: true
410
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
411
+ - name: batch_size
412
+ display_name: Batch size
413
+ description: For batch jobs, how many requests are in a batch.
414
+
415
+ # Calibration metrics:
416
+ - name: ece_1_bin
417
+ display_name: 1-bin expected calibration error
418
+ short_display_name: ECE (1-bin)
419
+ lower_is_better: true
420
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
421
+ - name: max_prob
422
+ display_name: Max prob
423
+ description: Model's average confidence in its prediction (only computed for classification tasks)
424
+ lower_is_better: false
425
+ - name: ece_10_bin
426
+ display_name: 10-bin expected calibration error
427
+ short_display_name: ECE (10-bin)
428
+ lower_is_better: true
429
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
430
+ - name: platt_ece_1_bin
431
+ display_name: 1-bin expected calibration error (after Platt scaling)
432
+ short_display_name: Platt-scaled ECE (1-bin)
433
+ lower_is_better: true
434
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
435
+ - name: platt_ece_10_bin
436
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
437
+ short_display_name: Platt-scaled ECE (10-bin)
438
+ lower_is_better: true
439
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
440
+ - name: platt_coef
441
+ display_name: Platt Scaling Coefficient
442
+ short_display_name: Platt Coef
443
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
444
+ lower_is_better: false
445
+ - name: platt_intercept
446
+ display_name: Platt Scaling Intercept
447
+ short_display_name: Platt Intercept
448
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
449
+ lower_is_better: false
450
+ - name: selective_cov_acc_area
451
+ display_name: Selective coverage-accuracy area
452
+ short_display_name: Selective Acc
453
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
454
+ lower_is_better: false
455
+ - name: selective_acc@10
456
+ display_name: Accuracy at 10% coverage
457
+ short_display_name: Acc@10%
458
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
459
+ lower_is_better: false
460
+
461
+ ############################################################
462
+ perturbations:
463
+ - name: robustness
464
+ display_name: Robustness
465
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
466
+ - name: fairness
467
+ display_name: Fairness
468
+ description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
469
+ - name: typos
470
+ display_name: Typos
471
+ description: >
472
+ Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
473
+ performance between perturbed and unperturbed versions.
474
+ - name: synonym
475
+ display_name: Synonyms
476
+ description: >
477
+ Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
478
+ worst-case performance between perturbed and unperturbed versions.
479
+ - name: dialect
480
+ display_name: SAE -> AAE
481
+ short_display_name: Dialect
482
+ description: >
483
+ Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
484
+ - name: race
485
+ display_name: First names by race (White -> Black)
486
+ short_display_name: Race
487
+ description: >
488
+ Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
489
+ - name: gender
490
+ display_name: Pronouns by gender (Male -> Female)
491
+ short_display_name: Gender
492
+ description: >
493
+ Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
494
+ performance between perturbed and unperturbed versions.
495
+
496
+ ############################################################
497
+ metric_groups:
498
+ - name: accuracy
499
+ display_name: Accuracy
500
+ metrics:
501
+ - name: ${main_name}
502
+ split: ${main_split}
503
+
504
+ - name: efficiency
505
+ display_name: Efficiency
506
+ metrics:
507
+ - name: inference_runtime
508
+ split: ${main_split}
509
+
510
+ - name: general_information
511
+ display_name: General information
512
+ metrics:
513
+ - name: num_instances
514
+ split: ${main_split}
515
+ - name: num_train_instances
516
+ split: ${main_split}
517
+ - name: prompt_truncated
518
+ split: ${main_split}
519
+ - name: num_prompt_tokens
520
+ split: ${main_split}
521
+ - name: num_output_tokens
522
+ split: ${main_split}
523
+
524
+ ############################################################
525
+ run_groups:
526
+ - name: core_scenarios
527
+ display_name: Core scenarios
528
+ description: The scenarios where we evaluate all the models.
529
+ category: All scenarios
530
+ subgroups:
531
+ - lsat_qa
532
+ - legalbench
533
+
534
+ - name: legalbench
535
+ display_name: LegalBench
536
+ description: LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).
537
+ metric_groups:
538
+ - accuracy
539
+ - efficiency
540
+ - general_information
541
+ environment:
542
+ main_name: quasi_exact_match
543
+ main_split: test
544
+ taxonomy:
545
+ task: multiple-choice question answering
546
+ what: public legal and admininstrative documents, manually constructed questions
547
+ who: lawyers
548
+ when: before 2023
549
+ language: English
550
+
551
+ - name: lsat_qa
552
+ display_name: LSAT
553
+ description: The LSAT benchmark for measuring analytical reasoning on the Law School Admission Test (LSAT; [Zhong et al., 2021](https://arxiv.org/pdf/2104.06598.pdf)).
554
+ metric_groups:
555
+ - accuracy
556
+ - efficiency
557
+ - general_information
558
+ environment:
559
+ main_name: quasi_exact_match
560
+ main_split: test
561
+ taxonomy:
562
+ task: "?"
563
+ what: n/a
564
+ who: n/a
565
+ when: n/a
566
+ language: synthetic