crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Schema for Arabic scenarios
|
|
3
|
+
############################################################
|
|
4
|
+
metrics:
|
|
5
|
+
# Infrastructure metrics:
|
|
6
|
+
- name: num_perplexity_tokens
|
|
7
|
+
display_name: '# tokens'
|
|
8
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
9
|
+
- name: num_bytes
|
|
10
|
+
display_name: '# bytes'
|
|
11
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
12
|
+
|
|
13
|
+
- name: num_references
|
|
14
|
+
display_name: '# ref'
|
|
15
|
+
description: Number of references.
|
|
16
|
+
- name: num_train_trials
|
|
17
|
+
display_name: '# trials'
|
|
18
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
19
|
+
- name: estimated_num_tokens_cost
|
|
20
|
+
display_name: 'cost'
|
|
21
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
22
|
+
- name: num_prompt_tokens
|
|
23
|
+
display_name: '# prompt tokens'
|
|
24
|
+
description: Number of tokens in the prompt.
|
|
25
|
+
- name: num_prompt_characters
|
|
26
|
+
display_name: '# prompt chars'
|
|
27
|
+
description: Number of characters in the prompt.
|
|
28
|
+
- name: num_completion_tokens
|
|
29
|
+
display_name: '# completion tokens'
|
|
30
|
+
description: Actual number of completion tokens (over all completions).
|
|
31
|
+
- name: num_output_tokens
|
|
32
|
+
display_name: '# output tokens'
|
|
33
|
+
description: Actual number of output tokens.
|
|
34
|
+
- name: max_num_output_tokens
|
|
35
|
+
display_name: 'Max output tokens'
|
|
36
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
37
|
+
- name: num_requests
|
|
38
|
+
display_name: '# requests'
|
|
39
|
+
description: Number of distinct API requests.
|
|
40
|
+
- name: num_instances
|
|
41
|
+
display_name: '# eval'
|
|
42
|
+
description: Number of evaluation instances.
|
|
43
|
+
- name: num_train_instances
|
|
44
|
+
display_name: '# train'
|
|
45
|
+
description: Number of training instances (e.g., in-context examples).
|
|
46
|
+
- name: prompt_truncated
|
|
47
|
+
display_name: truncated
|
|
48
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
49
|
+
- name: finish_reason_length
|
|
50
|
+
display_name: finish b/c length
|
|
51
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
52
|
+
- name: finish_reason_stop
|
|
53
|
+
display_name: finish b/c stop
|
|
54
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
55
|
+
- name: finish_reason_endoftext
|
|
56
|
+
display_name: finish b/c endoftext
|
|
57
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
58
|
+
- name: finish_reason_unknown
|
|
59
|
+
display_name: finish b/c unknown
|
|
60
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
61
|
+
- name: num_completions
|
|
62
|
+
display_name: '# completions'
|
|
63
|
+
description: Number of completions.
|
|
64
|
+
- name: predicted_index
|
|
65
|
+
display_name: Predicted index
|
|
66
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
67
|
+
- name: inference_runtime
|
|
68
|
+
display_name: Observed inference runtime (s)
|
|
69
|
+
short_display_name: Observed inference time (s)
|
|
70
|
+
lower_is_better: true
|
|
71
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
72
|
+
|
|
73
|
+
# Accuracy metrics:
|
|
74
|
+
- name: exact_match
|
|
75
|
+
display_name: Exact match
|
|
76
|
+
short_display_name: EM
|
|
77
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
78
|
+
lower_is_better: false
|
|
79
|
+
- name: quasi_exact_match
|
|
80
|
+
display_name: Quasi-exact match
|
|
81
|
+
short_display_name: EM
|
|
82
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
83
|
+
lower_is_better: false
|
|
84
|
+
- name: prefix_exact_match
|
|
85
|
+
display_name: Prefix exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
- name: quasi_prefix_exact_match
|
|
90
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
91
|
+
display_name: Prefix quasi-exact match
|
|
92
|
+
short_display_name: PEM
|
|
93
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
|
|
96
|
+
############################################################
|
|
97
|
+
perturbations: []
|
|
98
|
+
|
|
99
|
+
############################################################
|
|
100
|
+
metric_groups:
|
|
101
|
+
- name: accuracy
|
|
102
|
+
display_name: Accuracy
|
|
103
|
+
aggregation_strategies:
|
|
104
|
+
- mean
|
|
105
|
+
metrics:
|
|
106
|
+
- name: ${main_name}
|
|
107
|
+
split: ${main_split}
|
|
108
|
+
|
|
109
|
+
- name: efficiency
|
|
110
|
+
display_name: Efficiency
|
|
111
|
+
aggregation_strategies:
|
|
112
|
+
- mean
|
|
113
|
+
metrics:
|
|
114
|
+
- name: inference_runtime
|
|
115
|
+
split: ${main_split}
|
|
116
|
+
|
|
117
|
+
- name: general_information
|
|
118
|
+
display_name: General information
|
|
119
|
+
hide_win_rates: true
|
|
120
|
+
metrics:
|
|
121
|
+
- name: num_instances
|
|
122
|
+
split: ${main_split}
|
|
123
|
+
- name: num_train_instances
|
|
124
|
+
split: ${main_split}
|
|
125
|
+
- name: prompt_truncated
|
|
126
|
+
split: ${main_split}
|
|
127
|
+
- name: num_prompt_tokens
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
- name: num_output_tokens
|
|
130
|
+
split: ${main_split}
|
|
131
|
+
|
|
132
|
+
############################################################
|
|
133
|
+
run_groups:
|
|
134
|
+
- name: arabic_scenarios
|
|
135
|
+
display_name: Arabic Scenarios
|
|
136
|
+
description: Arabic Scenarios
|
|
137
|
+
category: All scenarios
|
|
138
|
+
subgroups:
|
|
139
|
+
- mmmlu
|
|
140
|
+
- arabic_mmlu
|
|
141
|
+
- alghafa
|
|
142
|
+
- exams_multilingual
|
|
143
|
+
- aratrust
|
|
144
|
+
|
|
145
|
+
- name: mmmlu
|
|
146
|
+
display_name: Multilingual MMLU (Arabic)
|
|
147
|
+
description: Multilingual MMLU (Arabic)
|
|
148
|
+
metric_groups:
|
|
149
|
+
- accuracy
|
|
150
|
+
- efficiency
|
|
151
|
+
- general_information
|
|
152
|
+
environment:
|
|
153
|
+
main_name: exact_match
|
|
154
|
+
main_split: test
|
|
155
|
+
taxonomy:
|
|
156
|
+
task: multiple-choice question answering
|
|
157
|
+
what: math, science, history, etc.
|
|
158
|
+
who: various online sources
|
|
159
|
+
when: before 2021
|
|
160
|
+
language: Arabic
|
|
161
|
+
|
|
162
|
+
- name: arabic_mmlu
|
|
163
|
+
display_name: Arabic MMLU
|
|
164
|
+
description: Arabic MMLU
|
|
165
|
+
metric_groups:
|
|
166
|
+
- accuracy
|
|
167
|
+
- efficiency
|
|
168
|
+
- general_information
|
|
169
|
+
environment:
|
|
170
|
+
main_name: exact_match
|
|
171
|
+
main_split: test
|
|
172
|
+
taxonomy:
|
|
173
|
+
task: "question answering"
|
|
174
|
+
what: "academic questions across various disciplines"
|
|
175
|
+
who: "academic exams writers and takers"
|
|
176
|
+
when: "before 2024"
|
|
177
|
+
language: Arabic
|
|
178
|
+
|
|
179
|
+
- name: alghafa
|
|
180
|
+
display_name: AlGhafa
|
|
181
|
+
description: AlGhafa
|
|
182
|
+
metric_groups:
|
|
183
|
+
- accuracy
|
|
184
|
+
- efficiency
|
|
185
|
+
- general_information
|
|
186
|
+
environment:
|
|
187
|
+
main_name: exact_match
|
|
188
|
+
main_split: test
|
|
189
|
+
taxonomy:
|
|
190
|
+
task: "multiple choice question answering"
|
|
191
|
+
what: Various
|
|
192
|
+
who: Various
|
|
193
|
+
when: "before 2023"
|
|
194
|
+
language: Arabic
|
|
195
|
+
|
|
196
|
+
- name: exams_multilingual
|
|
197
|
+
display_name: EXAMS (Arabic)
|
|
198
|
+
description: EXAMS (Arabic)
|
|
199
|
+
metric_groups:
|
|
200
|
+
- accuracy
|
|
201
|
+
- efficiency
|
|
202
|
+
- general_information
|
|
203
|
+
environment:
|
|
204
|
+
main_name: exact_match
|
|
205
|
+
main_split: test
|
|
206
|
+
taxonomy:
|
|
207
|
+
task: "multiple choice question answering"
|
|
208
|
+
what: High school examinations
|
|
209
|
+
who: High school examinations writers and test-takers
|
|
210
|
+
when: before 2020
|
|
211
|
+
language: Arabic
|
|
212
|
+
|
|
213
|
+
- name: aratrust
|
|
214
|
+
display_name: AraTrust
|
|
215
|
+
description: AraTrust
|
|
216
|
+
metric_groups:
|
|
217
|
+
- accuracy
|
|
218
|
+
- efficiency
|
|
219
|
+
- general_information
|
|
220
|
+
environment:
|
|
221
|
+
main_name: exact_match
|
|
222
|
+
main_split: test
|
|
223
|
+
taxonomy:
|
|
224
|
+
task: "question answering"
|
|
225
|
+
what: "academic questions across various disciplines"
|
|
226
|
+
who: "academic exams writers and takers"
|
|
227
|
+
when: "before 2024"
|
|
228
|
+
language: Arabic
|
|
@@ -1683,23 +1683,6 @@ run_groups:
|
|
|
1683
1683
|
when: n/a
|
|
1684
1684
|
language: synthetic
|
|
1685
1685
|
|
|
1686
|
-
- name: numeracy
|
|
1687
|
-
display_name: Numerical reasoning
|
|
1688
|
-
description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
|
|
1689
|
-
metric_groups:
|
|
1690
|
-
- accuracy
|
|
1691
|
-
- efficiency
|
|
1692
|
-
- general_information
|
|
1693
|
-
environment:
|
|
1694
|
-
main_name: absolute_value_difference
|
|
1695
|
-
main_split: test
|
|
1696
|
-
taxonomy:
|
|
1697
|
-
task: next-word prediction
|
|
1698
|
-
what: Dyck formal language
|
|
1699
|
-
who: n/a
|
|
1700
|
-
when: n/a
|
|
1701
|
-
language: synthetic
|
|
1702
|
-
|
|
1703
1686
|
- name: synthetic_reasoning
|
|
1704
1687
|
display_name: Synthetic reasoning (abstract symbols)
|
|
1705
1688
|
description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
@@ -195,6 +195,7 @@ run_groups:
|
|
|
195
195
|
- ruler_squad
|
|
196
196
|
- infinite_bench_en_sum
|
|
197
197
|
- infinite_bench_en_qa
|
|
198
|
+
- infinite_bench_en_mc
|
|
198
199
|
- openai_mrcr
|
|
199
200
|
|
|
200
201
|
- name: ruler_hotpotqa
|
|
@@ -234,7 +235,7 @@ run_groups:
|
|
|
234
235
|
|
|
235
236
|
- name: infinite_bench_en_qa
|
|
236
237
|
display_name: ∞Bench En.QA
|
|
237
|
-
description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
238
|
+
description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
238
239
|
metric_groups:
|
|
239
240
|
- accuracy
|
|
240
241
|
- general_information
|
|
@@ -249,6 +250,23 @@ run_groups:
|
|
|
249
250
|
when: Before 2024
|
|
250
251
|
language: English
|
|
251
252
|
|
|
253
|
+
- name: infinite_bench_en_mc
|
|
254
|
+
display_name: ∞Bench En.MC
|
|
255
|
+
description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
256
|
+
metric_groups:
|
|
257
|
+
- accuracy
|
|
258
|
+
- general_information
|
|
259
|
+
- annotation_metrics
|
|
260
|
+
environment:
|
|
261
|
+
main_name: exact_match
|
|
262
|
+
main_split: test
|
|
263
|
+
taxonomy:
|
|
264
|
+
task: multiple-choice question answering
|
|
265
|
+
what: Novels
|
|
266
|
+
who: Novel authors
|
|
267
|
+
when: Before 2024
|
|
268
|
+
language: English
|
|
269
|
+
|
|
252
270
|
- name: infinite_bench_en_sum
|
|
253
271
|
display_name: ∞Bench En.Sum
|
|
254
272
|
description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|