crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,228 @@
1
+ ---
2
+ # Schema for Arabic scenarios
3
+ ############################################################
4
+ metrics:
5
+ # Infrastructure metrics:
6
+ - name: num_perplexity_tokens
7
+ display_name: '# tokens'
8
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
9
+ - name: num_bytes
10
+ display_name: '# bytes'
11
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
12
+
13
+ - name: num_references
14
+ display_name: '# ref'
15
+ description: Number of references.
16
+ - name: num_train_trials
17
+ display_name: '# trials'
18
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
19
+ - name: estimated_num_tokens_cost
20
+ display_name: 'cost'
21
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
22
+ - name: num_prompt_tokens
23
+ display_name: '# prompt tokens'
24
+ description: Number of tokens in the prompt.
25
+ - name: num_prompt_characters
26
+ display_name: '# prompt chars'
27
+ description: Number of characters in the prompt.
28
+ - name: num_completion_tokens
29
+ display_name: '# completion tokens'
30
+ description: Actual number of completion tokens (over all completions).
31
+ - name: num_output_tokens
32
+ display_name: '# output tokens'
33
+ description: Actual number of output tokens.
34
+ - name: max_num_output_tokens
35
+ display_name: 'Max output tokens'
36
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
37
+ - name: num_requests
38
+ display_name: '# requests'
39
+ description: Number of distinct API requests.
40
+ - name: num_instances
41
+ display_name: '# eval'
42
+ description: Number of evaluation instances.
43
+ - name: num_train_instances
44
+ display_name: '# train'
45
+ description: Number of training instances (e.g., in-context examples).
46
+ - name: prompt_truncated
47
+ display_name: truncated
48
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
49
+ - name: finish_reason_length
50
+ display_name: finish b/c length
51
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
52
+ - name: finish_reason_stop
53
+ display_name: finish b/c stop
54
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
55
+ - name: finish_reason_endoftext
56
+ display_name: finish b/c endoftext
57
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
58
+ - name: finish_reason_unknown
59
+ display_name: finish b/c unknown
60
+ description: Fraction of instances where the the output was terminated for unknown reasons.
61
+ - name: num_completions
62
+ display_name: '# completions'
63
+ description: Number of completions.
64
+ - name: predicted_index
65
+ display_name: Predicted index
66
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
67
+ - name: inference_runtime
68
+ display_name: Observed inference runtime (s)
69
+ short_display_name: Observed inference time (s)
70
+ lower_is_better: true
71
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
72
+
73
+ # Accuracy metrics:
74
+ - name: exact_match
75
+ display_name: Exact match
76
+ short_display_name: EM
77
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
78
+ lower_is_better: false
79
+ - name: quasi_exact_match
80
+ display_name: Quasi-exact match
81
+ short_display_name: EM
82
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
83
+ lower_is_better: false
84
+ - name: prefix_exact_match
85
+ display_name: Prefix exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
88
+ lower_is_better: false
89
+ - name: quasi_prefix_exact_match
90
+ # TODO: should call this prefix_quasi_exact_match
91
+ display_name: Prefix quasi-exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
+ lower_is_better: false
95
+
96
+ ############################################################
97
+ perturbations: []
98
+
99
+ ############################################################
100
+ metric_groups:
101
+ - name: accuracy
102
+ display_name: Accuracy
103
+ aggregation_strategies:
104
+ - mean
105
+ metrics:
106
+ - name: ${main_name}
107
+ split: ${main_split}
108
+
109
+ - name: efficiency
110
+ display_name: Efficiency
111
+ aggregation_strategies:
112
+ - mean
113
+ metrics:
114
+ - name: inference_runtime
115
+ split: ${main_split}
116
+
117
+ - name: general_information
118
+ display_name: General information
119
+ hide_win_rates: true
120
+ metrics:
121
+ - name: num_instances
122
+ split: ${main_split}
123
+ - name: num_train_instances
124
+ split: ${main_split}
125
+ - name: prompt_truncated
126
+ split: ${main_split}
127
+ - name: num_prompt_tokens
128
+ split: ${main_split}
129
+ - name: num_output_tokens
130
+ split: ${main_split}
131
+
132
+ ############################################################
133
+ run_groups:
134
+ - name: arabic_scenarios
135
+ display_name: Arabic Scenarios
136
+ description: Arabic Scenarios
137
+ category: All scenarios
138
+ subgroups:
139
+ - mmmlu
140
+ - arabic_mmlu
141
+ - alghafa
142
+ - exams_multilingual
143
+ - aratrust
144
+
145
+ - name: mmmlu
146
+ display_name: Multilingual MMLU (Arabic)
147
+ description: Multilingual MMLU (Arabic)
148
+ metric_groups:
149
+ - accuracy
150
+ - efficiency
151
+ - general_information
152
+ environment:
153
+ main_name: exact_match
154
+ main_split: test
155
+ taxonomy:
156
+ task: multiple-choice question answering
157
+ what: math, science, history, etc.
158
+ who: various online sources
159
+ when: before 2021
160
+ language: Arabic
161
+
162
+ - name: arabic_mmlu
163
+ display_name: Arabic MMLU
164
+ description: Arabic MMLU
165
+ metric_groups:
166
+ - accuracy
167
+ - efficiency
168
+ - general_information
169
+ environment:
170
+ main_name: exact_match
171
+ main_split: test
172
+ taxonomy:
173
+ task: "question answering"
174
+ what: "academic questions across various disciplines"
175
+ who: "academic exams writers and takers"
176
+ when: "before 2024"
177
+ language: Arabic
178
+
179
+ - name: alghafa
180
+ display_name: AlGhafa
181
+ description: AlGhafa
182
+ metric_groups:
183
+ - accuracy
184
+ - efficiency
185
+ - general_information
186
+ environment:
187
+ main_name: exact_match
188
+ main_split: test
189
+ taxonomy:
190
+ task: "multiple choice question answering"
191
+ what: Various
192
+ who: Various
193
+ when: "before 2023"
194
+ language: Arabic
195
+
196
+ - name: exams_multilingual
197
+ display_name: EXAMS (Arabic)
198
+ description: EXAMS (Arabic)
199
+ metric_groups:
200
+ - accuracy
201
+ - efficiency
202
+ - general_information
203
+ environment:
204
+ main_name: exact_match
205
+ main_split: test
206
+ taxonomy:
207
+ task: "multiple choice question answering"
208
+ what: High school examinations
209
+ who: High school examinations writers and test-takers
210
+ when: before 2020
211
+ language: Arabic
212
+
213
+ - name: aratrust
214
+ display_name: AraTrust
215
+ description: AraTrust
216
+ metric_groups:
217
+ - accuracy
218
+ - efficiency
219
+ - general_information
220
+ environment:
221
+ main_name: exact_match
222
+ main_split: test
223
+ taxonomy:
224
+ task: "question answering"
225
+ what: "academic questions across various disciplines"
226
+ who: "academic exams writers and takers"
227
+ when: "before 2024"
228
+ language: Arabic
@@ -1683,23 +1683,6 @@ run_groups:
1683
1683
  when: n/a
1684
1684
  language: synthetic
1685
1685
 
1686
- - name: numeracy
1687
- display_name: Numerical reasoning
1688
- description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
1689
- metric_groups:
1690
- - accuracy
1691
- - efficiency
1692
- - general_information
1693
- environment:
1694
- main_name: absolute_value_difference
1695
- main_split: test
1696
- taxonomy:
1697
- task: next-word prediction
1698
- what: Dyck formal language
1699
- who: n/a
1700
- when: n/a
1701
- language: synthetic
1702
-
1703
1686
  - name: synthetic_reasoning
1704
1687
  display_name: Synthetic reasoning (abstract symbols)
1705
1688
  description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -195,6 +195,7 @@ run_groups:
195
195
  - ruler_squad
196
196
  - infinite_bench_en_sum
197
197
  - infinite_bench_en_qa
198
+ - infinite_bench_en_mc
198
199
  - openai_mrcr
199
200
 
200
201
  - name: ruler_hotpotqa
@@ -234,7 +235,7 @@ run_groups:
234
235
 
235
236
  - name: infinite_bench_en_qa
236
237
  display_name: ∞Bench En.QA
237
- description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
238
+ description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
238
239
  metric_groups:
239
240
  - accuracy
240
241
  - general_information
@@ -249,6 +250,23 @@ run_groups:
249
250
  when: Before 2024
250
251
  language: English
251
252
 
253
+ - name: infinite_bench_en_mc
254
+ display_name: ∞Bench En.MC
255
+ description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
256
+ metric_groups:
257
+ - accuracy
258
+ - general_information
259
+ - annotation_metrics
260
+ environment:
261
+ main_name: exact_match
262
+ main_split: test
263
+ taxonomy:
264
+ task: multiple-choice question answering
265
+ what: Novels
266
+ who: Novel authors
267
+ when: Before 2024
268
+ language: English
269
+
252
270
  - name: infinite_bench_en_sum
253
271
  display_name: ∞Bench En.Sum
254
272
  description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))