crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/benchmark/run.py CHANGED
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
9
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
10
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
11
11
  from helm.common.general import ensure_directory_exists
12
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
12
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
13
13
  from helm.common.authentication import Authentication
14
14
  from helm.common.object_spec import parse_object_spec, get_class_by_name
15
15
  from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -200,76 +200,9 @@ def validate_args(args):
200
200
 
201
201
 
202
202
  @htrack(None)
203
- def main():
204
- parser = argparse.ArgumentParser()
205
- add_service_args(parser)
206
- parser.add_argument(
207
- "-c",
208
- "--conf-paths",
209
- nargs="+",
210
- help="Where to read RunSpecs to run from",
211
- default=[],
212
- )
213
- parser.add_argument(
214
- "--models-to-run",
215
- nargs="+",
216
- help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
217
- default=None,
218
- )
219
- parser.add_argument(
220
- "--groups-to-run",
221
- nargs="+",
222
- help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
223
- default=None,
224
- )
225
- parser.add_argument(
226
- "--exit-on-error",
227
- action="store_true",
228
- help="Fail and exit immediately if a particular RunSpec fails.",
229
- )
230
- parser.add_argument(
231
- "--skip-completed-runs",
232
- action="store_true",
233
- help="Skip RunSpecs that have completed i.e. output files exists.",
234
- )
235
- parser.add_argument(
236
- "--priority",
237
- type=int,
238
- default=None,
239
- help="Run RunSpecs with priority less than or equal to this number. "
240
- "If a value for --priority is not specified, run on everything",
241
- )
242
- parser.add_argument(
243
- "--run-specs",
244
- nargs="*",
245
- help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
246
- "Specifies run entries to run.",
247
- default=[],
248
- )
249
- parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
250
- parser.add_argument(
251
- "--enable-huggingface-models",
252
- nargs="+",
253
- default=[],
254
- help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
255
- "Format: namespace/model_name[@revision]",
256
- )
257
- parser.add_argument(
258
- "--enable-local-huggingface-models",
259
- nargs="+",
260
- default=[],
261
- help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
262
- )
263
- parser.add_argument(
264
- "--runner-class-name",
265
- type=str,
266
- default=None,
267
- help="Full class name of the Runner class to use. If unset, uses the default Runner.",
268
- )
269
- add_run_args(parser)
270
- args = parser.parse_args()
271
- validate_args(args)
203
+ def helm_run(args):
272
204
 
205
+ validate_args(args)
273
206
  register_builtin_configs_from_helm_package()
274
207
  register_configs_from_directory(args.local_path)
275
208
 
@@ -358,13 +291,91 @@ def main():
358
291
  )
359
292
 
360
293
  if args.run_specs:
361
- hlog(
362
- "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
363
- "Use --run-entries instead."
294
+ hwarn(
295
+ "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
364
296
  )
365
297
 
366
298
  hlog("Done.")
367
299
 
368
300
 
301
+ # Separate parsing from starting HELM so we can setup logging
302
+ def main():
303
+ parser = argparse.ArgumentParser()
304
+ add_service_args(parser)
305
+ parser.add_argument(
306
+ "-c",
307
+ "--conf-paths",
308
+ nargs="+",
309
+ help="Where to read RunSpecs to run from",
310
+ default=[],
311
+ )
312
+ parser.add_argument(
313
+ "--models-to-run",
314
+ nargs="+",
315
+ help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
316
+ default=None,
317
+ )
318
+ parser.add_argument(
319
+ "--groups-to-run",
320
+ nargs="+",
321
+ help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
322
+ default=None,
323
+ )
324
+ parser.add_argument(
325
+ "--exit-on-error",
326
+ action="store_true",
327
+ help="Fail and exit immediately if a particular RunSpec fails.",
328
+ )
329
+ parser.add_argument(
330
+ "--skip-completed-runs",
331
+ action="store_true",
332
+ help="Skip RunSpecs that have completed i.e. output files exists.",
333
+ )
334
+ parser.add_argument(
335
+ "--priority",
336
+ type=int,
337
+ default=None,
338
+ help="Run RunSpecs with priority less than or equal to this number. "
339
+ "If a value for --priority is not specified, run on everything",
340
+ )
341
+ parser.add_argument(
342
+ "--run-specs",
343
+ nargs="*",
344
+ help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
345
+ "Specifies run entries to run.",
346
+ default=[],
347
+ )
348
+ parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
349
+ parser.add_argument(
350
+ "--enable-huggingface-models",
351
+ nargs="+",
352
+ default=[],
353
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
354
+ "Format: namespace/model_name[@revision]",
355
+ )
356
+ parser.add_argument(
357
+ "--enable-local-huggingface-models",
358
+ nargs="+",
359
+ default=[],
360
+ help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
361
+ )
362
+ parser.add_argument(
363
+ "--runner-class-name",
364
+ type=str,
365
+ default=None,
366
+ help="Full class name of the Runner class to use. If unset, uses the default Runner.",
367
+ )
368
+ parser.add_argument(
369
+ "--log-config",
370
+ type=str,
371
+ default=None,
372
+ help="PATH to a YAML file to customize logging",
373
+ )
374
+ add_run_args(parser)
375
+ args = parser.parse_args()
376
+ setup_default_logging(args.log_config)
377
+ return helm_run(args)
378
+
379
+
369
380
  if __name__ == "__main__":
370
381
  main()
@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
21
21
  AUDIO_LANGUAGE_MODEL_TAG,
22
22
  INSTRUCTION_FOLLOWING_MODEL_TAG,
23
23
  )
24
- from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
24
+ from helm.benchmark.adaptation.adapters.adapter_factory import (
25
+ ADAPT_GENERATION,
26
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
27
+ )
25
28
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
26
29
  from helm.benchmark.run_spec import RunSpec
27
30
  from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
537
540
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
538
541
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
539
542
  "vhelm": [0, 1, 2, 4, 8],
543
+ "melt": [0, 1, 5],
540
544
  }
541
545
 
542
546
 
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
1476
1480
  instructions = "Answer with only a single letter."
1477
1481
  elif self.scenario == "mcqa":
1478
1482
  instructions = "Answer with only a single letter."
1483
+ elif self.scenario == "mcqa_no_period":
1484
+ instructions = "Answer with only a single letter. Do not include a period in your answer."
1479
1485
  elif self.scenario == "mcqa_only_last_question":
1480
1486
  instructions = "Answer only the last question with only a single letter."
1481
1487
  else:
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
1521
1527
  )
1522
1528
  else:
1523
1529
  raise ValueError(f"Unknown scenario {self.scenario}")
1530
+ elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
1531
+ if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
1532
+ instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.' # noqa: E501
1533
+ else:
1534
+ raise ValueError(f"Unknown scenario {self.scenario}")
1524
1535
 
1525
1536
  if self.no_prefix:
1526
1537
  if instructions:
@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
143
143
  ):
144
144
  run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
145
145
 
146
- if model.name == "openai/o1-2024-12-17":
147
- # From https://platform.openai.com/docs/guides/reasoning,
148
- # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
149
- # experimenting with these models. As you become familiar with the number of reasoning tokens your
150
- # prompts require, you can adjust this buffer accordingly."
151
- run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
146
+ # TODO: find a better solution for this
147
+ # if model.name.startswith("openai/o"):
148
+ # # From https://platform.openai.com/docs/guides/reasoning,
149
+ # # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
150
+ # # experimenting with these models. As you become familiar with the number of reasoning tokens your
151
+ # # prompts require, you can adjust this buffer accordingly."
152
+ # run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
152
153
 
153
154
  # IDEFICS special handling
154
155
  if IDEFICS_MODEL_TAG in model.tags:
@@ -0,0 +1,73 @@
1
+ """Run specs for Arabic leaderboard
2
+
3
+ EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
6
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
7
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
8
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
9
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
10
+
11
+
12
+ @run_spec_function("arabic_mmlu")
13
+ def get_arabic_mmlu_spec() -> RunSpec:
14
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
15
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario")
16
+
17
+ adapter_spec = get_multiple_choice_adapter_spec(
18
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
19
+ instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
20
+ input_noun="Question",
21
+ output_noun="Answer",
22
+ )
23
+
24
+ return RunSpec(
25
+ name="arabic_mmlu",
26
+ scenario_spec=scenario_spec,
27
+ adapter_spec=adapter_spec,
28
+ metric_specs=get_exact_match_metric_specs(),
29
+ groups=["arabic_mmlu"],
30
+ )
31
+
32
+
33
+ @run_spec_function("alghafa")
34
+ def get_alghafa_spec(subset: str) -> RunSpec:
35
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
36
+ scenario_spec = ScenarioSpec(
37
+ class_name="helm.benchmark.scenarios.alghafa_scenario.AlGhafaScenario", args={"subset": subset}
38
+ )
39
+
40
+ adapter_spec = get_multiple_choice_adapter_spec(
41
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
42
+ instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
43
+ input_noun="Question",
44
+ output_noun="Answer",
45
+ )
46
+
47
+ return RunSpec(
48
+ name=f"alghafa:subset={subset}",
49
+ scenario_spec=scenario_spec,
50
+ adapter_spec=adapter_spec,
51
+ metric_specs=get_exact_match_metric_specs(),
52
+ groups=["alghafa", f"alghafa_{subset}"],
53
+ )
54
+
55
+
56
+ @run_spec_function("aratrust")
57
+ def get_aratrust_spec() -> RunSpec:
58
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
59
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario")
60
+
61
+ adapter_spec = get_generation_adapter_spec(
62
+ instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
63
+ input_noun="Question",
64
+ output_noun="Answer",
65
+ )
66
+
67
+ return RunSpec(
68
+ name="aratrust",
69
+ scenario_spec=scenario_spec,
70
+ adapter_spec=adapter_spec,
71
+ metric_specs=get_exact_match_metric_specs(),
72
+ groups=["aratrust"],
73
+ )
@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
113
113
  ]
114
114
 
115
115
 
116
+ def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
117
+ return [
118
+ MetricSpec(
119
+ class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
120
+ args={
121
+ "num_respondents": num_respondents,
122
+ "max_tokens": max_tokens,
123
+ },
124
+ )
125
+ ]
126
+
127
+
116
128
  ########################################################################################################################
117
129
  # RunSpecs
118
130
 
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
215
227
 
216
228
 
217
229
  @run_spec_function("voice_jailbreak_attacks")
218
- def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
230
+ def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
219
231
  scenario_spec = ScenarioSpec(
220
232
  class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
221
233
  "VoiceJailbreakAttacksScenario",
222
234
  args={"subset": subset},
223
235
  )
224
- adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
225
- metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
226
- include_basic_metrics=True, include_generative_harms_metrics=True
236
+ adapter_spec = _get_generation_adapter_spec(
237
+ instructions="Listen to the audio and respond according to its instructions.",
238
+ max_tokens=1024,
227
239
  )
240
+ metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
241
+ num_respondents=num_respondents,
242
+ max_tokens=200,
243
+ ) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
228
244
 
229
245
  run_spec_name: str = "voice_jailbreak_attacks"
230
246
  return RunSpec(
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
258
274
 
259
275
 
260
276
  @run_spec_function("vocal_sound")
261
- def get_vocal_sound_run_spec() -> RunSpec:
277
+ def get_vocal_sound_run_spec(sound: str) -> RunSpec:
262
278
  scenario_spec = ScenarioSpec(
263
279
  class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
280
+ args={"sound": sound},
264
281
  )
265
282
  adapter_spec = _get_generation_adapter_spec(
266
283
  instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
267
284
  '"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
268
285
  max_tokens=5,
269
286
  )
270
- metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
287
+ metric_specs = get_exact_match_metric_specs()
271
288
  run_spec_name: str = "vocal_sound"
272
289
  return RunSpec(
273
- name=run_spec_name,
290
+ name=f"{run_spec_name}:sound={sound}",
274
291
  scenario_spec=scenario_spec,
275
292
  adapter_spec=adapter_spec,
276
293
  metric_specs=metric_specs,
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
501
518
  )
502
519
  + _get_open_ended_generation_metric_specs()
503
520
  )
521
+
504
522
  run_spec_name: str = "air_bench_chat"
523
+ group_name: str = run_spec_name
524
+ if subject in ["mix", "speech"]:
525
+ group_name += "_reasoning"
526
+ elif subject in ["sound", "music"]:
527
+ group_name += "_knowledge"
528
+
505
529
  return RunSpec(
506
530
  name=f"{run_spec_name}:subject={subject}",
507
531
  scenario_spec=scenario_spec,
508
532
  adapter_spec=adapter_spec,
509
533
  metric_specs=metric_specs,
510
- groups=[run_spec_name],
534
+ groups=[group_name],
511
535
  )
512
536
 
513
537
 
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
611
635
  metric_specs=metric_specs,
612
636
  groups=[run_spec_name],
613
637
  )
638
+
639
+
640
+ @run_spec_function("corebench")
641
+ def get_corebench_run_spec() -> RunSpec:
642
+ scenario_spec = ScenarioSpec(
643
+ class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
644
+ )
645
+ adapter_spec = _get_generation_adapter_spec(
646
+ instructions="",
647
+ max_tokens=10,
648
+ )
649
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
650
+ run_spec_name: str = "corebench"
651
+ return RunSpec(
652
+ name=f"{run_spec_name}",
653
+ scenario_spec=scenario_spec,
654
+ adapter_spec=adapter_spec,
655
+ metric_specs=metric_specs,
656
+ groups=[run_spec_name],
657
+ )
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("bluex")
9
+ def get_bluex_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEX_Scenario", args={})
11
+
12
+ adapter_spec = get_multiple_choice_adapter_spec(
13
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ instructions="""
15
+ Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
16
+ Exemplo de Pergunta com a resposta:
17
+ Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
18
+ relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
19
+
20
+ (A) Narrador onisciente que conhece os pensamentos de todas as personagens.
21
+ (B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
22
+ (C) Narrador-personagem que vivencia e relata os eventos da história.
23
+ (D) Narrador observador que apenas registra as ações visíveis.
24
+ (E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
25
+
26
+ Resposta correta: C
27
+
28
+ A partir disso, responda:
29
+ """,
30
+ input_noun="Pergunta",
31
+ output_noun="Resposta",
32
+ )
33
+
34
+ return RunSpec(
35
+ name="bluex",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=get_exact_match_metric_specs(),
39
+ groups=["bluex"],
40
+ )
@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
35
35
  get_f1_metric_specs,
36
36
  get_generative_harms_metric_specs,
37
37
  get_language_modeling_metric_specs,
38
- get_numeracy_metric_specs,
39
38
  get_open_ended_generation_metric_specs,
40
39
  get_summarization_metric_specs,
41
40
  get_basic_generation_metric_specs,
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
381
380
  )
382
381
 
383
382
 
384
- @run_spec_function("numeracy")
385
- def get_numeracy_spec(
386
- relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
387
- ) -> RunSpec:
388
- from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
389
-
390
- run_solver_bool: bool = True if run_solver.lower() == "true" else False
391
- del run_solver
392
- random_seed = int(seed)
393
- scenario_spec = ScenarioSpec(
394
- class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
395
- args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
396
- )
397
-
398
- if mode in ["example", "standard"]:
399
- # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
400
- adapter_args: Dict[str, Any] = {
401
- "max_train_instances": 100,
402
- "max_eval_instances": 100,
403
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
404
- }
405
- elif mode == "function":
406
- # Test a model's ability to impute datapoints for randomly sampled relations
407
- # (resampled for each evaluation point).
408
- adapter_args = {
409
- "instructions": "",
410
- "max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
411
- # capture a higher-order version of this task / is a little convoluted
412
- # for models, currently.
413
- # (In the general version, the model sees other relations of the same class,
414
- # and needs to impute a datapoint for the last one. Presumably, inferring
415
- # the class - eg. the degree of the relation - would help.)
416
- "max_eval_instances": 1000,
417
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
418
- "instance_prefix": "\n\n",
419
- }
420
- else:
421
- raise ValueError(f"Invalid mode: {mode}")
422
-
423
- adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
424
- # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
425
- # because it is used within the scenario to construct the instances themselves.
426
-
427
- return RunSpec(
428
- name=f"numeracy:relation_type={relation_type},mode={mode}",
429
- scenario_spec=scenario_spec,
430
- adapter_spec=adapter_spec,
431
- metric_specs=get_numeracy_metric_specs(run_solver_bool),
432
- groups=["numeracy"],
433
- )
434
-
435
-
436
383
  @run_spec_function("boolq")
437
384
  def get_boolq_spec(only_contrast=False) -> RunSpec:
438
385
  scenario_spec = ScenarioSpec(