crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,197 @@
1
+ """Run specs for Arabic leaderboard
2
+
3
+ EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
6
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
7
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
8
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
9
+ from helm.benchmark.metrics.metric import MetricSpec
10
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
11
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
12
+
13
+
14
+ _ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
15
+ _ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
16
+
17
+
18
+ @run_spec_function("arabic_mmlu")
19
+ def get_arabic_mmlu_spec(subset: str) -> RunSpec:
20
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
21
+
22
+ scenario_spec = ScenarioSpec(
23
+ class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario", args={"subset": subset}
24
+ )
25
+
26
+ adapter_spec = get_multiple_choice_adapter_spec(
27
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
28
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
29
+ input_noun="السؤال",
30
+ output_noun="الإجابة",
31
+ max_tokens=100,
32
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
33
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
34
+ )
35
+
36
+ return RunSpec(
37
+ name=f"arabic_mmlu:subset={subset}",
38
+ scenario_spec=scenario_spec,
39
+ adapter_spec=adapter_spec,
40
+ metric_specs=get_exact_match_metric_specs(),
41
+ groups=["arabic_mmlu"],
42
+ )
43
+
44
+
45
+ @run_spec_function("alghafa")
46
+ def get_alghafa_spec(subset: str) -> RunSpec:
47
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
48
+ scenario_spec = ScenarioSpec(
49
+ class_name="helm.benchmark.scenarios.alghafa_scenario.AlGhafaScenario", args={"subset": subset}
50
+ )
51
+
52
+ adapter_spec = get_multiple_choice_adapter_spec(
53
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
54
+ instructions="الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح", # noqa: E501
55
+ input_noun="السؤال",
56
+ output_noun="الإجابة",
57
+ max_tokens=100,
58
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
59
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
60
+ )
61
+
62
+ return RunSpec(
63
+ name=f"alghafa:subset={subset}",
64
+ scenario_spec=scenario_spec,
65
+ adapter_spec=adapter_spec,
66
+ metric_specs=get_exact_match_metric_specs(),
67
+ groups=["alghafa", f"alghafa_{subset}"],
68
+ )
69
+
70
+
71
+ @run_spec_function("aratrust")
72
+ def get_aratrust_spec(category: str) -> RunSpec:
73
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
74
+ scenario_spec = ScenarioSpec(
75
+ class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario",
76
+ args={"category": category},
77
+ )
78
+
79
+ adapter_spec = get_generation_adapter_spec(
80
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج", # noqa: E501
81
+ input_noun="السؤال",
82
+ output_noun="الإجابة",
83
+ max_tokens=100,
84
+ )
85
+
86
+ return RunSpec(
87
+ name=f"aratrust:category={category}",
88
+ scenario_spec=scenario_spec,
89
+ adapter_spec=adapter_spec,
90
+ metric_specs=get_exact_match_metric_specs(),
91
+ groups=["aratrust"],
92
+ )
93
+
94
+
95
+ @run_spec_function("alrage")
96
+ def get_alrage_spec() -> RunSpec:
97
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
98
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.alrage_scenario.ALRAGEScenario")
99
+
100
+ adapter_spec = get_generation_adapter_spec(
101
+ instructions="بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي", # noqa: E501
102
+ input_noun="السؤال",
103
+ output_noun="الإجابة",
104
+ max_tokens=100,
105
+ )
106
+
107
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.alrage_annotator.ALRAGEAnnotator")]
108
+
109
+ metric_specs = [
110
+ MetricSpec(class_name="helm.benchmark.metrics.alrage_metric.ALRAGEMetric")
111
+ ] + get_basic_metric_specs([])
112
+
113
+ return RunSpec(
114
+ name="alrage",
115
+ scenario_spec=scenario_spec,
116
+ adapter_spec=adapter_spec,
117
+ annotators=annotator_specs,
118
+ metric_specs=metric_specs,
119
+ groups=["alrage"],
120
+ )
121
+
122
+
123
+ @run_spec_function("madinah_qa")
124
+ def get_madinah_qa_spec(subset: str) -> RunSpec:
125
+ scenario_spec = ScenarioSpec(
126
+ class_name="helm.benchmark.scenarios.madinah_qa_scenario.MadinahQAScenario", args={"subset": subset}
127
+ )
128
+
129
+ adapter_spec = get_multiple_choice_adapter_spec(
130
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
131
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
132
+ input_noun="السؤال",
133
+ output_noun="الإجابة",
134
+ max_tokens=100,
135
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
136
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
137
+ )
138
+
139
+ return RunSpec(
140
+ name=f"madinah_qa:subset={subset}",
141
+ scenario_spec=scenario_spec,
142
+ adapter_spec=adapter_spec,
143
+ metric_specs=get_exact_match_metric_specs(),
144
+ groups=["madinah_qa"],
145
+ )
146
+
147
+
148
+ @run_spec_function("mbzuai_human_translated_arabic_mmlu")
149
+ def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
150
+ scenario_spec = ScenarioSpec(
151
+ class_name="helm.benchmark.scenarios.mbzuai_human_translated_arabic_mmlu.MBZUAIHumanTranslatedArabicMMLUScenario",
152
+ args={"subject": subject},
153
+ )
154
+
155
+ adapter_spec = get_multiple_choice_adapter_spec(
156
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
157
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
158
+ input_noun="السؤال",
159
+ output_noun="الإجابة",
160
+ max_tokens=100,
161
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
162
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
163
+ )
164
+
165
+ return RunSpec(
166
+ name=f"mbzuai_human_translated_arabic_mmlu:subject={subject}",
167
+ scenario_spec=scenario_spec,
168
+ adapter_spec=adapter_spec,
169
+ metric_specs=get_exact_match_metric_specs(),
170
+ groups=["mbzuai_human_translated_arabic_mmlu"],
171
+ )
172
+
173
+
174
+ @run_spec_function("arabic_exams")
175
+ def get_arabic_exams_spec(subject: str) -> RunSpec:
176
+ scenario_spec = ScenarioSpec(
177
+ class_name="helm.benchmark.scenarios.arabic_exams_scenario.ArabicEXAMSScenario",
178
+ args={"subject": subject},
179
+ )
180
+
181
+ adapter_spec = get_multiple_choice_adapter_spec(
182
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
183
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
184
+ input_noun="السؤال",
185
+ output_noun="الإجابة",
186
+ max_tokens=100,
187
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
188
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
189
+ )
190
+
191
+ return RunSpec(
192
+ name=f"arabic_exams:subject={subject}",
193
+ scenario_spec=scenario_spec,
194
+ adapter_spec=adapter_spec,
195
+ metric_specs=get_exact_match_metric_specs(),
196
+ groups=["arabic_exams"],
197
+ )
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("bluex")
9
+ def get_bluex_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
11
+
12
+ adapter_spec = get_multiple_choice_adapter_spec(
13
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ instructions="""
15
+ Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
16
+ Exemplo de Pergunta com a resposta:
17
+ Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
18
+ relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
19
+
20
+ (A) Narrador onisciente que conhece os pensamentos de todas as personagens.
21
+ (B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
22
+ (C) Narrador-personagem que vivencia e relata os eventos da história.
23
+ (D) Narrador observador que apenas registra as ações visíveis.
24
+ (E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
25
+
26
+ Resposta correta: C
27
+
28
+ A partir disso, responda:
29
+ """,
30
+ input_noun="Pergunta",
31
+ output_noun="Resposta",
32
+ )
33
+
34
+ return RunSpec(
35
+ name="bluex",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=get_exact_match_metric_specs(),
39
+ groups=["bluex"],
40
+ )
@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
35
35
  get_f1_metric_specs,
36
36
  get_generative_harms_metric_specs,
37
37
  get_language_modeling_metric_specs,
38
- get_numeracy_metric_specs,
39
38
  get_open_ended_generation_metric_specs,
40
39
  get_summarization_metric_specs,
41
40
  get_basic_generation_metric_specs,
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
381
380
  )
382
381
 
383
382
 
384
- @run_spec_function("numeracy")
385
- def get_numeracy_spec(
386
- relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
387
- ) -> RunSpec:
388
- from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
389
-
390
- run_solver_bool: bool = True if run_solver.lower() == "true" else False
391
- del run_solver
392
- random_seed = int(seed)
393
- scenario_spec = ScenarioSpec(
394
- class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
395
- args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
396
- )
397
-
398
- if mode in ["example", "standard"]:
399
- # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
400
- adapter_args: Dict[str, Any] = {
401
- "max_train_instances": 100,
402
- "max_eval_instances": 100,
403
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
404
- }
405
- elif mode == "function":
406
- # Test a model's ability to impute datapoints for randomly sampled relations
407
- # (resampled for each evaluation point).
408
- adapter_args = {
409
- "instructions": "",
410
- "max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
411
- # capture a higher-order version of this task / is a little convoluted
412
- # for models, currently.
413
- # (In the general version, the model sees other relations of the same class,
414
- # and needs to impute a datapoint for the last one. Presumably, inferring
415
- # the class - eg. the degree of the relation - would help.)
416
- "max_eval_instances": 1000,
417
- "dim": RELTYPE_INFO[relation_type].num_variables + 1,
418
- "instance_prefix": "\n\n",
419
- }
420
- else:
421
- raise ValueError(f"Invalid mode: {mode}")
422
-
423
- adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
424
- # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
425
- # because it is used within the scenario to construct the instances themselves.
426
-
427
- return RunSpec(
428
- name=f"numeracy:relation_type={relation_type},mode={mode}",
429
- scenario_spec=scenario_spec,
430
- adapter_spec=adapter_spec,
431
- metric_specs=get_numeracy_metric_specs(run_solver_bool),
432
- groups=["numeracy"],
433
- )
434
-
435
-
436
383
  @run_spec_function("boolq")
437
384
  def get_boolq_spec(only_contrast=False) -> RunSpec:
438
385
  scenario_spec = ScenarioSpec(
@@ -806,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
806
753
  )
807
754
 
808
755
  return RunSpec(
809
- name=f"summarization_xsum:temperature={temperature},device={device}",
756
+ name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
810
757
  scenario_spec=scenario_spec,
811
758
  adapter_spec=adapter_spec,
812
759
  metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
813
760
  + get_generative_harms_metric_specs(),
814
- groups=["summarization_xsum"],
761
+ groups=["summarization_xsum_sampled"],
815
762
  )
816
763
 
817
764
 
@@ -0,0 +1,192 @@
1
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
2
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
3
+ from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
4
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
5
+ from helm.benchmark.metrics.codeinsights_metric_specs import (
6
+ get_functional_correctness_metric_specs,
7
+ get_comprehensive_code_evaluation_metric_specs,
8
+ get_edge_case_metric_specs,
9
+ get_code_efficiency_metric_specs,
10
+ )
11
+
12
+
13
+ @run_spec_function("codeinsights_correct_code")
14
+ def get_codeinsights_correct_code_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
15
+ scenario_spec = ScenarioSpec(
16
+ class_name="helm.benchmark.scenarios.codeinsights_correct_code_scenario.CodeInsightsCorrectCodeScenario",
17
+ args={"num_testcases": num_testcases},
18
+ )
19
+
20
+ instruction = (
21
+ "You are a skilled C++ programmer working on a foundational programming course assignment. "
22
+ "Your task is to write correct, efficient C++ code that solves the given problem. "
23
+ "Write clean, well-structured code that follows good programming practices. "
24
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
25
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
26
+ "and make sure the code is compatible with the Unit Test Input"
27
+ "Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly."
28
+ )
29
+
30
+ adapter_spec = get_generation_adapter_spec(
31
+ instructions=instruction,
32
+ output_noun="Your code",
33
+ stop_sequences=[],
34
+ max_tokens=4000,
35
+ temperature=tpr,
36
+ )
37
+
38
+ return RunSpec(
39
+ name=f"codeinsights_correct_code:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
40
+ scenario_spec=scenario_spec,
41
+ adapter_spec=adapter_spec,
42
+ metric_specs=get_functional_correctness_metric_specs() + get_basic_metric_specs([]),
43
+ groups=["codeinsights", "codeinsights_correct_code"],
44
+ )
45
+
46
+
47
+ @run_spec_function("codeinsights_student_coding")
48
+ def get_codeinsights_student_coding_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
49
+ scenario_spec = ScenarioSpec(
50
+ class_name="helm.benchmark.scenarios.codeinsights_student_coding_scenario.CodeInsightsStudentCodingScenario",
51
+ args={"num_testcases": num_testcases},
52
+ )
53
+
54
+ instruction = (
55
+ "You are the same student who wrote the three examples below in your foundational C++ course. "
56
+ "Mimic exactly your personal coding style, conventions, and level of proficiency—"
57
+ "do not over‐optimize or introduce unfamiliar patterns. "
58
+ "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
59
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
60
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
61
+ "and make sure the code is compatible with the Unit Test Input"
62
+ "Ensure your code includes any class definition when needed."
63
+ )
64
+
65
+ adapter_spec = get_generation_adapter_spec(
66
+ instructions=instruction,
67
+ output_noun="Your code",
68
+ stop_sequences=[],
69
+ max_tokens=4000,
70
+ temperature=tpr,
71
+ )
72
+
73
+ return RunSpec(
74
+ name=f"codeinsights_student_coding:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
75
+ scenario_spec=scenario_spec,
76
+ adapter_spec=adapter_spec,
77
+ metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
78
+ groups=["codeinsights", "codeinsights_student_coding"],
79
+ )
80
+
81
+
82
+ @run_spec_function("codeinsights_student_mistake")
83
+ def get_codeinsights_student_mistake_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
84
+ scenario_spec = ScenarioSpec(
85
+ class_name="helm.benchmark.scenarios.codeinsights_student_mistake_scenario.CodeInsightsStudentMistakeScenario",
86
+ args={"num_testcases": num_testcases},
87
+ )
88
+
89
+ instruction = (
90
+ "You are a C++ student with a consistent personal style, conventions, and proficiency level.\n"
91
+ "Your task is to attempt the target problem **but introduce realistic mistake** you would typically make—"
92
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
93
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
94
+ "and make sure the code is compatible with the Unit Test Input"
95
+ "Ensure your code is includes any class definition when needed."
96
+ )
97
+
98
+ adapter_spec = get_generation_adapter_spec(
99
+ instructions=instruction,
100
+ output_noun="Your code",
101
+ stop_sequences=[],
102
+ max_tokens=4000,
103
+ temperature=tpr,
104
+ )
105
+
106
+ return RunSpec(
107
+ name=f"codeinsights_student_mistake:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
108
+ scenario_spec=scenario_spec,
109
+ adapter_spec=adapter_spec,
110
+ metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
111
+ groups=["codeinsights", "codeinsights_student_mistake"],
112
+ )
113
+
114
+
115
+ @run_spec_function("codeinsights_code_efficiency")
116
+ def get_codeinsights_code_efficiency_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
117
+ """
118
+ Run specification for code efficiency evaluation scenario.
119
+
120
+ This scenario evaluates whether LLM-generated code has similar runtime efficiency
121
+ as the original student code. It focuses on problems where both solutions are
122
+ functionally correct and measures runtime performance alignment.
123
+
124
+ Requires C++ compiler (g++) to be available for actual compilation and execution.
125
+ """
126
+ scenario_spec = ScenarioSpec(
127
+ class_name="helm.benchmark.scenarios.codeinsights_code_efficiency_scenario.CodeInsightsCodeEfficiencyScenario",
128
+ args={"num_testcases": num_testcases},
129
+ )
130
+
131
+ instruction = (
132
+ "You are the same student who wrote the three examples below in your foundational C++ course. "
133
+ "Mimic exactly your personal coding style, conventions, and make sure to generate a correct code. "
134
+ "Do not over-optimize or introduce unfamiliar patterns. If the code is correct but inefficient, "
135
+ "imitate the inefficiency. "
136
+ "If the student writes efficiently, write efficiently too. "
137
+ "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
138
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
139
+ "DO NOT reproduce the template part as the generated code would be inserted to the template,"
140
+ "and make sure the code is compatible with the Unit Test Input"
141
+ "Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly."
142
+ )
143
+
144
+ adapter_spec = get_generation_adapter_spec(
145
+ instructions=instruction,
146
+ output_noun="Your code",
147
+ stop_sequences=[],
148
+ max_tokens=4000,
149
+ temperature=tpr,
150
+ )
151
+
152
+ return RunSpec(
153
+ name=f"codeinsights_code_efficiency:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
154
+ scenario_spec=scenario_spec,
155
+ adapter_spec=adapter_spec,
156
+ metric_specs=get_code_efficiency_metric_specs(
157
+ num_runtime_runs=5, # Run each solution 5 times for averaging
158
+ timeout_seconds=10, # 10 second timeout per execution
159
+ )
160
+ + get_basic_metric_specs([]),
161
+ groups=["codeinsights", "codeinsights_code_efficiency"],
162
+ )
163
+
164
+
165
+ @run_spec_function("codeinsights_edge_case")
166
+ def get_codeinsights_edge_case_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
167
+ scenario_spec = ScenarioSpec(
168
+ class_name="helm.benchmark.scenarios.codeinsights_edge_case_scenario.CodeInsightsEdgeCaseScenario",
169
+ args={"num_testcases": num_testcases},
170
+ )
171
+
172
+ instruction = (
173
+ "You are a student studying C++ with a consistent personal style, conventions, and proficiency level.\n"
174
+ "Your task is to identify which test case you would likely to fail for a given question with unit tests.\n"
175
+ "Respond only with integer of the unittest number\n\n"
176
+ )
177
+
178
+ adapter_spec = get_generation_adapter_spec(
179
+ instructions=instruction,
180
+ output_noun="Your code",
181
+ stop_sequences=[],
182
+ max_tokens=4000,
183
+ temperature=tpr,
184
+ )
185
+
186
+ return RunSpec(
187
+ name=f"codeinsights_edge_case:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
188
+ scenario_spec=scenario_spec,
189
+ adapter_spec=adapter_spec,
190
+ metric_specs=get_edge_case_metric_specs() + get_basic_metric_specs([]),
191
+ groups=["codeinsights", "codeinsights_edge_case"],
192
+ )
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("healthqa_br")
9
+ def get_healthqa_br_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(
11
+ class_name="helm.benchmark.scenarios.healthqa_br_scenario.HEALTHQA_BR_Scenario", args={}
12
+ )
13
+
14
+ adapter_spec = get_multiple_choice_adapter_spec(
15
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
16
+ instructions="""
17
+ Escolha a alternativa correta para as questões de medicina (responda apenas com a letra).
18
+ Exemplo de Pergunta com a resposta:
19
+ Qual dos seguintes órgãos é responsável pela produção da insulina no corpo humano?
20
+ A) Fígado
21
+ B) Rins
22
+ C) Pâncreas
23
+ D) Baço
24
+ E) Coração
25
+
26
+ Resposta correta: C
27
+
28
+ A partir disso, responda:
29
+ """,
30
+ input_noun="Pergunta",
31
+ output_noun="Resposta",
32
+ )
33
+
34
+ return RunSpec(
35
+ name="healthqa_br",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=get_exact_match_metric_specs(),
39
+ groups=["healthqa_br"],
40
+ )
@@ -60,7 +60,9 @@ def get_core_heim_metric_specs() -> List[MetricSpec]:
60
60
  class_name="helm.benchmark.metrics.image_generation.fractal_dimension_metric.FractalDimensionMetric",
61
61
  args={},
62
62
  ),
63
- MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
63
+ # Disabled due to keras issue.
64
+ # See: https://github.com/stanford-crfm/helm/issues/3741#issuecomment-3109478877
65
+ # MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
64
66
  MetricSpec(class_name="helm.benchmark.metrics.image_generation.nudity_metrics.NudityMetric", args={}),
65
67
  MetricSpec(class_name="helm.benchmark.metrics.image_generation.watermark_metrics.WatermarkMetric", args={}),
66
68
  ] + get_basic_metric_specs(names=[])