crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,366 @@
1
+ from typing import List
2
+ import re
3
+ import os
4
+ import subprocess
5
+ import tempfile
6
+ import shutil
7
+
8
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
+ from helm.benchmark.adaptation.request_state import RequestState
10
+ from helm.benchmark.metrics.metric import Metric
11
+ from helm.benchmark.metrics.metric_name import MetricName
12
+ from helm.benchmark.metrics.metric_service import MetricService
13
+ from helm.benchmark.metrics.statistic import Stat
14
+
15
+
16
+ def compile_code(i, temp_dir, timeout=10):
17
+ """
18
+ Compiles the C++ file at temp_dir/tc_{i}.cpp and outputs to temp_dir/tc_{i}.out.
19
+
20
+ Args:
21
+ i (int): Index of the code to compile.
22
+ temp_dir (str): Temporary directory where the C++ files are located.
23
+
24
+ Returns:
25
+ str or None: Path to the executable if compilation succeeds, else None.
26
+ """
27
+ executable = os.path.join(temp_dir, f"tc_{i}.out")
28
+ cpp_file = os.path.join(temp_dir, f"tc_{i}.cpp")
29
+
30
+ try:
31
+ result = subprocess.run(
32
+ ["timeout", str(timeout), "g++", "-std=c++11", cpp_file, "-o", executable],
33
+ stdout=subprocess.PIPE,
34
+ stderr=subprocess.PIPE,
35
+ text=True, # Optional: to get output as string
36
+ timeout=timeout + 2, # Optional: timeout for compilation
37
+ )
38
+ if result.returncode != 0:
39
+ # print(f"Compilation failed for {cpp_file}:\n{result.stderr}")
40
+ return None
41
+ return executable
42
+ except Exception as e:
43
+ print(f"An error occurred while compiling {cpp_file}: {e}")
44
+ return None
45
+
46
+
47
+ def parallel_compile(codes, temp_dir, timeout=10, max_workers=4):
48
+ """
49
+ Compiles multiple C++ codes in parallel.
50
+
51
+ Args:
52
+ codes (list): List of code snippets or identifiers.
53
+ temp_dir (str): Directory containing the C++ files.
54
+ max_workers (int): Maximum number of worker processes.
55
+
56
+ Returns:
57
+ list: List of paths to the compiled executables or None for failed compilations.
58
+ """
59
+ executables = []
60
+ for i in range(len(codes)):
61
+ executable = compile_code(i, temp_dir, timeout)
62
+ executables.append(executable)
63
+
64
+ return executables
65
+
66
+
67
+ def run_executable(executable, std_in, timeout=10):
68
+ """
69
+ Runs an executable with a timeout and captures its output.
70
+
71
+ Args:
72
+ executable (str): Path to the executable to run.
73
+ timeout (int): Timeout for running the executable in seconds.
74
+
75
+ Returns:
76
+ tuple: (return_code, output) where return_code is 0 if successful, non-zero otherwise,
77
+ and output is the stdout captured from the execution.
78
+ """
79
+ if executable is None:
80
+ return (0, "") # Return 0 and empty output for failed compilations
81
+
82
+ try:
83
+ result = subprocess.run(
84
+ ["timeout", str(timeout), executable],
85
+ input=std_in,
86
+ stdout=subprocess.PIPE,
87
+ stderr=subprocess.PIPE,
88
+ text=True, # To decode stdout and stderr as strings
89
+ timeout=timeout + 2, # Add a small buffer to the timeout
90
+ )
91
+ return (result.returncode, result.stdout)
92
+ except Exception as e:
93
+ print(f"An error occurred while running {executable}: {e}")
94
+ return (1, "") # Non-zero return code for errors
95
+
96
+
97
+ def parallel_run_executables(executables, std_inputs, timeout=10, max_workers=4):
98
+ """
99
+ Runs multiple executables in parallel with a timeout.
100
+
101
+ Args:
102
+ executables (list): List of paths to the executables.
103
+ max_workers (int): Maximum number of worker processes.
104
+
105
+ Returns:
106
+ list: List of results containing the outputs from running each executable.
107
+ """
108
+ results = []
109
+ for std_in, executable in zip(std_inputs, executables):
110
+ result_code, output = run_executable(executable, std_in, timeout)
111
+ results.append((result_code, output))
112
+
113
+ return results
114
+
115
+
116
+ class CPPEvaluator:
117
+ def __init__(self, template, testcases, timeout=10, max_workers=8):
118
+ """Initializes the CPPEvaluator class.
119
+
120
+ Args:
121
+ template (str): The template code with placeholders for the student's answer and test cases.
122
+ testcases (Dict[str]): A list of test cases, each containing the input, output, and optional std_in.
123
+ max_workers (int, optional): The maximum number of workers to use for parallel processing. Defaults to 8.
124
+ """
125
+ self.template = template
126
+ self.testcases = testcases
127
+ self.timeout = timeout
128
+ self.max_workers = max_workers
129
+ self.formatted_testcases, self.std_inputs = self.format_testcases()
130
+
131
+ def format_testcases(self):
132
+ """Formats the test cases into the required format for the grading engine.
133
+
134
+ Returns:
135
+ Tuple[List[Dict[str]], List[str]]: A tuple containing the formatted test cases and standard inputs.
136
+ """
137
+ formatted_testcases = []
138
+ std_inputs = []
139
+ for testcase in self.testcases:
140
+ formatted_testcases.append(
141
+ {
142
+ "extra": "",
143
+ "testcode": testcase["input"],
144
+ "expected_output": testcase["output"],
145
+ }
146
+ )
147
+ if "std_in" not in testcase:
148
+ std_inputs.append("")
149
+ else:
150
+ std_inputs.append(testcase["std_in"])
151
+ return formatted_testcases, std_inputs
152
+
153
+ def generate_code(self, student_answer):
154
+ """Generates the C++ code with the student's answer and test cases.
155
+
156
+ Args:
157
+ student_answer (str): The student's answer to be inserted into the template.
158
+
159
+ Returns:
160
+ List[str]: A list of C++ code snippets with the student's answer and test cases inserted.
161
+ """
162
+ # Insert the student's answer and test cases into the template
163
+ code = self.template.replace("{{ STUDENT_ANSWER }}", student_answer)
164
+
165
+ # Find the for loop in the template
166
+ start_index = code.find("{% for TEST in TESTCASES %}")
167
+ end_index = code.find("{% endfor %}") + len("{% endfor %}")
168
+
169
+ list_codes = []
170
+ for testcase in self.formatted_testcases:
171
+ # Insert the test case code into the template between the for loop
172
+ testcode = code[:start_index] + testcase["testcode"] + code[end_index:]
173
+ list_codes.append(testcode)
174
+
175
+ return list_codes
176
+
177
+ def write_and_compile_code(self, codes):
178
+ """Writes and compiles the C++ code.
179
+
180
+ Args:
181
+ codes (List[str]): A list of C++ code snippets.
182
+
183
+ Returns:
184
+ Tuple[List[str], str]: A tuple containing the list of executable paths and the temporary directory.
185
+ """
186
+ # Write the C++ code to a temporary file
187
+ temp_dir = tempfile.mkdtemp()
188
+ for i, code in enumerate(codes):
189
+ cpp_file = os.path.join(temp_dir, f"tc_{i}.cpp")
190
+ with open(cpp_file, "w") as file:
191
+ file.write(code)
192
+
193
+ # Compile the C++ code
194
+ executables = parallel_compile(codes, temp_dir, timeout=self.timeout, max_workers=self.max_workers)
195
+
196
+ return executables, temp_dir
197
+
198
+ def evaluate(self, student_answer):
199
+ """Evaluates the student's answer using the test cases.
200
+
201
+ Args:
202
+ student_answer (str): The student's answer to be evaluated.
203
+
204
+ Returns:
205
+ Dict[str, Union[float, List[int]]]: A dictionary containing the score and test case results.
206
+ """
207
+ # Generate the C++ code with the student's answer
208
+ codes = self.generate_code(student_answer)
209
+
210
+ # Write and compile the C++ code
211
+ executables, temp_dir = self.write_and_compile_code(codes)
212
+ list_result = []
213
+
214
+ executation_results = parallel_run_executables(
215
+ executables, self.std_inputs, timeout=self.timeout, max_workers=self.max_workers
216
+ )
217
+ for i, testcase in enumerate(self.testcases):
218
+ if executation_results[i][0] != 0:
219
+ list_result.append(0)
220
+ continue
221
+
222
+ expected_output = testcase["output"]
223
+ student_output = executation_results[i][1]
224
+ if expected_output.strip() != student_output.strip():
225
+ list_result.append(0)
226
+ else:
227
+ list_result.append(1)
228
+
229
+ # Delete the temporary directory
230
+ try:
231
+ shutil.rmtree(temp_dir)
232
+ except OSError as e:
233
+ print("Error: %s - %s." % (e.filename, e.strerror))
234
+
235
+ if len(list_result) == 0:
236
+ return {"score": 0, "testcases": list_result}
237
+
238
+ return {
239
+ "score": sum(list_result) / len(list_result),
240
+ "testcases": list_result,
241
+ }
242
+
243
+
244
+ class CodeInsightsFunctionalCorrectnessMetric(Metric):
245
+ """
246
+ Metric for evaluating functional correctness of C++ code generation.
247
+
248
+ Measures each model's functional correctness by computing the proportion of problems
249
+ for which its generated code passes all provided unit tests. For every generated solution,
250
+ we compile the C++ code (using g++) and execute the full test cases. We record the
251
+ proportions of the unit test that passes for each problem and then take the mean across
252
+ all problems. This yields a score between 0 and 1, where 1 indicates the model produced
253
+ flawless codes, and lower values reveal the fraction of tasks it could not solve all
254
+ the unit test cases.
255
+ """
256
+
257
+ def __init__(self, timeout: int = 10, max_workers: int = 8):
258
+ """
259
+ Initializes the CodeInsightsFunctionalCorrectnessMetric.
260
+
261
+ Args:
262
+ timeout (int): Timeout for each test case execution.
263
+ max_workers (int): Maximum number of workers for parallel processing.
264
+ """
265
+ super().__init__()
266
+ self.timeout = timeout
267
+ self.max_workers = max_workers
268
+
269
+ def evaluate_generation(
270
+ self,
271
+ adapter_spec: AdapterSpec,
272
+ request_state: RequestState,
273
+ metric_service: MetricService,
274
+ eval_cache_path: str,
275
+ ) -> List[Stat]:
276
+ """
277
+ Evaluate LLM-generated code by running unit tests and computing pass rate.
278
+
279
+ Returns:
280
+ List of Stat objects containing the functional correctness score
281
+ """
282
+ print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
283
+ print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
284
+
285
+ # Get the generated code from the request state
286
+ if not request_state.result or not request_state.result.completions:
287
+ print("ERROR: No output generated")
288
+ return self._create_failure_stats("No output generated")
289
+
290
+ generated_code = request_state.result.completions[0].text.strip()
291
+ generated_code = self._extract_student_code(generated_code)
292
+ print(f"Generated code length: {len(generated_code)}")
293
+ print(f"Generated code preview: {generated_code[:200]}...")
294
+
295
+ # Get test cases from instance extra_data
296
+ if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
297
+ print("ERROR: No extra_data available")
298
+ print(f"Instance attributes: {dir(request_state.instance)}")
299
+ return self._create_failure_stats("No test data available")
300
+
301
+ extra_data = request_state.instance.extra_data
302
+ print(f"Extra data keys: {list(extra_data.keys())}")
303
+
304
+ test_cases = extra_data.get("test_cases", [])
305
+ question_template = extra_data.get("question_template", "")
306
+ question_name = extra_data.get("question_name", "UNKNOWN")
307
+
308
+ print(f"Question name: {question_name}")
309
+ print(f"Number of test cases: {len(test_cases)}")
310
+ print(f"Template length: {len(question_template)}")
311
+
312
+ if not test_cases:
313
+ print("ERROR: No test cases available")
314
+ return self._create_failure_stats("No test cases available")
315
+
316
+ print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
317
+
318
+ # Run unit tests and calculate pass rate
319
+ evaluator = CPPEvaluator(question_template, test_cases, timeout=self.timeout, max_workers=self.max_workers)
320
+ pass_rate = evaluator.evaluate(generated_code)["score"]
321
+
322
+ print(f"Final pass rate: {pass_rate}")
323
+ print("=== END DEBUG ===\n")
324
+
325
+ return [Stat(MetricName("functional_correctness")).add(pass_rate)]
326
+
327
+ def _extract_student_code(self, model_code: str) -> str:
328
+ """
329
+ Extracts clean C++ code from model output:
330
+ - Trims preambles
331
+ - Removes student's main()
332
+ """
333
+ code_blocks = re.findall(r"```(?:c\+\+)?\n(.*?)```", model_code, flags=re.DOTALL)
334
+ if code_blocks:
335
+ model_code = code_blocks[0].strip() # Use the first code block
336
+ print("[Markdown extraction] Used fenced code blocks.")
337
+
338
+ # Post-processing
339
+ # Comment out as a testing - 7/3/2025
340
+ lines = model_code.strip().splitlines()
341
+ start_keywords = ("#include", "using namespace")
342
+ for i, line in enumerate(lines):
343
+ if any(line.strip().startswith(k) for k in start_keywords):
344
+ lines[i] = ""
345
+ code = "\n".join(lines).strip()
346
+ if "int main" in code:
347
+ code = code.split("int main")[0].strip()
348
+
349
+ # --- Final touch ---
350
+ if "print(" in code and "void print()" not in code and "print()" not in code:
351
+ print("⚠️ WARNING: `print()` is called in test input but not defined.")
352
+
353
+ return code
354
+
355
+ def _create_failure_stats(self, error_message: str) -> List[Stat]:
356
+ """
357
+ Create default statistics for failure cases.
358
+
359
+ Args:
360
+ error_message: Description of the failure
361
+
362
+ Returns:
363
+ List containing a single Stat with 0.0 functional correctness score
364
+ """
365
+ print(f"METRIC FAILURE: {error_message}")
366
+ return [Stat(MetricName("functional_correctness")).add(0.0)]
@@ -0,0 +1,92 @@
1
+ from typing import List
2
+ import re
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+ from helm.benchmark.metrics.codeinsights_code_evaluation_metrics import CodeInsightsCodeEvaluationMetric
11
+
12
+
13
+ class UnittestAlignmentMetric(Metric):
14
+ """
15
+ Compare LLM unit-test results with the student’s correctness pattern.
16
+
17
+ Adds:
18
+ • functional_correctness (pass-rate)
19
+ • edge_case_slip_match (binary 0/1)
20
+ """
21
+
22
+ # ------------------------------------------------------------------ I#
23
+ # HELM entry-point #
24
+ # ------------------------------------------------------------------ #
25
+ def evaluate_generation( # HELM entry-point
26
+ self,
27
+ adapter_spec: AdapterSpec,
28
+ request_state: RequestState,
29
+ metric_service: MetricService,
30
+ eval_cache_path: str,
31
+ ) -> List[Stat]:
32
+ # ------------------------------------------------------------------
33
+ # 1. Parse the model’s answer --------------------------------------
34
+ # ------------------------------------------------------------------
35
+ default_stat = Stat(MetricName("unittest_alignment")).add(0.0)
36
+
37
+ if not request_state.result or not request_state.result.completions:
38
+ # No output → automatic miss
39
+ return [default_stat]
40
+
41
+ raw_output: str = request_state.result.completions[0].text.strip()
42
+
43
+ # Extract the *first* integer we see (robust to whitespace / newlines)
44
+ match = re.search(r"-?\d+", raw_output)
45
+ if match is None:
46
+ # Model didn’t emit an integer → miss
47
+ return [default_stat]
48
+
49
+ try:
50
+ predicted_index: int = int(match.group())
51
+ except ValueError:
52
+ # Shouldn’t happen, but be safe
53
+ return [default_stat]
54
+
55
+ # ------------------------------------------------------------------
56
+ # 2. Retrieve ground-truth failure index ---------------------------
57
+ # ------------------------------------------------------------------
58
+ extra = getattr(request_state.instance, "extra_data", {}) or {}
59
+ correctness_pattern: List[int] = extra.get("student_correctness_pattern", [])
60
+
61
+ # Indices where the student failed (value == 0)
62
+ failed_indices: List[int] = [i for i, v in enumerate(correctness_pattern) if v == 0]
63
+
64
+ # If we don’t have exactly one failing test, treat as miss
65
+ if len(failed_indices) != 1:
66
+ return [default_stat]
67
+
68
+ actual_index: int = failed_indices[0]
69
+
70
+ # ------------------------------------------------------------------
71
+ # 3. Compare & return ---------------------------------------------
72
+ # ------------------------------------------------------------------
73
+ alignment_score = 1.0 if predicted_index == actual_index else 0.0
74
+ return [Stat(MetricName("unittest_alignment")).add(alignment_score)]
75
+
76
+
77
+ class CodeInsightsUnittestAlignmentMetric(CodeInsightsCodeEvaluationMetric):
78
+ """unit-test alignment (with new metrics)."""
79
+
80
+ def __init__(self, use_codebert: bool = True):
81
+ super().__init__(use_codebert=use_codebert)
82
+ self.alignment_metric = UnittestAlignmentMetric()
83
+
84
+ def evaluate_generation(
85
+ self,
86
+ adapter_spec: AdapterSpec,
87
+ request_state: RequestState,
88
+ metric_service: MetricService,
89
+ eval_cache_path: str,
90
+ ):
91
+ stats = self.alignment_metric.evaluate_generation(adapter_spec, request_state, metric_service, eval_cache_path)
92
+ return stats
@@ -0,0 +1,51 @@
1
+ from typing import List
2
+ from helm.benchmark.metrics.metric import MetricSpec
3
+
4
+
5
+ def get_functional_correctness_metric_specs() -> List[MetricSpec]:
6
+ return [
7
+ MetricSpec(
8
+ class_name="helm.benchmark.metrics.codeinsights_correct_code_metrics.CodeInsightsFunctionalCorrectnessMetric", # noqa: E501
9
+ args={"timeout": 10, "max_workers": 1},
10
+ )
11
+ ]
12
+
13
+
14
+ def get_comprehensive_code_evaluation_metric_specs(use_codebert: bool = True) -> List[MetricSpec]:
15
+ return [
16
+ MetricSpec(
17
+ class_name="helm.benchmark.metrics.codeinsights_code_evaluation_metrics.CodeInsightsComprehensiveCodeEvaluationMetric", # noqa: E501
18
+ args={"use_codebert": use_codebert},
19
+ )
20
+ ]
21
+
22
+
23
+ def get_code_efficiency_metric_specs(
24
+ num_runtime_runs: int = 5,
25
+ timeout_seconds: int = 10,
26
+ use_codebert: bool = True, # ➊ add arg if you wish
27
+ ):
28
+ return [
29
+ MetricSpec( # existing metric → runtime & correctness
30
+ class_name="helm.benchmark.metrics.codeinsights_code_efficiency_metrics.CodeInsightsCodeEfficiencyMetric",
31
+ args={
32
+ "num_runtime_runs": num_runtime_runs,
33
+ "timeout_seconds": timeout_seconds,
34
+ },
35
+ ),
36
+ MetricSpec( # ➋ NEW metric → AST + CodeBERT
37
+ class_name="helm.benchmark.metrics.codeinsights_code_evaluation_metrics.CodeInsightsCodeEvaluationMetric",
38
+ args={"use_codebert": use_codebert},
39
+ ),
40
+ ]
41
+
42
+
43
+ def get_edge_case_metric_specs(
44
+ use_codebert: bool = True,
45
+ ) -> List[MetricSpec]:
46
+ return [
47
+ MetricSpec(
48
+ class_name="helm.benchmark.metrics.codeinsights_edge_case_metrics.CodeInsightsUnittestAlignmentMetric", # noqa: E501
49
+ args={"use_codebert": use_codebert},
50
+ )
51
+ ]
@@ -16,7 +16,7 @@ from helm.common.request import RequestResult
16
16
 
17
17
 
18
18
  class CometMetric(Metric):
19
- """COMET machine translation metric using a regression model.
19
+ r"""COMET machine translation metric using a regression model.
20
20
  The model takes a triplet of source sentence, translation, and reference
21
21
  and computes a score in the range [0, 1] reflecting the quality of the predicted
22
22
  translation.
@@ -3,7 +3,7 @@ from typing import Any, List
3
3
 
4
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -70,3 +70,14 @@ class ConvFinQACalcMetric(Metric):
70
70
  return [
71
71
  Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
72
72
  ]
73
+
74
+ def get_metadata(self) -> List[MetricMetadata]:
75
+ return [
76
+ MetricMetadata(
77
+ name="float_equiv",
78
+ display_name="Float Equivalence",
79
+ description="Float Equivalence",
80
+ lower_is_better=False,
81
+ group=None,
82
+ ),
83
+ ]
@@ -25,7 +25,7 @@ def _longest_common_prefix_length(s1: np.ndarray, s2: np.ndarray, previous_best:
25
25
  min_len = min(len(s1), len(s2))
26
26
  s1, s2 = s1[:min_len], s2[:min_len]
27
27
  (nonzeros,) = np.cumprod(s1 == s2).nonzero() # Get indices (inclusive) up to which s1 and s2 are the same.
28
- result = np.max(nonzeros) + 1 if len(nonzeros) > 0 else 0
28
+ result = np.max(nonzeros).item() + 1 if len(nonzeros) > 0 else 0
29
29
  return result if previous_best is None else max(previous_best, result)
30
30
 
31
31
 
@@ -24,7 +24,7 @@ AGREE_PHRASES = [
24
24
  "fully agree",
25
25
  "could not agree more",
26
26
  "i 100% agree",
27
- "i 100\% agree",
27
+ "i 100\\% agree",
28
28
  "i actually agree",
29
29
  "couldn't possibly agree more",
30
30
  "couldn't possibly agree more",
@@ -8,7 +8,7 @@ from helm.benchmark.adaptation.request_state import RequestState
8
8
  from helm.benchmark.metrics.statistic import Stat, merge_stat
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
- from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
11
+ from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.metrics.metric_service import MetricService
14
14
  from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
@@ -93,3 +93,32 @@ class DryRunMetric(MetricInterface):
93
93
  merge_stat(stats, Stat(MetricName("num_requests")).add(len(scenario_state.request_states)))
94
94
 
95
95
  return MetricResult(list(stats.values()), per_instance_stats)
96
+
97
+ def get_metadata(self) -> List[MetricMetadata]:
98
+ return [
99
+ MetricMetadata(
100
+ name="estimated_num_tokens_cost",
101
+ display_name="cost",
102
+ short_display_name=None,
103
+ description="An estimate of the number of tokens (including prompt and output completions) needed to "
104
+ "perform the request.",
105
+ lower_is_better=None,
106
+ group=None,
107
+ ),
108
+ MetricMetadata(
109
+ name="num_completions",
110
+ display_name="# completions",
111
+ short_display_name=None,
112
+ description="Number of completions.",
113
+ lower_is_better=None,
114
+ group=None,
115
+ ),
116
+ MetricMetadata(
117
+ name="num_prompt_tokens",
118
+ display_name="# prompt tokens",
119
+ short_display_name=None,
120
+ description="Number of tokens in the prompt.",
121
+ lower_is_better=None,
122
+ group="general_information",
123
+ ),
124
+ ]