crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,94 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCSEIMedScenario(Scenario):
20
+ """
21
+ This dataset contains clinical notes from primary care visit encounters
22
+ (in-person/telehealth and telephone) of children ages 6-11 years old with ADHD
23
+ seen at Stanford's community-based primary care network, Packard Children's Health Alliance,
24
+ between 2015-2022. All children in this dataset were prescribed at least once an ADHD
25
+ medication (stimulants or non-stimulants) by a primary care clinician. In this
26
+ classification task, the LLM is tasked with classifying whether the note contains
27
+ documentation of side effect monitoring (recording of absence or presence of
28
+ medication side effects), as recommended in clinical practice guidelines.
29
+ From publication: https://doi.org/10.1542/peds.2024-067223
30
+ """
31
+
32
+ name = "shc_sei_med"
33
+ description = (
34
+ "ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
35
+ "pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
36
+ "in clinical practice guidelines. The dataset supports binary classification"
37
+ "to detect presence or absence of side effect inquiries (SEI) within notes."
38
+ )
39
+ tags = ["knowledge", "reasoning", "biomedical"]
40
+
41
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
42
+
43
+ def __init__(self, data_path: str):
44
+ super().__init__()
45
+ self.data_path = data_path
46
+
47
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
48
+ data = {}
49
+ with open(csv_path, "r") as file:
50
+ reader = csv.DictReader(file)
51
+ for row in reader:
52
+ question = row["prompt"]
53
+ context = row["context"]
54
+ answer = row["label"]
55
+ prompt = (
56
+ "You are reviewing a clinical note from health records of children "
57
+ "with attention deficit hyperactivity disorder (ADHD). Given the following "
58
+ "definitions: side Effects Inquiry (SEI): Explicit documentation by the clinician "
59
+ "asking about current side effects related to ADHD medications that the child is "
60
+ "taking or documentation of specific ADHD medication side effects experienced "
61
+ "by the patient. SEI does *not* include future side effects monitoring, "
62
+ "such as documentation of potential ADHD medication side effects, including "
63
+ "planning to follow patients to monitor side effects, explaining about "
64
+ "potential side effects of an ADHD medication. These documentations are not "
65
+ "categorized as SEI because they consist of a plan or an explanation about "
66
+ "side effects without actual side effect monitoring taking place, and "
67
+ "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
68
+ f"Provide an answer to the following question: {question} with the following context: {context} "
69
+ ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
70
+ "details or response, just a simple A or B response."
71
+ )
72
+ data[prompt] = answer
73
+ return data
74
+
75
+ def get_instances(self, output_path: str) -> List[Instance]:
76
+ check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
77
+ instances: List[Instance] = []
78
+ benchmark_data = self.create_benchmark(self.data_path)
79
+
80
+ for prompt, answer in benchmark_data.items():
81
+ assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
82
+ references: List[Reference] = [
83
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
84
+ for pred_answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
85
+ ]
86
+ instances.append(
87
+ Instance(
88
+ input=Input(text=prompt),
89
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
90
+ split=TEST_SPLIT,
91
+ )
92
+ )
93
+
94
+ return instances
@@ -0,0 +1,77 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCSequoiaMedScenario(Scenario):
20
+ """
21
+ Benchmark derived from manually curated answers to several questions for Sequoia clinic referrals
22
+ """
23
+
24
+ name = "shc_sequoia_med"
25
+ description = (
26
+ "ClinicReferral is a benchmark that determines patient eligibility for referral to the"
27
+ "Sequoia Clinic based on information from palliative care notes. The dataset provides"
28
+ "curated decisions on referral appropriateness to assist in automating clinic workflows."
29
+ )
30
+ tags = ["knowledge", "reasoning", "biomedical"]
31
+
32
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
+
34
+ def __init__(self, data_path: str):
35
+ super().__init__()
36
+ self.data_path = data_path
37
+
38
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
39
+ data = {}
40
+ counter = 1
41
+ with open(csv_path, "r") as file:
42
+ reader = csv.DictReader(file) # , quoting=csv.QUOTE_MINIMAL
43
+ for row in reader:
44
+ question = row["question"]
45
+ context = row["context"]
46
+ answer = row["label"]
47
+ prompt = (
48
+ f" {counter} Provide an answer to the following question: {question} with the following context:"
49
+ f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
50
+ "additional details or response, just a simple A or B response."
51
+ )
52
+ data[prompt] = answer
53
+ counter += 1
54
+ return data
55
+
56
+ def get_instances(self, output_path: str) -> List[Instance]:
57
+ check_file_exists(
58
+ self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
59
+ )
60
+ instances: List[Instance] = []
61
+ benchmark_data = self.create_benchmark(self.data_path)
62
+
63
+ for prompt, answer in benchmark_data.items():
64
+ assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
65
+ references: List[Reference] = [
66
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
67
+ for pred_answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
68
+ ]
69
+ instances.append(
70
+ Instance(
71
+ input=Input(text=prompt),
72
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
73
+ split=TEST_SPLIT,
74
+ )
75
+ )
76
+
77
+ return instances
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from datasets import load_dataset
3
3
 
4
- from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
4
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
5
5
 
6
6
 
7
7
  class SimpleSafetyTestsScenario(Scenario):
@@ -0,0 +1,91 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from filelock import FileLock
6
+
7
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
+ from helm.common.hierarchical_logger import hlog
9
+ from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
10
+ generate_schema_prompt,
11
+ )
12
+ from helm.benchmark.scenarios.scenario import (
13
+ CORRECT_TAG,
14
+ Scenario,
15
+ Instance,
16
+ Reference,
17
+ VALID_SPLIT,
18
+ Input,
19
+ Output,
20
+ )
21
+
22
+
23
+ def _ensure_file_unzipped(source_path: str, target_path: str):
24
+ with FileLock(f"{target_path}.lock"):
25
+ if os.path.exists(target_path):
26
+ hlog(f"Not decompressing {source_path} because {target_path} already exists")
27
+ return
28
+ tmp_path = target_path + ".tmp"
29
+ ensure_directory_exists(tmp_path)
30
+ shell(["unzip", source_path, "-d", tmp_path])
31
+ shell(["mv", tmp_path, target_path])
32
+
33
+
34
+ class SpiderScenario(Scenario):
35
+ """Spider 1.0"""
36
+
37
+ name = "spider"
38
+ description = "spider"
39
+ tags = ["sql"]
40
+
41
+ INSTRUCTIONS_PROMPT = """-- Using valid SQLite, answer the following questions for the tables provided above.
42
+ """
43
+ COT_PROMPT = """
44
+ Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
45
+
46
+ <reasoning>
47
+ INSERT_YOUR_REASONING_HERE
48
+ </reasoning>
49
+ <sql>
50
+ INSERT_YOUR_SQL_QUERY_HERE
51
+ </sql>""" # noqa: E501
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ data_parent_path = os.path.join(output_path, "data")
55
+ ensure_file_downloaded(
56
+ "https://drive.google.com/uc?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&export=download&confirm=t",
57
+ data_parent_path,
58
+ unpack=True,
59
+ unpack_type="unzip",
60
+ )
61
+ data_root_path = os.path.join(data_parent_path, "spider_data")
62
+ databases_root_path = os.path.join(data_root_path, "test_database")
63
+
64
+ database_schema_prompts: Dict[str, str] = {}
65
+ for database_name in os.listdir(databases_root_path):
66
+ database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
67
+ if not os.path.exists(database_path):
68
+ # Ignore stray ".DS_Store" directory
69
+ continue
70
+
71
+ database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
72
+ database_schema_prompts[database_name] = database_schema_prompt
73
+
74
+ instances: List[Instance] = []
75
+ dataset_path = os.path.join(data_root_path, "test.json")
76
+ dataset = json.load(open(dataset_path, "r"))
77
+ for row in dataset:
78
+ database_id: str = row["db_id"]
79
+ question: str = row["question"]
80
+ gold_sql: str = row["query"]
81
+
82
+ schema_prompt = database_schema_prompts[database_id]
83
+ combined_prompt = schema_prompt + "\n\n" + self.INSTRUCTIONS_PROMPT + question + self.COT_PROMPT
84
+ instance = Instance(
85
+ input=Input(text=combined_prompt),
86
+ references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
87
+ extra_data={"db_id": row["db_id"]},
88
+ split=VALID_SPLIT,
89
+ )
90
+ instances.append(instance)
91
+ return instances
@@ -0,0 +1,97 @@
1
+ import csv
2
+ from typing import List
3
+
4
+ from helm.common.general import check_file_exists
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+
16
+ class StarrPatientInstructionsScenario(Scenario):
17
+ """
18
+ Starr Patient Instructions is a dataset created from STARR-OMOP data, containing after-visit instructions
19
+ for outpatient surgeries/procedures. Each example corresponds to one surgery or procedure case (only including
20
+ outpatient or observation/overnight cases with discharge within 24 hours) and includes the following fields:
21
+
22
+ - Diagnosis: Why the patient needs the surgery/procedure.
23
+ - ActualProcedure: The surgery/procedure name.
24
+ - HistoryPhysicalNoteText: The History & Physical note written by the surgeon.
25
+ - OperativeNoteText: The report describing what was done during the surgery/procedure.
26
+ - DischargeInstructionNoteText: The specific after-surgery care instructions given to the patient.
27
+
28
+ The task is to generate personalized post-procedure patient instructions based on the provided case details.
29
+
30
+ Sample Synthetic Prompt:
31
+ Given the following case details, generate personalized after-surgery care instructions.
32
+
33
+ Diagnosis: [diagnosis text]
34
+ Procedure: [actual procedure text]
35
+ History & Physical: [H&P note text]
36
+ Operative Report: [operative note text]
37
+
38
+ Patient Instructions:
39
+ """
40
+
41
+ name = "starr_patient_instructions"
42
+ description = (
43
+ "PatientInstruct is a benchmark designed to evaluate models on generating personalized"
44
+ "post-procedure instructions for patients. It includes real-world patient History & Physical"
45
+ "Note (H&P) and operative report, from which models must produce clear, actionable instructions"
46
+ "appropriate for patients recovering from medical interventions."
47
+ )
48
+ tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
49
+
50
+ def __init__(self, data_path: str):
51
+ super().__init__()
52
+ self.data_path = data_path
53
+
54
+ def get_instances(self, output_path: str) -> List[Instance]:
55
+ check_file_exists(
56
+ self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
57
+ )
58
+ instances: List[Instance] = []
59
+ # For now, we assign all instances to the test split (zero-shot setting).
60
+ split = TEST_SPLIT
61
+
62
+ with open(self.data_path, "r", encoding="utf-8") as csvfile:
63
+ reader = csv.DictReader(csvfile)
64
+ for row in reader:
65
+ # Retrieve and strip the relevant fields.
66
+ qc_value = row.get("QC", "").strip().upper()
67
+ if qc_value != "TRUE":
68
+ continue
69
+ diagnosis = row.get("Diagnosis", "").strip()
70
+ actual_procedure = row.get("ActualProcedure", "").strip()
71
+ history_physical = row.get("HistoryPhysicalNoteText", "").strip()
72
+ operative_note = row.get("OperativeNoteText", "").strip()
73
+ discharge_instruction = row.get("DischargeInstructionNoteText", "").strip()
74
+
75
+ # Skip the instance if any required field is missing.
76
+ if not (
77
+ diagnosis and actual_procedure and history_physical and operative_note and discharge_instruction
78
+ ):
79
+ continue
80
+
81
+ # Construct the input prompt by concatenating the fields.
82
+ input_text = (
83
+ f"Diagnosis: {diagnosis}\n"
84
+ f"Procedure: {actual_procedure}\n"
85
+ f"History & Physical: {history_physical}\n"
86
+ f"Operative Report: {operative_note}\n\n"
87
+ )
88
+
89
+ instances.append(
90
+ Instance(
91
+ input=Input(text=input_text),
92
+ references=[Reference(Output(text=discharge_instruction), tags=[CORRECT_TAG])],
93
+ split=split,
94
+ )
95
+ )
96
+
97
+ return instances
@@ -3,7 +3,17 @@ import pickle
3
3
 
4
4
  from typing import List, Optional
5
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TRAIN_SPLIT,
11
+ VALID_SPLIT,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
7
17
 
8
18
 
9
19
  class SummarizationScenario(Scenario):
@@ -0,0 +1,157 @@
1
+ import os
2
+ import re
3
+ from typing import Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8
+ from helm.common.optional_dependencies import handle_module_not_found_error
9
+ from helm.benchmark.scenarios.scenario import (
10
+ Scenario,
11
+ Instance,
12
+ Reference,
13
+ TRAIN_SPLIT,
14
+ TEST_SPLIT,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
19
+
20
+ try:
21
+ # Needed for pandas.read_excel
22
+ import openpyxl # noqa
23
+ except ModuleNotFoundError as e:
24
+ handle_module_not_found_error(e, ["ibm-enterprise-scenarios"])
25
+
26
+
27
+ class SUMOSumScenario(Scenario):
28
+ """SUMO Web Claims Summarization
29
+
30
+ SUMO Web Claims Summarization is a summarization task over the climate subset from the SUMO dataset.
31
+ The task is to write a title based on the article contents.
32
+
33
+ Citation:
34
+ @inproceedings{mishra-etal-2020-generating,
35
+ title = "Generating Fact Checking Summaries for Web Claims",
36
+ author = "Mishra, Rahul and
37
+ Gupta, Dhruv and
38
+ Leippold, Markus",
39
+ editor = "Xu, Wei and
40
+ Ritter, Alan and
41
+ Baldwin, Tim and
42
+ Rahimi, Afshin",
43
+ booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
44
+ month = nov,
45
+ year = "2020",
46
+ address = "Online",
47
+ publisher = "Association for Computational Linguistics",
48
+ url = "https://aclanthology.org/2020.wnut-1.12",
49
+ doi = "10.18653/v1/2020.wnut-1.12",
50
+ pages = "81--90",
51
+ abstract = "We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets.",
52
+ }
53
+ """ # noqa: E501
54
+
55
+ name = "sumosum"
56
+ description = "Text summarization with climate corpus"
57
+ tags = ["summarization", "climate"]
58
+
59
+ TRAIN_RATIO = 0.2
60
+ TITLE_KEY = "Title"
61
+ DOCUMENT_KEY = "Doc_text"
62
+
63
+ def __init__(
64
+ self,
65
+ train_filter_min_length: Optional[int] = None,
66
+ train_filter_max_length: Optional[int] = None,
67
+ test_filter_min_length: Optional[int] = None,
68
+ test_filter_max_length: Optional[int] = None,
69
+ truncate_length: Optional[int] = None,
70
+ ):
71
+ """
72
+ Initializes the scenario.
73
+ Args:
74
+ train_filter_min_length: Int indicating minimum length for training
75
+ documents. Train examples smaller than
76
+ train_filter_min_length tokens will be filtered out.
77
+ train_filter_max_length: Int indicating maximum length for training
78
+ documents. Train examples larger than
79
+ train_filter_max_length tokens will be filtered out.
80
+ test_filter_min_length: Int indicating minimum length for training
81
+ documents. Test examples smaller than
82
+ test_filter_min_length tokens will be filtered out.
83
+ test_filter_max_length: Int indicating maximum length for training
84
+ documents. Test examples larger than
85
+ test_filter_max_length tokens will be filtered out.
86
+ truncate_length: Int indicating the maximum length in tokens to
87
+ truncate documents. Documents in all splits will be
88
+ truncated to truncate_length tokens.
89
+ NOTE: Whitespace tokenization is used to compute tokens.
90
+ """
91
+ super().__init__()
92
+ self.train_filter_min_length = train_filter_min_length
93
+ self.train_filter_max_length = train_filter_max_length
94
+ self.test_filter_min_length = test_filter_min_length
95
+ self.test_filter_max_length = test_filter_max_length
96
+ self.truncate_length = truncate_length
97
+
98
+ @staticmethod
99
+ def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
100
+ text = re.sub(r"\s+", " ", text)
101
+ return " ".join(text.split()[:max_length])
102
+
103
+ def _load_dataset(self, output_path: str) -> Dict[str, pd.DataFrame]:
104
+ data_dir = os.path.join(output_path, "data")
105
+ ensure_directory_exists(data_dir)
106
+
107
+ source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
108
+ source_file = os.path.basename(source_url)
109
+ target_path = os.path.join(data_dir, source_file)
110
+ ensure_file_downloaded(
111
+ source_url=source_url,
112
+ target_path=target_path,
113
+ )
114
+
115
+ # Column headers: Claim_id(int),Claim,Title,Doc_text,Label(bool)
116
+ target_df = pd.read_excel(target_path, skiprows=1)
117
+ target_df = target_df.dropna(subset=[SUMOSumScenario.TITLE_KEY, SUMOSumScenario.DOCUMENT_KEY])
118
+ # Remove carriage return _x000D_ in Excel string
119
+ target_df = target_df.replace({r"_x000D_": ""}, regex=True)
120
+ # target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
121
+ # Split randomly (works better than split by order)
122
+ train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
123
+ test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
124
+ return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
125
+
126
+ def get_instances(self, output_path: str) -> List[Instance]:
127
+ dataset_dict = self._load_dataset(output_path)
128
+
129
+ instances: List[Instance] = []
130
+
131
+ for split, split_data in dataset_dict.items():
132
+ for example in split_data.itertuples():
133
+ document = getattr(example, SUMOSumScenario.DOCUMENT_KEY)
134
+ title = getattr(example, SUMOSumScenario.TITLE_KEY)
135
+ art_len = len(document.split())
136
+ if split == TEST_SPLIT:
137
+ if self.test_filter_max_length and art_len > self.test_filter_max_length:
138
+ continue
139
+ if self.test_filter_min_length and art_len < self.test_filter_min_length:
140
+ continue
141
+ if split == TRAIN_SPLIT:
142
+ if self.train_filter_max_length and art_len > self.train_filter_max_length:
143
+ continue
144
+ if self.train_filter_min_length and art_len < self.train_filter_min_length:
145
+ continue
146
+
147
+ document = SUMOSumScenario._clean_and_truncate(document, self.truncate_length)
148
+ title = SUMOSumScenario._clean_and_truncate(title)
149
+
150
+ instance = Instance(
151
+ input=Input(text=document),
152
+ references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
153
+ split=split,
154
+ )
155
+ instances.append(instance)
156
+
157
+ return instances
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded
5
- from .scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
5
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
6
6
 
7
7
  NUM_INPUT_TOKENS: List[int] = [
8
8
  1,
@@ -67,7 +67,17 @@ from copy import copy
67
67
  from typing import List, Dict, Literal, Tuple
68
68
  from dataclasses import dataclass
69
69
 
70
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
70
+ from helm.benchmark.scenarios.scenario import (
71
+ Scenario,
72
+ Instance,
73
+ Reference,
74
+ TRAIN_SPLIT,
75
+ VALID_SPLIT,
76
+ TEST_SPLIT,
77
+ CORRECT_TAG,
78
+ Input,
79
+ Output,
80
+ )
71
81
 
72
82
 
73
83
  @dataclass(frozen=True)
@@ -25,7 +25,17 @@ The model hence is asked to do the following three tasks:
25
25
  import numpy as np
26
26
  from typing import List, Dict, Tuple
27
27
 
28
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
28
+ from helm.benchmark.scenarios.scenario import (
29
+ Scenario,
30
+ Instance,
31
+ Reference,
32
+ TRAIN_SPLIT,
33
+ VALID_SPLIT,
34
+ TEST_SPLIT,
35
+ CORRECT_TAG,
36
+ Input,
37
+ Output,
38
+ )
29
39
 
30
40
  ANIMALS = ["zebra", "cobra", "stork", "penguin", "shark", "lion", "buffalo", "whale", "seal", "eagle", "horse", "rat"]
31
41
  FRUITS = ["apple", "peach", "watermelon", "banana", "grape", "kiwi", "pear", "strawberry", "blueberry", "blackberry"]
@@ -0,0 +1,26 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.bigcodebench_scenario import BigCodeBenchScenario
5
+ from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_bigcodebench_scenario_get_instances():
10
+ bigcodebench_scenario = BigCodeBenchScenario("v0.1.2")
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = bigcodebench_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 1140
14
+ assert instances[0].id == "BigCodeBench/0"
15
+ assert instances[0].input == Input(
16
+ text=(
17
+ "Calculates the average of the sums of absolute differences between each pair "
18
+ "of consecutive numbers for all permutations of a given list. Each permutation "
19
+ "is shuffled before calculating the differences. Args: - numbers (list): A list "
20
+ "of numbers. Default is numbers from 1 to 10.\nThe function should output with:\n"
21
+ " float: The average of the sums of absolute differences for each shuffled permutation "
22
+ "of the list.\nYou should write self-contained code starting with:\n```\nimport itertools\n"
23
+ "from random import shuffle\ndef task_func(numbers=list(range(1, 3))):\n```"
24
+ )
25
+ )
26
+ assert instances[0].split == TEST_SPLIT
@@ -0,0 +1,18 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_czech_bank_qa_scenario_get_instances():
10
+ scenario = CzechBankQAScenario(config_name="default")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 30
14
+ assert actual_instances[0].input == Input(text="Get the total number of accounts in the system")
15
+ assert len(actual_instances[0].references) == 1
16
+ assert actual_instances[0].references[0].tags == [CORRECT_TAG]
17
+ assert actual_instances[0].references[0].output.text == "SELECT COUNT(*) FROM ACCOUNT"
18
+ assert actual_instances[0].split == "test"