crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,91 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from filelock import FileLock
6
+
7
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
+ from helm.common.hierarchical_logger import hlog
9
+ from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
10
+ generate_schema_prompt,
11
+ )
12
+ from helm.benchmark.scenarios.scenario import (
13
+ CORRECT_TAG,
14
+ Scenario,
15
+ Instance,
16
+ Reference,
17
+ VALID_SPLIT,
18
+ Input,
19
+ Output,
20
+ )
21
+
22
+
23
+ def _ensure_file_unzipped(source_path: str, target_path: str):
24
+ with FileLock(f"{target_path}.lock"):
25
+ if os.path.exists(target_path):
26
+ hlog(f"Not decompressing {source_path} because {target_path} already exists")
27
+ return
28
+ tmp_path = target_path + ".tmp"
29
+ ensure_directory_exists(tmp_path)
30
+ shell(["unzip", source_path, "-d", tmp_path])
31
+ shell(["mv", tmp_path, target_path])
32
+
33
+
34
+ class SpiderScenario(Scenario):
35
+ """Spider 1.0"""
36
+
37
+ name = "spider"
38
+ description = "spider"
39
+ tags = ["sql"]
40
+
41
+ INSTRUCTIONS_PROMPT = """-- Using valid SQLite, answer the following questions for the tables provided above.
42
+ """
43
+ COT_PROMPT = """
44
+ Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
45
+
46
+ <reasoning>
47
+ INSERT_YOUR_REASONING_HERE
48
+ </reasoning>
49
+ <sql>
50
+ INSERT_YOUR_SQL_QUERY_HERE
51
+ </sql>""" # noqa: E501
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ data_parent_path = os.path.join(output_path, "data")
55
+ ensure_file_downloaded(
56
+ "https://drive.google.com/uc?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&export=download&confirm=t",
57
+ data_parent_path,
58
+ unpack=True,
59
+ unpack_type="unzip",
60
+ )
61
+ data_root_path = os.path.join(data_parent_path, "spider_data")
62
+ databases_root_path = os.path.join(data_root_path, "test_database")
63
+
64
+ database_schema_prompts: Dict[str, str] = {}
65
+ for database_name in os.listdir(databases_root_path):
66
+ database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
67
+ if not os.path.exists(database_path):
68
+ # Ignore stray ".DS_Store" directory
69
+ continue
70
+
71
+ database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
72
+ database_schema_prompts[database_name] = database_schema_prompt
73
+
74
+ instances: List[Instance] = []
75
+ dataset_path = os.path.join(data_root_path, "test.json")
76
+ dataset = json.load(open(dataset_path, "r"))
77
+ for row in dataset:
78
+ database_id: str = row["db_id"]
79
+ question: str = row["question"]
80
+ gold_sql: str = row["query"]
81
+
82
+ schema_prompt = database_schema_prompts[database_id]
83
+ combined_prompt = schema_prompt + "\n\n" + self.INSTRUCTIONS_PROMPT + question + self.COT_PROMPT
84
+ instance = Instance(
85
+ input=Input(text=combined_prompt),
86
+ references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
87
+ extra_data={"db_id": row["db_id"]},
88
+ split=VALID_SPLIT,
89
+ )
90
+ instances.append(instance)
91
+ return instances
@@ -0,0 +1,90 @@
1
+ import os
2
+ import csv
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_directory_exists
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Input,
8
+ Scenario,
9
+ Instance,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Reference,
13
+ Output,
14
+ )
15
+
16
+
17
+ class StarrPatientInstructionsScenario(Scenario):
18
+ """
19
+ Starr Patient Instructions is a dataset created from STARR-OMOP data, containing after-visit instructions
20
+ for outpatient surgeries/procedures. Each example corresponds to one surgery or procedure case (only including
21
+ outpatient or observation/overnight cases with discharge within 24 hours) and includes the following fields:
22
+
23
+ - Diagnosis: Why the patient needs the surgery/procedure.
24
+ - ActualProcedure: The surgery/procedure name.
25
+ - HistoryPhysicalNoteText: The History & Physical note written by the surgeon.
26
+ - OperativeNoteText: The report describing what was done during the surgery/procedure.
27
+ - DischargeInstructionNoteText: The specific after-surgery care instructions given to the patient.
28
+
29
+ The task is to generate personalized post-procedure patient instructions based on the provided case details.
30
+
31
+ Sample Synthetic Prompt:
32
+ Given the following case details, generate personalized after-surgery care instructions.
33
+
34
+ Diagnosis: [diagnosis text]
35
+ Procedure: [actual procedure text]
36
+ History & Physical: [H&P note text]
37
+ Operative Report: [operative note text]
38
+
39
+ Patient Instructions:
40
+ """
41
+
42
+ name = "starr_patient_instructions"
43
+ description = "A dataset containing case details used to generate customized post-procedure patient instructions."
44
+ tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ csv_path = "/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv"
48
+ # Ensure the directory for the CSV file exists.
49
+ ensure_directory_exists(os.path.dirname(csv_path))
50
+
51
+ instances: List[Instance] = []
52
+ # For now, we assign all instances to the test split (zero-shot setting).
53
+ split = TEST_SPLIT
54
+
55
+ with open(csv_path, "r", encoding="utf-8") as csvfile:
56
+ reader = csv.DictReader(csvfile)
57
+ for row in reader:
58
+ # Retrieve and strip the relevant fields.
59
+ qc_value = row.get("QC", "").strip().upper()
60
+ if qc_value != "TRUE":
61
+ continue
62
+ diagnosis = row.get("Diagnosis", "").strip()
63
+ actual_procedure = row.get("ActualProcedure", "").strip()
64
+ history_physical = row.get("HistoryPhysicalNoteText", "").strip()
65
+ operative_note = row.get("OperativeNoteText", "").strip()
66
+ discharge_instruction = row.get("DischargeInstructionNoteText", "").strip()
67
+
68
+ # Skip the instance if any required field is missing.
69
+ if not (
70
+ diagnosis and actual_procedure and history_physical and operative_note and discharge_instruction
71
+ ):
72
+ continue
73
+
74
+ # Construct the input prompt by concatenating the fields.
75
+ input_text = (
76
+ f"Diagnosis: {diagnosis}\n"
77
+ f"Procedure: {actual_procedure}\n"
78
+ f"History & Physical: {history_physical}\n"
79
+ f"Operative Report: {operative_note}\n\n"
80
+ )
81
+
82
+ instances.append(
83
+ Instance(
84
+ input=Input(text=input_text),
85
+ references=[Reference(Output(text=discharge_instruction), tags=[CORRECT_TAG])],
86
+ split=split,
87
+ )
88
+ )
89
+
90
+ return instances
@@ -3,7 +3,17 @@ import pickle
3
3
 
4
4
  from typing import List, Optional
5
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TRAIN_SPLIT,
11
+ VALID_SPLIT,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
7
17
 
8
18
 
9
19
  class SummarizationScenario(Scenario):
@@ -0,0 +1,157 @@
1
+ import os
2
+ import re
3
+ from typing import Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8
+ from helm.common.optional_dependencies import handle_module_not_found_error
9
+ from helm.benchmark.scenarios.scenario import (
10
+ Scenario,
11
+ Instance,
12
+ Reference,
13
+ TRAIN_SPLIT,
14
+ TEST_SPLIT,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
19
+
20
+ try:
21
+ # Needed for pandas.read_excel
22
+ import openpyxl # noqa
23
+ except ModuleNotFoundError as e:
24
+ handle_module_not_found_error(e, ["ibm-enterprise-scenarios"])
25
+
26
+
27
+ class SUMOSumScenario(Scenario):
28
+ """SUMO Web Claims Summarization
29
+
30
+ SUMO Web Claims Summarization is a summarization task over the climate subset from the SUMO dataset.
31
+ The task is to write a title based on the article contents.
32
+
33
+ Citation:
34
+ @inproceedings{mishra-etal-2020-generating,
35
+ title = "Generating Fact Checking Summaries for Web Claims",
36
+ author = "Mishra, Rahul and
37
+ Gupta, Dhruv and
38
+ Leippold, Markus",
39
+ editor = "Xu, Wei and
40
+ Ritter, Alan and
41
+ Baldwin, Tim and
42
+ Rahimi, Afshin",
43
+ booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
44
+ month = nov,
45
+ year = "2020",
46
+ address = "Online",
47
+ publisher = "Association for Computational Linguistics",
48
+ url = "https://aclanthology.org/2020.wnut-1.12",
49
+ doi = "10.18653/v1/2020.wnut-1.12",
50
+ pages = "81--90",
51
+ abstract = "We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets.",
52
+ }
53
+ """ # noqa: E501
54
+
55
+ name = "sumosum"
56
+ description = "Text summarization with climate corpus"
57
+ tags = ["summarization", "climate"]
58
+
59
+ TRAIN_RATIO = 0.2
60
+ TITLE_KEY = "Title"
61
+ DOCUMENT_KEY = "Doc_text"
62
+
63
+ def __init__(
64
+ self,
65
+ train_filter_min_length: Optional[int] = None,
66
+ train_filter_max_length: Optional[int] = None,
67
+ test_filter_min_length: Optional[int] = None,
68
+ test_filter_max_length: Optional[int] = None,
69
+ truncate_length: Optional[int] = None,
70
+ ):
71
+ """
72
+ Initializes the scenario.
73
+ Args:
74
+ train_filter_min_length: Int indicating minimum length for training
75
+ documents. Train examples smaller than
76
+ train_filter_min_length tokens will be filtered out.
77
+ train_filter_max_length: Int indicating maximum length for training
78
+ documents. Train examples larger than
79
+ train_filter_max_length tokens will be filtered out.
80
+ test_filter_min_length: Int indicating minimum length for training
81
+ documents. Test examples smaller than
82
+ test_filter_min_length tokens will be filtered out.
83
+ test_filter_max_length: Int indicating maximum length for training
84
+ documents. Test examples larger than
85
+ test_filter_max_length tokens will be filtered out.
86
+ truncate_length: Int indicating the maximum length in tokens to
87
+ truncate documents. Documents in all splits will be
88
+ truncated to truncate_length tokens.
89
+ NOTE: Whitespace tokenization is used to compute tokens.
90
+ """
91
+ super().__init__()
92
+ self.train_filter_min_length = train_filter_min_length
93
+ self.train_filter_max_length = train_filter_max_length
94
+ self.test_filter_min_length = test_filter_min_length
95
+ self.test_filter_max_length = test_filter_max_length
96
+ self.truncate_length = truncate_length
97
+
98
+ @staticmethod
99
+ def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
100
+ text = re.sub(r"\s+", " ", text)
101
+ return " ".join(text.split()[:max_length])
102
+
103
+ def _load_dataset(self, output_path: str) -> Dict[str, pd.DataFrame]:
104
+ data_dir = os.path.join(output_path, "data")
105
+ ensure_directory_exists(data_dir)
106
+
107
+ source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
108
+ source_file = os.path.basename(source_url)
109
+ target_path = os.path.join(data_dir, source_file)
110
+ ensure_file_downloaded(
111
+ source_url=source_url,
112
+ target_path=target_path,
113
+ )
114
+
115
+ # Column headers: Claim_id(int),Claim,Title,Doc_text,Label(bool)
116
+ target_df = pd.read_excel(target_path, skiprows=1)
117
+ target_df = target_df.dropna(subset=[SUMOSumScenario.TITLE_KEY, SUMOSumScenario.DOCUMENT_KEY])
118
+ # Remove carriage return _x000D_ in Excel string
119
+ target_df = target_df.replace({r"_x000D_": ""}, regex=True)
120
+ # target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
121
+ # Split randomly (works better than split by order)
122
+ train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
123
+ test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
124
+ return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
125
+
126
+ def get_instances(self, output_path: str) -> List[Instance]:
127
+ dataset_dict = self._load_dataset(output_path)
128
+
129
+ instances: List[Instance] = []
130
+
131
+ for split, split_data in dataset_dict.items():
132
+ for example in split_data.itertuples():
133
+ document = getattr(example, SUMOSumScenario.DOCUMENT_KEY)
134
+ title = getattr(example, SUMOSumScenario.TITLE_KEY)
135
+ art_len = len(document.split())
136
+ if split == TEST_SPLIT:
137
+ if self.test_filter_max_length and art_len > self.test_filter_max_length:
138
+ continue
139
+ if self.test_filter_min_length and art_len < self.test_filter_min_length:
140
+ continue
141
+ if split == TRAIN_SPLIT:
142
+ if self.train_filter_max_length and art_len > self.train_filter_max_length:
143
+ continue
144
+ if self.train_filter_min_length and art_len < self.train_filter_min_length:
145
+ continue
146
+
147
+ document = SUMOSumScenario._clean_and_truncate(document, self.truncate_length)
148
+ title = SUMOSumScenario._clean_and_truncate(title)
149
+
150
+ instance = Instance(
151
+ input=Input(text=document),
152
+ references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
153
+ split=split,
154
+ )
155
+ instances.append(instance)
156
+
157
+ return instances
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded
5
- from .scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
5
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
6
6
 
7
7
  NUM_INPUT_TOKENS: List[int] = [
8
8
  1,
@@ -67,7 +67,17 @@ from copy import copy
67
67
  from typing import List, Dict, Literal, Tuple
68
68
  from dataclasses import dataclass
69
69
 
70
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
70
+ from helm.benchmark.scenarios.scenario import (
71
+ Scenario,
72
+ Instance,
73
+ Reference,
74
+ TRAIN_SPLIT,
75
+ VALID_SPLIT,
76
+ TEST_SPLIT,
77
+ CORRECT_TAG,
78
+ Input,
79
+ Output,
80
+ )
71
81
 
72
82
 
73
83
  @dataclass(frozen=True)
@@ -25,7 +25,17 @@ The model hence is asked to do the following three tasks:
25
25
  import numpy as np
26
26
  from typing import List, Dict, Tuple
27
27
 
28
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
28
+ from helm.benchmark.scenarios.scenario import (
29
+ Scenario,
30
+ Instance,
31
+ Reference,
32
+ TRAIN_SPLIT,
33
+ VALID_SPLIT,
34
+ TEST_SPLIT,
35
+ CORRECT_TAG,
36
+ Input,
37
+ Output,
38
+ )
29
39
 
30
40
  ANIMALS = ["zebra", "cobra", "stork", "penguin", "shark", "lion", "buffalo", "whale", "seal", "eagle", "horse", "rat"]
31
41
  FRUITS = ["apple", "peach", "watermelon", "banana", "grape", "kiwi", "pear", "strawberry", "blueberry", "blackberry"]
@@ -0,0 +1,26 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.bigcodebench_scenario import BigCodeBenchScenario
5
+ from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_bigcodebench_scenario_get_instances():
10
+ bigcodebench_scenario = BigCodeBenchScenario("v0.1.2")
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = bigcodebench_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 1140
14
+ assert instances[0].id == "BigCodeBench/0"
15
+ assert instances[0].input == Input(
16
+ text=(
17
+ "Calculates the average of the sums of absolute differences between each pair "
18
+ "of consecutive numbers for all permutations of a given list. Each permutation "
19
+ "is shuffled before calculating the differences. Args: - numbers (list): A list "
20
+ "of numbers. Default is numbers from 1 to 10.\nThe function should output with:\n"
21
+ " float: The average of the sums of absolute differences for each shuffled permutation "
22
+ "of the list.\nYou should write self-contained code starting with:\n```\nimport itertools\n"
23
+ "from random import shuffle\ndef task_func(numbers=list(range(1, 3))):\n```"
24
+ )
25
+ )
26
+ assert instances[0].split == TEST_SPLIT
@@ -0,0 +1,18 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_czech_bank_qa_scenario_get_instances():
10
+ scenario = CzechBankQAScenario(config_name="default")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 30
14
+ assert actual_instances[0].input == Input(text="Get the total number of accounts in the system")
15
+ assert len(actual_instances[0].references) == 1
16
+ assert actual_instances[0].references[0].tags == [CORRECT_TAG]
17
+ assert actual_instances[0].references[0].output.text == "SELECT COUNT(*) FROM ACCOUNT"
18
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,53 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_enem_challenge_scenario():
10
+ enem_scenario = ENEMChallengeScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = enem_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 1431
14
+ assert instances[0].split == TEST_SPLIT
15
+
16
+ assert instances[0].input.text.startswith(
17
+ "A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)"
18
+ )
19
+ assert len(instances[0].input.text) == 1163
20
+
21
+ assert instances[0].references == [
22
+ Reference(
23
+ output=Output(
24
+ text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501
31
+ ),
32
+ tags=[],
33
+ ),
34
+ Reference(
35
+ output=Output(
36
+ text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501
37
+ ),
38
+ tags=[CORRECT_TAG],
39
+ ),
40
+ Reference(
41
+ output=Output(
42
+ text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501
43
+ ),
44
+ tags=[],
45
+ ),
46
+ Reference(
47
+ output=Output(
48
+ text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501
49
+ ),
50
+ tags=[],
51
+ ),
52
+ ]
53
+ assert instances[0].references[2].is_correct
@@ -13,9 +13,13 @@ def test_ewok_scenario():
13
13
  with TemporaryDirectory() as tmpdir:
14
14
  try:
15
15
  instances = scenario.get_instances(tmpdir)
16
- except DatasetNotFoundError:
16
+ except (DatasetNotFoundError, Exception):
17
+ # Catch generic Exception here because the Hugging Face Hub client can raise
18
+ # an authentication issue as the following exception:
19
+ # huggingface_hub.utils._errors.GatedRepoError: 401 Client Error.
20
+ # but GatedRepoError is a type in a private package, so we avoid referencing it
17
21
  pytest.skip("Unable to access gated dataset on Hugging Face Hub; skipping test")
18
- assert len(instances) == 8748
22
+ assert len(instances) == 8750
19
23
  assert "believes" in instances[0].input.text
20
24
  assert len(instances[0].references) == 2
21
25
  assert "inside" in instances[0].references[0].output.text
@@ -0,0 +1,18 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_legalbench_scenario():
10
+ scenario = GoldCommodityNewsScenario(category="price_or_not")
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+ assert len(instances) == 11412
14
+ assert instances[0].input == Input(text="april gold down 20 cents to settle at $1,116.10/oz")
15
+ assert instances[0].references == [
16
+ Reference(output=Output(text="Yes"), tags=[CORRECT_TAG]),
17
+ ]
18
+ assert instances[0].split == "test"
@@ -0,0 +1,44 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+ from helm.benchmark.scenarios.gpqa_scenario import GPQAScenario
4
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
5
+
6
+
7
+ @pytest.mark.scenarios
8
+ def test_gpqa_scenario():
9
+ with TemporaryDirectory() as tmpdir:
10
+ scenario = GPQAScenario(subset="gpqa_main")
11
+ instances = scenario.get_instances(tmpdir)
12
+ assert len(instances) == 448
13
+ assert instances[0].split == "test"
14
+ assert len(instances[0].input.text) == 689
15
+ references = instances[0].references
16
+ assert len(references[0].output.text) == 10
17
+ assert len(references[1].output.text) == 6
18
+ assert len(references[2].output.text) == 9
19
+ assert len(references[3].output.text) == 7
20
+ assert references[3].tags == [CORRECT_TAG]
21
+
22
+ scenario = GPQAScenario(subset="gpqa_diamond")
23
+ instances = scenario.get_instances(tmpdir)
24
+ assert len(instances) == 198
25
+ assert instances[0].split == "test"
26
+ assert len(instances[0].input.text) == 262
27
+ references = instances[0].references
28
+ assert len(references[0].output.text) == 8
29
+ assert len(references[1].output.text) == 9
30
+ assert len(references[2].output.text) == 8
31
+ assert len(references[3].output.text) == 8
32
+ assert references[3].tags == [CORRECT_TAG]
33
+
34
+ scenario = GPQAScenario(subset="gpqa_extended")
35
+ instances = scenario.get_instances(tmpdir)
36
+ assert len(instances) == 546
37
+ assert instances[0].split == "test"
38
+ assert len(instances[0].input.text) == 689
39
+ references = instances[0].references
40
+ assert len(references[0].output.text) == 10
41
+ assert len(references[1].output.text) == 6
42
+ assert len(references[2].output.text) == 9
43
+ assert len(references[3].output.text) == 7
44
+ assert references[3].tags == [CORRECT_TAG]
@@ -0,0 +1,36 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.ifeval_scenario import IFEvalScenario
5
+ from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_ifeval_scenario_get_instances():
10
+ ifeval_scenario = IFEvalScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = ifeval_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 541
14
+ assert instances[0].input == Input(
15
+ text=(
16
+ "Write a 300+ word summary of the wikipedia page "
17
+ '"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". '
18
+ "Do not use any commas and highlight at least 3 sections that has titles in markdown format, "
19
+ "for example *highlighted section part 1*, *highlighted section part 2*, "
20
+ "*highlighted section part 3*."
21
+ )
22
+ )
23
+ assert instances[0].split == TEST_SPLIT
24
+ assert instances[0].extra_data
25
+ assert instances[0].extra_data["instruction_ids"] == [
26
+ "punctuation:no_comma",
27
+ "detectable_format:number_highlighted_sections",
28
+ "length_constraints:number_words",
29
+ ]
30
+ kwargs_groups = instances[0].extra_data["instruction_kwargs"]
31
+ assert all(_ is None for _ in kwargs_groups[0].values())
32
+ assert kwargs_groups[1]["num_highlights"] == 3
33
+ assert all(kwargs_groups[1][key] is None for key in kwargs_groups[1] if key != "num_highlights")
34
+ assert kwargs_groups[2]["relation"] == "at least"
35
+ assert kwargs_groups[2]["num_words"] == 300
36
+ assert all(kwargs_groups[2][key] is None for key in kwargs_groups[2] if key not in {"relation", "num_words"})
@@ -0,0 +1,27 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.imdb_ptbr_scenario import IMDB_PTBRScenario
5
+ from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_imdb_ptbr_scenario():
10
+ imdb_ptbr = IMDB_PTBRScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = imdb_ptbr.get_instances(tmpdir)
13
+ assert len(instances) == 30000
14
+ assert instances[0].split == TRAIN_SPLIT
15
+
16
+ assert instances[10].input.text.startswith(
17
+ "Foi ótimo ver algumas das minhas estrelas favoritas de 30 anos atrás, "
18
+ "incluindo John Ritter, Ben Gazarra e Audrey Hepburn."
19
+ )
20
+ assert len(instances[10].input.text) == 1549
21
+
22
+ assert instances[10].references == [
23
+ Reference(
24
+ output=Output(text="negativo"),
25
+ tags=[CORRECT_TAG],
26
+ )
27
+ ]