crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,106 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+ from tqdm import tqdm
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
16
+ from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
17
+ import pandas as pd
18
+
19
+
20
+ class VoxCeleb2Scenario(Scenario):
21
+ """VoxCeleb2
22
+
23
+ VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
24
+ interview videos uploaded to YouTube. This dataset contains over a million utterances from over
25
+ 6,000 speakers.
26
+
27
+ Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
28
+
29
+ Citation:
30
+ @inproceedings{Chung18b,
31
+ author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
32
+ title = "VoxCeleb2: Deep Speaker Recognition",
33
+ booktitle = "INTERSPEECH",
34
+ year = "2018",
35
+ }
36
+ """
37
+
38
+ DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
39
+ REFERENCE_URL = (
40
+ "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
41
+ )
42
+ IDENTITY_INSTRUCTION = (
43
+ "Listen to the audio and take your best guess to determine if the two speakers are the same person. "
44
+ "Give just the letter of your answer and nothing else."
45
+ )
46
+
47
+ name = "voxceleb2"
48
+ description = (
49
+ "A large-scale dataset of over a million utterances from over 6,000 speakers with their"
50
+ "gender, race, identity information"
51
+ "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
52
+ )
53
+ tags: List[str] = ["audio", "identification"]
54
+ options: List[str] = ["Yes", "No"]
55
+
56
+ def _convert_answer_to_label(self, answer: bool) -> str:
57
+ if answer:
58
+ return "A"
59
+ else:
60
+ return "B"
61
+
62
+ def _reformat_and_convert_audio_file(
63
+ self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
64
+ ) -> str:
65
+ tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
66
+ ensure_directory_exists(os.path.dirname(tgt_audio_path))
67
+ use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
68
+ return tgt_audio_path
69
+
70
+ def get_instances(self, output_path: str) -> List[Instance]:
71
+ instances: List[Instance] = []
72
+ audio_data_path = os.path.join(output_path, "audio_files")
73
+ tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
74
+ ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
75
+ annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
76
+ instances = []
77
+ for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
78
+ tgt_first_audio_path = self._reformat_and_convert_audio_file(
79
+ row["first"], tgt_audio_data_path, audio_data_path
80
+ )
81
+ tgt_second_audio_path = self._reformat_and_convert_audio_file(
82
+ row["second"], tgt_audio_data_path, audio_data_path
83
+ )
84
+
85
+ answer = self._convert_answer_to_label(row["same"])
86
+ # The given correct answer is a letter, but we need an index
87
+ correct_answer_index: int = ord(answer) - ord("A")
88
+ references: List[Reference] = []
89
+ for i, option in enumerate(self.options):
90
+ reference: Reference
91
+ is_correct: bool = i == correct_answer_index
92
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
93
+ references.append(reference)
94
+
95
+ input = Input(
96
+ multimedia_content=MultimediaObject(
97
+ [
98
+ MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
99
+ MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
100
+ MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
101
+ ]
102
+ )
103
+ )
104
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
105
+
106
+ return instances
@@ -0,0 +1,68 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+ from helm.common.general import ensure_directory_exists
15
+ from helm.common.hierarchical_logger import hlog
16
+
17
+
18
+ class AutoBencherCapabilitiesScenario(Scenario):
19
+ """AutoBencher Capabilities
20
+
21
+ AutoBencher uses a language model to automatically search
22
+ for datasets. AutoBencher Capabilities consists of question
23
+ answering datasets for math, multilingual, and knowledge-intensive
24
+ question answering created by AutoBencher.
25
+
26
+ Paper: https://arxiv.org/abs/2407.08351"""
27
+
28
+ name = "autobencher_capabilities"
29
+ description = (
30
+ "AutoBencher Capabilities consists of question answering datasets "
31
+ "for math, multilingual, and knowledge-intensive "
32
+ "question answering created by AutoBencher. "
33
+ "([paper](https://arxiv.org/abs/2407.08351))"
34
+ )
35
+ tags = ["question answering"]
36
+
37
+ SUBJECTS = ["math", "mt", "econ", "science", "history"]
38
+
39
+ def __init__(self, subject: str):
40
+ super().__init__()
41
+ if subject not in self.SUBJECTS:
42
+ raise ValueError(f"Unexpected subject {subject}, available subjects are {self.SUBJECTS}")
43
+ self.subject: str = subject
44
+
45
+ def get_instances(self, output_path: str) -> List[Instance]:
46
+ cache_dir = os.path.join(output_path, "data")
47
+ ensure_directory_exists(cache_dir)
48
+
49
+ # TODO: Switch this to the production dataset when available.
50
+ dataset = datasets.load_dataset(
51
+ "xlisali1/AutoBencher-capability.json",
52
+ split="train", # Use train split as test, so only zero-shot is supported
53
+ cache_dir=cache_dir,
54
+ revision="efe58dd72b6423e3f5c967f16cbea8cce3a51933",
55
+ )
56
+ instances: List[Instance] = []
57
+ for row in dataset:
58
+ if row["subject"] == self.subject:
59
+ continue
60
+ input = Input(text=row["question"])
61
+ # References are category ID, followed by level 2, 3 and 4 category names.
62
+ references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
63
+ if row["gold_answer"] is None:
64
+ hlog(f"WARNING: Row had no gold_answer: {row}")
65
+ continue
66
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
67
+ instances.append(instance)
68
+ return instances
@@ -0,0 +1,51 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from typing import List
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+
8
+ from .scenario import CORRECT_TAG, Scenario, Instance, Input, TEST_SPLIT, Reference, Output
9
+
10
+
11
+ class AutobencherSafetyScenario(Scenario):
12
+ """
13
+ Autobencher safety scenario
14
+
15
+ AutoBencher uses a language model to automatically search
16
+ for datasets. AutoBencher Capabilities consists of question
17
+ answering datasets for math, multilingual, and knowledge-intensive
18
+ question answering created by AutoBencher.
19
+
20
+ Paper: https://arxiv.org/abs/2407.08351
21
+ """
22
+
23
+ name = "autobencher_safety"
24
+ description = "Autobencher safety consists of question answering datasets"
25
+ tags = ["safety"]
26
+
27
+ def get_instances(self, output_path: str) -> List[Instance]:
28
+ data_path = os.path.join(output_path, "data")
29
+ os.makedirs(data_path, exist_ok=True)
30
+ url = "https://raw.githubusercontent.com/farzaank/AutoBencher/refs/heads/main/safety_processing/process%20full%20dataset%20for%20mTurk/full_dataset.json" # noqa: E501
31
+ outf_path = os.path.join(data_path, "full_dataset.json")
32
+ ensure_file_downloaded(
33
+ source_url=url,
34
+ target_path=outf_path,
35
+ unpack=False,
36
+ )
37
+
38
+ json_data = json.loads(outf_path)
39
+ df = pd.DataFrame(json_data)
40
+
41
+ # Read all the instances
42
+ instances: List[Instance] = []
43
+
44
+ for i, row in df.iterrows():
45
+ references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
46
+ input_text = row["question"]
47
+ input = Input(text=input_text)
48
+ id = str(row["category"]) + str(row["id"])
49
+ instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
50
+ instances.append(instance)
51
+ return instances
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
5
- from .scenario import (
5
+ from helm.benchmark.scenarios.scenario import (
6
6
  Scenario,
7
7
  Instance,
8
8
  Reference,
@@ -37,7 +37,12 @@ class Banking77Scenario(Scenario):
37
37
  ensure_directory_exists(cache_dir)
38
38
 
39
39
  # TODO: Switch this to the production dataset when available.
40
- dataset = datasets.load_dataset("PolyAI/banking77", cache_dir=cache_dir)
40
+ dataset = datasets.load_dataset(
41
+ "PolyAI/banking77",
42
+ cache_dir=cache_dir,
43
+ revision="90d4e2ee5521c04fc1488f065b8b083658768c57",
44
+ trust_remote_code=True,
45
+ )
41
46
 
42
47
  instances: List[Instance] = []
43
48
  for split_name in [TRAIN_SPLIT, TEST_SPLIT]:
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -5,7 +5,17 @@ from typing import List, Dict
5
5
  from urllib.parse import urljoin
6
6
 
7
7
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
8
- from .scenario import Scenario, Instance, Reference, Input, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Output
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ Input,
13
+ CORRECT_TAG,
14
+ TRAIN_SPLIT,
15
+ VALID_SPLIT,
16
+ TEST_SPLIT,
17
+ Output,
18
+ )
9
19
 
10
20
 
11
21
  class BIGBenchScenario(Scenario):
@@ -0,0 +1,58 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ TEST_SPLIT,
8
+ Input,
9
+ )
10
+ from helm.common.general import ensure_directory_exists
11
+
12
+
13
+ VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"]
14
+
15
+
16
+ class BigCodeBenchScenario(Scenario):
17
+ """BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions
18
+
19
+ BigCodeBench is an easy-to-use benchmark for solving practical and challenging tasks via code.
20
+ It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting.
21
+ The benchmark is designed for HumanEval-like function-level code generation tasks,
22
+ but with much more complex instructions and diverse function calls."""
23
+
24
+ name = "bigcodebench"
25
+ description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
26
+ tags = ["coding"]
27
+
28
+ def __init__(self, version: str):
29
+ super().__init__()
30
+ assert version in VERSIONS, "Unknown version: {}".format(version)
31
+ self.version = version
32
+
33
+ def get_instances(self, output_path: str) -> List[Instance]:
34
+ # Get BigCodeBench from HuggingFace
35
+ cache_dir = os.path.join(output_path, "data")
36
+ ensure_directory_exists(cache_dir)
37
+ dataset = datasets.load_dataset(
38
+ "bigcode/bigcodebench",
39
+ revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6",
40
+ cache_dir=cache_dir,
41
+ split=self.version,
42
+ )
43
+ assert isinstance(dataset, datasets.Dataset)
44
+
45
+ # Read all instances
46
+ instances: List[Instance] = []
47
+ for row in dataset:
48
+ id = row["task_id"]
49
+ input = Input(text=row["instruct_prompt"])
50
+ instance = Instance(
51
+ id=id,
52
+ input=input,
53
+ references=[],
54
+ split=TEST_SPLIT,
55
+ )
56
+ instances.append(instance)
57
+
58
+ return instances
@@ -0,0 +1,94 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from filelock import FileLock
6
+
7
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
+ from helm.common.hierarchical_logger import hlog
9
+ from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
10
+ generate_comment_prompt,
11
+ generate_schema_prompt,
12
+ )
13
+ from helm.benchmark.scenarios.scenario import (
14
+ CORRECT_TAG,
15
+ Scenario,
16
+ Instance,
17
+ Reference,
18
+ VALID_SPLIT,
19
+ Input,
20
+ Output,
21
+ )
22
+
23
+
24
+ def _ensure_file_unzipped(source_path: str, target_path: str):
25
+ with FileLock(f"{target_path}.lock"):
26
+ if os.path.exists(target_path):
27
+ hlog(f"Not decompressing {source_path} because {target_path} already exists")
28
+ return
29
+ tmp_path = target_path + ".tmp"
30
+ ensure_directory_exists(tmp_path)
31
+ shell(["unzip", source_path, "-d", tmp_path])
32
+ shell(["mv", tmp_path, target_path])
33
+
34
+
35
+ class BIRDSQLScenario(Scenario):
36
+ """BIRD-SQL (Dev)"""
37
+
38
+ name = "bird_sql"
39
+ description = "bird_sql"
40
+ tags = ["sql"]
41
+
42
+ COT_PROMPT = """
43
+ Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
44
+
45
+ <reasoning>
46
+ INSERT_YOUR_REASONING_HERE
47
+ </reasoning>
48
+ <sql>
49
+ INSERT_YOUR_SQL_QUERY_HERE
50
+ </sql>""" # noqa: E501
51
+
52
+ def get_instances(self, output_path: str) -> List[Instance]:
53
+ data_root_path = os.path.join(output_path, "dev")
54
+ ensure_file_downloaded(
55
+ "https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip", data_root_path, unpack=True, unpack_type="unzip"
56
+ )
57
+ databases_unzip_target = os.path.join(data_root_path, "unzipped_dev_databases")
58
+ _ensure_file_unzipped(os.path.join(data_root_path, "dev_databases.zip"), databases_unzip_target)
59
+ # Note: Zip file contains .DS_Store file at the root, which makes dev_databases unzip into a nested directory
60
+ databases_root_path = os.path.join(databases_unzip_target, "dev_databases")
61
+
62
+ database_schema_prompts: Dict[str, str] = {}
63
+ for database_name in os.listdir(databases_root_path):
64
+ database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
65
+ print(database_path)
66
+ if not os.path.exists(database_path):
67
+ # Ignore stray ".DS_Store" directory
68
+ continue
69
+
70
+ database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
71
+ database_schema_prompts[database_name] = database_schema_prompt
72
+
73
+ instances: List[Instance] = []
74
+ dataset_path = os.path.join(data_root_path, "dev.json")
75
+ dataset = json.load(open(dataset_path, "r"))
76
+ for row in dataset:
77
+ question_id: int = row["question_id"]
78
+ database_id: str = row["db_id"]
79
+ question: str = row["question"]
80
+ knowledge: str = row["evidence"]
81
+ gold_sql: str = row["SQL"]
82
+
83
+ schema_prompt = database_schema_prompts[database_id]
84
+ comment_prompt = generate_comment_prompt(question, knowledge)
85
+ combined_prompt = schema_prompt + "\n\n" + comment_prompt + self.COT_PROMPT
86
+ instance = Instance(
87
+ id=f"id{question_id}",
88
+ input=Input(text=combined_prompt),
89
+ references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
90
+ extra_data={"db_id": row["db_id"]},
91
+ split=VALID_SPLIT,
92
+ )
93
+ instances.append(instance)
94
+ return instances
@@ -0,0 +1,118 @@
1
+ # flake8: noqa
2
+ # type: ignore
3
+ # fmt: off
4
+
5
+ from typing import Dict
6
+ import os
7
+ import sqlite3
8
+
9
+
10
+ # The following code is copied verbatim from:
11
+ # https://github.com/AlibabaResearch/DAMO-ConvAI/blob/90a76ef9ef1e2960c9bdfa38c63cc81b841e813e/bird/llm/src/gpt_request.py
12
+ # under the following license:
13
+ #
14
+ # MIT License
15
+ #
16
+ # Copyright (c) 2022 Alibaba Research
17
+ #
18
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
19
+ # of this software and associated documentation files (the "Software"), to deal
20
+ # in the Software without restriction, including without limitation the rights
21
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
+ # copies of the Software, and to permit persons to whom the Software is
23
+ # furnished to do so, subject to the following conditions:
24
+ #
25
+ # The above copyright notice and this permission notice shall be included in all
26
+ # copies or substantial portions of the Software.
27
+
28
+
29
+ def get_db_schemas(bench_root: str, db_name: str) -> Dict[str, str]:
30
+ """
31
+ Read an sqlite file, and return the CREATE commands for each of the tables in the database.
32
+ """
33
+ asdf = 'database' if bench_root == 'spider' else 'databases'
34
+ with sqlite3.connect(f'file:{bench_root}/{asdf}/{db_name}/{db_name}.sqlite?mode=ro', uri=True) as conn:
35
+ # conn.text_factory = bytes
36
+ cursor = conn.cursor()
37
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
38
+ tables = cursor.fetchall()
39
+ schemas = {}
40
+ for table in tables:
41
+ cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
42
+ schemas[table[0]] = cursor.fetchone()[0]
43
+
44
+ return schemas
45
+
46
+ def nice_look_table(column_names: list, values: list):
47
+ rows = []
48
+ # Determine the maximum width of each column
49
+ widths = [max(len(str(value[i])) for value in values + [column_names]) for i in range(len(column_names))]
50
+
51
+ # Print the column names
52
+ header = ''.join(f'{column.rjust(width)} ' for column, width in zip(column_names, widths))
53
+ # print(header)
54
+ # Print the values
55
+ for value in values:
56
+ row = ''.join(f'{str(v).rjust(width)} ' for v, width in zip(value, widths))
57
+ rows.append(row)
58
+ rows = "\n".join(rows)
59
+ final_output = header + '\n' + rows
60
+ return final_output
61
+
62
+ def generate_schema_prompt(db_path, num_rows=None):
63
+ # extract create ddls
64
+ '''
65
+ :param root_place:
66
+ :param db_name:
67
+ :return:
68
+ '''
69
+ full_schema_prompt_list = []
70
+ conn = sqlite3.connect(db_path)
71
+ # Create a cursor object
72
+ cursor = conn.cursor()
73
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
74
+ tables = cursor.fetchall()
75
+ schemas = {}
76
+ for table in tables:
77
+ if table == 'sqlite_sequence':
78
+ continue
79
+ cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
80
+ create_prompt = cursor.fetchone()[0]
81
+ schemas[table[0]] = create_prompt
82
+ if num_rows:
83
+ cur_table = table[0]
84
+ if cur_table in ['order', 'by', 'group']:
85
+ cur_table = "`{}`".format(cur_table)
86
+
87
+ cursor.execute("SELECT * FROM {} LIMIT {}".format(cur_table, num_rows))
88
+ column_names = [description[0] for description in cursor.description]
89
+ values = cursor.fetchall()
90
+ rows_prompt = nice_look_table(column_names=column_names, values=values)
91
+ verbose_prompt = "/* \n {} example rows: \n SELECT * FROM {} LIMIT {}; \n {} \n */".format(num_rows, cur_table, num_rows, rows_prompt)
92
+ schemas[table[0]] = "{} \n {}".format(create_prompt, verbose_prompt)
93
+
94
+ for k, v in schemas.items():
95
+ full_schema_prompt_list.append(v)
96
+
97
+ schema_prompt = "\n\n".join(full_schema_prompt_list)
98
+
99
+ return schema_prompt
100
+
101
+ def generate_comment_prompt(question, knowledge=None):
102
+ pattern_prompt_no_kg = "-- Using valid SQLite, answer the following questions for the tables provided above."
103
+ pattern_prompt_kg = "-- Using valid SQLite and understading External Knowledge, answer the following questions for the tables provided above."
104
+ # question_prompt = "-- {}".format(question) + '\n SELECT '
105
+ question_prompt = "-- {}".format(question)
106
+ knowledge_prompt = "-- External Knowledge: {}".format(knowledge)
107
+
108
+ if not knowledge_prompt:
109
+ result_prompt = pattern_prompt_no_kg + '\n' + question_prompt
110
+ else:
111
+ result_prompt = knowledge_prompt + '\n' + pattern_prompt_kg + '\n' + question_prompt
112
+
113
+ return result_prompt
114
+
115
+ def cot_wizard():
116
+ cot = "\nGenerate the SQL after thinking step by step: "
117
+
118
+ return cot
@@ -4,7 +4,7 @@ from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
  from helm.common.hierarchical_logger import hlog
7
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
8
8
 
9
9
 
10
10
  class BLiMPScenario(Scenario):
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  TRAIN_SPLIT,
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List, Dict, Optional
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
- from .scenario import (
6
+ from helm.benchmark.scenarios.scenario import (
7
7
  Scenario,
8
8
  Instance,
9
9
  Reference,
@@ -0,0 +1,79 @@
1
+ from typing import List
2
+ import os
3
+ import os.path
4
+
5
+ from datasets import load_dataset, DatasetDict
6
+
7
+ from helm.common.general import ensure_directory_exists
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Input,
10
+ Scenario,
11
+ Instance,
12
+ Reference,
13
+ TRAIN_SPLIT,
14
+ TEST_SPLIT,
15
+ CORRECT_TAG,
16
+ Output,
17
+ )
18
+
19
+
20
+ class CaseHOLDScenario(Scenario):
21
+ """
22
+ CaseHOLD QA
23
+ CaseHOLD is a multiple choice question answering task derived from legal citations in judicial rulings.
24
+ CaseHOLD consists of ~53,000 questions, mined from the Harvard Law Library case law corpus.
25
+
26
+ Dataset repository
27
+ https://huggingface.co/datasets/casehold/casehold
28
+ Publication
29
+ "When Does Pretraining Help? Assessing Self-Supervised Learning for Law and the CaseHOLD Dataset"
30
+ ICAIL, 2021
31
+ https://reglab.stanford.edu/data/casehold-benchmark/
32
+ https://arxiv.org/abs/2104.08671
33
+
34
+ Data content
35
+ The citing context from the judicial decision serves as the prompt for the question.
36
+ The answer choices are holding statements derived from citations following text in a legal decision.
37
+ There are five answer choices for each citing text.
38
+ The correct answer is the holding statement that corresponds to the citing text.
39
+ The four incorrect answers are other holding statements.
40
+
41
+ """ # noqa: E501
42
+
43
+ name = "casehold"
44
+ description = "CaseHOLD (Case Holdings On Legal Decisions) is a multiple choice question answering scenario where the task is to identify the relevant holding of a cited case [(Zheng et al, 2021)](https://arxiv.org/pdf/2104.08671.pdf)." # noqa: E501
45
+ tags = ["question_answering", "legal"]
46
+
47
+ # Note: Skip the validation split since we don't need it
48
+ HELM_SPLIT_NAME_TO_DATASETS_SPLIT_NAME = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
49
+ NUM_REFERENCES = 5
50
+
51
+ def get_instances(self, output_path: str) -> List[Instance]:
52
+ data_path: str = os.path.join(output_path, "data")
53
+ ensure_directory_exists(data_path)
54
+ dataset: DatasetDict = load_dataset(
55
+ "casehold/casehold",
56
+ "all",
57
+ cache_dir=data_path,
58
+ )
59
+
60
+ instances: List[Instance] = []
61
+ for helm_split_name, datasets_split_name in self.HELM_SPLIT_NAME_TO_DATASETS_SPLIT_NAME.items():
62
+ split_data = dataset[datasets_split_name]
63
+ for example in split_data:
64
+ example_id = example["example_id"]
65
+ citing_prompt = example["citing_prompt"]
66
+ holdings = [example[f"holding_{i}"] for i in range(self.NUM_REFERENCES)]
67
+ correct_label: str = example["label"]
68
+ references = [
69
+ Reference(Output(text=holdings[i]), tags=([CORRECT_TAG] if correct_label == str(i) else []))
70
+ for i in range(self.NUM_REFERENCES)
71
+ ]
72
+ instance = Instance(
73
+ input=Input(text=citing_prompt),
74
+ references=references,
75
+ split=helm_split_name,
76
+ id=f"id{example_id}",
77
+ )
78
+ instances.append(instance)
79
+ return instances