crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,37 @@
1
+ import csv
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ TEST_SPLIT,
9
+ Input,
10
+ )
11
+
12
+
13
+ _DATA_DIRRECTORY_PATH = "restricted/helpdesk_call_summarization/HELM Sample Transcripts_20241221_0045"
14
+
15
+
16
+ class HelpdeskCallSummarizationScenario(Scenario):
17
+ """Helpdesk call summarization."""
18
+
19
+ name = "helpdesk_call_summarization"
20
+ description = "Helpdesk call summarization."
21
+ tags = ["helpdesk_call_center"]
22
+
23
+ def get_instances(self, output_path: str) -> List[Instance]:
24
+ instances: List[Instance] = []
25
+ for file_name in os.listdir(_DATA_DIRRECTORY_PATH):
26
+ if not file_name.endswith(".csv") or not file_name.startswith("Call1-"):
27
+ continue
28
+ file_path = os.path.join(_DATA_DIRRECTORY_PATH, file_name)
29
+ with open(file_path) as f:
30
+ csv_reader = csv.reader(f)
31
+ prompt_lines = [f"{row[0]}: {row[4]}" for row in csv_reader]
32
+ prompt = "\n".join(prompt_lines)
33
+ instance_id = file_name.removeprefix("Call1-").removesuffix(".csv")
34
+ input = Input(text=prompt)
35
+ instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
36
+ instances.append(instance)
37
+ return instances
@@ -5,8 +5,8 @@ from enum import Enum
5
5
  import pandas as pd
6
6
 
7
7
  from helm.common.optional_dependencies import handle_module_not_found_error
8
- from .ice_scenario_pinned_file_order import listdir_with_pinned_file_order
9
- from .scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
9
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
10
10
 
11
11
  try:
12
12
  # pd.read_excel() uses xlrd
@@ -114,8 +114,12 @@ class ICEScenario(Scenario):
114
114
  """
115
115
  The International Corpus of English (ICE).
116
116
 
117
- NOTE: This text cannot be downloaded
118
- automatically. You must extract each subset zip file into /benchmark_output/scenarios/ice.
117
+ NOTE: This text cannot be downloaded automatically.
118
+ You must extract each subset zip file into args.output_path + '/scenarios/ice',
119
+ which is by default '/benchmark_output/scenarios/ice',
120
+ where args.output_path is parsed from the command line argument.
121
+ See helm.benchmark.runner for more details about args.output_path.
122
+
119
123
  The archives should extract into folders named according to the dictionary SUBSET_TO_DIRECTORY
120
124
  below.
121
125
 
@@ -0,0 +1,53 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Input,
8
+ TEST_SPLIT,
9
+ )
10
+ from helm.common.general import ensure_directory_exists
11
+
12
+
13
+ class IFEvalScenario(Scenario):
14
+ """IFEval
15
+
16
+ IFEval contains around 500 "verifiable instructions" such as "write in more than 400 words"
17
+ and "mention the keyword of AI at least 3 times" which can be verified by heuristics."""
18
+
19
+ name = "ifeval"
20
+ description = "Instruction-Following Evaluation for Large Language Models"
21
+ tags = ["instruction following"]
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+
26
+ def get_instances(self, output_path: str) -> List[Instance]:
27
+ # Get IFEval from HuggingFace
28
+ cache_dir = os.path.join(output_path, "data")
29
+ ensure_directory_exists(cache_dir)
30
+ dataset = datasets.load_dataset(
31
+ "google/IFEval",
32
+ trust_remote_code=True,
33
+ cache_dir=cache_dir,
34
+ split="train",
35
+ revision="966cd89545d6b6acfd7638bc708b98261ca58e84",
36
+ )
37
+ assert isinstance(dataset, datasets.Dataset)
38
+
39
+ # Read all instances
40
+ instances: List[Instance] = []
41
+ for _, row in enumerate(dataset):
42
+ id = row["key"]
43
+ input = Input(text=row["prompt"].strip())
44
+ instance = Instance(
45
+ id=f"id{id}",
46
+ input=input,
47
+ references=[],
48
+ split=TEST_SPLIT,
49
+ extra_data={"instruction_ids": row["instruction_id_list"], "instruction_kwargs": row["kwargs"]},
50
+ )
51
+ instances.append(instance)
52
+
53
+ return instances
@@ -0,0 +1,60 @@
1
+ from typing import Any, List, Dict
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+
16
+
17
+ class IMDB_PTBRScenario(Scenario):
18
+ """
19
+ The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP)
20
+ particularly for text classification and sentiment analysis.
21
+ This is a translated version that is meant to evaluate PT-BR models.
22
+ It consists of movie reviews from the Internet Movie Database (IMDB) and
23
+ includes both positive and negative sentiments labeled for supervised learning.
24
+ """
25
+
26
+ name = "simple_classification"
27
+ description = "Classify movie reviews between positive or negative."
28
+ tags = ["classification"]
29
+
30
+ def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
31
+ instances: List[Instance] = []
32
+ label_names = {0: "negativo", 1: "positivo"}
33
+ for example in dataset[split]:
34
+ input = Input(text=example["text"])
35
+ # NOTE: For classification scenarios, the reference outputs should be the same
36
+ # for all instances, and should include both correct and incorrect classes.
37
+ # HELM only supports single-label classification. Exactly one reference
38
+ # should have the CORRECT_TAG tag.
39
+ references = [
40
+ Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
41
+ ]
42
+ instance = Instance(input=input, references=references, split=split)
43
+ instances.append(instance)
44
+ return instances
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ instances: List[Instance] = []
48
+ cache_dir = str(Path(output_path) / "data")
49
+ dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir)
50
+ splits: Dict[str, str] = {
51
+ "train": TRAIN_SPLIT,
52
+ "test": TEST_SPLIT,
53
+ }
54
+ for split in splits:
55
+ if split not in splits.keys():
56
+ hlog(f"{split} split doesn't exist, skipping")
57
+ continue
58
+ instances.extend(self.process_dataset(dataset, splits[split]))
59
+
60
+ return instances
@@ -2,8 +2,17 @@ import os
2
2
  from typing import List, Dict, Optional
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded
5
- from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, Input, Output
6
- from .imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ CORRECT_TAG,
10
+ TRAIN_SPLIT,
11
+ VALID_SPLIT,
12
+ Input,
13
+ Output,
14
+ )
15
+ from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
7
16
 
8
17
 
9
18
  class IMDBScenario(Scenario):
@@ -0,0 +1,85 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+
5
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ TEST_SPLIT,
15
+ )
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class InfiniteBenchEnQAScenario(Scenario):
20
+ """InfiniteBench En.QA
21
+
22
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
23
+ understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
24
+ InfiniteBench that requires models to perform open-form question answering on questions that necessitate
25
+ long-range dependency and reasoning, beyond simple short passage retrieval.
26
+ """
27
+
28
+ name = "infinite_bench_en_qa"
29
+ description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
+ tags = ["question_answering"]
31
+
32
+ def __init__(self, max_num_words: int):
33
+ self.max_num_words = max_num_words
34
+ super().__init__()
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ # Get InfiniteBench from HuggingFace
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+
41
+ # Define the features schema
42
+ ft = Features(
43
+ {
44
+ "id": Value("int64"),
45
+ "context": Value("string"),
46
+ "input": Value("string"),
47
+ "answer": Sequence(Value("string")),
48
+ "options": Sequence(Value("string")),
49
+ }
50
+ )
51
+
52
+ # Load the dataset with the specified features
53
+ dataset = load_dataset(
54
+ "xinrongzhang2022/InfiniteBench",
55
+ split="longbook_qa_eng",
56
+ features=ft,
57
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
58
+ )
59
+
60
+ assert isinstance(dataset, Dataset)
61
+
62
+ def count_words(text: str) -> int:
63
+ return len(re.split(r"\s+", text.strip()))
64
+
65
+ dataset = dataset.filter(
66
+ lambda example: count_words(example["context"])
67
+ + count_words(example["input"])
68
+ + sum(count_words(option) for option in example["options"])
69
+ <= self.max_num_words
70
+ )
71
+
72
+ # Read all instances
73
+ instances: List[Instance] = []
74
+ for row in dataset:
75
+ id = row["id"]
76
+ input = Input(text=row["context"] + "\n\n" + row["input"])
77
+ instance = Instance(
78
+ id=id,
79
+ input=input,
80
+ references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances
@@ -0,0 +1,79 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Input,
9
+ Reference,
10
+ Output,
11
+ CORRECT_TAG,
12
+ TEST_SPLIT,
13
+ )
14
+ from helm.common.general import ensure_directory_exists
15
+
16
+
17
+ class InfiniteBenchEnSumScenario(Scenario):
18
+ """InfiniteBench En.Sum
19
+
20
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
21
+ understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
22
+ InfiniteBench that requires models to generate a concise summary of the novel.
23
+ """
24
+
25
+ name = "infinite_bench_en_sum"
26
+ description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
27
+ tags = ["summarization"]
28
+
29
+ def __init__(self, max_num_words: int):
30
+ self.max_num_words = max_num_words
31
+ super().__init__()
32
+
33
+ def get_instances(self, output_path: str) -> List[Instance]:
34
+ # Get InfiniteBench from HuggingFace
35
+ cache_dir = os.path.join(output_path, "data")
36
+ ensure_directory_exists(cache_dir)
37
+
38
+ # Define the features schema
39
+ ft = Features(
40
+ {
41
+ "id": Value("int64"),
42
+ "context": Value("string"),
43
+ "input": Value("string"),
44
+ "answer": Sequence(Value("string")),
45
+ "options": Sequence(Value("string")),
46
+ }
47
+ )
48
+
49
+ # Load the dataset with the specified features
50
+ dataset = load_dataset(
51
+ "xinrongzhang2022/InfiniteBench",
52
+ split="longbook_sum_eng",
53
+ features=ft,
54
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
55
+ )
56
+
57
+ assert isinstance(dataset, Dataset)
58
+
59
+ def count_words(text: str) -> int:
60
+ return len(re.split(r"\s+", text.strip()))
61
+
62
+ dataset = dataset.filter(
63
+ lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
64
+ )
65
+
66
+ # Read all instances
67
+ instances: List[Instance] = []
68
+ for row in dataset:
69
+ id = row["id"]
70
+ input = Input(text=row["context"] + "\n\n" + row["input"])
71
+ instance = Instance(
72
+ id=id,
73
+ input=input,
74
+ references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
75
+ split=TEST_SPLIT,
76
+ )
77
+ instances.append(instance)
78
+
79
+ return instances
@@ -2,8 +2,8 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded
5
- from .scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
6
- from .mmlu_scenario import MMLUScenario
5
+ from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
6
+ from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
7
7
 
8
8
 
9
9
  class InteractiveQAMMLUScenario(MMLUScenario):
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import Scenario, Instance, Input, TEST_SPLIT
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
7
 
8
8
 
9
9
  class KoalaScenario(Scenario):
@@ -0,0 +1,151 @@
1
+ import os
2
+ from typing import List, Dict
3
+ import json
4
+ import re
5
+
6
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ VALID_SPLIT,
13
+ TEST_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class KPIEDGARScenario(Scenario):
21
+ """A financial named entity recognition (NER) scenario based on KPI-EDGAR (T. Deußer et al., 2022).
22
+
23
+ This scenario has been modified from the paper. The original paper has 12 entity types and requires the model
24
+ to extract pairs of related entities. This scenario only use four named entity types (kpi, cy, py, py1) and only
25
+ requires the model to extract individual entities.
26
+
27
+ Paper:
28
+ T. Deußer et al.,
29
+ “KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
30
+ https://arxiv.org/abs/2210.09163
31
+
32
+ Prompt format:
33
+
34
+ ```
35
+ Context: {Sentence}
36
+ Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
37
+ kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
38
+ Answer:
39
+ ```
40
+
41
+ Example input:
42
+
43
+ ```
44
+ Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
45
+ Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
46
+ kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
47
+ Answer:
48
+ ```
49
+
50
+ Example reference:
51
+ ```
52
+ 284 [cy], total unrecognized compensation cost [kpi]
53
+ ```""" # noqa: E501
54
+
55
+ name = "kpi_edgar"
56
+ description = "Named Entity Recognition from financial documents."
57
+ tags = ["named_entity_recognition", "finance"]
58
+
59
+ TAG_DICT = {
60
+ "kpi": "Key Performance Indicators expressible in numerical and monetary value",
61
+ "cy": "Current Year monetary value",
62
+ "py": "Prior Year monetary value",
63
+ "py1": "Two Year Past Value",
64
+ }
65
+ TAG_DESCRIPTIONS = ", ".join(["%s: %s" % (key, val) for (key, val) in TAG_DICT.items()]) + "."
66
+ TAG_PAREN_RE = (r"\[", r"\]")
67
+ TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
68
+ TAG_PAREN_ESC = ("(", ")")
69
+ DATASET_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT, "test": TEST_SPLIT}
70
+ JSON_URL = "https://raw.githubusercontent.com/tobideusser/kpi-edgar/2ec7084dcd55b4979bbe288d4aa1e962c685c9ab/data/kpi_edgar.json" # noqa: E501
71
+ JSON_FILENAME = "kpi_edgar.json"
72
+
73
+ @staticmethod
74
+ def get_sentences(dataset: List[Dict]) -> List[Dict]:
75
+ return [
76
+ sentence
77
+ for document in dataset
78
+ for segment in document["segments"]
79
+ for sentence in segment["sentences"] or []
80
+ ]
81
+
82
+ @staticmethod
83
+ def escape_parenthesis(text: str) -> str:
84
+ tmp0 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[0], KPIEDGARScenario.TAG_PAREN_ESC[0], text)
85
+ tmp1 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[1], KPIEDGARScenario.TAG_PAREN_ESC[1], tmp0)
86
+ return tmp1
87
+
88
+ @staticmethod
89
+ def get_output_text(
90
+ words: List[str],
91
+ annotations: List[Dict],
92
+ ) -> str:
93
+ # def get_entity_for_annotation(words: List[str], annotation: Dict) -> str
94
+ entities: List[str] = []
95
+ for annotation in annotations:
96
+ annotation_type = annotation["type_"]
97
+ if annotation_type not in KPIEDGARScenario.TAG_DICT:
98
+ continue
99
+ start_idx = annotation["start"]
100
+ end_idx = annotation["end"]
101
+ annotated_words = words[start_idx:end_idx]
102
+ phrase = KPIEDGARScenario.escape_parenthesis(" ".join(annotated_words))
103
+ entities.append(
104
+ "%s %s%s%s" % (phrase, KPIEDGARScenario.TAG_PAREN[0], annotation_type, KPIEDGARScenario.TAG_PAREN[1])
105
+ )
106
+
107
+ return ", ".join(entities)
108
+
109
+ @staticmethod
110
+ def sentences_to_instances(sentences: List[Dict]) -> List[Instance]:
111
+ instances: List[Instance] = []
112
+ for sentence in sentences:
113
+ dataset_split: str = sentence["split_type"]
114
+ if dataset_split is None:
115
+ continue
116
+ split = KPIEDGARScenario.DATASET_SPLIT_TO_HELM_SPLIT[dataset_split]
117
+
118
+ words: List[str] = [word_dict["value"] for word_dict in sentence["words"]]
119
+ passage = KPIEDGARScenario.escape_parenthesis(" ".join(words))
120
+ input_text = (
121
+ "Context: %s\n"
122
+ "Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n" # noqa: E501
123
+ "%s" % (passage, KPIEDGARScenario.TAG_DESCRIPTIONS)
124
+ )
125
+
126
+ annotations = sentence["entities_anno"]
127
+ output_text = KPIEDGARScenario.get_output_text(words, annotations)
128
+ if not output_text:
129
+ continue
130
+
131
+ instances.append(
132
+ Instance(
133
+ input=Input(text=input_text),
134
+ references=[Reference(Output(text=output_text), tags=[CORRECT_TAG])],
135
+ split=split,
136
+ )
137
+ )
138
+ return instances
139
+
140
+ def get_instances(self, output_path: str) -> List[Instance]:
141
+ data_path = os.path.join(output_path, "data")
142
+ ensure_directory_exists(data_path)
143
+
144
+ base_url = self.JSON_URL
145
+ dataset_file_name = self.JSON_FILENAME
146
+ target_path = os.path.join(data_path, dataset_file_name)
147
+ ensure_file_downloaded(source_url=base_url, target_path=target_path)
148
+
149
+ with open(target_path, "r") as f:
150
+ raw_dataset = json.load(f)
151
+ return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
@@ -0,0 +1,129 @@
1
+ import os
2
+ import pandas as pd
3
+ import json
4
+ import re
5
+
6
+ from typing import List
7
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Input,
10
+ Scenario,
11
+ Instance,
12
+ Reference,
13
+ TRAIN_SPLIT,
14
+ TEST_SPLIT,
15
+ CORRECT_TAG,
16
+ Output,
17
+ )
18
+
19
+
20
+ class LegalContractSummarizationScenario(Scenario):
21
+ """Legal Contract Summarization
22
+
23
+ A legal contract summarization benchmark based on the paper
24
+ Plain English Summarization of Contracts (Manor & Li, NAACL 2019),
25
+ which presented a dataset of legal text snippets paired with summaries
26
+ written in plain English.
27
+
28
+ @inproceedings{manor-li-2019-plain,
29
+ title = "Plain {E}nglish Summarization of Contracts",
30
+ author = "Manor, Laura and
31
+ Li, Junyi Jessy",
32
+ editor = "Aletras, Nikolaos and
33
+ Ash, Elliott and
34
+ Barrett, Leslie and
35
+ Chen, Daniel and
36
+ Meyers, Adam and
37
+ Preotiuc-Pietro, Daniel and
38
+ Rosenberg, David and
39
+ Stent, Amanda",
40
+ booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2019",
41
+ month = jun,
42
+ year = "2019",
43
+ address = "Minneapolis, Minnesota",
44
+ publisher = "Association for Computational Linguistics",
45
+ url = "https://aclanthology.org/W19-2201",
46
+ doi = "10.18653/v1/W19-2201",
47
+ pages = "1--11",
48
+ abstract = "Unilateral legal contracts, such as terms of service, play a substantial role in modern digital life. However, few read these documents before accepting the terms within, as they are too long and the language too complicated. We propose the task of summarizing such legal documents in plain English, which would enable users to have a better understanding of the terms they are accepting. We propose an initial dataset of legal text snippets paired with summaries written in plain English. We verify the quality of these summaries manually, and show that they involve heavy abstraction, compression, and simplification. Initial experiments show that unsupervised extractive summarization methods do not perform well on this task due to the level of abstraction and style differences. We conclude with a call for resource and technique development for simplification and style transfer for legal language.",
49
+ }
50
+ """ # noqa: E501
51
+
52
+ TRAIN_RATIO: float = 0.2
53
+ ARTICLE_COLUMN_NAME = "original_text"
54
+ SUMMARY_COLUMN_NAME = "reference_summary"
55
+ ID_COLUMN_NAME = "uid"
56
+
57
+ name = "legal_contract_summarization"
58
+ description = (
59
+ "Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf)."
60
+ )
61
+ tags = ["summarization", "legal"]
62
+
63
+ def __init__(self):
64
+ """
65
+ Initializes the scenario.
66
+
67
+ """
68
+ super().__init__()
69
+
70
+ @staticmethod
71
+ def _clean(text: str) -> str:
72
+ return re.sub(r"\s+", " ", text)
73
+
74
+ def _load_dataset(self, output_path: str):
75
+ data_dir = os.path.join(output_path, "data")
76
+ ensure_directory_exists(data_dir)
77
+
78
+ source_url = "https://raw.githubusercontent.com/lauramanor/legal_summarization/master/all_v1.json"
79
+ source_file = os.path.basename(source_url)
80
+ target_path = os.path.join(data_dir, source_file)
81
+ ensure_file_downloaded(
82
+ source_url=source_url,
83
+ target_path=target_path,
84
+ )
85
+
86
+ target_df = pd.DataFrame()
87
+ with open(target_path) as f:
88
+ json_data = json.load(f)
89
+ target_df = pd.DataFrame.from_records(list(json_data.values()))
90
+ target_df = target_df.dropna(
91
+ subset=[
92
+ LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME,
93
+ LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME,
94
+ LegalContractSummarizationScenario.ID_COLUMN_NAME,
95
+ ]
96
+ )
97
+ # Split randomly (works better than split by order)
98
+ train_df = target_df.sample(frac=LegalContractSummarizationScenario.TRAIN_RATIO, random_state=0)
99
+ test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
100
+
101
+ return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
102
+
103
+ def get_instances(self, output_path: str) -> List[Instance]:
104
+ dataset = self._load_dataset(output_path)
105
+
106
+ instances: List[Instance] = []
107
+
108
+ for split, split_data in dataset.items():
109
+ for example in split_data.itertuples():
110
+ id = getattr(example, LegalContractSummarizationScenario.ID_COLUMN_NAME)
111
+ article = LegalContractSummarizationScenario._clean(
112
+ getattr(example, LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME)
113
+ )
114
+ summary = LegalContractSummarizationScenario._clean(
115
+ getattr(example, LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME)
116
+ )
117
+ input = Input(
118
+ text=article,
119
+ )
120
+ output = Output(text=summary)
121
+ instance = Instance(
122
+ id=id,
123
+ input=input,
124
+ references=[Reference(output=output, tags=[CORRECT_TAG])],
125
+ split=split,
126
+ )
127
+ instances.append(instance)
128
+
129
+ return instances