crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ from typing import List, Any
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ CORRECT_TAG,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class ENEMChallengeScenario(Scenario):
17
+ """
18
+ The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
19
+ every year by the Brazilian government to students that wish to undertake a University degree.
20
+
21
+ The questions are about all types of intelectual fields and they are divided into four groups
22
+ that are named as: Humanities, Languages, Sciences and Mathematics.
23
+
24
+ This scenario is based on the exams that were applied throughout the years of 2009 and 2023.
25
+
26
+ The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
27
+ """
28
+
29
+ name = "enem_challenge"
30
+ description = "ENEM Challenge dataset"
31
+ tags = ["knowledge", "multiple_choice", "pt-br"]
32
+
33
+ def get_instances(self, output_path: str) -> List[Instance]:
34
+ # Download the raw data and read all the dialogues
35
+ dataset: Any
36
+ # Read all the instances
37
+ instances: List[Instance] = []
38
+ cache_dir = str(Path(output_path) / "data")
39
+
40
+ dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
41
+ for example in dataset["train"]:
42
+ question = example["question"]
43
+ choices = example["choices"]
44
+ answer = example["answerKey"]
45
+ # Skipping every canceled question!
46
+ if answer == "ANULADO":
47
+ continue
48
+ answers_dict = dict(zip(choices["label"], choices["text"]))
49
+ correct_answer = answers_dict[answer]
50
+
51
+ def answer_to_reference(answer: str) -> Reference:
52
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
53
+
54
+ instance = Instance(
55
+ input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
56
+ )
57
+ instances.append(instance)
58
+ return instances
@@ -5,7 +5,17 @@ from typing import List, Tuple
5
5
 
6
6
  from helm.common.hierarchical_logger import hlog
7
7
  from helm.common.general import ensure_file_downloaded
8
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TRAIN_SPLIT,
13
+ VALID_SPLIT,
14
+ TEST_SPLIT,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
9
19
 
10
20
 
11
21
  class EntityDataImputationScenario(Scenario):
@@ -4,8 +4,18 @@ from typing import Dict, List, Tuple
4
4
 
5
5
  from helm.common.hierarchical_logger import hlog
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
8
- from .entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ VALID_SPLIT,
13
+ TEST_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+ from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
9
19
 
10
20
 
11
21
  class EntityMatchingScenario(Scenario):
@@ -0,0 +1,94 @@
1
+ import os
2
+ import random
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ CORRECT_TAG,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class FinancialPhrasebankScenario(Scenario):
19
+ """
20
+ A sentiment classification benchmark based on the dataset from Good Debt or Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., 2013)](https://arxiv.org/abs/1307.5336).
21
+
22
+ Context:
23
+ Polar sentiment dataset of sentences from financial news. The dataset consists of 4840 sentences from English
24
+ language financial news categorized by sentiment. The dataset is divided by agreement rate of 5-8 annotators.
25
+
26
+ This release of the financial phrase bank covers a collection of 4840 sentences. The selected collection of
27
+ phrases was annotated by 16 people with adequate background knowledge on financial markets.
28
+
29
+ Given the large number of overlapping annotations (5 to 8 annotations per sentence), there are several ways
30
+ to define a majority vote based gold standard. To provide an objective comparison, the paper authors have formed 4 alternative
31
+ reference datasets based on the strength of majority agreement: 100%, 75%, 66% and 50%.
32
+
33
+ Data source:
34
+ https://huggingface.co/datasets/takala/financial_phrasebank
35
+
36
+ Reference:
37
+ P. Malo, A. Sinha, P. Korhonen, J. Wallenius, and P. Takala, “Good debt or bad debt: Detecting semantic orientations in economic texts,” Journal of the Association for Information Science and Technology, vol. 65, 2014.
38
+ https://arxiv.org/pdf/1307.5336
39
+
40
+ """ # noqa: E501
41
+
42
+ name = "financial_phrasebank"
43
+ description = "The dataset consists of 4840 sentences from English \
44
+ language financial news categorized by sentiment."
45
+ tags = ["finance", "sentiment analysis", "classification"]
46
+
47
+ INSTRUCTIONS = """The dataset consists of sentences from English language financial news categorized by sentiment.
48
+ Classify the sentences into one of the 3 sentiment categories.
49
+ Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
50
+ DATASET_URL = "https://huggingface.co/datasets/takala/financial_phrasebank/resolve/598b6aad98f7c8d67be161b12a4b5f2497e07edd/data/FinancialPhraseBank-v1.0.zip" # noqa: E501
51
+ AGREEMENT_VALUES = [50, 66, 75, 100]
52
+ TRAIN_SPLIT_SIZE = 0.7
53
+
54
+ def __init__(self, agreement: int, random_seed: int = 121):
55
+ """The initialization of an instance.
56
+
57
+ Args:
58
+ subset: str: This argument is used to specify the ratio of annotators who agreed on the ground truth label.
59
+ The value must be one of the strings defined in
60
+ SUBSETS = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"].
61
+ random_seed: int = 121: The random seed for sampling the train/test splits.
62
+ """
63
+ super().__init__()
64
+ if agreement not in self.AGREEMENT_VALUES:
65
+ raise Exception(
66
+ f"Unknown `agreement` value: {agreement}, allowed values are {self.AGREEMENT_VALUES}".format(agreement)
67
+ )
68
+ self.agreement = agreement
69
+ self.random_seed = random_seed
70
+
71
+ def get_instances(self, output_path: str) -> List[Instance]:
72
+ data_parent_path = os.path.join(output_path, "data")
73
+ ensure_file_downloaded(
74
+ self.DATASET_URL,
75
+ data_parent_path,
76
+ unpack=True,
77
+ unpack_type="unzip",
78
+ )
79
+ file_name = "Sentences_AllAgree.txt" if self.agreement == 100 else f"Sentences_{self.agreement}Agree.txt"
80
+ data_file_path = os.path.join(data_parent_path, "FinancialPhraseBank-v1.0", file_name)
81
+ with open(data_file_path, mode="r", encoding="iso-8859-1") as f:
82
+ lines = list(f.readlines())
83
+ random.Random(self.random_seed).shuffle(lines)
84
+ train_split_index = int(len(lines) * self.TRAIN_SPLIT_SIZE)
85
+ instances: List[Instance] = []
86
+ for index, line in enumerate(lines):
87
+ sentence, label = line.strip().rsplit("@", 1)
88
+ instance = Instance(
89
+ input=Input(text=sentence),
90
+ references=[Reference(Output(text=label), tags=[CORRECT_TAG])],
91
+ split=TRAIN_SPLIT if index < train_split_index else TEST_SPLIT,
92
+ )
93
+ instances.append(instance)
94
+ return instances
@@ -0,0 +1,124 @@
1
+ import dataclasses
2
+ import math
3
+ import os
4
+ import random
5
+ from typing import List
6
+
7
+ import pandas as pd
8
+
9
+ from helm.benchmark.runner import TRAIN_SPLIT
10
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
11
+ from helm.benchmark.scenarios.scenario import (
12
+ CORRECT_TAG,
13
+ TEST_SPLIT,
14
+ Input,
15
+ Instance,
16
+ Reference,
17
+ Scenario,
18
+ Output,
19
+ )
20
+
21
+
22
+ class GoldCommodityNewsScenario(Scenario):
23
+ """Gold commodity news headline classification
24
+
25
+ This dataset contains gold commodity news headlines annotated by humans labeled by humans with regards to
26
+ whether the news headline discusses past movements and expected directionality in prices, asset comparison
27
+ and other general information. The task is to classify the news headlines using these labels.
28
+
29
+ Paper: https://arxiv.org/abs/2009.04202
30
+ Dataset: https://www.kaggle.com/datasets/daittan/gold-commodity-news-and-dimensions
31
+
32
+ Citation:
33
+ Ankur Sinha, Tanmay Khandait
34
+ "Impact of News on the Commodity Market: Dataset and Results." arXiv preprint arXiv:2009.04202 (2020)"""
35
+
36
+ name = "gold_commodity_news"
37
+ description = "The dataset is a collection of news items related to the gold commodities from various sources."
38
+
39
+ tags = ["news headline", "classification"]
40
+
41
+ CATEGORY_COLUMN_NAMES = {
42
+ "price_or_not": "Price or Not",
43
+ "direction_up": "Direction Up",
44
+ "direction_constant": "Direction Constant",
45
+ "direction_down": "Direction Down",
46
+ "past_price": "PastPrice",
47
+ "future_price": "FuturePrice",
48
+ "past_news": "PastNews",
49
+ "future_news": "FutureNews",
50
+ "assert_comparison": "Asset Comparision",
51
+ }
52
+
53
+ CATEGORY_INSTRUCTIONS = {
54
+ "price_or_not": "the gold price",
55
+ "direction_up": "the gold price heading up",
56
+ "direction_constant": "the price remaining constant or stable",
57
+ "direction_down": "the gold price heading down",
58
+ "past_price": "any past information about gold prices",
59
+ "future_price": "any future information about gold prices",
60
+ "past_news": "any past information other than the gold prices",
61
+ "future_news": "any future information other than the gold prices",
62
+ "assert_comparison": "a comparison purely in the context of the gold commodity with another asset",
63
+ }
64
+
65
+ @classmethod
66
+ def get_instructions(cls, category: str):
67
+ if category not in GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS:
68
+ raise ValueError(
69
+ f"Invalid category: '{category}' "
70
+ f"Valid categories are: {list(GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS.keys())}"
71
+ )
72
+
73
+ return (
74
+ "The following are news headlines about the gold commodity. "
75
+ "Classify whether the news headline discusses "
76
+ f'{GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS[category]}. Answer only "Yes" or "No".'
77
+ )
78
+
79
+ def __init__(self, category: str):
80
+ super().__init__()
81
+ if category not in GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS:
82
+ raise ValueError(
83
+ f"Invalid category: '{category}' "
84
+ f"Valid categories are: {list(GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS.keys())}"
85
+ )
86
+ self.column_name = GoldCommodityNewsScenario.CATEGORY_COLUMN_NAMES[category]
87
+
88
+ def get_instances(self, output_path: str) -> List[Instance]:
89
+ ensure_directory_exists(output_path)
90
+ data_path = os.path.join(output_path, "finalDataset_0208.csv")
91
+
92
+ ensure_file_downloaded(
93
+ source_url=(
94
+ "https://www.kaggle.com/api/v1/datasets/download/daittan/"
95
+ "gold-commodity-news-and-dimensions?dataset_version_number=1"
96
+ ),
97
+ target_path=data_path,
98
+ unpack=True,
99
+ unpack_type="unzip",
100
+ )
101
+
102
+ df = pd.read_csv(data_path)
103
+
104
+ instances: List[Instance] = []
105
+ for _, row in df.iterrows():
106
+ expected_output: str
107
+ if row[self.column_name] == 1:
108
+ expected_output = "Yes"
109
+ else:
110
+ expected_output = "No"
111
+
112
+ instance = Instance(
113
+ input=Input(text=str(row["News"])),
114
+ references=[Reference(Output(text=expected_output), tags=[CORRECT_TAG])],
115
+ split=str(TEST_SPLIT),
116
+ )
117
+ instances.append(instance)
118
+
119
+ # no explicit train/test split, so randomly assign 10% of rows to train
120
+ random.seed(0)
121
+ train_indexes = random.sample(list(range(len(instances))), k=math.floor(len(instances) / 10))
122
+ for train_index in train_indexes:
123
+ instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
124
+ return instances
@@ -0,0 +1,80 @@
1
+ import datasets
2
+ import os
3
+ import random
4
+ from typing import List
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+ from helm.common.general import ensure_directory_exists
16
+
17
+
18
+ SUBSETS = ["gpqa_main", "gpqa_diamond", "gpqa_extended"]
19
+
20
+ # Train example indices below are found by indexing examples given in the original paper repo
21
+ TRAIN_EXAMPLE_INDICES = {
22
+ "gpqa_main": [339, 105],
23
+ "gpqa_diamond": [124, 39],
24
+ "gpqa_extended": [146, 330, 436],
25
+ }
26
+
27
+
28
+ class GPQAScenario(Scenario):
29
+ """GPQA
30
+
31
+ GPQA is a multiple-choice, Q&A dataset of very hard questions written and validated by experts in biology, physics,
32
+ and chemistry. When attempting questions out of their own domain (e.g., a physicist answers a chemistry question),
33
+ these experts get only 34% accuracy, despite spending >30m with full access to Google."""
34
+
35
+ name = "gpqa"
36
+ description = "A Graduate-Level Google-Proof Q&A Benchmark"
37
+ tags = ["question answering"]
38
+
39
+ def __init__(self, subset: str, random_seed=42):
40
+ super().__init__()
41
+ assert subset in SUBSETS, "Unknown subset: {}".format(subset)
42
+ self.subset = subset
43
+ self.random_seed = random_seed
44
+
45
+ def get_instances(self, output_path: str) -> List[Instance]:
46
+ # Get GPQA from HuggingFace
47
+ cache_dir = os.path.join(output_path, "data")
48
+ ensure_directory_exists(cache_dir)
49
+ dataset = datasets.load_dataset(
50
+ "Idavidrein/gpqa",
51
+ self.subset,
52
+ trust_remote_code=True,
53
+ cache_dir=cache_dir,
54
+ split="train",
55
+ revision="90b8e5be2b1d3d2dbfe016cdab47981150600c4a",
56
+ )
57
+ assert isinstance(dataset, datasets.Dataset)
58
+
59
+ # Read all instances
60
+ random.seed(self.random_seed)
61
+ instances: List[Instance] = []
62
+ for idx, row in enumerate(dataset):
63
+ input = Input(text=row["Question"].strip())
64
+ references = [
65
+ Reference(Output(text=row["Correct Answer"].strip()), tags=[CORRECT_TAG]),
66
+ Reference(Output(text=row["Incorrect Answer 1"].strip()), tags=[]),
67
+ Reference(Output(text=row["Incorrect Answer 2"].strip()), tags=[]),
68
+ Reference(Output(text=row["Incorrect Answer 3"].strip()), tags=[]),
69
+ ]
70
+ random.shuffle(references)
71
+ if idx in TRAIN_EXAMPLE_INDICES[self.subset]:
72
+ extra_data = {
73
+ "chain_of_thought": row["Explanation"],
74
+ }
75
+ instance = Instance(input=input, references=references, split=TRAIN_SPLIT, extra_data=extra_data)
76
+ else:
77
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
78
+ instances.append(instance)
79
+
80
+ return instances
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
 
3
- from .scenario import Scenario, Instance, Input, TEST_SPLIT
4
- from .grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
3
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
4
+ from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
5
5
 
6
6
 
7
7
  class GrammarScenario(Scenario):
@@ -3,7 +3,16 @@ import os
3
3
  from typing import Dict, List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ CORRECT_TAG,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ Input,
14
+ Output,
15
+ )
7
16
 
8
17
 
9
18
  class GSM8KScenario(Scenario):
@@ -0,0 +1,50 @@
1
+ import os
2
+ import pandas as pd
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+
7
+ from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
8
+
9
+
10
+ class HarmBenchGCGTransferScenario(Scenario):
11
+ """
12
+ HarmBenchGCG-T is a standardized evaluation framework for automated red teaming.
13
+ HarmBench identifies key considerations previously unaccounted for in red teaming
14
+ evaluations and systematically designed prompts that meet these criteria.
15
+
16
+ https://arxiv.org/abs/2402.04249
17
+ """
18
+
19
+ name = "harm_bench_gcg_transfer"
20
+ description = "HarmBench GCG-T dataset - standard and contextual attacks"
21
+ tags = ["safety"]
22
+
23
+ def get_instances(self, output_path: str) -> List[Instance]:
24
+ data_path = os.path.join(output_path, "data")
25
+ os.makedirs(data_path, exist_ok=True)
26
+ url = "https://raw.githubusercontent.com/farzaank/harmbench-gcg-ensembled/refs/heads/main/output.csv" # noqa: E501
27
+ outf_path = os.path.join(data_path, "harmbench_gcg.csv")
28
+ ensure_file_downloaded(
29
+ source_url=url,
30
+ target_path=outf_path,
31
+ unpack=False,
32
+ )
33
+
34
+ df = pd.read_csv(outf_path)
35
+
36
+ # Read all the instances
37
+ instances: List[Instance] = []
38
+ # we take the direct_request template from HarmBench for simplicity
39
+ # see the example here
40
+ # https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
41
+ #
42
+ # in the future we can use other types of templates including various red team attacks
43
+ for i, row in df.iterrows():
44
+ references = [Reference(output=Output(text="unsafe behavior"), tags=[])]
45
+ input_text = row["behavior_prompt"]
46
+ input = Input(text=input_text)
47
+ id = row["behavior_name"]
48
+ instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
49
+ instances.append(instance)
50
+ return instances
@@ -4,7 +4,7 @@ from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
 
7
- from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
8
8
 
9
9
 
10
10
  class HarmBenchScenario(Scenario):
@@ -0,0 +1,131 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from datasets import DatasetDict, load_dataset
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ CORRECT_TAG,
8
+ TEST_SPLIT,
9
+ Input,
10
+ Instance,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ )
15
+ from helm.common.general import ensure_directory_exists
16
+
17
+
18
+ class HeadQAScenario(Scenario):
19
+ """
20
+ From "HEAD-QA: A Healthcare Dataset for Complex Reasoning" (Vilares et al.), HEAD-QA is a multi-choice
21
+ question-answering dataset designed to evaluate reasoning on challenging healthcare-related questions.
22
+ The questions are sourced from Spanish healthcare exams for specialized positions, covering various topics
23
+ such as Medicine, Nursing, Psychology, Chemistry, Pharmacology, and Biology.
24
+
25
+ Example from the dataset:
26
+
27
+ Question:
28
+ The excitatory postsynaptic potentials:
29
+
30
+ A) They are all or nothing.
31
+ B) They are hyperpolarizing.
32
+ C) They can be added.
33
+ D) They spread long distances.
34
+
35
+ Answer:
36
+ The answer is C. Explanation: None provided in this dataset.
37
+
38
+ @InProceedings{HEAD-QA,
39
+ author = {David Vilares and Manuel Vilares and Carlos Gómez-Rodríguez},
40
+ title = {HEAD-QA: A Healthcare Dataset for Complex Reasoning},
41
+ year = {2019},
42
+ abstract = {We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex
43
+ reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system,
44
+ and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and
45
+ cross-lingual (to English) experiments with information retrieval and neural techniques. We show that:
46
+ (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance,
47
+ demonstrating its usefulness as a benchmark for future work.}}
48
+
49
+
50
+ Task:
51
+ Given a question and its multiple-choice answers, models must identify the correct answer, corresponding to the
52
+ `ra` field in the dataset. The dataset spans six healthcare domains and is challenging even for experts.
53
+ """
54
+
55
+ HUGGING_FACE_DATASET_PATH: str = "dvilares/head_qa"
56
+ SKIP_VQA: bool = True
57
+ SKIP_TEXTQA: bool = False
58
+
59
+ name = "head_qa"
60
+ description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
61
+ tags = ["question_answering", "biomedical", "medicine"]
62
+
63
+ def __init__(self, language: str = "en", category: Optional[str] = None):
64
+ """Initialize the HEAD-QA scenario.
65
+
66
+ Args:
67
+ language (str, optional): Language of the dataset. Defaults to "en".
68
+ category (str, optional): Category of the dataset. If None, all categories are used.
69
+ """
70
+ super().__init__()
71
+ self.language: str = language
72
+ self.category: Optional[str] = category
73
+ assert (
74
+ self.SKIP_VQA or self.SKIP_TEXTQA
75
+ ), "Failed to initialize HeadQAScenario, one of `SKIP_VQA` or `SKIP_TEXTQA` must be True."
76
+
77
+ def get_instances(self, output_path: str) -> List[Instance]:
78
+ data_path: str = os.path.join(output_path, "data")
79
+ ensure_directory_exists(data_path)
80
+ dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH, self.language)
81
+
82
+ # XXX: Should we consider validation as test too?
83
+ # splits = {TRAIN_SPLIT: ["train", "validation"], TEST_SPLIT: ["test"]}
84
+ # Limit to zero shot setting
85
+ splits = {TEST_SPLIT: ["test"]}
86
+ instances: List[Instance] = []
87
+ for (
88
+ helm_split_name,
89
+ dataset_splits_name,
90
+ ) in splits.items(): # Iterate over the splits
91
+ for dataset_split_name in dataset_splits_name:
92
+ split_data = dataset[dataset_split_name]
93
+
94
+ for example in split_data:
95
+ # Whether to process Visual Question Answering (VQA) examples
96
+ if self.SKIP_VQA and example["image"] is not None:
97
+ continue
98
+
99
+ # Whether to process Text Question Answering (TextQA) examples
100
+ if self.SKIP_TEXTQA and example["image"] is None:
101
+ continue
102
+
103
+ # If specified, filter by category
104
+ if self.category is not None:
105
+ if example["category"] != self.category:
106
+ continue
107
+
108
+ question = example["qtext"]
109
+
110
+ # Format the final answer with explanation
111
+ instances.append(
112
+ Instance(
113
+ input=Input(text=question),
114
+ references=[
115
+ Reference(
116
+ Output(text=option["atext"]),
117
+ tags=[CORRECT_TAG] if option["aid"] == example["ra"] else [],
118
+ )
119
+ for option in example["answers"]
120
+ ],
121
+ split=helm_split_name,
122
+ extra_data={
123
+ "id": example["qid"],
124
+ "name": example["name"],
125
+ "category": example["category"],
126
+ "year": example["year"],
127
+ },
128
+ )
129
+ )
130
+
131
+ return instances
@@ -0,0 +1,37 @@
1
+ import csv
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ TEST_SPLIT,
9
+ Input,
10
+ )
11
+
12
+
13
+ _DATA_DIRRECTORY_PATH = "restricted/helpdesk_call_summarization/HELM Sample Transcripts_20241221_0045"
14
+
15
+
16
+ class HelpdeskCallSummarizationScenario(Scenario):
17
+ """Helpdesk call summarization."""
18
+
19
+ name = "helpdesk_call_summarization"
20
+ description = "Helpdesk call summarization."
21
+ tags = ["helpdesk_call_center"]
22
+
23
+ def get_instances(self, output_path: str) -> List[Instance]:
24
+ instances: List[Instance] = []
25
+ for file_name in os.listdir(_DATA_DIRRECTORY_PATH):
26
+ if not file_name.endswith(".csv") or not file_name.startswith("Call1-"):
27
+ continue
28
+ file_path = os.path.join(_DATA_DIRRECTORY_PATH, file_name)
29
+ with open(file_path) as f:
30
+ csv_reader = csv.reader(f)
31
+ prompt_lines = [f"{row[0]}: {row[4]}" for row in csv_reader]
32
+ prompt = "\n".join(prompt_lines)
33
+ instance_id = file_name.removeprefix("Call1-").removesuffix(".csv")
34
+ input = Input(text=prompt)
35
+ instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
36
+ instances.append(instance)
37
+ return instances