crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
1
+ from typing import Any, Dict, Optional, Union
2
+ from importlib.resources import files
3
+
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.annotator import Annotator
6
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
7
+ from helm.clients.auto_client import AutoClient
8
+ from helm.common.hierarchical_logger import hlog
9
+ from helm.common.request import Request
10
+
11
+
12
+ # Following https://github.com/KbsdJames/Omni-MATH/blob/23be225c8e268df51990f6c5c1448f34d3b56911/GPT_eval/get_result.py
13
+ def _parse_report(report):
14
+ parts = report.split("## ")
15
+ data = {}
16
+ for part in parts[1:]:
17
+ lines = part.strip().split("\n")
18
+ title = lines[0].strip()
19
+ content = "\n".join(lines[1:]).strip()
20
+ if title == "Justification":
21
+ data[title] = content
22
+ else:
23
+ data[title] = lines[1].strip() if len(lines) > 1 else ""
24
+ return data
25
+
26
+
27
+ class OmniMATHAnnotator(Annotator):
28
+ """The Omni-MATH autograder."""
29
+
30
+ name = "omni_math"
31
+
32
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
33
+ self._auto_client = auto_client
34
+ self._template_name = template_name or "gpt_evaluation_zero_shot_template"
35
+ template_path = files("helm.benchmark.annotation.omni_math").joinpath(f"{self._template_name}.txt")
36
+ with template_path.open("r") as file:
37
+ self._score_template = file.read()
38
+
39
+ def annotate(self, request_state: RequestState) -> Any:
40
+ assert request_state.result
41
+ assert len(request_state.result.completions) == 1
42
+ prompt_template = self._score_template
43
+ model_output_text = request_state.result.completions[0].text
44
+ annotator_prompt = (
45
+ prompt_template.replace("{{Problem}}", request_state.instance.input.text)
46
+ .replace("{{Reference Answer}}", request_state.instance.references[0].output.text)
47
+ .replace("{{Solution}}", model_output_text)
48
+ )
49
+ if not model_output_text.strip():
50
+ hlog(
51
+ "WARNING: OmniMATHAnnotator skipped sending requests to annotator models "
52
+ "because the model response was empty"
53
+ )
54
+ return {
55
+ "prompt_text": None,
56
+ "empty_output_equivalence_judgement": False,
57
+ }
58
+
59
+ SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
60
+ "gpt": AnnotatorModelInfo(
61
+ model_name="openai/gpt-4o-2024-05-13",
62
+ model_deployment="openai/gpt-4o-2024-05-13",
63
+ ),
64
+ "llama": AnnotatorModelInfo(
65
+ model_name="meta/llama-3.1-405b-instruct-turbo",
66
+ model_deployment="together/llama-3.1-405b-instruct-turbo",
67
+ ),
68
+ "claude": AnnotatorModelInfo(
69
+ model_name="anthropic/claude-3-5-sonnet-20241022",
70
+ model_deployment="anthropic/claude-3-5-sonnet-20241022",
71
+ ),
72
+ }
73
+ annotations: Dict[str, Union[Optional[str], Optional[bool]]] = {"prompt_text": annotator_prompt}
74
+
75
+ for annotator_name, annotator_model_info in SHORT_NAME_TO_MODEL_INFO.items():
76
+ student_final_answer: Optional[str] = None
77
+ equivalence_judgement: Optional[bool] = None
78
+ justification: Optional[str] = None
79
+ annotator_request = Request(
80
+ model=annotator_model_info.model_name,
81
+ model_deployment=annotator_model_info.model_deployment,
82
+ prompt=annotator_prompt,
83
+ temperature=0.0,
84
+ max_tokens=4096,
85
+ )
86
+ annotator_response = self._auto_client.make_request(annotator_request)
87
+ if not annotator_response.success:
88
+ hlog(
89
+ "WARNING: OmniMATHAnnotator got an error response from "
90
+ f"{annotator_model_info.model_name}: {annotator_response.error}"
91
+ )
92
+ else:
93
+ assert len(annotator_response.completions) == 1
94
+ annotator_response_text = annotator_response.completions[0].text
95
+ report_parts: Dict[str, str] = _parse_report(annotator_response_text)
96
+ try:
97
+ student_final_answer = report_parts["Student Final Answer"]
98
+ except KeyError:
99
+ hlog(
100
+ "WARNING: OmniMATHAnnotator could not get Student Final Answer from annotation from "
101
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
102
+ )
103
+
104
+ try:
105
+ justification = report_parts["Justification"].strip().removesuffix("=== report over ===").strip()
106
+ except KeyError:
107
+ hlog(
108
+ "WARNING: OmniMATHAnnotator could not get Justification from annotation from "
109
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
110
+ )
111
+
112
+ try:
113
+ equivalence_judgement_str = report_parts["Equivalence Judgement"].strip().upper()
114
+ if equivalence_judgement_str == "TRUE":
115
+ equivalence_judgement = True
116
+ elif equivalence_judgement_str == "FALSE":
117
+ equivalence_judgement = False
118
+ else:
119
+ hlog(
120
+ "WARNING: OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
121
+ f"{annotator_model_info.model_name}: {equivalence_judgement_str}"
122
+ )
123
+ except KeyError:
124
+ hlog(
125
+ "WARNING: OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
126
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
127
+ )
128
+
129
+ annotations[f"{annotator_name}_student_final_answer"] = student_final_answer
130
+ annotations[f"{annotator_name}_justification"] = justification
131
+ annotations[f"{annotator_name}_equivalence_judgement"] = equivalence_judgement
132
+ return annotations
@@ -0,0 +1,18 @@
1
+ import os
2
+
3
+ from helm.benchmark.annotation.bird_sql_annotator import BirdSQLAnnotator
4
+ from helm.benchmark.runner import get_benchmark_output_path
5
+
6
+
7
+ class SpiderAnnotator(BirdSQLAnnotator):
8
+ """The Spider evaluator that computes execution accuracy.
9
+
10
+ Based on the Bird-SQL annotator."""
11
+
12
+ name = "spider"
13
+
14
+ def get_database_path(self, database_name: str) -> str:
15
+ databases_root_path = os.path.join(
16
+ get_benchmark_output_path(), "scenarios", "spider", "data", "spider_data", "test_database"
17
+ )
18
+ return os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
@@ -0,0 +1,98 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of after-surgery care instructions.
8
+
9
+ Your goal is to assess how well the care instructions capture the next steps the patient should follow after their
10
+ procedure, and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The case will be provided in these tags:
13
+ <case_details>
14
+ {{QUESTION}}
15
+ </case_details>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference response will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully review the <response> and compare it to the <gold_response> when needed.
28
+
29
+ Please evaluate the instructions on a scale of 1-5 (1 = poor, 5 = excellent) for each of these three key dimensions:
30
+
31
+ Evaluation Criteria:
32
+ Accuracy (1-5)
33
+ - Are the instructions medically correct for the patient's specific case?
34
+
35
+ Completeness (1-5)
36
+ - Do the instructions cover all important post-surgical care details?
37
+
38
+ Clarity (1-5)
39
+ - Are the instructions easy for the patient to understand?
40
+
41
+ Output Format:
42
+ Output your evaluation as a single valid JSON object matching the following structure:
43
+ {
44
+ "accuracy": {
45
+ "score": 0,
46
+ "explanation": "Brief explanation of why this score was given."
47
+ },
48
+ "completeness": {
49
+ "score": 0,
50
+ "explanation": "Brief explanation of why this score was given."
51
+ },
52
+ "clarity": {
53
+ "score": 0,
54
+ "explanation": "Brief explanation of why this score was given."
55
+ }
56
+ }
57
+
58
+ Ensure the output is valid JSON:
59
+ - Use **double quotes** (") for all keys and string values.
60
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
61
+ maintain valid JSON formatting.
62
+ - Do not include any additional information in the output.
63
+ """
64
+
65
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
66
+ "accuracy": {"score", "explanation"},
67
+ "completeness": {"score", "explanation"},
68
+ "clarity": {"score", "explanation"},
69
+ }
70
+
71
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
+ "gpt": AnnotatorModelInfo(
73
+ model_name="openai/gpt-4o-2024-05-13",
74
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
+ ),
76
+ "llama": AnnotatorModelInfo(
77
+ model_name="meta/llama-3.3-70b-instruct",
78
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
+ ),
80
+ "claude": AnnotatorModelInfo(
81
+ model_name="anthropic/claude-3-7-sonnet-20250219",
82
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
+ ),
84
+ }
85
+
86
+
87
+ class StarrPatientInstructionsAnnotator(LLMAsJuryAnnotator):
88
+ """The StarrPatientInstructions autograder."""
89
+
90
+ name = "starr_patient_instructions"
91
+
92
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
93
+ super().__init__(
94
+ auto_client=auto_client,
95
+ prompt_template=PROMPT_TEMPLATE,
96
+ annotation_criteria=ANNOTATION_CRITERIA,
97
+ annotator_models=ANNOTATOR_MODELS,
98
+ )
@@ -0,0 +1,75 @@
1
+ # Instruction
2
+
3
+ You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models.
4
+ We will provide you with the user query and a pair of AI-generated responses (Response A and Response B).
5
+ You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
6
+
7
+ # Conversation between User and AI
8
+
9
+ ## History
10
+ <|begin_of_history|>
11
+
12
+ {$history}
13
+
14
+ <|end_of_history|>
15
+
16
+ ## Current User Query
17
+ <|begin_of_query|>
18
+
19
+ {$user_query}
20
+
21
+ <|end_of_query|>
22
+
23
+ ## Response A
24
+ <|begin_of_response_A|>
25
+
26
+ {$candidate_A}
27
+
28
+ <|end_of_response_A|>
29
+
30
+ ## Response B
31
+ <|begin_of_response_B|>
32
+
33
+ {$candidate_B}
34
+
35
+ <|end_of_response_B|>
36
+
37
+ # Evaluation
38
+
39
+ ## Checklist
40
+
41
+ <|begin_of_checklist|>
42
+
43
+ {$checklist}
44
+
45
+ <|end_of_checklist|>
46
+
47
+ Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
48
+
49
+ ## Rules
50
+
51
+ You should compare the above two responses based on your analysis of the user queries and the conversation history.
52
+ You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
53
+ There are five choices to give your final assessment: ["A++", "A+", "A=B", "B+", "B++"], which correspond to the following meanings:
54
+
55
+ - `A++`: Response A is much better than Response B.
56
+ - `A+`: Response A is only slightly better than Response B.
57
+ - `A=B`: Response A and B are of the same quality. Please use this choice sparingly.
58
+ - `B+`: Response B is only slightly better than Response A.
59
+ - `B++`: Response B is much better than Response A.
60
+
61
+
62
+ ## Output Format
63
+ First, please output your analysis for each model response, and then summarize your assessment to three aspects: "reason A=B", "reason A>B", and "reason B>A", and finally make your choice for the final assessment.
64
+
65
+ Please provide your evaluation results in the following json format by filling in the placeholders in []:
66
+ ```
67
+ {
68
+ "analysis of A": "[analysis of Response A]",
69
+ "analysis of B": "[analysis of Response B]",
70
+ "reason of A=B": "[where Response A and B perform equally well]",
71
+ "reason of A>B": "[where Response A is better than Response B]",
72
+ "reason of B>A": "[where Response B is better than Response A]",
73
+ "choice": "[A++ or A+ or A=B or B+ or B++]",
74
+ }
75
+ ```
@@ -0,0 +1,66 @@
1
+ # Instruction
2
+
3
+ You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
4
+ We will provide you with the user query and an AI-generated responses.
5
+ You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
6
+
7
+ # Conversation between User and AI
8
+
9
+ ## History
10
+ <|begin_of_history|>
11
+
12
+ {$history}
13
+
14
+ <|end_of_history|>
15
+
16
+ ## Current User Query
17
+ <|begin_of_query|>
18
+
19
+ {$user_query}
20
+
21
+ <|end_of_query|>
22
+
23
+ ## AI Response
24
+ <|begin_of_response|>
25
+
26
+ {$model_output}
27
+
28
+ <|end_of_response|>
29
+
30
+
31
+ # Evaluation
32
+
33
+ ## Checklist
34
+
35
+ <|begin_of_checklist|>
36
+
37
+ {$checklist}
38
+
39
+ <|end_of_checklist|>
40
+
41
+ Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
42
+
43
+ ## Rules
44
+
45
+ You should compare the above response based on your analysis of the user queries and the conversation history.
46
+ You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
47
+ The scores are in the range of 1~10, where 1 means the response is very poor and 10 means the response is perfect.
48
+ Here are more detailed criteria for the scores:
49
+
50
+ - Score 1~2: The response is very poor and does not make sense at all.
51
+ - Score 3~4: The response is poor and does help user solve the problem in a meaningful way.
52
+ - Score 5~6: The response is fair but has some issues (e.g., factual errors, hallucinations, missing key information).
53
+ - Score 7~8: The response is good enough but could be improved in some ways.
54
+ - Score 9~10: The response is perfect and provides helpful information that can help user solve the problem.
55
+
56
+ ## Output Format
57
+ First, please output your analysis for the model response, and then summarize your assessment to two aspects: "strengths" and "weaknesses"; Finally, please write down your rating for the assessment.
58
+
59
+ Please provide your evaluation results in the following json format by filling in the placeholders in []:
60
+ ```
61
+ {
62
+ "strengths": "[analysis for the strengths of the response]",
63
+ "weaknesses": "[analysis for the weaknesses of the response]",
64
+ "score": "[1~10]"
65
+ }
66
+ ```
@@ -0,0 +1,119 @@
1
+ import re
2
+ from typing import Any, Optional, Union
3
+ from importlib.resources import files
4
+ from typing import Dict
5
+
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.annotation.annotator import Annotator
8
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
9
+ from helm.clients.auto_client import AutoClient
10
+ from helm.common.hierarchical_logger import hlog
11
+ from helm.common.request import Request
12
+
13
+
14
+ class WildBenchAnnotator(Annotator):
15
+ """The WildBench autograder."""
16
+
17
+ name = "wildbench"
18
+
19
+ def __init__(self, auto_client: AutoClient):
20
+ self._auto_client = auto_client
21
+ template_path = files("helm.benchmark.annotation.wildbench").joinpath("eval_template.score.v2.md")
22
+ with template_path.open("r") as f:
23
+ self._score_template = f.read()
24
+ self._pattern = re.compile(
25
+ r'"strengths"\s*:\s*"(.*?)"\s*,\s*"weaknesses"\s*:\s*"(.*?)"\s*,\s*"score"\s*:\s*(".*?"|\d+)', re.DOTALL
26
+ )
27
+
28
+ def annotate(self, request_state: RequestState) -> Any:
29
+ assert request_state.result
30
+ assert len(request_state.result.completions) == 1
31
+ assert request_state.instance.extra_data
32
+ model_output_text = request_state.result.completions[0].text
33
+ if not model_output_text.strip():
34
+ # Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
35
+ hlog(
36
+ "WARNING: WildBenchAnnotator skipped sending requests to annotator models "
37
+ "because the model response was empty"
38
+ )
39
+ return {
40
+ "prompt_text": None,
41
+ "empty_output_score": 1.0,
42
+ }
43
+
44
+ input_messages = request_state.instance.input.messages
45
+ assert input_messages is not None
46
+ history = []
47
+ for round in input_messages[:-1]:
48
+ noun = "USER: " if round["role"] == "user" else "ASSISTANT: "
49
+ history.append(noun + round["content"])
50
+ history_text = "\n\n".join(history)
51
+ user_query_text = input_messages[-1]["content"]
52
+ checklist_text = "\n".join(
53
+ [f"- {checklist_item}" for checklist_item in request_state.instance.extra_data["checklist"]]
54
+ )
55
+ annotator_prompt = (
56
+ self._score_template.replace("{$history}", history_text)
57
+ .replace("{$user_query}", user_query_text)
58
+ .replace("{$model_output}", model_output_text)
59
+ .replace("{$checklist}", checklist_text)
60
+ )
61
+
62
+ SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
63
+ "gpt": AnnotatorModelInfo(
64
+ model_name="openai/gpt-4o-2024-05-13",
65
+ model_deployment="openai/gpt-4o-2024-05-13",
66
+ ),
67
+ "llama": AnnotatorModelInfo(
68
+ model_name="meta/llama-3.1-405b-instruct-turbo",
69
+ model_deployment="together/llama-3.1-405b-instruct-turbo",
70
+ ),
71
+ "claude": AnnotatorModelInfo(
72
+ model_name="anthropic/claude-3-5-sonnet-20241022",
73
+ model_deployment="anthropic/claude-3-5-sonnet-20241022",
74
+ ),
75
+ }
76
+ annotations: Dict[str, Union[Optional[str], Optional[float]]] = {"prompt_text": annotator_prompt}
77
+ for annotator_name, annotator_model_info in SHORT_NAME_TO_MODEL_INFO.items():
78
+ annotator_request = Request(
79
+ model=annotator_model_info.model_name,
80
+ model_deployment=annotator_model_info.model_deployment,
81
+ prompt=annotator_prompt,
82
+ temperature=0.0,
83
+ max_tokens=2000,
84
+ )
85
+ strengths: Optional[str] = None
86
+ weaknesses: Optional[str] = None
87
+ score: Optional[float] = None
88
+ annotator_response = self._auto_client.make_request(annotator_request)
89
+ if not annotator_response.success:
90
+ hlog(
91
+ "WARNING: WildBenchAnnotator got an error response from "
92
+ f"{annotator_model_info.model_name}: : {annotator_response.error}"
93
+ )
94
+ else:
95
+ assert len(annotator_response.completions) == 1
96
+ annotator_response_text = annotator_response.completions[0].text
97
+ annotator_response_parts = self._pattern.search(annotator_response_text)
98
+ if not annotator_response_parts:
99
+ hlog(
100
+ "WARNING: WildBenchAnnotator got a malformed annotation from "
101
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
102
+ )
103
+ else:
104
+ strengths = annotator_response_parts[1].strip()
105
+ weaknesses = annotator_response_parts[2].strip()
106
+ score_text = annotator_response_parts[3].strip().strip('"')
107
+ try:
108
+ score = float(score_text)
109
+ except ValueError:
110
+ hlog(
111
+ "WARNING: WildBenchAnnotator could not parse the score from the annotation from "
112
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
113
+ )
114
+
115
+ annotations[f"{annotator_name}_strengths"] = strengths
116
+ annotations[f"{annotator_name}_weaknesses"] = weaknesses
117
+ annotations[f"{annotator_name}_score"] = score
118
+
119
+ return annotations
@@ -13,7 +13,7 @@ from helm.common.general import ensure_directory_exists, parallel_map, get_crede
13
13
  from helm.common.hierarchical_logger import htrack, hlog
14
14
  from helm.benchmark.adaptation.scenario_state import ScenarioState
15
15
  from helm.benchmark.adaptation.request_state import RequestState
16
- from helm.benchmark.annotation.annotator import AnnotatorSpec, Annotator
16
+ from helm.benchmark.annotation.annotator import Annotator
17
17
  from helm.benchmark.annotation.annotator_factory import AnnotatorFactory
18
18
  from helm.proxy.services.service import CACHE_DIR
19
19
 
@@ -88,22 +88,33 @@ class AnnotationExecutor:
88
88
  hlog("Skipped annotation.")
89
89
  return scenario_state
90
90
 
91
- if scenario_state.annotator_specs is None or len(scenario_state.annotator_specs) == 0:
91
+ if not scenario_state.annotator_specs:
92
92
  hlog("No annotators to run.")
93
93
  return scenario_state
94
94
 
95
- # Do it!
96
- def do_it(request_state: RequestState) -> RequestState:
97
- assert scenario_state.annotator_specs is not None
98
- return self.process(scenario_state.annotator_specs, request_state)
95
+ try:
96
+ annotators: List[Annotator] = [
97
+ self.factory.get_annotator(annotator_spec) for annotator_spec in scenario_state.annotator_specs
98
+ ]
99
+ except Exception as e:
100
+ raise AnnotationExecutorError(f"Could not initialize annotator for spec: {str(e)} ") from e
99
101
 
100
- self.annotator_specs = scenario_state.annotator_specs
102
+ if all(getattr(annotator, "use_global_metric", False) for annotator in annotators):
103
+ # Do it!
104
+ request_states = self.process_all(
105
+ annotators, scenario_state.request_states # processing all request together
106
+ )
101
107
 
102
- request_states = parallel_map(
103
- do_it,
104
- scenario_state.request_states,
105
- parallelism=self.execution_spec.parallelism,
106
- )
108
+ else:
109
+ # Do it!
110
+ def do_it(request_state: RequestState) -> RequestState:
111
+ return self.process(annotators, request_state)
112
+
113
+ request_states = parallel_map(
114
+ do_it,
115
+ scenario_state.request_states,
116
+ parallelism=self.execution_spec.parallelism,
117
+ )
107
118
 
108
119
  hlog(f"Annotated {len(request_states)} requests")
109
120
  return ScenarioState(
@@ -112,13 +123,22 @@ class AnnotationExecutor:
112
123
  annotator_specs=scenario_state.annotator_specs,
113
124
  )
114
125
 
115
- def process(self, annotator_specs: List[AnnotatorSpec], state: RequestState) -> RequestState:
126
+ def process(self, annotators: List[Annotator], state: RequestState) -> RequestState:
116
127
  annotations: Dict[str, Any] = {}
117
128
  try:
118
- for annotator_spec in annotator_specs:
119
- annotator: Annotator = self.factory.get_annotator(annotator_spec)
129
+ for annotator in annotators:
120
130
  new_annotations = annotator.annotate(state)
121
131
  annotations[annotator.name] = new_annotations
122
132
  except Exception as e:
123
133
  raise AnnotationExecutorError(f"{str(e)} Request: {state.request}") from e
124
134
  return replace(state, annotations=annotations)
135
+
136
+ def process_all(self, annotators: List[Annotator], states: List[RequestState]) -> List[RequestState]:
137
+ annotations: Dict[str, Any] = {}
138
+ try:
139
+ for annotator in annotators:
140
+ new_annotations = annotator.annotate_all(states)
141
+ annotations[annotator.name] = new_annotations
142
+ except Exception as e:
143
+ raise AnnotationExecutorError(f"{str(e)} Request: {[state.request for state in states]}") from e
144
+ return [replace(state, annotations=new_annotations[idx]) for idx, state in enumerate(states)]
@@ -9,8 +9,9 @@ from typing import Dict, List, Tuple, Set, Optional
9
9
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
12
- from .perturbation_description import PerturbationDescription
13
- from .perturbation import Perturbation, TextPerturbation
12
+ from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
13
+ from helm.benchmark.augmentations.perturbation import Perturbation, TextPerturbation
14
+ from helm.benchmark.runner import get_benchmark_output_path
14
15
 
15
16
 
16
17
  ############################################################
@@ -69,7 +70,7 @@ class ChineseTyposPerturbation(TextPerturbation):
69
70
  self.word_level_perturb: bool = word_level_perturb # Whether we perturb text on the character or word level
70
71
 
71
72
  # Ensure all necessary data are downloaded
72
- output_dir = os.path.join("benchmark_output", "perturbations", self.name)
73
+ output_dir = os.path.join(get_benchmark_output_path(), "perturbations", self.name)
73
74
  ensure_directory_exists(output_dir)
74
75
  for filename in self.FILE_NAMES:
75
76
  target_path = os.path.join(output_dir, filename)
@@ -303,7 +304,7 @@ class ChineseSynonymPerturbation(TextPerturbation):
303
304
  self.prob: float = prob
304
305
  self.trial_num: int = trial_num # Number of trial to get a 100% perturbed text
305
306
 
306
- target_dir = os.path.join("benchmark_output", "perturbations", self.name, "synonyms.json")
307
+ target_dir = os.path.join(get_benchmark_output_path(), "perturbations", self.name, "synonyms.json")
307
308
  ensure_directory_exists(os.path.dirname(target_dir))
308
309
  ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_dir)
309
310
  with open(os.path.join(target_dir)) as f:
@@ -433,7 +434,7 @@ class ChineseGenderPerturbation(TextPerturbation):
433
434
  if self.mode == self.GENDER_TERM:
434
435
  self.term_dict: Dict[Tuple[str, str], Dict[str, str]] = defaultdict(dict)
435
436
 
436
- target_path = os.path.join("benchmark_output", "perturbations", self.name, "gender_term.txt")
437
+ target_path = os.path.join(get_benchmark_output_path(), "perturbations", self.name, "gender_term.txt")
437
438
  ensure_directory_exists(os.path.dirname(target_path))
438
439
  ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
439
440
  with open(target_path) as fin:
@@ -492,7 +493,7 @@ class ChinesePersonNamePerturbation(Perturbation):
492
493
 
493
494
  """ Resources """
494
495
  SOURCE_URL: str = "http://39.108.215.175/assets/chinese_name_gender.json"
495
- OUTPUT_PATH = os.path.join("benchmark_output", "perturbations", name)
496
+ OUTPUT_PATH = os.path.join(get_benchmark_output_path(), "perturbations", name)
496
497
 
497
498
  """ Gender categories """
498
499
  GENDER_CATEGORY = "gender"
@@ -554,7 +555,7 @@ class ChinesePersonNamePerturbation(Perturbation):
554
555
 
555
556
  self.preserve_gender: bool = preserve_gender
556
557
 
557
- target_path = os.path.join("benchmark_output", "perturbations", self.name, "chinese_name_gender.json")
558
+ target_path = os.path.join(get_benchmark_output_path(), "perturbations", self.name, "chinese_name_gender.json")
558
559
  ensure_directory_exists(os.path.dirname(target_path))
559
560
  ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
560
561
  with open(os.path.join(target_path), "r", encoding="utf-8") as f:
@@ -735,7 +736,7 @@ class MandarinToCantonesePerturbation(TextPerturbation):
735
736
  handle_module_not_found_error(e, ["cleva"])
736
737
  self.s2t_converter = opencc.OpenCC("s2t.json")
737
738
 
738
- target_path = os.path.join("benchmark_output", "perturbations", self.name, "conversion.json")
739
+ target_path = os.path.join(get_benchmark_output_path(), "perturbations", self.name, "conversion.json")
739
740
  ensure_directory_exists(os.path.dirname(target_path))
740
741
  ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
741
742
  with open(target_path) as fin:
@@ -5,8 +5,8 @@ import re
5
5
  from random import Random
6
6
 
7
7
  from helm.common.general import match_case
8
- from .perturbation import TextPerturbation
9
- from .perturbation_description import PerturbationDescription
8
+ from helm.benchmark.augmentations.perturbation import TextPerturbation
9
+ from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
10
10
 
11
11
 
12
12
  CONTRACTION_MAP: Dict[str, str] = {