crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ TEST_SPLIT,
6
+ Scenario,
7
+ Instance,
8
+ Input,
9
+ )
10
+ from helm.common.audio_utils import is_invalid_audio_file
11
+ from helm.common.media_object import MediaObject, MultimediaObject
12
+
13
+
14
+ class VoiceJailbreakAttacksScenario(Scenario):
15
+ """
16
+ Voice Jailbreak Attacks Against GPT-4o
17
+
18
+ Paper: https://arxiv.org/abs/2405.19103
19
+
20
+ The examples used in this scenario was generated following the instructions in the GitHub:
21
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack
22
+ Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT".
23
+
24
+ We ran:
25
+ python tts/prompt2audio.py --dataset baseline --voice fable
26
+ python tts/prompt2audio.py --dataset baseline --voice fable
27
+
28
+ then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks:
29
+
30
+ voice_jailbreak_attacks/
31
+ baseline_fable/
32
+ *.wav
33
+ textjailbreak_fable/
34
+ *.wav
35
+
36
+ Base prompts (30 total):
37
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv
38
+ Jailbreak templates:
39
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv
40
+
41
+ The authors used Attack Success Rate (ASR) as the evaluation metric.
42
+ Specifically, two authors manually label each response to determine if it answers the forbidden
43
+ question. If there is any uncertainty, the authors discuss to reach a conclusion
44
+
45
+ Citation:
46
+
47
+ @article{SWBZ24,
48
+ author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang},
49
+ title = {{Voice Jailbreak Attacks Against GPT-4o}},
50
+ journal = {{CoRR abs/2405.19103}},
51
+ year = {2024}
52
+ }
53
+ """
54
+
55
+ VALID_SUBSETS = ["baseline", "textjailbreak"]
56
+
57
+ name = "voice_jailbreak_attacks"
58
+ description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))."
59
+ tags = ["audio", "safety"]
60
+
61
+ def __init__(self, subset: str = "baseline") -> None:
62
+ super().__init__()
63
+ assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}"
64
+ self._subset = subset
65
+
66
+ def get_instances(self, output_path: str) -> List[Instance]:
67
+ audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable")
68
+ assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}"
69
+
70
+ instances: List[Instance] = []
71
+ for file in os.listdir(audio_directory_path):
72
+ if not file.endswith("wav"):
73
+ continue
74
+
75
+ audio_path: str = os.path.join(audio_directory_path, file)
76
+ assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}"
77
+
78
+ input = Input(
79
+ multimedia_content=MultimediaObject(
80
+ media_objects=[
81
+ MediaObject(location=audio_path, content_type="audio/wav"),
82
+ ]
83
+ )
84
+ )
85
+ instances.append(Instance(input=input, references=[], split=TEST_SPLIT))
86
+
87
+ return instances
@@ -0,0 +1,105 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+ from tqdm import tqdm
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
16
+ from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
17
+ import pandas as pd
18
+
19
+
20
+ class VoxCeleb2Scenario(Scenario):
21
+ """VoxCeleb2
22
+
23
+ VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
24
+ interview videos uploaded to YouTube. This dataset contains over a million utterances from over
25
+ 6,000 speakers.
26
+
27
+ Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
28
+
29
+ Citation:
30
+ @inproceedings{Chung18b,
31
+ author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
32
+ title = "VoxCeleb2: Deep Speaker Recognition",
33
+ booktitle = "INTERSPEECH",
34
+ year = "2018",
35
+ }
36
+ """
37
+
38
+ DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
39
+ REFERENCE_URL = (
40
+ "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
41
+ )
42
+ IDENTITY_INSTRUCTION = (
43
+ "Listen to the audio and take your best guess to determine if the two speakers are the same person."
44
+ )
45
+
46
+ name = "voxceleb2"
47
+ description = (
48
+ "A large-scale dataset of over a million utterances from over 6,000 speakers with their"
49
+ "gender, race, identity information"
50
+ "([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
51
+ )
52
+ tags: List[str] = ["audio", "identification"]
53
+ options: List[str] = ["Yes", "No"]
54
+
55
+ def _convert_answer_to_label(self, answer: bool) -> str:
56
+ if answer:
57
+ return "A"
58
+ else:
59
+ return "B"
60
+
61
+ def _reformat_and_convert_audio_file(
62
+ self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
63
+ ) -> str:
64
+ tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
65
+ ensure_directory_exists(os.path.dirname(tgt_audio_path))
66
+ use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
67
+ return tgt_audio_path
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ instances: List[Instance] = []
71
+ audio_data_path = os.path.join(output_path, "audio_files")
72
+ tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
73
+ ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
74
+ annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
75
+ instances = []
76
+ for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
77
+ tgt_first_audio_path = self._reformat_and_convert_audio_file(
78
+ row["first"], tgt_audio_data_path, audio_data_path
79
+ )
80
+ tgt_second_audio_path = self._reformat_and_convert_audio_file(
81
+ row["second"], tgt_audio_data_path, audio_data_path
82
+ )
83
+
84
+ answer = self._convert_answer_to_label(row["same"])
85
+ # The given correct answer is a letter, but we need an index
86
+ correct_answer_index: int = ord(answer) - ord("A")
87
+ references: List[Reference] = []
88
+ for i, option in enumerate(self.options):
89
+ reference: Reference
90
+ is_correct: bool = i == correct_answer_index
91
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
92
+ references.append(reference)
93
+
94
+ input = Input(
95
+ multimedia_content=MultimediaObject(
96
+ [
97
+ MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
98
+ MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
99
+ MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
100
+ ]
101
+ )
102
+ )
103
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
104
+
105
+ return instances
@@ -0,0 +1,68 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+ from helm.common.general import ensure_directory_exists
15
+ from helm.common.hierarchical_logger import hwarn
16
+
17
+
18
+ class AutoBencherCapabilitiesScenario(Scenario):
19
+ """AutoBencher Capabilities
20
+
21
+ AutoBencher uses a language model to automatically search
22
+ for datasets. AutoBencher Capabilities consists of question
23
+ answering datasets for math, multilingual, and knowledge-intensive
24
+ question answering created by AutoBencher.
25
+
26
+ Paper: https://arxiv.org/abs/2407.08351"""
27
+
28
+ name = "autobencher_capabilities"
29
+ description = (
30
+ "AutoBencher Capabilities consists of question answering datasets "
31
+ "for math, multilingual, and knowledge-intensive "
32
+ "question answering created by AutoBencher. "
33
+ "([paper](https://arxiv.org/abs/2407.08351))"
34
+ )
35
+ tags = ["question answering"]
36
+
37
+ SUBJECTS = ["math", "mt", "econ", "science", "history"]
38
+
39
+ def __init__(self, subject: str):
40
+ super().__init__()
41
+ if subject not in self.SUBJECTS:
42
+ raise ValueError(f"Unexpected subject {subject}, available subjects are {self.SUBJECTS}")
43
+ self.subject: str = subject
44
+
45
+ def get_instances(self, output_path: str) -> List[Instance]:
46
+ cache_dir = os.path.join(output_path, "data")
47
+ ensure_directory_exists(cache_dir)
48
+
49
+ # TODO: Switch this to the production dataset when available.
50
+ dataset = datasets.load_dataset(
51
+ "xlisali1/AutoBencher-capability.json",
52
+ split="train", # Use train split as test, so only zero-shot is supported
53
+ cache_dir=cache_dir,
54
+ revision="efe58dd72b6423e3f5c967f16cbea8cce3a51933",
55
+ )
56
+ instances: List[Instance] = []
57
+ for row in dataset:
58
+ if row["subject"] == self.subject:
59
+ continue
60
+ input = Input(text=row["question"])
61
+ # References are category ID, followed by level 2, 3 and 4 category names.
62
+ references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
63
+ if row["gold_answer"] is None:
64
+ hwarn(f"Row had no gold_answer: {row}")
65
+ continue
66
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
67
+ instances.append(instance)
68
+ return instances
@@ -0,0 +1,51 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from typing import List
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+
8
+ from .scenario import CORRECT_TAG, Scenario, Instance, Input, TEST_SPLIT, Reference, Output
9
+
10
+
11
+ class AutobencherSafetyScenario(Scenario):
12
+ """
13
+ Autobencher safety scenario
14
+
15
+ AutoBencher uses a language model to automatically search
16
+ for datasets. AutoBencher Capabilities consists of question
17
+ answering datasets for math, multilingual, and knowledge-intensive
18
+ question answering created by AutoBencher.
19
+
20
+ Paper: https://arxiv.org/abs/2407.08351
21
+ """
22
+
23
+ name = "autobencher_safety"
24
+ description = "Autobencher safety consists of question answering datasets"
25
+ tags = ["safety"]
26
+
27
+ def get_instances(self, output_path: str) -> List[Instance]:
28
+ data_path = os.path.join(output_path, "data")
29
+ os.makedirs(data_path, exist_ok=True)
30
+ url = "https://raw.githubusercontent.com/farzaank/AutoBencher/refs/heads/main/safety_processing/process%20full%20dataset%20for%20mTurk/full_dataset.json" # noqa: E501
31
+ outf_path = os.path.join(data_path, "full_dataset.json")
32
+ ensure_file_downloaded(
33
+ source_url=url,
34
+ target_path=outf_path,
35
+ unpack=False,
36
+ )
37
+
38
+ json_data = json.loads(outf_path)
39
+ df = pd.DataFrame(json_data)
40
+
41
+ # Read all the instances
42
+ instances: List[Instance] = []
43
+
44
+ for i, row in df.iterrows():
45
+ references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
46
+ input_text = row["question"]
47
+ input = Input(text=input_text)
48
+ id = str(row["category"]) + str(row["id"])
49
+ instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
50
+ instances.append(instance)
51
+ return instances
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
5
- from .scenario import (
5
+ from helm.benchmark.scenarios.scenario import (
6
6
  Scenario,
7
7
  Instance,
8
8
  Reference,
@@ -37,7 +37,12 @@ class Banking77Scenario(Scenario):
37
37
  ensure_directory_exists(cache_dir)
38
38
 
39
39
  # TODO: Switch this to the production dataset when available.
40
- dataset = datasets.load_dataset("PolyAI/banking77", cache_dir=cache_dir)
40
+ dataset = datasets.load_dataset(
41
+ "PolyAI/banking77",
42
+ cache_dir=cache_dir,
43
+ revision="90d4e2ee5521c04fc1488f065b8b083658768c57",
44
+ trust_remote_code=True,
45
+ )
41
46
 
42
47
  instances: List[Instance] = []
43
48
  for split_name in [TRAIN_SPLIT, TEST_SPLIT]:
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -5,7 +5,17 @@ from typing import List, Dict
5
5
  from urllib.parse import urljoin
6
6
 
7
7
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
8
- from .scenario import Scenario, Instance, Reference, Input, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Output
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ Input,
13
+ CORRECT_TAG,
14
+ TRAIN_SPLIT,
15
+ VALID_SPLIT,
16
+ TEST_SPLIT,
17
+ Output,
18
+ )
9
19
 
10
20
 
11
21
  class BIGBenchScenario(Scenario):
@@ -0,0 +1,58 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ TEST_SPLIT,
8
+ Input,
9
+ )
10
+ from helm.common.general import ensure_directory_exists
11
+
12
+
13
+ VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"]
14
+
15
+
16
+ class BigCodeBenchScenario(Scenario):
17
+ """BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions
18
+
19
+ BigCodeBench is an easy-to-use benchmark for solving practical and challenging tasks via code.
20
+ It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting.
21
+ The benchmark is designed for HumanEval-like function-level code generation tasks,
22
+ but with much more complex instructions and diverse function calls."""
23
+
24
+ name = "bigcodebench"
25
+ description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
26
+ tags = ["coding"]
27
+
28
+ def __init__(self, version: str):
29
+ super().__init__()
30
+ assert version in VERSIONS, "Unknown version: {}".format(version)
31
+ self.version = version
32
+
33
+ def get_instances(self, output_path: str) -> List[Instance]:
34
+ # Get BigCodeBench from HuggingFace
35
+ cache_dir = os.path.join(output_path, "data")
36
+ ensure_directory_exists(cache_dir)
37
+ dataset = datasets.load_dataset(
38
+ "bigcode/bigcodebench",
39
+ revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6",
40
+ cache_dir=cache_dir,
41
+ split=self.version,
42
+ )
43
+ assert isinstance(dataset, datasets.Dataset)
44
+
45
+ # Read all instances
46
+ instances: List[Instance] = []
47
+ for row in dataset:
48
+ id = row["task_id"]
49
+ input = Input(text=row["instruct_prompt"])
50
+ instance = Instance(
51
+ id=id,
52
+ input=input,
53
+ references=[],
54
+ split=TEST_SPLIT,
55
+ )
56
+ instances.append(instance)
57
+
58
+ return instances
@@ -0,0 +1,94 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from filelock import FileLock
6
+
7
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
+ from helm.common.hierarchical_logger import hlog
9
+ from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
10
+ generate_comment_prompt,
11
+ generate_schema_prompt,
12
+ )
13
+ from helm.benchmark.scenarios.scenario import (
14
+ CORRECT_TAG,
15
+ Scenario,
16
+ Instance,
17
+ Reference,
18
+ VALID_SPLIT,
19
+ Input,
20
+ Output,
21
+ )
22
+
23
+
24
+ def _ensure_file_unzipped(source_path: str, target_path: str):
25
+ with FileLock(f"{target_path}.lock"):
26
+ if os.path.exists(target_path):
27
+ hlog(f"Not decompressing {source_path} because {target_path} already exists")
28
+ return
29
+ tmp_path = target_path + ".tmp"
30
+ ensure_directory_exists(tmp_path)
31
+ shell(["unzip", source_path, "-d", tmp_path])
32
+ shell(["mv", tmp_path, target_path])
33
+
34
+
35
+ class BIRDSQLScenario(Scenario):
36
+ """BIRD-SQL (Dev)"""
37
+
38
+ name = "bird_sql"
39
+ description = "bird_sql"
40
+ tags = ["sql"]
41
+
42
+ COT_PROMPT = """
43
+ Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
44
+
45
+ <reasoning>
46
+ INSERT_YOUR_REASONING_HERE
47
+ </reasoning>
48
+ <sql>
49
+ INSERT_YOUR_SQL_QUERY_HERE
50
+ </sql>""" # noqa: E501
51
+
52
+ def get_instances(self, output_path: str) -> List[Instance]:
53
+ data_root_path = os.path.join(output_path, "dev")
54
+ ensure_file_downloaded(
55
+ "https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip", data_root_path, unpack=True, unpack_type="unzip"
56
+ )
57
+ databases_unzip_target = os.path.join(data_root_path, "unzipped_dev_databases")
58
+ _ensure_file_unzipped(os.path.join(data_root_path, "dev_databases.zip"), databases_unzip_target)
59
+ # Note: Zip file contains .DS_Store file at the root, which makes dev_databases unzip into a nested directory
60
+ databases_root_path = os.path.join(databases_unzip_target, "dev_databases")
61
+
62
+ database_schema_prompts: Dict[str, str] = {}
63
+ for database_name in os.listdir(databases_root_path):
64
+ database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
65
+ print(database_path)
66
+ if not os.path.exists(database_path):
67
+ # Ignore stray ".DS_Store" directory
68
+ continue
69
+
70
+ database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
71
+ database_schema_prompts[database_name] = database_schema_prompt
72
+
73
+ instances: List[Instance] = []
74
+ dataset_path = os.path.join(data_root_path, "dev.json")
75
+ dataset = json.load(open(dataset_path, "r"))
76
+ for row in dataset:
77
+ question_id: int = row["question_id"]
78
+ database_id: str = row["db_id"]
79
+ question: str = row["question"]
80
+ knowledge: str = row["evidence"]
81
+ gold_sql: str = row["SQL"]
82
+
83
+ schema_prompt = database_schema_prompts[database_id]
84
+ comment_prompt = generate_comment_prompt(question, knowledge)
85
+ combined_prompt = schema_prompt + "\n\n" + comment_prompt + self.COT_PROMPT
86
+ instance = Instance(
87
+ id=f"id{question_id}",
88
+ input=Input(text=combined_prompt),
89
+ references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
90
+ extra_data={"db_id": row["db_id"]},
91
+ split=VALID_SPLIT,
92
+ )
93
+ instances.append(instance)
94
+ return instances
@@ -0,0 +1,118 @@
1
+ # flake8: noqa
2
+ # type: ignore
3
+ # fmt: off
4
+
5
+ from typing import Dict
6
+ import os
7
+ import sqlite3
8
+
9
+
10
+ # The following code is copied verbatim from:
11
+ # https://github.com/AlibabaResearch/DAMO-ConvAI/blob/90a76ef9ef1e2960c9bdfa38c63cc81b841e813e/bird/llm/src/gpt_request.py
12
+ # under the following license:
13
+ #
14
+ # MIT License
15
+ #
16
+ # Copyright (c) 2022 Alibaba Research
17
+ #
18
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
19
+ # of this software and associated documentation files (the "Software"), to deal
20
+ # in the Software without restriction, including without limitation the rights
21
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
+ # copies of the Software, and to permit persons to whom the Software is
23
+ # furnished to do so, subject to the following conditions:
24
+ #
25
+ # The above copyright notice and this permission notice shall be included in all
26
+ # copies or substantial portions of the Software.
27
+
28
+
29
+ def get_db_schemas(bench_root: str, db_name: str) -> Dict[str, str]:
30
+ """
31
+ Read an sqlite file, and return the CREATE commands for each of the tables in the database.
32
+ """
33
+ asdf = 'database' if bench_root == 'spider' else 'databases'
34
+ with sqlite3.connect(f'file:{bench_root}/{asdf}/{db_name}/{db_name}.sqlite?mode=ro', uri=True) as conn:
35
+ # conn.text_factory = bytes
36
+ cursor = conn.cursor()
37
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
38
+ tables = cursor.fetchall()
39
+ schemas = {}
40
+ for table in tables:
41
+ cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
42
+ schemas[table[0]] = cursor.fetchone()[0]
43
+
44
+ return schemas
45
+
46
+ def nice_look_table(column_names: list, values: list):
47
+ rows = []
48
+ # Determine the maximum width of each column
49
+ widths = [max(len(str(value[i])) for value in values + [column_names]) for i in range(len(column_names))]
50
+
51
+ # Print the column names
52
+ header = ''.join(f'{column.rjust(width)} ' for column, width in zip(column_names, widths))
53
+ # print(header)
54
+ # Print the values
55
+ for value in values:
56
+ row = ''.join(f'{str(v).rjust(width)} ' for v, width in zip(value, widths))
57
+ rows.append(row)
58
+ rows = "\n".join(rows)
59
+ final_output = header + '\n' + rows
60
+ return final_output
61
+
62
+ def generate_schema_prompt(db_path, num_rows=None):
63
+ # extract create ddls
64
+ '''
65
+ :param root_place:
66
+ :param db_name:
67
+ :return:
68
+ '''
69
+ full_schema_prompt_list = []
70
+ conn = sqlite3.connect(db_path)
71
+ # Create a cursor object
72
+ cursor = conn.cursor()
73
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
74
+ tables = cursor.fetchall()
75
+ schemas = {}
76
+ for table in tables:
77
+ if table == 'sqlite_sequence':
78
+ continue
79
+ cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
80
+ create_prompt = cursor.fetchone()[0]
81
+ schemas[table[0]] = create_prompt
82
+ if num_rows:
83
+ cur_table = table[0]
84
+ if cur_table in ['order', 'by', 'group']:
85
+ cur_table = "`{}`".format(cur_table)
86
+
87
+ cursor.execute("SELECT * FROM {} LIMIT {}".format(cur_table, num_rows))
88
+ column_names = [description[0] for description in cursor.description]
89
+ values = cursor.fetchall()
90
+ rows_prompt = nice_look_table(column_names=column_names, values=values)
91
+ verbose_prompt = "/* \n {} example rows: \n SELECT * FROM {} LIMIT {}; \n {} \n */".format(num_rows, cur_table, num_rows, rows_prompt)
92
+ schemas[table[0]] = "{} \n {}".format(create_prompt, verbose_prompt)
93
+
94
+ for k, v in schemas.items():
95
+ full_schema_prompt_list.append(v)
96
+
97
+ schema_prompt = "\n\n".join(full_schema_prompt_list)
98
+
99
+ return schema_prompt
100
+
101
+ def generate_comment_prompt(question, knowledge=None):
102
+ pattern_prompt_no_kg = "-- Using valid SQLite, answer the following questions for the tables provided above."
103
+ pattern_prompt_kg = "-- Using valid SQLite and understading External Knowledge, answer the following questions for the tables provided above."
104
+ # question_prompt = "-- {}".format(question) + '\n SELECT '
105
+ question_prompt = "-- {}".format(question)
106
+ knowledge_prompt = "-- External Knowledge: {}".format(knowledge)
107
+
108
+ if not knowledge_prompt:
109
+ result_prompt = pattern_prompt_no_kg + '\n' + question_prompt
110
+ else:
111
+ result_prompt = knowledge_prompt + '\n' + pattern_prompt_kg + '\n' + question_prompt
112
+
113
+ return result_prompt
114
+
115
+ def cot_wizard():
116
+ cot = "\nGenerate the SQL after thinking step by step: "
117
+
118
+ return cot
@@ -4,7 +4,7 @@ from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
  from helm.common.hierarchical_logger import hlog
7
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
8
8
 
9
9
 
10
10
  class BLiMPScenario(Scenario):
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  TRAIN_SPLIT,
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List, Dict, Optional
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
- from .scenario import (
6
+ from helm.benchmark.scenarios.scenario import (
7
7
  Scenario,
8
8
  Instance,
9
9
  Reference,