crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,49 @@
1
+ """Run spec functions for three clinical sections of MMLU human-translated into 11 African languages
2
+
3
+ Available subjects: "clinical_knowledge", "college_medicine", "virology"
4
+ Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages)
5
+ """ # noqa: E501
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
8
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
9
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
10
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
11
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
12
+
13
+
14
+ @run_spec_function("mmlu_clinical_afr")
15
+ def get_mmlu_clinical_afr_spec(subject: str, lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
16
+ scenario_spec = ScenarioSpec(
17
+ class_name="helm.benchmark.scenarios.mmlu_clinical_afr_scenario.MMLU_Clinical_Afr_Scenario",
18
+ args={"subject": subject, "lang": lang},
19
+ )
20
+
21
+ lang_map = {
22
+ "af": "Afrikaans",
23
+ "zu": "Zulu",
24
+ "xh": "Xhosa",
25
+ "am": "Amharic",
26
+ "bm": "Bambara",
27
+ "ig": "Igbo",
28
+ "nso": "Sepedi",
29
+ "sn": "Shona",
30
+ "st": "Sesotho",
31
+ "tn": "Setswana",
32
+ "ts": "Tsonga",
33
+ }
34
+
35
+ adapter_spec = get_multiple_choice_adapter_spec(
36
+ method=method,
37
+ instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')} "
38
+ f"in {lang_map[lang]}.",
39
+ input_noun="Question",
40
+ output_noun="Answer",
41
+ )
42
+
43
+ return RunSpec(
44
+ name=f"mmlu_clinical_afr:subject={subject},lang={lang},method={method}",
45
+ scenario_spec=scenario_spec,
46
+ adapter_spec=adapter_spec,
47
+ metric_specs=get_exact_match_metric_specs(),
48
+ groups=[f"mmlu_clinical_afr_{subject}", f"mmlu_clinical_afr_{subject}_{lang}"],
49
+ )
@@ -0,0 +1,32 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("oab_exams")
9
+ def get_enem_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.oab_exams_scenario.OABExamsScenario", args={})
11
+
12
+ adapter_spec = get_multiple_choice_adapter_spec(
13
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
15
+ "Se as opções forem A, B, C e D,"
16
+ "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
17
+ "Exemplo: Ao conselho da subseção compete\nA. representar a OAB no Conselho de Segurança do MERCOSUL."
18
+ "\nB. fiscalizar as funções e atribuições do conselho seccional.\nC. instaurar e instruir processos "
19
+ "disciplinares, para julgamento pelo Conselho Federal.\nD. receber pedido de inscrição nos quadros de "
20
+ "advogado e estagiário, instruindo e emitindo parecer prévio, para decisão do conselho seccional.\n"
21
+ "Resposta: D",
22
+ input_noun="Pergunta",
23
+ output_noun="Resposta",
24
+ )
25
+
26
+ return RunSpec(
27
+ name="oab_exams",
28
+ scenario_spec=scenario_spec,
29
+ adapter_spec=adapter_spec,
30
+ metric_specs=get_exact_match_metric_specs(),
31
+ groups=["oab_exams"],
32
+ )
@@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
41
41
  )
42
42
 
43
43
 
44
+ @run_spec_function("harm_bench_gcg_transfer")
45
+ def get_harm_bench_gcg_transfer_spec() -> RunSpec:
46
+ adapter_spec = AdapterSpec(
47
+ method=ADAPT_GENERATION,
48
+ global_prefix="",
49
+ global_suffix="",
50
+ instructions="",
51
+ input_prefix="",
52
+ input_suffix="",
53
+ output_prefix="",
54
+ output_suffix="",
55
+ instance_prefix="",
56
+ max_train_instances=0,
57
+ num_outputs=1,
58
+ max_tokens=512,
59
+ temperature=0.0,
60
+ stop_sequences=[],
61
+ )
62
+ scenario_spec = ScenarioSpec(
63
+ class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
64
+ )
65
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
66
+ metric_specs = [
67
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
68
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
69
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
70
+ ]
71
+ return RunSpec(
72
+ name="harm_bench_gcg_transfer",
73
+ scenario_spec=scenario_spec,
74
+ adapter_spec=adapter_spec,
75
+ metric_specs=metric_specs,
76
+ annotators=annotator_specs,
77
+ groups=["harm_bench_gcg_transfer"],
78
+ )
79
+
80
+
44
81
  @run_spec_function("simple_safety_tests")
45
82
  def get_simple_safety_tests_spec() -> RunSpec:
46
83
  adapter_spec = AdapterSpec(
@@ -5,9 +5,9 @@ from helm.benchmark.adaptation.common_adapter_specs import (
5
5
  get_generation_adapter_spec,
6
6
  get_multiple_choice_separate_adapter_spec,
7
7
  )
8
- from helm.benchmark.metrics.bhasa_metrics_specs import (
9
- get_bhasa_machine_translation_metric_specs,
10
- get_bhasa_qa_metric_specs,
8
+ from helm.benchmark.metrics.seahelm_metrics_specs import (
9
+ get_seahelm_machine_translation_metric_specs,
10
+ get_seahelm_qa_metric_specs,
11
11
  )
12
12
  from helm.benchmark.metrics.common_metric_specs import (
13
13
  get_basic_metric_specs,
@@ -17,7 +17,7 @@ from helm.benchmark.metrics.common_metric_specs import (
17
17
  from helm.benchmark.run_spec import RunSpec, run_spec_function
18
18
  from helm.benchmark.scenarios.scenario import ScenarioSpec
19
19
 
20
- # BHASA Run Specs
20
+ # SEA-HELM Run Specs
21
21
  # A. Natural Language Understanding
22
22
  # B. Natural Language Generation
23
23
  # C. Natural Language Reasoning
@@ -43,18 +43,18 @@ def get_tydiqa_spec() -> RunSpec:
43
43
  max_tokens=256,
44
44
  )
45
45
 
46
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.TyDiQAScenario")
46
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.TyDiQAScenario")
47
47
 
48
48
  return RunSpec(
49
49
  name=name,
50
50
  scenario_spec=scenario_spec,
51
51
  adapter_spec=adapter_spec,
52
- metric_specs=get_bhasa_qa_metric_specs(
52
+ metric_specs=get_seahelm_qa_metric_specs(
53
53
  args={
54
54
  "language": "id",
55
55
  }
56
56
  ),
57
- groups=["bhasa_nlu", "tydiqa"],
57
+ groups=["seahelm_nlu", "tydiqa"],
58
58
  )
59
59
 
60
60
 
@@ -84,7 +84,7 @@ def get_xquad_spec(language="th") -> RunSpec:
84
84
  )
85
85
 
86
86
  scenario_spec = ScenarioSpec(
87
- class_name="helm.benchmark.scenarios.bhasa_scenario.XQuADScenario",
87
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XQuADScenario",
88
88
  args={
89
89
  "language": language,
90
90
  },
@@ -94,12 +94,12 @@ def get_xquad_spec(language="th") -> RunSpec:
94
94
  name=name,
95
95
  scenario_spec=scenario_spec,
96
96
  adapter_spec=adapter_spec,
97
- metric_specs=get_bhasa_qa_metric_specs(
97
+ metric_specs=get_seahelm_qa_metric_specs(
98
98
  args={
99
99
  "language": language,
100
100
  }
101
101
  ),
102
- groups=["bhasa_nlu", f"xquad_{language}"],
102
+ groups=["seahelm_nlu", f"xquad_{language}"],
103
103
  )
104
104
 
105
105
 
@@ -116,18 +116,18 @@ def get_indicqa_spec() -> RunSpec:
116
116
  max_tokens=256,
117
117
  )
118
118
 
119
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicQAScenario")
119
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicQAScenario")
120
120
 
121
121
  return RunSpec(
122
122
  name=name,
123
123
  scenario_spec=scenario_spec,
124
124
  adapter_spec=adapter_spec,
125
- metric_specs=get_bhasa_qa_metric_specs(
125
+ metric_specs=get_seahelm_qa_metric_specs(
126
126
  args={
127
127
  "language": "ta",
128
128
  }
129
129
  ),
130
- groups=["bhasa_nlu", "indicqa"],
130
+ groups=["seahelm_nlu", "indicqa"],
131
131
  )
132
132
 
133
133
 
@@ -146,14 +146,14 @@ def get_nusax_spec() -> RunSpec:
146
146
  max_tokens=16,
147
147
  )
148
148
 
149
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.NusaXScenario")
149
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.NusaXScenario")
150
150
 
151
151
  return RunSpec(
152
152
  name=name,
153
153
  scenario_spec=scenario_spec,
154
154
  adapter_spec=adapter_spec,
155
155
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
156
- groups=["bhasa_nlu", "nusax"],
156
+ groups=["seahelm_nlu", "nusax"],
157
157
  )
158
158
 
159
159
 
@@ -171,14 +171,14 @@ def get_uitvsfc_spec() -> RunSpec:
171
171
  max_tokens=16,
172
172
  )
173
173
 
174
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.UITVSFCScenario")
174
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.UITVSFCScenario")
175
175
 
176
176
  return RunSpec(
177
177
  name=name,
178
178
  scenario_spec=scenario_spec,
179
179
  adapter_spec=adapter_spec,
180
180
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
181
- groups=["bhasa_nlu", "uitvsfc"],
181
+ groups=["seahelm_nlu", "uitvsfc"],
182
182
  )
183
183
 
184
184
 
@@ -196,14 +196,14 @@ def get_wisesight_spec() -> RunSpec:
196
196
  max_tokens=16,
197
197
  )
198
198
 
199
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.WisesightScenario")
199
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.WisesightScenario")
200
200
 
201
201
  return RunSpec(
202
202
  name=name,
203
203
  scenario_spec=scenario_spec,
204
204
  adapter_spec=adapter_spec,
205
205
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
206
- groups=["bhasa_nlu", "wisesight"],
206
+ groups=["seahelm_nlu", "wisesight"],
207
207
  )
208
208
 
209
209
 
@@ -221,14 +221,14 @@ def get_indicsentiment_spec() -> RunSpec:
221
221
  max_tokens=16,
222
222
  )
223
223
 
224
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicSentimentScenario")
224
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicSentimentScenario")
225
225
 
226
226
  return RunSpec(
227
227
  name=name,
228
228
  scenario_spec=scenario_spec,
229
229
  adapter_spec=adapter_spec,
230
230
  metric_specs=get_classification_metric_specs() + get_basic_metric_specs([]),
231
- groups=["bhasa_nlu", "indicsentiment"],
231
+ groups=["seahelm_nlu", "indicsentiment"],
232
232
  )
233
233
 
234
234
 
@@ -250,14 +250,14 @@ def get_mlhsd_spec() -> RunSpec:
250
250
  max_tokens=16,
251
251
  )
252
252
 
253
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.MLHSDScenario")
253
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.MLHSDScenario")
254
254
 
255
255
  return RunSpec(
256
256
  name=name,
257
257
  scenario_spec=scenario_spec,
258
258
  adapter_spec=adapter_spec,
259
259
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
260
- groups=["bhasa_nlu", "mlhsd"],
260
+ groups=["seahelm_nlu", "mlhsd"],
261
261
  )
262
262
 
263
263
 
@@ -278,14 +278,14 @@ def get_vihsd_spec() -> RunSpec:
278
278
  max_tokens=16,
279
279
  )
280
280
 
281
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ViHSDScenario")
281
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ViHSDScenario")
282
282
 
283
283
  return RunSpec(
284
284
  name=name,
285
285
  scenario_spec=scenario_spec,
286
286
  adapter_spec=adapter_spec,
287
287
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
288
- groups=["bhasa_nlu", "vihsd"],
288
+ groups=["seahelm_nlu", "vihsd"],
289
289
  )
290
290
 
291
291
 
@@ -304,14 +304,14 @@ def get_thaitoxicitytweets_spec() -> RunSpec:
304
304
  max_tokens=16,
305
305
  )
306
306
 
307
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ThaiToxicityTweetsScenario")
307
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ThaiToxicityTweetsScenario")
308
308
 
309
309
  return RunSpec(
310
310
  name=name,
311
311
  scenario_spec=scenario_spec,
312
312
  adapter_spec=adapter_spec,
313
313
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
314
- groups=["bhasa_nlu", "thaitoxicitytweets"],
314
+ groups=["seahelm_nlu", "thaitoxicitytweets"],
315
315
  )
316
316
 
317
317
 
@@ -378,7 +378,7 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
378
378
  )
379
379
 
380
380
  scenario_spec = ScenarioSpec(
381
- class_name="helm.benchmark.scenarios.bhasa_scenario.FloresScenario",
381
+ class_name="helm.benchmark.scenarios.seahelm_scenario.FloresScenario",
382
382
  args={
383
383
  "pair": pair,
384
384
  },
@@ -388,8 +388,8 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
388
388
  name=name,
389
389
  scenario_spec=scenario_spec,
390
390
  adapter_spec=adapter_spec,
391
- metric_specs=get_bhasa_machine_translation_metric_specs(),
392
- groups=["bhasa_nlg", f"flores_{pair}"],
391
+ metric_specs=get_seahelm_machine_translation_metric_specs(),
392
+ groups=["seahelm_nlg", f"flores_{pair}"],
393
393
  )
394
394
 
395
395
 
@@ -414,14 +414,14 @@ def get_indonli_spec() -> RunSpec:
414
414
  max_tokens=2,
415
415
  )
416
416
 
417
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndoNLIScenario")
417
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndoNLIScenario")
418
418
 
419
419
  return RunSpec(
420
420
  name=name,
421
421
  scenario_spec=scenario_spec,
422
422
  adapter_spec=adapter_spec,
423
423
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
424
- groups=["bhasa_nlr", "indonli"],
424
+ groups=["seahelm_nlr", "indonli"],
425
425
  )
426
426
 
427
427
 
@@ -456,7 +456,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
456
456
  )
457
457
 
458
458
  scenario_spec = ScenarioSpec(
459
- class_name="helm.benchmark.scenarios.bhasa_scenario.XNLIScenario",
459
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XNLIScenario",
460
460
  args={
461
461
  "language": language,
462
462
  },
@@ -467,7 +467,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
467
467
  scenario_spec=scenario_spec,
468
468
  adapter_spec=adapter_spec,
469
469
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
470
- groups=["bhasa_nlr", f"xnli_{language}"],
470
+ groups=["seahelm_nlr", f"xnli_{language}"],
471
471
  )
472
472
 
473
473
 
@@ -487,14 +487,14 @@ def get_indicxnli_spec() -> RunSpec:
487
487
  max_tokens=2,
488
488
  )
489
489
 
490
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicXNLIScenario")
490
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicXNLIScenario")
491
491
 
492
492
  return RunSpec(
493
493
  name=name,
494
494
  scenario_spec=scenario_spec,
495
495
  adapter_spec=adapter_spec,
496
496
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
497
- groups=["bhasa_nlr", "indicxnli"],
497
+ groups=["seahelm_nlr", "indicxnli"],
498
498
  )
499
499
 
500
500
 
@@ -531,7 +531,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
531
531
  )
532
532
 
533
533
  scenario_spec = ScenarioSpec(
534
- class_name="helm.benchmark.scenarios.bhasa_scenario.XCOPAScenario",
534
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XCOPAScenario",
535
535
  args={
536
536
  "language": language,
537
537
  },
@@ -542,7 +542,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
542
542
  scenario_spec=scenario_spec,
543
543
  adapter_spec=adapter_spec,
544
544
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
545
- groups=["bhasa_nlr", f"xcopa_{language}"],
545
+ groups=["seahelm_nlr", f"xcopa_{language}"],
546
546
  )
547
547
 
548
548
 
@@ -566,7 +566,7 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
566
566
  )
567
567
 
568
568
  scenario_spec = ScenarioSpec(
569
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
569
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEASyntaxMinimalPairsScenario",
570
570
  args={
571
571
  "method": method,
572
572
  "language": language,
@@ -578,14 +578,18 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
578
578
  scenario_spec=scenario_spec,
579
579
  adapter_spec=adapter_spec,
580
580
  metric_specs=get_exact_match_metric_specs(),
581
- groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
581
+ groups=[
582
+ "seahelm_linguistic",
583
+ f"lindsea_syntax_minimal_pairs_{language}",
584
+ f"lindsea_syntax_minimal_pairs_{method}_{language}",
585
+ ],
582
586
  )
583
587
 
584
588
 
585
- # 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
586
- @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
587
- def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
588
- name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
589
+ # 2.1. Pragmatics: LINDSEA Presuppositions
590
+ @run_spec_function("lindsea_pragmatics_presuppositions")
591
+ def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: str = "all") -> RunSpec:
592
+ name = f"lindsea_pragmatics_presuppositions_{subset}_{language}"
589
593
 
590
594
  adapter_spec = get_generation_adapter_spec(
591
595
  output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -595,9 +599,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
595
599
  )
596
600
 
597
601
  scenario_spec = ScenarioSpec(
598
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
602
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsPresuppositionsScenario",
599
603
  args={
600
604
  "language": language,
605
+ "subset": subset,
601
606
  },
602
607
  )
603
608
 
@@ -606,14 +611,18 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
606
611
  scenario_spec=scenario_spec,
607
612
  adapter_spec=adapter_spec,
608
613
  metric_specs=get_exact_match_metric_specs(),
609
- groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
614
+ groups=[
615
+ "seahelm_linguistic",
616
+ f"lindsea_pragmatics_presuppositions_{language}",
617
+ f"lindsea_pragmatics_presuppositions_{subset}_{language}",
618
+ ],
610
619
  )
611
620
 
612
621
 
613
- # 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
614
- @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
615
- def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
616
- name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
622
+ # 2.2. Pragmatics: LINDSEA Scalar Implicatures
623
+ @run_spec_function("lindsea_pragmatics_scalar_implicatures")
624
+ def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset: str = "all") -> RunSpec:
625
+ name = f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}"
617
626
 
618
627
  adapter_spec = get_generation_adapter_spec(
619
628
  output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -623,9 +632,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
623
632
  )
624
633
 
625
634
  scenario_spec = ScenarioSpec(
626
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
635
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
627
636
  args={
628
637
  "language": language,
638
+ "subset": subset,
629
639
  },
630
640
  )
631
641
 
@@ -634,5 +644,9 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
634
644
  scenario_spec=scenario_spec,
635
645
  adapter_spec=adapter_spec,
636
646
  metric_specs=get_exact_match_metric_specs(),
637
- groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
647
+ groups=[
648
+ "seahelm_linguistic",
649
+ f"lindsea_pragmatics_scalar_implicatures_{language}",
650
+ f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
651
+ ],
638
652
  )
@@ -0,0 +1,54 @@
1
+ from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
2
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.metrics.metric import MetricSpec
5
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
6
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
7
+
8
+
9
+ @run_spec_function("bird_sql")
10
+ def get_bird_sql_dev_run_spec() -> RunSpec:
11
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario")
12
+
13
+ adapter_spec = get_generation_adapter_spec(
14
+ input_noun=None,
15
+ output_noun=None,
16
+ max_tokens=1024,
17
+ stop_sequences=[],
18
+ )
19
+
20
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator")]
21
+
22
+ return RunSpec(
23
+ name="bird_sql",
24
+ scenario_spec=scenario_spec,
25
+ adapter_spec=adapter_spec,
26
+ annotators=annotator_specs,
27
+ metric_specs=get_exact_match_metric_specs()
28
+ + [MetricSpec(class_name="helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric")],
29
+ groups=["bird_sql"],
30
+ )
31
+
32
+
33
+ @run_spec_function("spider")
34
+ def get_spider_run_spec() -> RunSpec:
35
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.spider_scenario.SpiderScenario")
36
+
37
+ adapter_spec = get_generation_adapter_spec(
38
+ input_noun=None,
39
+ output_noun=None,
40
+ max_tokens=1024,
41
+ stop_sequences=[],
42
+ )
43
+
44
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.spider_annotator.SpiderAnnotator")]
45
+
46
+ return RunSpec(
47
+ name="spider",
48
+ scenario_spec=scenario_spec,
49
+ adapter_spec=adapter_spec,
50
+ annotators=annotator_specs,
51
+ metric_specs=get_exact_match_metric_specs()
52
+ + [MetricSpec(class_name="helm.benchmark.metrics.spider_metrics.SpiderMetric")],
53
+ groups=["spider"],
54
+ )
@@ -0,0 +1,32 @@
1
+ from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
2
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
3
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
4
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
5
+
6
+
7
+ @run_spec_function("tweetsentbr")
8
+ def get_tweetsentbr_spec() -> RunSpec:
9
+ scenario_spec = ScenarioSpec(
10
+ class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={}
11
+ )
12
+
13
+ adapter_spec = get_generation_adapter_spec(
14
+ instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo".
15
+
16
+ Tweet: vocês viram a novela hoje?
17
+ Classe: Neutro
18
+
19
+ Tweet: que vontade de comer pizza
20
+ Classe: Neutro
21
+ """,
22
+ input_noun="Tweet",
23
+ output_noun="Classe",
24
+ )
25
+
26
+ return RunSpec(
27
+ name="tweetsentbr",
28
+ scenario_spec=scenario_spec,
29
+ adapter_spec=adapter_spec,
30
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
31
+ groups=["tweetsentbr"],
32
+ )
@@ -1,3 +1,5 @@
1
+ import os
2
+
1
3
  from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
4
  from helm.benchmark.metrics.metric import MetricSpec
3
5
  from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
@@ -8,9 +10,15 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
8
10
  @run_spec_function("unitxt")
9
11
  def get_unitxt_spec(**kwargs) -> RunSpec:
10
12
  card = kwargs.get("card")
11
- if not card:
12
- raise Exception("Unitxt card must be specified")
13
- name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
13
+ recipe = kwargs.get("recipe")
14
+ if not card and not recipe:
15
+ raise Exception("Unitxt card or recipe must be specified")
16
+ if os.environ.get("HELM_UNITXT_SHORTEN_RUN_SPEC_NAMES", "").lower() == "true":
17
+ name_suffix = ",".join(
18
+ [f"{key}={value}" for key, value in kwargs.items() if key not in ["template_card_index", "loader_limit"]]
19
+ )
20
+ else:
21
+ name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
14
22
  name = f"unitxt:{name_suffix}"
15
23
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.unitxt_scenario.UnitxtScenario", args=kwargs)
16
24
  adapter_spec = AdapterSpec(
@@ -28,7 +36,8 @@ def get_unitxt_spec(**kwargs) -> RunSpec:
28
36
  max_train_instances=0,
29
37
  num_outputs=1,
30
38
  temperature=0.0,
31
- stop_sequences=["\n\n"],
39
+ max_tokens=512,
40
+ stop_sequences=[],
32
41
  )
33
42
  return RunSpec(
34
43
  name=name,
@@ -38,5 +47,5 @@ def get_unitxt_spec(**kwargs) -> RunSpec:
38
47
  MetricSpec(class_name="helm.benchmark.metrics.unitxt_metrics.UnitxtMetric", args=kwargs),
39
48
  ]
40
49
  + get_basic_metric_specs([]),
41
- groups=[f"unitxt_{card}"],
50
+ groups=[f"unitxt_{card or recipe}"],
42
51
  )