crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,53 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.mmlu_pro_scenario import MMLUProScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_mmlu_pro_scenario():
10
+ with TemporaryDirectory() as tmpdir:
11
+ # Test for the "abstract_algebra" subject
12
+ scenario = MMLUProScenario(subject="math")
13
+ instances = scenario.get_instances(tmpdir)
14
+ assert instances[1].input == Input(
15
+ text="Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?" # noqa: E501
16
+ )
17
+
18
+ # Ensure it handles up to 10 answer options
19
+ assert instances[1].references == [
20
+ Reference(output=Output(text="ST + TS is the identity map of V onto itself."), tags=[]),
21
+ Reference(output=Output(text="TS = 0"), tags=[]),
22
+ Reference(output=Output(text="ST = 1"), tags=[]),
23
+ Reference(output=Output(text="ST - TS = 0"), tags=[]),
24
+ Reference(output=Output(text="ST = T"), tags=[]),
25
+ Reference(output=Output(text="ST = 0"), tags=[]),
26
+ Reference(output=Output(text="ST = TS"), tags=[]),
27
+ Reference(output=Output(text="ST - TS is the identity map of V onto itself."), tags=[CORRECT_TAG]),
28
+ Reference(output=Output(text="TS = T"), tags=[]),
29
+ Reference(output=Output(text="ST = S"), tags=[]),
30
+ ]
31
+ assert instances[1].split == "train"
32
+
33
+ scenario = MMLUProScenario(subject="health")
34
+ instances = scenario.get_instances(tmpdir)
35
+
36
+ assert instances[0].input == Input(
37
+ text="Which of the following is the body cavity that contains the pituitary gland?"
38
+ )
39
+
40
+ # Check references with more answer choices and correct tagging
41
+ assert instances[0].references == [
42
+ Reference(output=Output(text="Ventral"), tags=[]),
43
+ Reference(output=Output(text="Dorsal"), tags=[]),
44
+ Reference(output=Output(text="Buccal"), tags=[]),
45
+ Reference(output=Output(text="Thoracic"), tags=[]),
46
+ Reference(output=Output(text="Pericardial"), tags=[]),
47
+ Reference(output=Output(text="Abdominal"), tags=[]),
48
+ Reference(output=Output(text="Spinal"), tags=[]),
49
+ Reference(output=Output(text="Pelvic"), tags=[]),
50
+ Reference(output=Output(text="Pleural"), tags=[]),
51
+ Reference(output=Output(text="Cranial"), tags=[CORRECT_TAG]),
52
+ ]
53
+ assert instances[0].split == "train"
@@ -0,0 +1,51 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.oab_exams_scenario import OABExamsScenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_oab_exams_scenario():
10
+ oab_scenario = OABExamsScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = oab_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 2210
14
+ assert instances[100].split == TEST_SPLIT
15
+
16
+ assert instances[100].input.text.startswith(
17
+ "O Congresso Nacional e suas respectivas Casas se reúnem anualmente para a atividade legislativa."
18
+ )
19
+ assert len(instances[100].input.text) == 178
20
+
21
+ assert instances[100].references == [
22
+ Reference(
23
+ output=Output(
24
+ text="Legislatura: o período compreendido entre 2 de fevereiro a 17 de julho e 1º de agosto a 22 de dezembro." # noqa: E501
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="Sessão legislativa: os quatro anos equivalentes ao mandato dos parlamentares." # noqa: E501
31
+ ),
32
+ tags=[],
33
+ ),
34
+ Reference(
35
+ output=Output(
36
+ text="Sessão conjunta: a reunião da Câmara dos Deputados e do Senado Federal destinada, por exemplo, "
37
+ "a conhecer do veto presidencial e sobre ele deliberar." # noqa: E501
38
+ ),
39
+ tags=[CORRECT_TAG],
40
+ ),
41
+ Reference(
42
+ output=Output(
43
+ text="Sessão extraordinária: a que ocorre por convocação ou do Presidente do Senado Federal ou do "
44
+ "Presidente da Câmara dos Deputados ou do Presidente da República e mesmo por requerimento da maioria "
45
+ "dos membros de ambas as Casas para, excepcionalmente, inaugurar a "
46
+ "sessão legislativa e eleger as respectivas mesas diretoras." # noqa: E501
47
+ ),
48
+ tags=[],
49
+ ),
50
+ ]
51
+ assert instances[100].references[2].is_correct
@@ -0,0 +1,27 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.omni_math_scenario import OmniMATHScenario
5
+ from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_omni_math_scenario_get_instances():
10
+ omni_math_scenario = OmniMATHScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = omni_math_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 4428
14
+ assert instances[0].input == Input(
15
+ text=(
16
+ "Let $ n(\\ge2) $ be a positive integer. Find the minimum $ m $, "
17
+ "so that there exists $x_{ij}(1\\le i ,j\\le n)$ satisfying:\n(1)For every "
18
+ "$1\\le i ,j\\le n, x_{ij}=max\\{x_{i1},x_{i2},...,x_{ij}\\} $ or $ x_{ij}="
19
+ "max\\{x_{1j},x_{2j},...,x_{ij}\\}.$\n(2)For every $1\\le i \\le n$, there "
20
+ "are at most $m$ indices $k$ with $x_{ik}=max\\{x_{i1},x_{i2},...,x_{ik}\\}."
21
+ "$\n(3)For every $1\\le j \\le n$, there are at most $m$ indices $k$ with "
22
+ "$x_{kj}=max\\{x_{1j},x_{2j},...,x_{kj}\\}.$"
23
+ )
24
+ )
25
+ assert instances[0].split == TEST_SPLIT
26
+ assert instances[0].references
27
+ assert instances[0].references[0].output.text == "1 + \\left\\lceil \\frac{n}{2} \\right\\rceil"
@@ -0,0 +1,24 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario
5
+ from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_tweetsentbr_scenario():
10
+ tweetsentbr = TweetSentBRScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = tweetsentbr.get_instances(tmpdir)
13
+ assert len(instances) == 2085
14
+ assert instances[0].split == TRAIN_SPLIT
15
+
16
+ assert instances[0].input.text.startswith("joca tá com a corda toda 😂 😂 😂 😂")
17
+ assert len(instances[0].input.text) == 32
18
+
19
+ assert instances[0].references == [
20
+ Reference(
21
+ output=Output(text="Positivo"),
22
+ tags=[CORRECT_TAG],
23
+ )
24
+ ]
@@ -0,0 +1,15 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.wildbench_scenario import WildBenchScenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_wildbench_scenario_get_instances():
10
+ wildbench_scenario = WildBenchScenario(subset="v2")
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = wildbench_scenario.get_instances(tmpdir)
13
+ assert len(instances) == 1024
14
+ assert instances[0].split == TEST_SPLIT
15
+ assert instances[0].extra_data
@@ -0,0 +1,19 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.winogrande_afr_scenario import Winogrande_Afr_Scenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_winogrande_afr_scenario():
10
+ with TemporaryDirectory() as tmpdir:
11
+ scenario = Winogrande_Afr_Scenario(lang="am")
12
+ instances = scenario.get_instances(tmpdir)
13
+ assert len(instances) == 3674
14
+ assert instances[0].input == Input(text="ሳራ ከማሪያ በጣም የተሻለች የቀዶ ጥገና ሐኪም ስለነበረች ሁልጊዜ _ ቀላል ህመሞችን ታክማለች.")
15
+ assert instances[0].references == [
16
+ Reference(output=Output(text="ሳራ"), tags=[]),
17
+ Reference(output=Output(text="ማሪያ"), tags=[CORRECT_TAG]),
18
+ ]
19
+ assert instances[0].split == "train"
@@ -4,7 +4,16 @@ import json
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
  from helm.common.hierarchical_logger import hlog
7
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
8
17
 
9
18
 
10
19
  class ThaiExamScenario(Scenario):
@@ -7,7 +7,7 @@ from typing import Dict, List
7
7
 
8
8
  from helm.common.general import ensure_file_downloaded
9
9
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
10
- from .scenario import Scenario, Instance, TEST_SPLIT, Input
10
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
11
11
 
12
12
 
13
13
  class ThePileScenario(Scenario):
@@ -3,7 +3,16 @@ import os
3
3
  from typing import List, Dict, Any
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
- from .scenario import Scenario, Instance, Reference, VALID_SPLIT, TRAIN_SPLIT, CORRECT_TAG, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ VALID_SPLIT,
11
+ TRAIN_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
7
16
 
8
17
 
9
18
  class TruthfulQAScenario(Scenario):
@@ -0,0 +1,66 @@
1
+ from typing import Any, List, Dict
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+
16
+
17
+ class TweetSentBRScenario(Scenario):
18
+ """
19
+ TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several
20
+ annotators following steps stablished on the literature for improving reliability on
21
+ the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes:
22
+
23
+ Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post;
24
+ Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post;
25
+ Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic,
26
+ irrelevant, confusing or containing only objective data.
27
+
28
+ This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set
29
+ and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting.
30
+ """
31
+
32
+ name = "simple_classification"
33
+ description = "Classify tweets into Positive, Negative or Neutral."
34
+ tags = ["classification"]
35
+
36
+ def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
37
+ instances: List[Instance] = []
38
+ label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"}
39
+ for example in dataset[split]:
40
+ input = Input(text=example["sentence"])
41
+ # NOTE: For classification scenarios, the reference outputs should be the same
42
+ # for all instances, and should include both correct and incorrect classes.
43
+ # HELM only supports single-label classification. Exactly one reference
44
+ # should have the CORRECT_TAG tag.
45
+ references = [
46
+ Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
47
+ ]
48
+ instance = Instance(input=input, references=references, split=split)
49
+ instances.append(instance)
50
+ return instances
51
+
52
+ def get_instances(self, output_path: str) -> List[Instance]:
53
+ instances: List[Instance] = []
54
+ cache_dir = str(Path(output_path) / "data")
55
+ dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir)
56
+ splits: Dict[str, str] = {
57
+ "train": TRAIN_SPLIT,
58
+ "test": TEST_SPLIT,
59
+ }
60
+ for split in splits:
61
+ if split not in splits.keys():
62
+ hlog(f"{split} split doesn't exist, skipping")
63
+ continue
64
+ instances.extend(self.process_dataset(dataset, splits[split]))
65
+
66
+ return instances
@@ -4,7 +4,7 @@ from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
  from helm.common.hierarchical_logger import hlog
7
- from .scenario import Scenario, Instance, TEST_SPLIT, Input
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
8
 
9
9
  CODALAB_URI_TEMPLATE: str = (
10
10
  "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -32,13 +32,19 @@ class UnitxtScenario(Scenario):
32
32
  self.kwargs = kwargs
33
33
 
34
34
  def get_instances(self, output_path: str) -> List[Instance]:
35
- dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
35
+ if len(self.kwargs) == 1 and "recipe" in self.kwargs:
36
+ dataset_name = self.kwargs["recipe"]
37
+ else:
38
+ dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
36
39
  dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
37
40
 
38
41
  instances: List[Instance] = []
39
42
 
40
43
  for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
41
- for index, row in enumerate(dataset[unitxt_split_name]):
44
+ dataset_split = dataset.get(unitxt_split_name)
45
+ if dataset_split is None:
46
+ continue
47
+ for index, row in enumerate(dataset_split):
42
48
  references = [
43
49
  Reference(
44
50
  output=Output(text=reference_text),
@@ -4,7 +4,7 @@ import json
4
4
  from typing import Dict, List, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import Scenario, Instance, Input, TEST_SPLIT
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
7
 
8
8
 
9
9
  class VicunaScenario(Scenario):
@@ -0,0 +1,140 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.images_utils import generate_hash
18
+
19
+
20
+ class BlinkScenario(Scenario):
21
+ """
22
+ BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
23
+ but pose significant challenges for VLMs.
24
+
25
+ Website: https://zeyofu.github.io/blink/
26
+
27
+ @article{fu2024blink,
28
+ title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
29
+ author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
30
+ Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
31
+ journal={arXiv preprint arXiv:2404.12390},
32
+ year={2024}
33
+ }
34
+ """
35
+
36
+ HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"
37
+
38
+ VALID_CATEGORIES: List[str] = [
39
+ "Art_Style",
40
+ "Counting",
41
+ "Forensic_Detection",
42
+ "Functional_Correspondence",
43
+ "IQ_Test",
44
+ "Jigsaw",
45
+ "Multi-view_Reasoning",
46
+ "Object_Localization",
47
+ "Relative_Depth",
48
+ "Relative_Reflectance",
49
+ "Semantic_Correspondence",
50
+ "Spatial_Relation",
51
+ "Visual_Correspondence",
52
+ "Visual_Similarity",
53
+ ]
54
+
55
+ name = "blink"
56
+ description = (
57
+ "BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
58
+ "but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
59
+ )
60
+ tags = ["vision-language", "knowledge", "reasoning"]
61
+
62
+ def __init__(self, category: str):
63
+ super().__init__()
64
+
65
+ if category not in self.VALID_CATEGORIES:
66
+ raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
67
+ self._category: str = category
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ def save_image(image) -> str:
71
+ image_file_name: str = generate_hash(image) + ".jpg"
72
+ local_image_path: str = os.path.join(output_path, image_file_name)
73
+ if not os.path.exists(local_image_path):
74
+ image.save(local_image_path)
75
+ return local_image_path
76
+
77
+ def get_image_header(image_index: int) -> str:
78
+ if image_index == 1:
79
+ return "First image:"
80
+ elif image_index == 2:
81
+ return "Second image:"
82
+ elif image_index == 3:
83
+ return "Third image:"
84
+ elif image_index == 4:
85
+ return "Fourth image:"
86
+ else:
87
+ raise ValueError(f"Invalid image index: {image_index}")
88
+
89
+ instances: List[Instance] = []
90
+ for row in tqdm(
91
+ load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
92
+ ):
93
+ # Save the image(s) to disk
94
+ has_multiple_images: bool = row["image_2"] is not None
95
+ content: List[MediaObject] = []
96
+
97
+ if has_multiple_images:
98
+ # An example can have up to 4 images
99
+ for i in range(1, 5):
100
+ image_i = row[f"image_{i}"]
101
+ if image_i is None:
102
+ break
103
+
104
+ # Before each image, include a header text that indicates which number image it is.
105
+ # Some prompts refer to specific image numbers within the question, e.g.,
106
+ # "Given three similar but different images, take the first image as reference.
107
+ # Can you tell which one of the latter two images is most similar to the first one?
108
+ # Select from the following choices. (A) the second image (B) the third image"
109
+ image_path: str = save_image(image_i)
110
+ content.extend(
111
+ [
112
+ MediaObject(text=get_image_header(i), content_type="text/plain"),
113
+ MediaObject(location=image_path, content_type="image/jpeg"),
114
+ ]
115
+ )
116
+ else:
117
+ image1 = row["image_1"]
118
+ image1_path: str = save_image(image1)
119
+ content.append(MediaObject(location=image1_path, content_type="image/jpeg"))
120
+
121
+ # Add the prompt that has both the question and the answer choices
122
+ prompt: str = row["prompt"]
123
+ # Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
124
+ prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
125
+ content.append(MediaObject(text=prompt, content_type="text/plain"))
126
+
127
+ # The answer has the correct letter choices surrounded by parentheses
128
+ paren_letter_answer: str = row["answer"]
129
+ assert (
130
+ paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
131
+ ), f"Unexpected answer format: {paren_letter_answer}"
132
+ letter_answer: str = paren_letter_answer[1]
133
+ references: List[Reference] = [
134
+ Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
135
+ ]
136
+ instances.append(
137
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
138
+ )
139
+
140
+ return instances
@@ -0,0 +1,95 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.images_utils import generate_hash
18
+
19
+
20
+ class MMStarScenario(Scenario):
21
+ """
22
+ MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
23
+ selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
24
+ the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
25
+ are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
26
+ involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
27
+ multi-modal capabilities for the solution.
28
+
29
+ Website: https://mmstar-benchmark.github.io/
30
+
31
+ @article{chen2024we,
32
+ title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
33
+ author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
34
+ Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
35
+ journal={arXiv preprint arXiv:2403.20330},
36
+ year={2024}
37
+ }
38
+ """
39
+
40
+ HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
41
+
42
+ VALID_CATEGORIES: List[str] = [
43
+ "coarse perception",
44
+ "fine-grained perception",
45
+ "instance reasoning",
46
+ "logical reasoning",
47
+ "math",
48
+ "science technology",
49
+ ]
50
+
51
+ name = "mm_star"
52
+ description = (
53
+ "MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
54
+ "meticulously selected by humans."
55
+ "([Chen, 2024](https://arxiv.org/abs/2403.20330))."
56
+ )
57
+ tags = ["vision-language", "knowledge", "reasoning"]
58
+
59
+ def __init__(self, category: str):
60
+ super().__init__()
61
+
62
+ category = category.replace("_", " ")
63
+ if category not in self.VALID_CATEGORIES:
64
+ raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
65
+ if category == "science technology":
66
+ category = "science & technology"
67
+
68
+ self._category: str = category
69
+
70
+ def get_instances(self, output_path: str) -> List[Instance]:
71
+ instances: List[Instance] = []
72
+
73
+ for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
74
+ # Filter by category
75
+ category: str = row["category"]
76
+ if category != self._category:
77
+ continue
78
+
79
+ # Save the image to disk
80
+ image = row["image"]
81
+ image_file_name: str = generate_hash(image) + ".jpg"
82
+ local_image_path: str = os.path.join(output_path, image_file_name)
83
+ if not os.path.exists(local_image_path):
84
+ image.save(local_image_path)
85
+
86
+ content: List[MediaObject] = [
87
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
88
+ MediaObject(text=row["question"], content_type="text/plain"),
89
+ ]
90
+ references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
91
+ instances.append(
92
+ Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
93
+ )
94
+
95
+ return instances