crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.simple_scenarios import (
5
+ SimpleMCQAScenario,
6
+ SimpleShortAnswerQAScenario,
7
+ SimpleClassificationScenario,
8
+ )
9
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
10
+
11
+
12
+ @pytest.mark.scenarios
13
+ def test_simple_mcqa_scenario():
14
+ scenario = SimpleMCQAScenario()
15
+ with TemporaryDirectory() as tmpdir:
16
+ instances = scenario.get_instances(tmpdir)
17
+ assert len(instances) == 90
18
+ assert instances[0].input == Input(text="Is 10 even or odd?")
19
+ assert instances[0].references == [
20
+ Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
21
+ Reference(output=Output(text="Odd"), tags=[]),
22
+ ]
23
+ assert instances[0].split == "train"
24
+
25
+
26
+ @pytest.mark.scenarios
27
+ def test_simple_short_answer_qa_scenario():
28
+ scenario = SimpleShortAnswerQAScenario()
29
+ with TemporaryDirectory() as tmpdir:
30
+ instances = scenario.get_instances(tmpdir)
31
+ assert len(instances) == 90
32
+ assert instances[0].input == Input(text="Is 10 even or odd?")
33
+ assert instances[0].references == [
34
+ Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
35
+ ]
36
+ assert instances[0].split == "train"
37
+
38
+
39
+ @pytest.mark.scenarios
40
+ def test_simple_classification_scenario():
41
+ scenario = SimpleClassificationScenario()
42
+ with TemporaryDirectory() as tmpdir:
43
+ instances = scenario.get_instances(tmpdir)
44
+ assert len(instances) == 90
45
+ assert instances[0].input == Input(text="10")
46
+ assert instances[0].references == [
47
+ Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
48
+ Reference(output=Output(text="Odd"), tags=[]),
49
+ ]
50
+ assert instances[0].split == "train"
@@ -0,0 +1,135 @@
1
+ import os
2
+ from typing import Dict, List
3
+ import json
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.common.hierarchical_logger import hlog
7
+ from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
8
+
9
+
10
+ class ThaiExamScenario(Scenario):
11
+ """
12
+ ThaiExam, a benchmark comprising Thai multiple-choice examinations as follows:
13
+
14
+ ∙ ONET: The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
15
+ We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
16
+ These subjects are Thai, English, Mathematics, Social Studies, and Science.
17
+ Amounting to a total of 170 questions and options.
18
+
19
+ ∙ IC: The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
20
+ Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
21
+ We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
22
+
23
+ ∙ TGAT: The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
24
+ Focuses on critical and logical thinking skills.
25
+ We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
26
+
27
+ ∙ TPAT-1: The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
28
+ The Exam assesses students’ professional skills requirement in medical schools.
29
+ This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
30
+ The TPAT-1 consists of 5 choices per question.
31
+
32
+ ∙ A-Level: An academic knowledge assessment examination (Applied Knowledge Level)
33
+ that covers general foundational subjects taught in schools.
34
+ The content assessed in this examination aligns with the curriculum guidelines
35
+ and emphasizes the practical application of knowledge in daily life.
36
+ We collected a total of 175 questions and answers.
37
+
38
+ We created and used these exams to evaluate the performance of the Typhoon models(https://arxiv.org/abs/2312.13951).
39
+
40
+ Prompt models using the following format
41
+
42
+ <input> # train
43
+ A. <reference>
44
+ B. <reference>
45
+ C. <reference>
46
+ D. <reference>
47
+ E. <reference>
48
+ Answer: <A/B/C/D/E>
49
+
50
+ x N (N-shot)
51
+
52
+ <input> # test
53
+ A. <reference1>
54
+ B. <reference2>
55
+ C. <reference3>
56
+ D. <reference4>
57
+ E. <reference5>
58
+ Answer:
59
+
60
+ For example:
61
+
62
+ ในระบบย่อยอาหารของมนุษย์ การดูดซึมสารอาหารส่วนใหญ่เกิดขึ้นที่อวัยวะใด?
63
+ A. ลำไส้เล็ก
64
+ B. ตับอ่อน
65
+ C. ลำไส้ใหญ่
66
+ D. กระเพาะอาหาร
67
+ E. หัวใจ
68
+ Answer: A
69
+
70
+ ข้อใดอธิบายเกี่ยวกับแรงไฟฟ้าได้ถูกต้อง?
71
+ A. เกิดได้โดยที่วัตถุไม่ต้องสัมผัสกัน
72
+ B. เป็นได้เฉพาะแรงผลักเท่านั้น
73
+ C. เป็นได้เฉพาะแรงดูดเท่านั้น
74
+ D. เป็นแรงต้านระหว่างวัตถุเท่านั้น
75
+ E. ถูกทุกข้อ
76
+ Answer:
77
+
78
+ Target: A
79
+ """
80
+
81
+ name = "thai_exam"
82
+ description = "ThaiExam benchmark comprising Thai multiple-choice examinations."
83
+ tags = ["knowledge", "multiple_choice"]
84
+
85
+ def __init__(self, exam: str):
86
+ super().__init__()
87
+ self.exam = exam
88
+
89
+ def download_thai_exam(self, path: str):
90
+ ensure_file_downloaded(
91
+ "https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
92
+ target_path=path,
93
+ unpack=True,
94
+ )
95
+
96
+ def process_jsonl(self, jsonl_path: str, split: str) -> List[Instance]:
97
+ instances: List[Instance] = []
98
+ hlog(f"Reading {jsonl_path}")
99
+ with open(jsonl_path, "r") as f:
100
+ for line in f:
101
+ data = json.loads(line)
102
+ # for handle missing key incase of some subject doesn't have all 5 choices
103
+ answers = [data[key] for key in ["a", "b", "c", "d", "e"] if key in data and data[key] != ""]
104
+ answers_dict = dict(zip(["A", "B", "C", "D", "E"], answers))
105
+
106
+ question, correct_answer = data["question"], answers_dict[data["answer"].upper()]
107
+
108
+ def answer_to_reference(answer: str) -> Reference:
109
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
110
+
111
+ instance = Instance(
112
+ input=Input(text=question),
113
+ references=list(map(answer_to_reference, answers)),
114
+ split=split,
115
+ )
116
+ instances.append(instance)
117
+ return instances
118
+
119
+ def get_instances(self, output_path) -> List[Instance]:
120
+ data_path: str = os.path.join(output_path, "data")
121
+ self.download_thai_exam(data_path)
122
+
123
+ instances: List[Instance] = []
124
+ splits: Dict[str, str] = {
125
+ "train": TRAIN_SPLIT,
126
+ "test": TEST_SPLIT,
127
+ }
128
+ for split in splits:
129
+ jsonl_path: str = os.path.join(data_path, self.exam, f"{self.exam}_{split}.jsonl")
130
+ if not os.path.exists(jsonl_path):
131
+ hlog(f"{jsonl_path} doesn't exist, skipping")
132
+ continue
133
+ instances.extend(self.process_jsonl(jsonl_path, splits[split]))
134
+
135
+ return instances
@@ -50,14 +50,13 @@ class ThePileScenario(Scenario):
50
50
  self.subset = subset
51
51
 
52
52
  @htrack(None)
53
- def load_and_cache_all_subsets(self, output_path):
54
- data_path = os.path.join(output_path, "data")
53
+ def load_and_cache_all_subsets(self, data_jsonl, output_path):
55
54
  subsets: Dict[str, List] = {subset: [] for subset in self.pile_subsets}
56
55
 
57
56
  # Load all data into memory
58
57
  with htrack_block("Loading"):
59
- hlog(f"Loading all data from {data_path}")
60
- with open(data_path) as f:
58
+ hlog(f"Loading all data from {data_jsonl}")
59
+ with open(data_jsonl) as f:
61
60
  data = [json.loads(line) for line in f]
62
61
 
63
62
  # Classify the documents by subset
@@ -76,10 +75,10 @@ class ThePileScenario(Scenario):
76
75
 
77
76
  def get_instances(self, output_path: str) -> List[Instance]:
78
77
  # Download the raw data
79
- data_path = os.path.join(output_path, "data")
78
+ data_jsonl = os.path.join(output_path, "data")
80
79
  ensure_file_downloaded(
81
80
  source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
82
- target_path=data_path,
81
+ target_path=data_jsonl,
83
82
  unpack=True,
84
83
  )
85
84
 
@@ -87,7 +86,7 @@ class ThePileScenario(Scenario):
87
86
 
88
87
  # If the target subset does not exist, load and cache all subsets to the directory
89
88
  if not os.path.exists(subset_path):
90
- self.load_and_cache_all_subsets(output_path)
89
+ self.load_and_cache_all_subsets(data_jsonl, output_path)
91
90
 
92
91
  # Read all the instances
93
92
  instances = []
@@ -0,0 +1,56 @@
1
+ from typing import List
2
+ from datasets import load_dataset
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Output,
6
+ Reference,
7
+ Scenario,
8
+ Instance,
9
+ Input,
10
+ CORRECT_TAG,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ VALID_SPLIT,
14
+ )
15
+
16
+
17
+ class UnitxtScenario(Scenario):
18
+ """Integration with Unitxt: https://unitxt.rtfd.io/"""
19
+
20
+ name = "unitxt"
21
+ description = "Unitxt Scenarios"
22
+ tags = ["unitxt"]
23
+
24
+ UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME = {
25
+ "train": TRAIN_SPLIT,
26
+ "test": TEST_SPLIT,
27
+ "validation": VALID_SPLIT,
28
+ }
29
+
30
+ def __init__(self, **kwargs):
31
+ super().__init__()
32
+ self.kwargs = kwargs
33
+
34
+ def get_instances(self, output_path: str) -> List[Instance]:
35
+ dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
36
+ dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
37
+
38
+ instances: List[Instance] = []
39
+
40
+ for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
41
+ for index, row in enumerate(dataset[unitxt_split_name]):
42
+ references = [
43
+ Reference(
44
+ output=Output(text=reference_text),
45
+ tags=[CORRECT_TAG],
46
+ )
47
+ for reference_text in row["references"]
48
+ ]
49
+ instance = Instance(
50
+ id=f"{unitxt_split_name}{index}",
51
+ input=Input(text=row["source"]),
52
+ references=references,
53
+ split=helm_split_name,
54
+ )
55
+ instances.append(instance)
56
+ return instances
@@ -137,7 +137,9 @@ class VerifiabilityJudgementScenario(Scenario):
137
137
  for _, filesplit in split_to_filesplit.items():
138
138
  target_name = f"verifiability_judgments_{filesplit}.jsonl"
139
139
  target_path: str = os.path.join(data_path, target_name)
140
- url: str = f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
140
+ url: str = (
141
+ f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
142
+ )
141
143
  ensure_file_downloaded(source_url=url, target_path=target_path)
142
144
  assert os.path.exists(target_path)
143
145
 
@@ -26,7 +26,7 @@ class VicunaScenario(Scenario):
26
26
  return self.category == "all" or raw["category"] == self.category
27
27
 
28
28
  # Download the raw data
29
- source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/eval/table/question.jsonl"
29
+ source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/v0.2.5/fastchat/eval/table/question.jsonl"
30
30
  data_path: str = os.path.join(output_path, "vicuna_questions.jsonl")
31
31
 
32
32
  ensure_file_downloaded(
@@ -0,0 +1,103 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class BingoScenario(Scenario):
21
+ """
22
+ Holistic Analysis of Hallucination in GPT-4V(ision): Bias and Interference Challenges
23
+
24
+ We introduce a new benchmark, namely, the Bias and Interference Challenges in Visual Language Models (Bingo).
25
+ This benchmark is designed to evaluate and shed light on the two common types of hallucinations in visual
26
+ language models: bias and interference. Here, bias refers to the model's tendency to hallucinate certain types
27
+ of responses, possibly due to imbalance in its training data. Interference pertains to scenarios where the
28
+ judgment of GPT-4V(ision) can be disrupted due to how the text prompt is phrased or how the input image is
29
+ presented. The benchmark consists of open-ended question-answer pairs, and we employ open-ended generation
30
+ metrics for evaluation. In the experiment, we identify a notable regional bias, whereby GPT-4V(ision) is
31
+ better at interpreting Western images or images with English writing compared to images from other countries
32
+ or containing text in other languages.
33
+
34
+
35
+ @article{cui2023holistic,
36
+ title={Holistic analysis of hallucination in gpt-4v (ision): Bias and interference challenges},
37
+ author={Cui, Chenhang and Zhou, Yiyang and Yang, Xinyu and Wu, Shirley and Zhang, Linjun and
38
+ Zou, James and Yao, Huaxiu},
39
+ journal={arXiv preprint arXiv:2311.03287},
40
+ year={2023}
41
+ }
42
+
43
+ Paper: https://arxiv.org/abs/2311.03287
44
+ """
45
+
46
+ BINGO_HUGGINGFACE_DATASET_NAME: str = "PahaII/Bingo"
47
+
48
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main/images/{image_path}?download=true"
49
+
50
+ SUBJECTS: List[str] = ["T2I", "I2I", "OCR", "Factual", "Region"]
51
+
52
+ name = "bingo"
53
+ description = (
54
+ "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
55
+ " ([paper](https://arxiv.org/abs/2311.03287))."
56
+ )
57
+ tags = ["vision-language"]
58
+
59
+ def __init__(self, subject: str):
60
+ super().__init__()
61
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
62
+ self._subject: str = subject
63
+
64
+ def get_instances(self, output_path: str) -> List[Instance]:
65
+ images_path: str = os.path.join(output_path, "images")
66
+ ensure_directory_exists(images_path)
67
+
68
+ # There is only the test split in Unicorn benchmark
69
+ instances: List[Instance] = []
70
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
71
+
72
+ # Process the test set
73
+ for row in tqdm(
74
+ load_dataset(
75
+ self.BINGO_HUGGINGFACE_DATASET_NAME,
76
+ data_files=question_data_files,
77
+ split=TEST_SPLIT,
78
+ cache_dir=output_path,
79
+ )
80
+ ):
81
+ # Download the image
82
+ image_path: str = row["image_path"]
83
+ local_image_path: str = os.path.join(output_path, image_path)
84
+ ensure_file_downloaded(
85
+ source_url=self.IMAGE_URL.format(image_path=image_path),
86
+ target_path=local_image_path,
87
+ unpack=False,
88
+ )
89
+
90
+ content: List[MediaObject] = [
91
+ MediaObject(location=local_image_path, content_type="image/png"),
92
+ MediaObject(text=row["question"], content_type="text/plain"),
93
+ ]
94
+ answer: str = row["answer"]
95
+ instances.append(
96
+ Instance(
97
+ Input(multimedia_content=MultimediaObject(content)),
98
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
99
+ split=TEST_SPLIT,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,92 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ ALL_SPLITS,
9
+ CORRECT_TAG,
10
+ VALID_SPLIT,
11
+ Instance,
12
+ Input,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
19
+
20
+
21
+ class HatefulMemesScenario(Scenario):
22
+ """
23
+ The Hateful Memes Challenge and Dataset is a competition and open source dataset designed to
24
+ measure progress in multimodal vision-and-language classification.
25
+
26
+ In order for AI to become a more effective tool for detecting hate speech, it must be able to understand
27
+ content the way people do: holistically. When viewing a meme, for example, we don’t think about the words
28
+ and photo independently of each other; we understand the combined meaning. This is extremely challenging
29
+ for machines, however, because it means they can’t analyze the text and the image separately. They must
30
+ combine these different modalities and understand how the meaning changes when they are presented together.
31
+
32
+ @article{kiela2020hateful,
33
+ title={The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes},
34
+ author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh,
35
+ Amanpreet and Ringshia, Pratik and Testuggine, Davide},
36
+ journal={arXiv preprint arXiv:2005.04790},
37
+ year={2020}
38
+ }
39
+
40
+ Paper: https://arxiv.org/abs/2005.04790
41
+ Website: https://ai.meta.com/tools/hatefulmemes/
42
+ """
43
+
44
+ IMAGE_URL: str = (
45
+ "https://huggingface.co/datasets/neuralcatcher/hateful_memes/resolve/main/{image_path}?download=true"
46
+ )
47
+
48
+ # This question is from the example on the website
49
+ QUESTION: str = "Is this meme mean?"
50
+
51
+ name = "hateful_memes"
52
+ description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
53
+ tags = ["vision-language"]
54
+
55
+ def get_instances(self, output_path: str) -> List[Instance]:
56
+ ensure_directory_exists(os.path.join(output_path, "img"))
57
+
58
+ instances: List[Instance] = []
59
+ for split in ALL_SPLITS:
60
+ for row in tqdm(
61
+ load_dataset(
62
+ "neuralcatcher/hateful_memes",
63
+ split="validation" if split == VALID_SPLIT else split,
64
+ cache_dir=output_path,
65
+ )
66
+ ):
67
+ # Download the meme
68
+ image_path: str = row["img"]
69
+ local_image_path: str = os.path.join(output_path, image_path)
70
+ ensure_file_downloaded(
71
+ source_url=self.IMAGE_URL.format(image_path=image_path),
72
+ target_path=local_image_path,
73
+ unpack=False,
74
+ )
75
+ # Some examples are missing images. Skip those for now
76
+ if not os.path.exists(local_image_path) or os.path.getsize(local_image_path) == 0:
77
+ continue
78
+
79
+ content: List[MediaObject] = [
80
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
81
+ MediaObject(text=self.QUESTION, content_type="text/plain"),
82
+ ]
83
+ answer: str = "Yes" if row["label"] == 1 else "No"
84
+ instances.append(
85
+ Instance(
86
+ Input(multimedia_content=MultimediaObject(content)),
87
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
88
+ split=split,
89
+ )
90
+ )
91
+
92
+ return instances
@@ -0,0 +1,113 @@
1
+ from collections import Counter
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Set
4
+ import json
5
+ import os
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ ALL_SPLITS,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class HEIMHumanEvalReference(Reference):
22
+ # The number of human annotators who gave this reference or answer.
23
+ num_human_answered: int = 0
24
+
25
+
26
+ class HEIMHumanEvalScenario(Scenario):
27
+ """
28
+ In [Holistic Evaluation of Text-To-Image Models (HEIM)](https://arxiv.org/abs/2311.04287), we evaluated 26
29
+ state-of-the-art text-to-image models using across 12 different aspects (e.g., toxicity mitigation, unbiasedness,
30
+ originality, etc.). We used human annotators through AWS Mechanical Turk to evaluate the models for some of
31
+ these aspects (see image below).
32
+ This scenario contains the AI-generated images and human annotations for the following question types:
33
+
34
+ 1. Alignment
35
+ 2. Aesthetics
36
+ 3. Clear subject
37
+ 4. Originality
38
+ 5. Photorealism
39
+
40
+ Citations:
41
+ - HEIM: https://arxiv.org/abs/2311.04287
42
+ - MS COCO: https://arxiv.org/abs/1405.0312
43
+ """
44
+
45
+ DATASET_DOWNLOAD_URL: str = (
46
+ "https://worksheets.codalab.org/rest/bundles/0x502d646c366c4f1d8c4a2ccf163b958f/contents/blob/"
47
+ )
48
+ VALID_QUESTION_TYPES: Set[str] = {"alignment", "aesthetics", "clear_subject", "originality", "photorealism"}
49
+
50
+ name = "heim_human_eval"
51
+ description = (
52
+ "Images generated by text-to-image models and human annotations for HEIM "
53
+ "([paper](https://arxiv.org/abs/2311.04287))."
54
+ )
55
+ tags = ["vision-language", "visual question answering", "image evaluation"]
56
+
57
+ def __init__(self, question_type: str):
58
+ super().__init__()
59
+ assert question_type in self.VALID_QUESTION_TYPES, f"Invalid question type: {question_type}"
60
+ self._question_type: str = question_type
61
+
62
+ def get_instances(self, output_path: str) -> List[Instance]:
63
+ # Download the dataset
64
+ output_path = os.path.join(output_path, "dataset")
65
+ ensure_file_downloaded(
66
+ source_url=self.DATASET_DOWNLOAD_URL, target_path=output_path, unpack=True, unpack_type="untar"
67
+ )
68
+
69
+ # Load the multiple-choice questions
70
+ with open(os.path.join(output_path, "questions.json")) as questions_file:
71
+ question_info: Dict = json.load(questions_file)[self._question_type]
72
+
73
+ instances: List[Instance] = []
74
+ for split in ALL_SPLITS:
75
+ annotations_split_path: str = os.path.join(output_path, f"{self._question_type}_{split}.jsonl")
76
+ with open(annotations_split_path) as f:
77
+ # Read each line/example as a JSON object
78
+ for line in f.readlines():
79
+ image_annotation: Dict = json.loads(line)
80
+ image_path: str = os.path.join(output_path, image_annotation["image_path"])
81
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
82
+
83
+ # Get the most common human answer(s) for the question
84
+ human_answers: List[str] = [str(answer) for answer in image_annotation["human_annotations"]]
85
+ human_answers_to_counts = Counter(human_answers)
86
+ max_count: int = max(human_answers_to_counts.values())
87
+ modes: List[str] = [value for value, count in human_answers_to_counts.items() if count == max_count]
88
+
89
+ content: List[MediaObject] = [MediaObject(location=image_path, content_type="image/png")]
90
+ if "prompt" in image_annotation:
91
+ # Include the prompt in the content if it exists
92
+ prompt: str = image_annotation["prompt"]
93
+ content.append(MediaObject(text=f"Description: {prompt}", content_type="text/plain"))
94
+ content.append(MediaObject(text=question_info["question"], content_type="text/plain"))
95
+
96
+ references: List[Reference] = [
97
+ HEIMHumanEvalReference(
98
+ Output(text=answer),
99
+ # The mode is the most common human answer and the reference we mark as correct
100
+ tags=[CORRECT_TAG] if value in modes else [],
101
+ num_human_answered=human_answers_to_counts[value],
102
+ )
103
+ for value, answer in question_info["choices"].items()
104
+ ]
105
+ instances.append(
106
+ Instance(
107
+ Input(multimedia_content=MultimediaObject(content)),
108
+ references=references,
109
+ split=split,
110
+ )
111
+ )
112
+
113
+ return instances
@@ -0,0 +1,55 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ CORRECT_TAG,
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Output,
10
+ Reference,
11
+ Scenario,
12
+ )
13
+ from helm.common.media_object import MediaObject, MultimediaObject
14
+
15
+
16
+ class Chart2CSVScenario(Scenario):
17
+ """
18
+ Chart to CSV
19
+ """
20
+
21
+ name = "chart2csv"
22
+ description = "Convert a chart to CSV."
23
+ tags = ["vision-language", "image2structure"]
24
+
25
+ def get_instances(self, output_path: str) -> List[Instance]:
26
+ assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
27
+ instances: List[Instance] = []
28
+
29
+ charts_path: str = os.path.join(output_path, "charts")
30
+ ground_truths_path: str = os.path.join(output_path, "groundtruth")
31
+
32
+ for chart_file in os.listdir(charts_path):
33
+ if not chart_file.endswith(".png"):
34
+ continue
35
+
36
+ chart_path: str = os.path.join(charts_path, chart_file)
37
+ ground_truth_file: str = chart_file.replace(".png", ".psv")
38
+ ground_truth_path: str = os.path.join(ground_truths_path, ground_truth_file)
39
+ assert os.path.exists(ground_truth_path), f"Ground truth does not exist at {ground_truth_path}"
40
+
41
+ content: List[MediaObject] = [
42
+ MediaObject(location=chart_path, content_type="image/png"),
43
+ ]
44
+ with open(ground_truth_path, "r") as file:
45
+ ground_truth: str = file.read().replace("|", ",")
46
+
47
+ instances.append(
48
+ Instance(
49
+ Input(multimedia_content=MultimediaObject(content)),
50
+ references=[Reference(Output(text=ground_truth), tags=[CORRECT_TAG])],
51
+ split=TEST_SPLIT,
52
+ )
53
+ )
54
+
55
+ return instances