crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,135 @@
1
+ import os
2
+ from typing import Dict, List
3
+ import json
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.common.hierarchical_logger import hlog
7
+ from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
8
+
9
+
10
+ class ThaiExamScenario(Scenario):
11
+ """
12
+ ThaiExam, a benchmark comprising Thai multiple-choice examinations as follows:
13
+
14
+ ∙ ONET: The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
15
+ We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
16
+ These subjects are Thai, English, Mathematics, Social Studies, and Science.
17
+ Amounting to a total of 170 questions and options.
18
+
19
+ ∙ IC: The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
20
+ Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
21
+ We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
22
+
23
+ ∙ TGAT: The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
24
+ Focuses on critical and logical thinking skills.
25
+ We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
26
+
27
+ ∙ TPAT-1: The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
28
+ The Exam assesses students’ professional skills requirement in medical schools.
29
+ This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
30
+ The TPAT-1 consists of 5 choices per question.
31
+
32
+ ∙ A-Level: An academic knowledge assessment examination (Applied Knowledge Level)
33
+ that covers general foundational subjects taught in schools.
34
+ The content assessed in this examination aligns with the curriculum guidelines
35
+ and emphasizes the practical application of knowledge in daily life.
36
+ We collected a total of 175 questions and answers.
37
+
38
+ We created and used these exams to evaluate the performance of the Typhoon models(https://arxiv.org/abs/2312.13951).
39
+
40
+ Prompt models using the following format
41
+
42
+ <input> # train
43
+ A. <reference>
44
+ B. <reference>
45
+ C. <reference>
46
+ D. <reference>
47
+ E. <reference>
48
+ Answer: <A/B/C/D/E>
49
+
50
+ x N (N-shot)
51
+
52
+ <input> # test
53
+ A. <reference1>
54
+ B. <reference2>
55
+ C. <reference3>
56
+ D. <reference4>
57
+ E. <reference5>
58
+ Answer:
59
+
60
+ For example:
61
+
62
+ ในระบบย่อยอาหารของมนุษย์ การดูดซึมสารอาหารส่วนใหญ่เกิดขึ้นที่อวัยวะใด?
63
+ A. ลำไส้เล็ก
64
+ B. ตับอ่อน
65
+ C. ลำไส้ใหญ่
66
+ D. กระเพาะอาหาร
67
+ E. หัวใจ
68
+ Answer: A
69
+
70
+ ข้อใดอธิบายเกี่ยวกับแรงไฟฟ้าได้ถูกต้อง?
71
+ A. เกิดได้โดยที่วัตถุไม่ต้องสัมผัสกัน
72
+ B. เป็นได้เฉพาะแรงผลักเท่านั้น
73
+ C. เป็นได้เฉพาะแรงดูดเท่านั้น
74
+ D. เป็นแรงต้านระหว่างวัตถุเท่านั้น
75
+ E. ถูกทุกข้อ
76
+ Answer:
77
+
78
+ Target: A
79
+ """
80
+
81
+ name = "thai_exam"
82
+ description = "ThaiExam benchmark comprising Thai multiple-choice examinations."
83
+ tags = ["knowledge", "multiple_choice"]
84
+
85
+ def __init__(self, exam: str):
86
+ super().__init__()
87
+ self.exam = exam
88
+
89
+ def download_thai_exam(self, path: str):
90
+ ensure_file_downloaded(
91
+ "https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
92
+ target_path=path,
93
+ unpack=True,
94
+ )
95
+
96
+ def process_jsonl(self, jsonl_path: str, split: str) -> List[Instance]:
97
+ instances: List[Instance] = []
98
+ hlog(f"Reading {jsonl_path}")
99
+ with open(jsonl_path, "r") as f:
100
+ for line in f:
101
+ data = json.loads(line)
102
+ # for handle missing key incase of some subject doesn't have all 5 choices
103
+ answers = [data[key] for key in ["a", "b", "c", "d", "e"] if key in data and data[key] != ""]
104
+ answers_dict = dict(zip(["A", "B", "C", "D", "E"], answers))
105
+
106
+ question, correct_answer = data["question"], answers_dict[data["answer"].upper()]
107
+
108
+ def answer_to_reference(answer: str) -> Reference:
109
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
110
+
111
+ instance = Instance(
112
+ input=Input(text=question),
113
+ references=list(map(answer_to_reference, answers)),
114
+ split=split,
115
+ )
116
+ instances.append(instance)
117
+ return instances
118
+
119
+ def get_instances(self, output_path) -> List[Instance]:
120
+ data_path: str = os.path.join(output_path, "data")
121
+ self.download_thai_exam(data_path)
122
+
123
+ instances: List[Instance] = []
124
+ splits: Dict[str, str] = {
125
+ "train": TRAIN_SPLIT,
126
+ "test": TEST_SPLIT,
127
+ }
128
+ for split in splits:
129
+ jsonl_path: str = os.path.join(data_path, self.exam, f"{self.exam}_{split}.jsonl")
130
+ if not os.path.exists(jsonl_path):
131
+ hlog(f"{jsonl_path} doesn't exist, skipping")
132
+ continue
133
+ instances.extend(self.process_jsonl(jsonl_path, splits[split]))
134
+
135
+ return instances
@@ -0,0 +1,56 @@
1
+ from typing import List
2
+ from datasets import load_dataset
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Output,
6
+ Reference,
7
+ Scenario,
8
+ Instance,
9
+ Input,
10
+ CORRECT_TAG,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ VALID_SPLIT,
14
+ )
15
+
16
+
17
+ class UnitxtScenario(Scenario):
18
+ """Integration with Unitxt: https://unitxt.rtfd.io/"""
19
+
20
+ name = "unitxt"
21
+ description = "Unitxt Scenarios"
22
+ tags = ["unitxt"]
23
+
24
+ UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME = {
25
+ "train": TRAIN_SPLIT,
26
+ "test": TEST_SPLIT,
27
+ "validation": VALID_SPLIT,
28
+ }
29
+
30
+ def __init__(self, **kwargs):
31
+ super().__init__()
32
+ self.kwargs = kwargs
33
+
34
+ def get_instances(self, output_path: str) -> List[Instance]:
35
+ dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
36
+ dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
37
+
38
+ instances: List[Instance] = []
39
+
40
+ for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
41
+ for index, row in enumerate(dataset[unitxt_split_name]):
42
+ references = [
43
+ Reference(
44
+ output=Output(text=reference_text),
45
+ tags=[CORRECT_TAG],
46
+ )
47
+ for reference_text in row["references"]
48
+ ]
49
+ instance = Instance(
50
+ id=f"{unitxt_split_name}{index}",
51
+ input=Input(text=row["source"]),
52
+ references=references,
53
+ split=helm_split_name,
54
+ )
55
+ instances.append(instance)
56
+ return instances
@@ -137,7 +137,9 @@ class VerifiabilityJudgementScenario(Scenario):
137
137
  for _, filesplit in split_to_filesplit.items():
138
138
  target_name = f"verifiability_judgments_{filesplit}.jsonl"
139
139
  target_path: str = os.path.join(data_path, target_name)
140
- url: str = f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
140
+ url: str = (
141
+ f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
142
+ )
141
143
  ensure_file_downloaded(source_url=url, target_path=target_path)
142
144
  assert os.path.exists(target_path)
143
145
 
@@ -26,7 +26,7 @@ class VicunaScenario(Scenario):
26
26
  return self.category == "all" or raw["category"] == self.category
27
27
 
28
28
  # Download the raw data
29
- source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/eval/table/question.jsonl"
29
+ source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/v0.2.5/fastchat/eval/table/question.jsonl"
30
30
  data_path: str = os.path.join(output_path, "vicuna_questions.jsonl")
31
31
 
32
32
  ensure_file_downloaded(
@@ -0,0 +1,83 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ ALL_SPLITS,
10
+ VALID_SPLIT,
11
+ TEST_SPLIT,
12
+ Instance,
13
+ Input,
14
+ Output,
15
+ Reference,
16
+ Scenario,
17
+ )
18
+ from helm.common.media_object import MediaObject, MultimediaObject
19
+ from helm.common.general import ensure_directory_exists
20
+
21
+
22
+ class AOKVQAScenario(Scenario):
23
+ """
24
+ A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense
25
+ and world knowledge to answer.
26
+
27
+ @misc{schwenk2022aokvqa,
28
+ title={A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge},
29
+ author={Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi},
30
+ year={2022},
31
+ eprint={2206.01718},
32
+ archivePrefix={arXiv},
33
+ primaryClass={cs.CV}
34
+ }
35
+
36
+ Paper: https://arxiv.org/abs/2206.01718
37
+ Website: https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA
38
+ """
39
+
40
+ HF_DATASET_NAME: str = "HuggingFaceM4/A-OKVQA"
41
+
42
+ name = "a_okvqa"
43
+ description = (
44
+ "A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of "
45
+ "commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718))."
46
+ )
47
+ tags = ["vision-language", "knowledge", "reasoning"]
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ images_path: str = os.path.join(output_path, "images")
51
+ ensure_directory_exists(images_path)
52
+
53
+ instances: List[Instance] = []
54
+ for helm_split in ALL_SPLITS:
55
+ if helm_split == TEST_SPLIT:
56
+ # The examples in the test split does not have answers
57
+ continue
58
+
59
+ split = "validation" if helm_split == VALID_SPLIT else helm_split
60
+
61
+ for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split=split)):
62
+ image_filename: str = f"{row['question_id']}.jpg"
63
+ local_image_path: str = os.path.join(images_path, image_filename)
64
+ image = row["image"]
65
+ if not os.path.exists(local_image_path):
66
+ image.save(local_image_path)
67
+
68
+ content: List[MediaObject] = [
69
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
70
+ MediaObject(text=row["question"], content_type="text/plain"),
71
+ ]
72
+ instances.append(
73
+ Instance(
74
+ Input(multimedia_content=MultimediaObject(content)),
75
+ references=[
76
+ Reference(Output(text=choice), tags=[CORRECT_TAG] if i == row["correct_choice_idx"] else [])
77
+ for i, choice in enumerate(row["choices"])
78
+ ],
79
+ split=helm_split,
80
+ )
81
+ )
82
+
83
+ return instances
@@ -0,0 +1,103 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class BingoScenario(Scenario):
21
+ """
22
+ Holistic Analysis of Hallucination in GPT-4V(ision): Bias and Interference Challenges
23
+
24
+ We introduce a new benchmark, namely, the Bias and Interference Challenges in Visual Language Models (Bingo).
25
+ This benchmark is designed to evaluate and shed light on the two common types of hallucinations in visual
26
+ language models: bias and interference. Here, bias refers to the model's tendency to hallucinate certain types
27
+ of responses, possibly due to imbalance in its training data. Interference pertains to scenarios where the
28
+ judgment of GPT-4V(ision) can be disrupted due to how the text prompt is phrased or how the input image is
29
+ presented. The benchmark consists of open-ended question-answer pairs, and we employ open-ended generation
30
+ metrics for evaluation. In the experiment, we identify a notable regional bias, whereby GPT-4V(ision) is
31
+ better at interpreting Western images or images with English writing compared to images from other countries
32
+ or containing text in other languages.
33
+
34
+
35
+ @article{cui2023holistic,
36
+ title={Holistic analysis of hallucination in gpt-4v (ision): Bias and interference challenges},
37
+ author={Cui, Chenhang and Zhou, Yiyang and Yang, Xinyu and Wu, Shirley and Zhang, Linjun and
38
+ Zou, James and Yao, Huaxiu},
39
+ journal={arXiv preprint arXiv:2311.03287},
40
+ year={2023}
41
+ }
42
+
43
+ Paper: https://arxiv.org/abs/2311.03287
44
+ """
45
+
46
+ BINGO_HUGGINGFACE_DATASET_NAME: str = "PahaII/Bingo"
47
+
48
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main/images/{image_path}?download=true"
49
+
50
+ SUBJECTS: List[str] = ["T2I", "I2I", "OCR", "Factual", "Region"]
51
+
52
+ name = "bingo"
53
+ description = (
54
+ "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
55
+ " ([paper](https://arxiv.org/abs/2311.03287))."
56
+ )
57
+ tags = ["vision-language"]
58
+
59
+ def __init__(self, subject: str):
60
+ super().__init__()
61
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
62
+ self._subject: str = subject
63
+
64
+ def get_instances(self, output_path: str) -> List[Instance]:
65
+ images_path: str = os.path.join(output_path, "images")
66
+ ensure_directory_exists(images_path)
67
+
68
+ # There is only the test split in Unicorn benchmark
69
+ instances: List[Instance] = []
70
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
71
+
72
+ # Process the test set
73
+ for row in tqdm(
74
+ load_dataset(
75
+ self.BINGO_HUGGINGFACE_DATASET_NAME,
76
+ data_files=question_data_files,
77
+ split=TEST_SPLIT,
78
+ cache_dir=output_path,
79
+ )
80
+ ):
81
+ # Download the image
82
+ image_path: str = row["image_path"]
83
+ local_image_path: str = os.path.join(output_path, image_path)
84
+ ensure_file_downloaded(
85
+ source_url=self.IMAGE_URL.format(image_path=image_path),
86
+ target_path=local_image_path,
87
+ unpack=False,
88
+ )
89
+
90
+ content: List[MediaObject] = [
91
+ MediaObject(location=local_image_path, content_type="image/png"),
92
+ MediaObject(text=row["question"], content_type="text/plain"),
93
+ ]
94
+ answer: str = row["answer"]
95
+ instances.append(
96
+ Instance(
97
+ Input(multimedia_content=MultimediaObject(content)),
98
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
99
+ split=TEST_SPLIT,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,134 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ TEST_SPLIT,
8
+ Instance,
9
+ Input,
10
+ Output,
11
+ Reference,
12
+ Scenario,
13
+ )
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded
16
+
17
+
18
+ class Crossmodal3600Scenario(Scenario):
19
+ """
20
+ Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated
21
+ with human-generated reference captions in 36 languages.
22
+
23
+ @inproceedings{ThapliyalCrossmodal2022,
24
+ author = {Ashish Thapliyal and Jordi Pont-Tuset and Xi Chen and Radu Soricut},
25
+ title = {{Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset}},
26
+ booktitle = {EMNLP},
27
+ year = {2022}
28
+ }
29
+
30
+ Paper: https://arxiv.org/abs/2205.12522
31
+ Website: https://google.github.io/crossmodal-3600/
32
+ """
33
+
34
+ LANGUAGE_TO_ID: Dict[str, str] = {
35
+ "arabic": "ar",
36
+ "bengali": "bn",
37
+ "chinese": "zh",
38
+ "croatian": "hr",
39
+ "cusco_quechua": "quz",
40
+ "czech": "cs",
41
+ "danish": "da",
42
+ "dutch": "nl",
43
+ "english": "en",
44
+ "persian": "fa",
45
+ "finnish": "fi",
46
+ "french": "fr",
47
+ "german": "de",
48
+ "greek": "el",
49
+ "hebrew": "he",
50
+ "hindi": "hi",
51
+ "hungarian": "hu",
52
+ "indonesian": "id",
53
+ "italian": "it",
54
+ "japanese": "ja",
55
+ "korean": "ko",
56
+ "maori": "mi",
57
+ "norwegian": "no",
58
+ "polish": "pl",
59
+ "portuguese": "pt",
60
+ "romanian": "ro",
61
+ "russian": "ru",
62
+ "spanish": "es",
63
+ "swahili": "sw",
64
+ "swedish": "sv",
65
+ "telugu": "te",
66
+ "thai": "th",
67
+ "turkish": "tr",
68
+ "ukrainian": "uk",
69
+ "vietnamese": "vi",
70
+ }
71
+
72
+ IMAGES_URL: str = "https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz"
73
+ CAPTIONS_URL: str = "https://google.github.io/crossmodal-3600/web-data/captions.zip"
74
+
75
+ name = "crossmodal_3600"
76
+ description = (
77
+ "Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated "
78
+ "with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))."
79
+ )
80
+ tags = ["vision-language", "multilinguality"]
81
+
82
+ def __init__(self, location: str, language: str):
83
+ super().__init__()
84
+ self._locale_id: str = self.LANGUAGE_TO_ID[location]
85
+ self._language_id: str = self.LANGUAGE_TO_ID[language]
86
+ self._instruction: str = f"Generate a short caption for the following image in {language}."
87
+
88
+ def get_instances(self, output_path: str) -> List[Instance]:
89
+ images_path: str = os.path.join(output_path, "images")
90
+ ensure_file_downloaded(
91
+ source_url=self.IMAGES_URL,
92
+ target_path=images_path,
93
+ unpack=True,
94
+ unpack_type="untar",
95
+ )
96
+
97
+ captions_path: str = os.path.join(output_path, "captions.jsonl")
98
+ ensure_file_downloaded(
99
+ source_url=self.CAPTIONS_URL,
100
+ target_path=captions_path,
101
+ unpack=True,
102
+ unpack_type="unzip",
103
+ )
104
+
105
+ instances: List[Instance] = []
106
+ with open(captions_path, "r") as captions_file:
107
+ for line in captions_file:
108
+ example: Dict = json.loads(line)
109
+
110
+ locale_id: str = example["image/locale"]
111
+ if locale_id != self._locale_id:
112
+ continue
113
+
114
+ key: str = example["image/key"]
115
+ image_path: str = os.path.join(images_path, f"{key}.jpg")
116
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
117
+
118
+ assert self._language_id in example, f"Language {self._language_id} not found in example"
119
+ all_captions: Dict = example[self._language_id]
120
+ captions: List[str] = all_captions["caption"]
121
+
122
+ content: List[MediaObject] = [
123
+ MediaObject(text=self._instruction, content_type="text/plain"),
124
+ MediaObject(location=image_path, content_type="image/jpeg"),
125
+ ]
126
+ instances.append(
127
+ Instance(
128
+ Input(multimedia_content=MultimediaObject(content)),
129
+ references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions],
130
+ split=TEST_SPLIT,
131
+ )
132
+ )
133
+
134
+ return instances
@@ -0,0 +1,74 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class Flickr30KScenario(Scenario):
21
+ """
22
+ An image caption corpus consisting of 158,915 crowdsourced captions describing 31,783 Flickr images.
23
+
24
+ @article{young2014image,
25
+ title={From image descriptions to visual denotations: New similarity metrics for semantic
26
+ inference over event descriptions},
27
+ author={Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia},
28
+ journal={Transactions of the Association for Computational Linguistics},
29
+ volume={2},
30
+ pages={67--78},
31
+ year={2014},
32
+ publisher={MIT Press}
33
+ }
34
+
35
+ Paper: https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf
36
+ Website: https://shannon.cs.illinois.edu/DenotationGraph/
37
+ """
38
+
39
+ HF_DATASET_NAME: str = "nlphuji/flickr30k"
40
+
41
+ name = "flickr30k"
42
+ description = (
43
+ "An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr "
44
+ "images ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
45
+ )
46
+ tags = ["vision-language"]
47
+
48
+ def get_instances(self, output_path: str) -> List[Instance]:
49
+ images_path: str = os.path.join(output_path, "images")
50
+ ensure_directory_exists(images_path)
51
+
52
+ instances: List[Instance] = []
53
+ for row in tqdm(load_dataset(self.HF_DATASET_NAME, cache_dir=output_path, split="test")):
54
+ split: str = row["split"]
55
+ helm_split: str = VALID_SPLIT if split == "val" else split
56
+
57
+ image_filename: str = row["filename"]
58
+ local_image_path: str = os.path.join(images_path, image_filename)
59
+ image = row["image"]
60
+ if not os.path.exists(local_image_path):
61
+ image.save(local_image_path)
62
+
63
+ content: List[MediaObject] = [
64
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
65
+ ]
66
+ instances.append(
67
+ Instance(
68
+ Input(multimedia_content=MultimediaObject(content)),
69
+ references=[Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in row["caption"]],
70
+ split=helm_split,
71
+ )
72
+ )
73
+
74
+ return instances