crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,72 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List, Set
4
+
5
+ from helm.common.media_object import MediaObject, MultimediaObject
6
+ from helm.common.general import ensure_file_downloaded
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, CORRECT_TAG, VALID_SPLIT
8
+
9
+
10
+ class PaintSkillsScenario(Scenario):
11
+ """
12
+ PaintSkills is a compositional diagnostic dataset an evaluation toolkit that measures three
13
+ fundamental visual reasoning capabilities:
14
+
15
+ - object recognition => object
16
+ - object counting => count
17
+ - spatial relation understanding => spatial
18
+
19
+ Paper: https://arxiv.org/abs/2202.04053
20
+ Website: https://github.com/j-min/DallEval/tree/main/paintskills
21
+ """
22
+
23
+ METADATA_DOWNLOAD_URL: str = "https://drive.google.com/uc?export=download&id=12jsHDzEcBr-Et3FhLq-HckI5cmLB_rxC"
24
+ SKILL_TO_DOWNLOAD_URL: Dict[str, str] = {
25
+ "object": "https://drive.google.com/uc?export=download&id=1lpvSpBNfEg5EJt16prumXiuEO99byjzw&confirm=t",
26
+ "count": "https://drive.google.com/uc?export=download&id=1koA-5xiZbAUDh65jpYaylG3IOA-mZTH2&confirm=t",
27
+ "spatial": "https://drive.google.com/uc?export=download&id=1g-L0dVQjBTWp1uRwJLYXIj2xYIlQ2knu&confirm=t",
28
+ }
29
+
30
+ name = "paint_skills"
31
+ description = (
32
+ "A compositional diagnostic dataset an evaluation toolkit that measures visual reasoning skills "
33
+ "([paper](https://arxiv.org/abs/2202.04053))."
34
+ )
35
+ tags = ["text-to-image", "image-to-text"]
36
+
37
+ def __init__(self, skill: str):
38
+ super().__init__()
39
+ assert skill in self.SKILL_TO_DOWNLOAD_URL, f"Invalid skill: {skill}"
40
+ self.skill: str = skill
41
+
42
+ def get_instances(self, output_path: str) -> List[Instance]:
43
+ skills_data_path: str = os.path.join(output_path, self.skill)
44
+ ensure_file_downloaded(
45
+ source_url=self.SKILL_TO_DOWNLOAD_URL[self.skill],
46
+ target_path=skills_data_path,
47
+ unpack=True,
48
+ unpack_type="unzip",
49
+ )
50
+
51
+ images_path: str = os.path.join(skills_data_path, "images")
52
+ with open(os.path.join(skills_data_path, "scenes", f"{self.skill}_val.json"), "r") as f:
53
+ examples: Dict = json.load(f)
54
+
55
+ instances: List[Instance] = []
56
+ seen_captions: Set[str] = set()
57
+ for example in examples["data"]:
58
+ caption: str = example["text"]
59
+ if caption in seen_captions:
60
+ continue
61
+
62
+ seen_captions.add(caption)
63
+ image_path: str = os.path.join(images_path, f"image_{example['id']}.png")
64
+ content: MultimediaObject = MultimediaObject([MediaObject(content_type="image/png", location=image_path)])
65
+ instance = Instance(
66
+ Input(text=caption),
67
+ references=[Reference(Output(multimedia_content=content), tags=[CORRECT_TAG])],
68
+ split=VALID_SPLIT,
69
+ )
70
+ instances.append(instance)
71
+
72
+ return instances
@@ -0,0 +1,94 @@
1
+ import csv
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+
8
+
9
+ class PartiPromptsScenario(Scenario):
10
+ """
11
+ PartiPrompts (P2) is a set of 1600 diverse English prompts that allow us to more comprehensively
12
+ evaluate and test the limits of text-to-image synthesis models.
13
+
14
+ Each prompt in the P2 benchmark is associated with two labels:
15
+ 1. Category: indicating a broad group that a prompt belongs to
16
+ 2. Challenge: highlighting an aspect which makes a prompt difficult
17
+
18
+ Categories:
19
+ - Abstract: Descriptions that represent abstract concepts, including single words and simple numbers.
20
+ - World Knowledge: Descriptions focused on objects and places that exist in the real world.
21
+ - People: Descriptions where the primary participants are human beings (but not specific individuals,
22
+ living or dead).
23
+ - Animals: Descriptions in which the primary participants are animals.
24
+ - Illustrations: Descriptions of images that involve specific types of graphical representations,
25
+ including geometrical objects, diagrams, and symbols.
26
+ - Artifacts: Descriptions that represent abstract concepts, including single words and simple numbers.
27
+ - Food & Beverage: Descriptions of things animals, especially human beings, eat or drink.
28
+ - Vehicles: Descriptions where the focus is on man-made devices for transportation.
29
+ - Arts: Descriptions of existing paintings or intended to produce novel images in the format of a painting.
30
+ - Indoor Scenes: Descriptions about objects and participants that occur indoors.
31
+ - Outdoor Scenes: Descriptions about objects and participants that occur outdoors.
32
+ - Produce & Plants: Descriptions focused on plants or their products (fruits, vegetables, seeds, etc).
33
+
34
+ Challenges:
35
+ - Simple Detail: Descriptions that include only simple or high-level details.
36
+ - Fine-grained Detail: Descriptions that include very detailed specifications of attributes or
37
+ actions of entities or objects in a scene.
38
+ - Complex: Descriptions that include many fine-grained, interacting details or relationships between multiple
39
+ participants.
40
+ - Quantity: Descriptions that specify particular counts of occurrences of subjects in a scene.
41
+ - Style & Format: Descriptions that specifically focus on the visual manner in which a subject or scene
42
+ must be depicted.
43
+ - Properties & Positioning: Descriptions that target precise assignment of properties to entities or
44
+ objects (often in the context of multiple entities or objects), and/or the
45
+ relative spatial arrangement of entities and objects with respect to one
46
+ another or landmarks in the scene.
47
+ - Linguistic Structures: Long and/or abstract words or complex syntactic structures or semantic
48
+ ambiguities.
49
+ - Writing & Symbols: Descriptions that require words or symbols to be accurately represented
50
+ in the context of the visual scene.
51
+ - Imagination: Descriptions that include participants or interactions that are not, or are generally unlikely
52
+ to be, found in the modern day world.
53
+ - Basic: Descriptions about a single subject or concept with little to no detail or embellishment.
54
+ - Perspective: Descriptions that specify particular viewpoints or positioning of the subjects in a scene.
55
+
56
+ Paper: https://arxiv.org/abs/2206.10789
57
+ Website: https://parti.research.google/
58
+ """
59
+
60
+ DATASET_DOWNLOAD_URL: str = "https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv"
61
+ ALL_CATEGORY: str = "all"
62
+
63
+ name = "parti_prompts"
64
+ description = (
65
+ "PartiPrompts (P2) is a set of 1600 diverse English prompts that allow to more comprehensively "
66
+ "evaluate and test the limits of text-to-image synthesis models ([paper](https://arxiv.org/abs/2206.10789))."
67
+ )
68
+ tags = ["text-to-image"]
69
+
70
+ def __init__(self, category: str):
71
+ super().__init__()
72
+ self.category: str = category
73
+
74
+ def get_instances(self, output_path: str) -> List[Instance]:
75
+ prompts_path: str = os.path.join(output_path, "prompts.tsv")
76
+ ensure_file_downloaded(source_url=self.DATASET_DOWNLOAD_URL, target_path=prompts_path)
77
+
78
+ instances: List[Instance] = []
79
+ with open(prompts_path) as f:
80
+ tsv_reader = csv.reader(f, delimiter="\t")
81
+ for i, row in enumerate(tsv_reader):
82
+ if i == 0:
83
+ # Skip the header
84
+ continue
85
+
86
+ prompt: str = row[0]
87
+ category: str = row[1]
88
+
89
+ # P2 does not have reference images
90
+ instance = Instance(Input(text=prompt), references=[], split=TEST_SPLIT)
91
+ if category.startswith(self.category) or self.category == self.ALL_CATEGORY:
92
+ instances.append(instance)
93
+
94
+ return instances
@@ -0,0 +1,42 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
4
+
5
+
6
+ class RadiologyScenario(Scenario):
7
+ """
8
+ From "What Does DALL-E 2 Know About Radiology?", DALL-E 2 fails to generate realistic
9
+ X-ray, CT, MRI and ultrasound images. For example, generated images of X-rays have
10
+ smaller bones missing, extra joints added, etc.
11
+
12
+ Prompts are in the following format:
13
+
14
+ “An x-ray of {head, chest, shoulder, abdomen, pelvis, hand, knee, ankle}” => 8 prompts
15
+ “An MRI of the {heart, liver, kidney}” => 3 prompts
16
+ “A CT of the {heart, liver, kidney}” => 3 prompts
17
+ “Ultrasound of the {heart, liver, kidney}” => 3 prompts
18
+
19
+ Paper: https://arxiv.org/abs/2209.13696
20
+ """
21
+
22
+ X_RAY_ANATOMICAL_AREA: List[str] = ["head", "chest", "shoulder", "abdomen", "pelvis", "hand", "knee", "ankle"]
23
+ ORGANS: List[str] = ["heart", "liver", "kidney"]
24
+
25
+ name = "radiology"
26
+ description = "Prompts to generate radiological images ([paper](https://arxiv.org/abs/2209.13696))."
27
+ tags = ["text-to-image", "knowledge"]
28
+
29
+ def get_instances(self, _) -> List[Instance]:
30
+ prompts: List[str] = []
31
+
32
+ for anatomical_area in self.X_RAY_ANATOMICAL_AREA:
33
+ prompts.append(f"An x-ray of {anatomical_area}")
34
+
35
+ for organ in self.ORGANS:
36
+ prompts.append(f"An MRI of the {organ}")
37
+ prompts.append(f"A CT of the {organ}")
38
+ prompts.append(f"Ultrasound of the {organ}")
39
+
40
+ assert len(prompts) == 17, "Invalid number of prompts"
41
+ # There are no reference images
42
+ return [Instance(Input(text=prompt), references=[], split=TEST_SPLIT) for prompt in prompts]
@@ -0,0 +1,52 @@
1
+ from typing import List, Set
2
+ import csv
3
+ import os
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+
8
+
9
+ class RelationalUnderstandingScenario(Scenario):
10
+ """
11
+ From Testing Relational Understanding in Text-Guided Image Generation, based on existing cognitive,
12
+ linguistic, and developmental literature, the authors created a set of 15 relations (8 physical,
13
+ 7 agentic) and a set of 12 entities (6 objects, 6 agents). The physical relations were: in, on,
14
+ under, covering, near, occluded by, hanging over, and tied to. The agentic relations were: pushing,
15
+ pulling, touching, hitting, kicking, helping, and hindering. The objects were: box, cylinder,
16
+ blanket, bowl, teacup, and knife. The agents were: man, woman, child, robot, monkey, and iguana.
17
+
18
+ The authors created 5 different prompts for each relation, by randomly sampling two entities five
19
+ times, resulting in 75 distinct basic relation prompts (e.g., a monkey touching an iguana). Withs
20
+ these prompts, the authors showed that DALL-E 2 suffers from a significant lack of commonsense
21
+ reasoning in the form of relational understanding.
22
+
23
+ Paper: https://arxiv.org/abs/2208.00005
24
+ Website: https://osf.io/sm68h
25
+ """
26
+
27
+ name = "relational_understanding"
28
+ description = (
29
+ "Consists of 75 basic relation prompts that tests commonsense reasoning "
30
+ "([paper](https://arxiv.org/abs/2208.00005))."
31
+ )
32
+ tags = ["text-to-image", "reasoning"]
33
+
34
+ def get_instances(self, output_path: str) -> List[Instance]:
35
+ data_path: str = os.path.join(output_path, "choice_data.csv")
36
+ ensure_file_downloaded(source_url="https://osf.io/download/tb3a4", target_path=data_path)
37
+
38
+ instances: List[Instance] = []
39
+ seen_prompts: Set[str] = set()
40
+ with open(data_path) as csv_file:
41
+ csv_reader = csv.reader(csv_file, delimiter=",")
42
+ for i, row in enumerate(csv_reader):
43
+ if i == 0:
44
+ # Skip the header
45
+ continue
46
+
47
+ prompt: str = row[1]
48
+ if prompt not in seen_prompts:
49
+ instances.append(Instance(Input(text=prompt), references=[], split=TEST_SPLIT))
50
+ seen_prompts.add(prompt)
51
+
52
+ return instances
@@ -0,0 +1,124 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
4
+
5
+
6
+ class TIMEMostSignificantHistoricalFigures(Scenario):
7
+ """
8
+ People from TIME's "The 100 Most Significant Figures in History" list.
9
+
10
+ https://ideas.time.com/2013/12/10/whos-biggest-the-100-most-significant-figures-in-history/
11
+ """
12
+
13
+ HISTORICAL_FIGURES: List[str] = [
14
+ "Jesus",
15
+ "Napoleon Bonaparte",
16
+ "Muhammad",
17
+ "William Shakespeare",
18
+ "Abraham Lincoln",
19
+ "George Washington",
20
+ "Adolf Hitler",
21
+ "Aristotle",
22
+ "Alexander the Great",
23
+ "Thomas Jefferson",
24
+ "Henry VIII of England",
25
+ "Charles Darwin",
26
+ "Elizabeth I of England",
27
+ "Karl Marx",
28
+ "Julius Caesar",
29
+ "Queen Victoria",
30
+ "Martin Luther",
31
+ "Joseph Stalin",
32
+ "Albert Einstein",
33
+ "Christopher Columbus",
34
+ "Isaac Newton",
35
+ "Charlemagne",
36
+ "Theodore Roosevelt",
37
+ "Wolfgang Amadeus Mozart",
38
+ "Plato",
39
+ "Louis XIV of France",
40
+ "Ludwig van Beethoven",
41
+ "Ulysses S.Grant",
42
+ "Leonardo da Vinci",
43
+ "Augustus",
44
+ "Carl Linnaeus",
45
+ "Ronald Reagan",
46
+ "Charles Dickens",
47
+ "Paul the Apostle",
48
+ "Benjamin Franklin",
49
+ # "George W.Bush",
50
+ "Winston Churchill",
51
+ "Genghis Khan",
52
+ "Charles I of England",
53
+ "Thomas Edison",
54
+ "James I of England",
55
+ "Friedrich Nietzsche",
56
+ "Franklin D.Roosevelt",
57
+ "Sigmund Freud",
58
+ "Alexander Hamilton",
59
+ "Mohandas Karamchand Gandhi",
60
+ "Woodrow Wilson",
61
+ "Johann Sebastian Bach",
62
+ "Galileo Galilei",
63
+ "Oliver Cromwell",
64
+ "James Madison",
65
+ "Gautama Buddha",
66
+ "Mark Twain",
67
+ "Edgar Allan Poe",
68
+ "Joseph Smith, Jr.",
69
+ "Adam Smith",
70
+ "David, King of Israel",
71
+ "George III of the United Kingdom",
72
+ "Immanuel Kant",
73
+ "James Cook",
74
+ "John Adams",
75
+ "Richard Wagner",
76
+ "Pyotr Ilyich Tchaikovsky",
77
+ "Voltaire",
78
+ "Saint Peter",
79
+ "Andrew Jackson",
80
+ "Constantine the Great",
81
+ "Socrates",
82
+ "Elvis Presley",
83
+ "William the Conqueror",
84
+ "John F.Kennedy",
85
+ "Augustine of Hippo",
86
+ "Vincent van Gogh",
87
+ "Nicolaus Copernicus",
88
+ "Vladimir Lenin",
89
+ "Robert E.Lee",
90
+ "Oscar Wilde",
91
+ "Charles II of England",
92
+ "Cicero",
93
+ "Jean-Jacques Rousseau",
94
+ "Francis Bacon",
95
+ "Richard Nixon",
96
+ "Louis XVI of France",
97
+ "Charles V, Holy Roman Emperor",
98
+ "King Arthur",
99
+ "Michelangelo",
100
+ "Philip II of Spain",
101
+ "Johann Wolfgang von Goethe",
102
+ "Ali, founder of Sufism",
103
+ "Thomas Aquinas",
104
+ "Pope John Paul II",
105
+ "René Descartes",
106
+ "Nikola Tesla",
107
+ "Harry S.Truman",
108
+ "Joan of Arc",
109
+ "Dante Alighieri",
110
+ "Otto von Bismarck",
111
+ "Grover Cleveland",
112
+ "John Calvin",
113
+ "John Locke",
114
+ ]
115
+
116
+ name = "time_most_significant_historical_figures"
117
+ description = 'People from TIME\'s "The 100 Most Significant Figures in History" list.'
118
+ tags = ["text-to-image", "knowledge"]
119
+
120
+ def get_instances(self, _) -> List[Instance]:
121
+ return [
122
+ Instance(Input(text=historical_figure), references=[], split=TEST_SPLIT)
123
+ for historical_figure in self.HISTORICAL_FIGURES
124
+ ]
@@ -0,0 +1,62 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from datasets import load_dataset
5
+
6
+ from helm.common.general import get_file_name
7
+ from helm.common.images_utils import copy_image
8
+ from helm.common.media_object import MediaObject, MultimediaObject
9
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, CORRECT_TAG, TEST_SPLIT
10
+
11
+
12
+ class WinogroundScenario(Scenario):
13
+ """
14
+ Winoground is a novel task and dataset for evaluating the ability of vision and language models
15
+ to conduct visio-linguistic compositional reasoning. Given two images and two captions, the
16
+ goal is to match them correctly—but crucially, both captions contain a completely identical set
17
+ of words/morphemes, only in a different order. The dataset was carefully hand-curated by
18
+ expert annotators and is labeled with a rich set of fine-grained tags to assist in analyzing
19
+ model performance.
20
+
21
+ Users must agree to share their contact information before downloading the dataset from
22
+ Hugging Face. Either agree to the terms and set HUGGING_FACE_ACCESS_TOKEN to an access token
23
+ of a valid Hugging Face account or have the dataset pre-downloaded at the Hugging Face cache
24
+ (default path: ~/.cache/huggingface/datasets).
25
+
26
+ Paper: https://arxiv.org/abs/2204.03162
27
+ Website: https://huggingface.co/datasets/facebook/winoground
28
+ """
29
+
30
+ name = "winoground"
31
+ description = (
32
+ "Winoground is a novel task and dataset for evaluating the ability of vision and language models "
33
+ "to conduct visio-linguistic compositional reasoning "
34
+ "([paper](https://arxiv.org/abs/2204.03162))."
35
+ )
36
+ tags = ["text-to-image", "image-to-text", "visual_reasoning"]
37
+
38
+ def get_instances(self, output_path: str) -> List[Instance]:
39
+ auth_token: str = os.environ.get("HUGGING_FACE_ACCESS_TOKEN", "")
40
+
41
+ instances: List[Instance] = []
42
+ for row in load_dataset("facebook/winoground", split="test", use_auth_token=auth_token):
43
+ # Use the first example of the pair for now (index 0)
44
+ caption: str = row["caption_0"]
45
+ image_path: str = row["image_0"].filename
46
+
47
+ # Create a copy of the image in the benchmark output folder for metrics computation
48
+ image_copy_path: str = os.path.join(output_path, get_file_name(image_path))
49
+ if not os.path.exists(image_copy_path):
50
+ copy_image(image_path, image_copy_path)
51
+ content: MultimediaObject = MultimediaObject(
52
+ [MediaObject(content_type="image/png", location=image_copy_path)]
53
+ )
54
+
55
+ instances.append(
56
+ Instance(
57
+ input=Input(text=caption),
58
+ references=[Reference(Output(multimedia_content=content), tags=[CORRECT_TAG])],
59
+ split=TEST_SPLIT,
60
+ )
61
+ )
62
+ return instances
@@ -107,7 +107,6 @@ class IMDBScenario(Scenario):
107
107
  contrast_map = {}
108
108
 
109
109
  for orig_line, contrast_line in zip(orig_and_contrast_inputs[0], orig_and_contrast_inputs[1]):
110
-
111
110
  orig_label_name, orig_context = orig_line.strip().split("\t")
112
111
  orig_label = label_name_to_id[orig_label_name]
113
112
 
@@ -0,0 +1,123 @@
1
+ import random
2
+ import os
3
+ import json
4
+ import datasets
5
+ from pathlib import Path
6
+ from typing import List, Dict
7
+
8
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
+ from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
10
+
11
+ PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
12
+
13
+ SUBSETS = [
14
+ "abercrombie",
15
+ "corporate_lobbying",
16
+ "international_citizenship_questions",
17
+ "function_of_decision_section",
18
+ "proa",
19
+ ]
20
+
21
+
22
+ def get_legalbench_prompt_settings(subset: str, cache_dir: str):
23
+ """
24
+ Loads prompt construction settings for all subsets.
25
+ """
26
+ assert subset in SUBSETS, "Unknown subset: {}".format(subset)
27
+
28
+ prompt_construction_settings_path = os.path.join(cache_dir, "prompt_construction_settings.json")
29
+ ensure_directory_exists(cache_dir)
30
+ ensure_file_downloaded(
31
+ source_url=PROMPT_SETTINGS_URL,
32
+ target_path=prompt_construction_settings_path,
33
+ )
34
+ with open(prompt_construction_settings_path, "r") as f:
35
+ field_ordering, instructions, label_keys, output_nouns, _ = map(json.loads, f.read().strip().split("\n"))
36
+ return (
37
+ field_ordering[subset],
38
+ instructions[subset],
39
+ label_keys[subset],
40
+ output_nouns[subset],
41
+ )
42
+
43
+
44
+ def get_legalbench_instructions(subset: str, cache_dir: str):
45
+ return get_legalbench_prompt_settings(subset, cache_dir)[1]
46
+
47
+
48
+ def get_legalbench_output_nouns(subset: str, cache_dir: str):
49
+ return get_legalbench_prompt_settings(subset, cache_dir)[3]
50
+
51
+
52
+ class LegalBenchScenario(Scenario):
53
+ """
54
+ LegalBench is benchmark containing different legal reasoning tasks. We use a subset of the tasks, selected
55
+ to represent different legal reasoning patterns.
56
+
57
+ LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models
58
+ https://arxiv.org/abs/2308.11462
59
+
60
+ Official website for LegalBench:
61
+ http://hazyresearch.stanford.edu/legalbench/
62
+
63
+ Dataset summary:
64
+ https://huggingface.co/datasets/nguha/legalbench
65
+
66
+ Prompts are adapted from:
67
+ https://github.com/HazyResearch/legalbench/
68
+
69
+ Subsets:
70
+
71
+ - abercrombie
72
+ - corporate_lobbying
73
+ - international_citizenship_questions
74
+ - function_of_decision_section
75
+ - proa
76
+ """
77
+
78
+ name = "legalbench"
79
+ description = "LegalBench"
80
+ tags = ["text_classification", "robustness"]
81
+
82
+ def __init__(self, subset: str, random_seed=42):
83
+ super().__init__()
84
+ assert subset in SUBSETS, "Unknown subset: {}".format(subset)
85
+ self.subset = subset
86
+ self.random_seed = random_seed
87
+
88
+ def load_prompt_construction_settings(self, output_path: str):
89
+ # Load from prompt construction settings
90
+ cache_dir = str(Path(output_path) / "data")
91
+ return get_legalbench_prompt_settings(self.subset, cache_dir)
92
+
93
+ def get_instances(self, output_path: str) -> List[Instance]:
94
+ fields, _, label_key, _ = self.load_prompt_construction_settings(output_path)
95
+ cache_dir = str(Path(output_path) / "data")
96
+
97
+ # Download data from Huggingface. LegalBench provides splits for samples to
98
+ # be used for prompt construction and for testing.
99
+ train_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="train")
100
+ test_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="test")
101
+ assert isinstance(train_dataset, datasets.Dataset)
102
+ assert isinstance(test_dataset, datasets.Dataset)
103
+
104
+ dataset_splits: Dict[str, datasets.Dataset] = {
105
+ TRAIN_SPLIT: train_dataset,
106
+ TEST_SPLIT: test_dataset,
107
+ }
108
+
109
+ # Read all instances
110
+ random.seed(self.random_seed)
111
+ instances: List[Instance] = []
112
+ for split, subset in dataset_splits.items():
113
+ for x in subset:
114
+ assert fields is not None, "Field ordering not loaded"
115
+ prompt: str = "\n".join([f"{field[0]}: {x[field[1]]}" for field in fields])
116
+ instance = Instance(
117
+ input=Input(text=prompt),
118
+ references=[Reference(Output(text=x[label_key]), tags=[CORRECT_TAG])],
119
+ split=split,
120
+ )
121
+ instances.append(instance)
122
+
123
+ return instances
@@ -0,0 +1,94 @@
1
+ import os
2
+ from typing import List
3
+ from xml.etree.ElementTree import Element
4
+ import xml.etree.ElementTree as ET
5
+
6
+ from helm.common.general import ensure_file_downloaded
7
+ from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
8
+
9
+
10
+ class LiveQAScenario(Scenario):
11
+ """
12
+ TREC-2017 LiveQA: Medical Question Answering Task
13
+
14
+ The LiveQA'17 medical task focuses on consumer health question answering.
15
+ Please refer to the original paper for more information about the constructed datasets and the LiveQA Track:
16
+ https://trec.nist.gov/pubs/trec26/papers/Overview-QA.pdf
17
+
18
+ Paper citation:
19
+
20
+ @inproceedings{LiveMedQA2017,
21
+ author = {Asma {Ben Abacha} and Eugene Agichtein and Yuval Pinter and Dina Demner{-}Fushman},
22
+ title = {Overview of the Medical Question Answering Task at TREC 2017 LiveQA},
23
+ booktitle = {TREC 2017},
24
+ year = {2017}
25
+ }
26
+ """
27
+
28
+ SOURCE_REPO_URL = "https://raw.githubusercontent.com/abachaa/LiveQA_MedicalTask_TREC2017/master/TestDataset/"
29
+ FILENAME = "TREC-2017-LiveQA-Medical-Test-Questions-w-summaries.xml"
30
+
31
+ name = "live_qa"
32
+ description = "TREC-2017 LiveQA: Medical Question Answering Task"
33
+ tags = ["knowledge", "generation", "question_answering", "biomedical"]
34
+
35
+ def download_liveqa(self, path: str):
36
+ """Download the XML file containing the questions & reference answers"""
37
+ ensure_file_downloaded(
38
+ source_url=os.path.join(self.SOURCE_REPO_URL, self.FILENAME),
39
+ target_path=os.path.join(path, self.FILENAME),
40
+ unpack=False,
41
+ )
42
+
43
+ @staticmethod
44
+ def remove_whitespace(s: str) -> str:
45
+ """Just remove all whitespace from a string"""
46
+ return " ".join(s.strip().split())
47
+
48
+ @staticmethod
49
+ def _extract_question_id(element: Element):
50
+ return element.attrib["qid"]
51
+
52
+ @classmethod
53
+ def _extract_question(cls, element: Element) -> str:
54
+ """Given an XML Element representing a question, extract just the question as text"""
55
+ return cls.remove_whitespace(element.find("NLM-Summary").text) # type: ignore
56
+
57
+ @classmethod
58
+ def _extract_answers(cls, element: Element) -> List[str]:
59
+ """Given an XML Element representing a question, extract the reference answers"""
60
+ answers = []
61
+ for answer in element.iter("ANSWER"):
62
+ answers.append(cls.remove_whitespace(answer.text)) # type: ignore
63
+
64
+ return answers
65
+
66
+ def process_xml(self, base_path: str) -> List[Instance]:
67
+ """Parse the XMLs into question-answer(s) pairs"""
68
+ xml_path = os.path.join(base_path, self.FILENAME)
69
+ tree = ET.parse(xml_path)
70
+ root = tree.getroot()
71
+
72
+ instances = []
73
+ for question_root in root:
74
+ # get the actual question and question ID
75
+ id = self._extract_question_id(question_root)
76
+ question = Input(self._extract_question(question_root))
77
+
78
+ # parse out the reference answers
79
+ answers = self._extract_answers(question_root)
80
+ references = [Reference(Output(answer), tags=[CORRECT_TAG]) for answer in answers]
81
+
82
+ # stitch it all together
83
+ instances.append(Instance(question, references, split=TEST_SPLIT, id=id))
84
+
85
+ return instances
86
+
87
+ def get_instances(self, output_path: str) -> List[Instance]:
88
+ """entrypoint to creating this scenario's instances"""
89
+ # get the dataset
90
+ self.download_liveqa(output_path)
91
+
92
+ # get the instances by parsing the XML
93
+ instances = self.process_xml(output_path)
94
+ return instances