crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -22,11 +22,17 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
+ # Mistral instruction-following format
26
+ MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
27
+
25
28
  # For Anthropic models
26
29
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
27
30
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
31
+ ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
28
32
 
29
33
  GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
34
+ GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
35
+ GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
30
36
 
31
37
  # Models which emit garbage tokens when temperature=0.
32
38
  BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
@@ -46,12 +52,27 @@ NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
46
52
  # Some models can follow instructions.
47
53
  INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
48
54
 
55
+ # For text-to-image models
56
+ TEXT_TO_IMAGE_MODEL_TAG: str = "TEXT_TO_IMAGE_MODEL_TAG"
57
+
49
58
  # For Vision-langauge models (VLMs)
50
59
  VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
60
+ # IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`)
61
+ IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
62
+ IDEFICS_MODEL_TAG: str = "IDEFICS_MODEL_TAG"
63
+ # Llava should use a special prompt format (see `LlavaRunExpander`)
64
+ LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
65
+ # OpenFlamingo has a special prompt format (see `OpenFlamingoRunExpander`)
66
+ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
67
+ # Some VLMs do not support multiple images in the prompt
68
+ LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
69
+ FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
51
70
 
52
71
 
53
72
  # Frozen is set to false as the model_deployment_registry.py file
54
73
  # might populate the deployment_names field.
74
+
75
+
55
76
  @dataclass(frozen=False)
56
77
  class ModelMetadata:
57
78
  name: str
@@ -153,6 +174,11 @@ def get_model_names_with_tag(tag: str) -> List[str]:
153
174
  return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
154
175
 
155
176
 
177
+ def model_has_tag(model_name: str, tag: str) -> bool:
178
+ """Return True if the model has the given tag. False otherwise."""
179
+ return tag in get_model_metadata(model_name).tags
180
+
181
+
156
182
  def get_all_text_models() -> List[str]:
157
183
  """Return all model names of text models."""
158
184
  return get_model_names_with_tag(TEXT_MODEL_TAG)
@@ -168,6 +194,16 @@ def get_all_instruction_following_models() -> List[str]:
168
194
  return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
169
195
 
170
196
 
197
+ def is_text_to_image_model(model_name: str) -> bool:
198
+ """Returns True if the model is a text-to-image model. False otherwise."""
199
+ return model_has_tag(model_name, TEXT_TO_IMAGE_MODEL_TAG)
200
+
201
+
202
+ def is_vlm(model_name: str) -> bool:
203
+ """Returns True if the model is a vision-language model (VLM). False otherwise."""
204
+ return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
205
+
206
+
171
207
  def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
172
208
  """Return placeholder ModelMetadata for an unknown model."""
173
209
  return ModelMetadata(
@@ -0,0 +1,133 @@
1
+ import signal
2
+ import threading
3
+ import traceback
4
+ from typing import List
5
+ import os
6
+ import time
7
+ import torch
8
+ import torch.multiprocessing as multiprocessing
9
+ from concurrent.futures import ProcessPoolExecutor as Pool
10
+ from tqdm import tqdm
11
+
12
+ from helm.benchmark.config_registry import (
13
+ register_configs_from_directory,
14
+ register_builtin_configs_from_helm_package,
15
+ )
16
+ from helm.benchmark.executor import ExecutionSpec
17
+ from helm.benchmark.runner import Runner, RunSpec, RunnerError
18
+ from helm.common.hierarchical_logger import hlog, htrack_block
19
+ from helm.benchmark.runner_config_registry import RUNNER_CONFIG
20
+
21
+ _MAX_CONCURRENT_WORKERS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKERS"
22
+
23
+
24
+ # From
25
+ # https://stackoverflow.com/questions/71300294/how-to-terminate-pythons-processpoolexecutor-when-parent-process-dies
26
+ def start_thread_to_terminate_when_parent_process_dies(ppid):
27
+ pid = os.getpid()
28
+
29
+ def f():
30
+ while True:
31
+ try:
32
+ os.kill(ppid, 0)
33
+ except OSError:
34
+ os.kill(pid, signal.SIGTERM)
35
+ time.sleep(1)
36
+
37
+ thread = threading.Thread(target=f, daemon=True)
38
+ thread.start()
39
+
40
+
41
+ def initialize_worker(gpu_id: int):
42
+ hlog(f"Worker {gpu_id} initializing")
43
+
44
+ # Wait for 0.1 seconds to ensure all workers are initialized with different CUDA_VISIBLE_DEVICES
45
+ time.sleep(0.1)
46
+
47
+ # Pin GPU to worker process
48
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
49
+
50
+ # Necessary for code_metrics in humaneval to work properly
51
+ multiprocessing.set_start_method("fork", force=True)
52
+
53
+
54
+ class MultiGPURunner(Runner):
55
+ """Runner that runs the entire benchmark on multiple GPUs.
56
+
57
+ This is a thin wrapper around `Runner` that runs the entire benchmark on
58
+ multiple GPUs using `multiprocessing`.
59
+
60
+ Note that this runner will load multiple models into memory at the same
61
+ time if your running configuration specifies that, similar to the `Runner`
62
+ class. `SlurmRunner` on the other hand will load at most one model on a
63
+ GPU"""
64
+
65
+ def __init__(
66
+ self,
67
+ execution_spec: ExecutionSpec,
68
+ output_path: str,
69
+ suite: str,
70
+ skip_instances: bool,
71
+ cache_instances: bool,
72
+ cache_instances_only: bool,
73
+ skip_completed_runs: bool,
74
+ exit_on_error: bool,
75
+ ):
76
+ super().__init__(
77
+ execution_spec=execution_spec,
78
+ output_path=output_path,
79
+ suite=suite,
80
+ skip_instances=skip_instances,
81
+ cache_instances=cache_instances,
82
+ cache_instances_only=cache_instances_only,
83
+ skip_completed_runs=skip_completed_runs,
84
+ exit_on_error=exit_on_error,
85
+ )
86
+ # Configure max concurrent worker jobs from the environment variable.
87
+ env_max_concurrent_workers = os.getenv(_MAX_CONCURRENT_WORKERS_ENV_NAME)
88
+ self.max_concurrent_workers = (
89
+ int(env_max_concurrent_workers)
90
+ if env_max_concurrent_workers
91
+ else (
92
+ RUNNER_CONFIG.helm_max_concurrent_workers
93
+ if RUNNER_CONFIG.helm_max_concurrent_workers > 0
94
+ else torch.cuda.device_count()
95
+ )
96
+ )
97
+
98
+ def safe_run_one(self, run_spec: RunSpec):
99
+ register_builtin_configs_from_helm_package()
100
+ if self.executor.execution_spec.local_path is not None:
101
+ register_configs_from_directory(self.executor.execution_spec.local_path)
102
+
103
+ try:
104
+ with htrack_block(f"Running {run_spec.name}"):
105
+ self.run_one(run_spec)
106
+ except Exception as e:
107
+ hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
108
+ return e
109
+
110
+ def run_all(self, run_specs: List[RunSpec]):
111
+ """Run the entire benchmark on multiple GPU"""
112
+
113
+ # Set the start method to forkserver to avoid issues with CUDA.
114
+ multiprocessing.set_start_method("forkserver")
115
+
116
+ with Pool(
117
+ max_workers=self.max_concurrent_workers,
118
+ initializer=start_thread_to_terminate_when_parent_process_dies,
119
+ initargs=(os.getpid(),),
120
+ ) as pool:
121
+ # Pin GPUs to each worker process
122
+ pool.map(initialize_worker, [i for i in range(self.max_concurrent_workers)])
123
+
124
+ # Run all queued tasks
125
+ error_msgs = list(tqdm(pool.map(self.safe_run_one, run_specs), total=len(run_specs), disable=None))
126
+
127
+ # Raise exception for failed runs, if any.
128
+ failed_run_names = [
129
+ run_spec.name for error_msg, run_spec in zip(error_msgs, run_specs) if error_msg is not None
130
+ ]
131
+ if failed_run_names:
132
+ failed_runs_str = ", ".join([f'"{run_name}"' for run_name in failed_run_names])
133
+ raise RunnerError(f"Failed runs: [{failed_runs_str}]")
@@ -10,9 +10,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
10
10
  import numpy as np
11
11
  from scipy.stats import pearsonr
12
12
 
13
+ from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
13
14
  from helm.common.hierarchical_logger import hlog
14
15
  from helm.common.optional_dependencies import handle_module_not_found_error
15
- from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
16
+ from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
16
17
  from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
17
18
 
18
19
  try:
@@ -133,9 +134,6 @@ class Plotter:
133
134
  self.plot_format = plot_format
134
135
  self._tables_cache: Dict[str, Dict[str, Table]] = {}
135
136
 
136
- schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
137
- self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
138
-
139
137
  def get_group_tables(self, group_name: str) -> Dict[str, Table]:
140
138
  """Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
141
139
  if group_name in self._tables_cache:
@@ -338,14 +336,14 @@ class Plotter:
338
336
 
339
337
  def get_model_release_date(model_name: str) -> Optional[date]:
340
338
  """Maps a model name to the month of model release."""
341
- release_date = self.model_metadata[model_name].release_date
339
+ release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
342
340
  if release_date is None:
343
341
  return None
344
342
  return release_date.replace(day=1)
345
343
 
346
344
  def get_model_size(model_name: str) -> Optional[int]:
347
345
  """Maps a model name to the number of parameters, rounding to the nearest leading digit."""
348
- size = self.model_metadata[model_name].num_parameters
346
+ size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
349
347
  if size is None:
350
348
  return None
351
349
  grain = 10 ** (len(str(size)) - 1)
@@ -401,7 +399,9 @@ class Plotter:
401
399
 
402
400
  for i, access_level in enumerate(access_levels):
403
401
  model_indices: List[int] = [
404
- idx for idx, model in enumerate(table.adapters) if self.model_metadata[model].access == access_level
402
+ idx
403
+ for idx, model in enumerate(table.adapters)
404
+ if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
405
405
  ]
406
406
  best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
407
407
 
@@ -611,6 +611,7 @@ def main():
611
611
  parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
612
612
  parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
613
613
  args = parser.parse_args()
614
+ register_builtin_configs_from_helm_package()
614
615
  base_path = os.path.join(args.output_path, "runs", args.suite)
615
616
  if not os.path.exists(os.path.join(base_path, "groups")):
616
617
  hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")
@@ -1,9 +1,9 @@
1
1
  from collections import OrderedDict, defaultdict
2
2
  from dataclasses import dataclass
3
3
  import os
4
- from typing import Dict, Iterable, List, Optional, Set, Tuple
4
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
5
5
 
6
- from helm.benchmark.adaptation.adapters.adapter_factory import (
6
+ from helm.benchmark.adaptation.adapter_spec import (
7
7
  ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
8
8
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
9
9
  )
@@ -12,11 +12,13 @@ from helm.benchmark.adaptation.request_state import RequestState
12
12
  from helm.benchmark.adaptation.scenario_state import ScenarioState
13
13
  from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
14
14
  from helm.benchmark.metrics.metric import PerInstanceStats
15
+ from helm.common.multimodal_request_utils import gather_generated_image_locations
15
16
  from helm.benchmark.presentation.schema import Schema
16
- from helm.benchmark.runner import RunSpec
17
+ from helm.benchmark.run_spec import RunSpec
17
18
  from helm.benchmark.scenarios.scenario import Instance
18
19
  from helm.common.general import write
19
20
  from helm.common.hierarchical_logger import hlog, htrack
21
+ from helm.common.images_utils import encode_base64
20
22
  from helm.common.request import Request
21
23
  from helm.common.codec import from_json, to_json
22
24
 
@@ -43,6 +45,9 @@ class DisplayPrediction:
43
45
  truncated_predicted_text: Optional[str]
44
46
  """The truncated prediction text, if truncation is required by the Adapter method."""
45
47
 
48
+ base64_images: Optional[List[str]]
49
+ """Images in base64."""
50
+
46
51
  mapped_output: Optional[str]
47
52
  """The mapped output, if an output mapping exists and the prediction can be mapped"""
48
53
 
@@ -52,6 +57,8 @@ class DisplayPrediction:
52
57
  stats: Dict[str, float]
53
58
  """Statistics computed from the predicted output"""
54
59
 
60
+ annotations: Optional[Dict[str, Any]]
61
+
55
62
 
56
63
  @dataclass(frozen=True)
57
64
  class DisplayRequest:
@@ -73,7 +80,7 @@ class DisplayRequest:
73
80
  """The actual Request to display in the web frontend.
74
81
 
75
82
  There can be multiple requests per trial. The displayed request should be the
76
- most relevant request e.g. the request for the chosen cohice for multiple choice questions."""
83
+ most relevant request e.g. the request for the chosen choice for multiple choice questions."""
77
84
 
78
85
 
79
86
  def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
@@ -102,8 +109,7 @@ def _truncate_predicted_text(
102
109
  tokens = request_state.result.completions[0].tokens
103
110
  if tokens:
104
111
  first_token = tokens[0]
105
- if not first_token.top_logprobs:
106
- prefix = first_token.text
112
+ prefix = first_token.text
107
113
  if prefix:
108
114
  predicted_text = predicted_text
109
115
  prefix = prefix
@@ -126,7 +132,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
126
132
  if metric_group is None:
127
133
  continue
128
134
  for metric_name_matcher in metric_group.metrics:
129
- if metric_name_matcher.perturbation_name:
135
+ if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
130
136
  continue
131
137
  result.add(metric_name_matcher.substitute(run_group.environment).name)
132
138
  return result
@@ -256,9 +262,17 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
256
262
  mapped_output = (
257
263
  request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
258
264
  )
259
- instance_id_to_instance[
260
- (request_state.instance.id, request_state.instance.perturbation)
261
- ] = request_state.instance
265
+ instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
266
+ request_state.instance
267
+ )
268
+
269
+ # Process images and include if they exist
270
+ images: List[str] = [
271
+ encode_base64(image_location)
272
+ for image_location in gather_generated_image_locations(request_state.result)
273
+ if os.path.exists(image_location)
274
+ ]
275
+
262
276
  predictions.append(
263
277
  DisplayPrediction(
264
278
  instance_id=request_state.instance.id,
@@ -266,9 +280,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
266
280
  train_trial_index=request_state.train_trial_index,
267
281
  predicted_text=predicted_text,
268
282
  truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
283
+ base64_images=images,
269
284
  mapped_output=mapped_output,
270
285
  reference_index=request_state.reference_index,
271
286
  stats=trial_stats,
287
+ annotations=request_state.annotations,
272
288
  )
273
289
  )
274
290
  requests.append(
@@ -1,5 +1,4 @@
1
1
  from dataclasses import dataclass, field
2
- from datetime import date
3
2
  from typing import List, Optional, Dict
4
3
  import dacite
5
4
  import mako.template
@@ -46,34 +45,6 @@ class Field:
46
45
  return name
47
46
 
48
47
 
49
- # Note: also see Model from `models.py`.
50
- @dataclass(frozen=True)
51
- class ModelField(Field):
52
- # Organization that originally created the model (e.g. "EleutherAI")
53
- # Note that this may be different from group or the prefix of the model `name`
54
- # ("together" in "together/gpt-j-6b") as the hosting organization
55
- # may be different from the creator organization. We also capitalize
56
- # this field properly to later display in the UI.
57
- # TODO: in the future, we want to cleanup the naming in the following ways:
58
- # - make the creator_organization an identifier with a separate display name
59
- # - have a convention like <hosting_organization><creator_organization>/<model_name>
60
- creator_organization: Optional[str] = None
61
-
62
- # How this model is available (e.g., limited)
63
- access: Optional[str] = None
64
-
65
- # Whether we have yet to evaluate this model
66
- todo: bool = False
67
-
68
- # When was the model released
69
- release_date: Optional[date] = None
70
-
71
- # The number of parameters
72
- # This should be a string as the number of parameters is usually a round number (175B),
73
- # but we set it as an int for plotting purposes.
74
- num_parameters: Optional[int] = None
75
-
76
-
77
48
  @dataclass(frozen=True)
78
49
  class MetricNameMatcher:
79
50
  """
@@ -100,7 +71,7 @@ class MetricNameMatcher:
100
71
  if self.name != metric_name.name:
101
72
  return False
102
73
 
103
- if self.split != metric_name.split:
74
+ if self.split != "__all__" and self.split != metric_name.split:
104
75
  return False
105
76
 
106
77
  # Optional
@@ -121,9 +92,11 @@ class MetricNameMatcher:
121
92
  return MetricNameMatcher(
122
93
  name=mako.template.Template(self.name).render(**environment),
123
94
  split=mako.template.Template(self.split).render(**environment),
124
- perturbation_name=mako.template.Template(self.perturbation_name).render(**environment)
125
- if self.perturbation_name is not None
126
- else None,
95
+ perturbation_name=(
96
+ mako.template.Template(self.perturbation_name).render(**environment)
97
+ if self.perturbation_name is not None
98
+ else None
99
+ ),
127
100
  )
128
101
 
129
102
 
@@ -135,6 +108,9 @@ class MetricGroup(Field):
135
108
 
136
109
  metrics: List[MetricNameMatcher] = field(default_factory=list)
137
110
 
111
+ hide_win_rates: Optional[bool] = None
112
+ """If set to true, do not compute win rates."""
113
+
138
114
 
139
115
  BY_METRIC = "by_metric"
140
116
  BY_GROUP = "by_group"
@@ -222,9 +198,6 @@ class RunGroup(Field):
222
198
  class Schema:
223
199
  """Specifies information about what to display on the frontend."""
224
200
 
225
- # Models
226
- models: List[ModelField]
227
-
228
201
  # Adapter fields (e.g., temperature)
229
202
  adapter: List[Field]
230
203
 
@@ -241,17 +214,19 @@ class Schema:
241
214
  run_groups: List[RunGroup]
242
215
 
243
216
  def __post_init__(self):
244
- self.name_to_model = {model.name: model for model in self.models}
245
217
  self.name_to_metric = {metric.name: metric for metric in self.metrics}
246
218
  self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
247
219
  self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
248
220
  self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
249
221
 
250
222
 
251
- def read_schema(filename: str) -> Schema:
223
+ def get_default_schema_path() -> str:
224
+ return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
225
+
226
+
227
+ def read_schema(schema_path: str) -> Schema:
252
228
  # TODO: merge in model metadata from `model_metadata.yaml`
253
- schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(filename)
254
229
  hlog(f"Reading schema file {schema_path}...")
255
- with schema_path.open("r") as f:
230
+ with open(schema_path, "r") as f:
256
231
  raw = yaml.safe_load(f)
257
232
  return dacite.from_dict(Schema, raw)