crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/common/cache.py CHANGED
@@ -1,13 +1,13 @@
1
1
  from collections import defaultdict
2
2
  from dataclasses import dataclass
3
- from typing import Dict, Callable, Generator, Optional, Tuple
3
+ from typing import Dict, Callable, Generator, Mapping, Optional, Tuple
4
4
  import json
5
5
  import threading
6
6
 
7
7
  import sqlite3
8
8
 
9
9
  from helm.common.general import hlog, htrack
10
- from helm.common.key_value_store import KeyValueStore, SqliteKeyValueStore
10
+ from helm.common.key_value_store import BlackHoleKeyValueStore, KeyValueStore, SqliteKeyValueStore
11
11
  from helm.proxy.retry import get_retry_decorator
12
12
 
13
13
  try:
@@ -51,6 +51,16 @@ class SqliteCacheConfig(KeyValueStoreCacheConfig):
51
51
  return self.path
52
52
 
53
53
 
54
+ @dataclass(frozen=True)
55
+ class BlackHoleCacheConfig(KeyValueStoreCacheConfig):
56
+ """Configuration for a cache that does not save any data."""
57
+
58
+ @property
59
+ def cache_stats_key(self) -> str:
60
+ """The string key used by CacheStats to identify this cache."""
61
+ return "disabled_cache"
62
+
63
+
54
64
  @dataclass(frozen=True)
55
65
  class MongoCacheConfig(KeyValueStoreCacheConfig):
56
66
  """Configuration for a cache backed by a MongoDB collection."""
@@ -113,12 +123,14 @@ def create_key_value_store(config: KeyValueStoreCacheConfig) -> KeyValueStore:
113
123
  return MongoKeyValueStore(config.uri, config.collection_name)
114
124
  elif isinstance(config, SqliteCacheConfig):
115
125
  return SqliteKeyValueStore(config.path)
126
+ elif isinstance(config, BlackHoleCacheConfig):
127
+ return BlackHoleKeyValueStore()
116
128
  else:
117
129
  raise ValueError(f"KeyValueStoreCacheConfig with unknown type: {config}")
118
130
 
119
131
 
120
132
  @retry
121
- def write_to_key_value_store(key_value_store: KeyValueStore, key: Dict, response: Dict) -> bool:
133
+ def write_to_key_value_store(key_value_store: KeyValueStore, key: Mapping, response: Dict) -> bool:
122
134
  """
123
135
  Write to the key value store with retry. Returns boolean indicating whether the write was successful or not.
124
136
  """
@@ -188,7 +200,7 @@ class Cache(object):
188
200
  else:
189
201
  raise ValueError(f"CacheConfig with unknown type: {config}")
190
202
 
191
- def get(self, request: Dict, compute: Callable[[], Dict]) -> Tuple[Dict, bool]:
203
+ def get(self, request: Mapping, compute: Callable[[], Dict]) -> Tuple[Dict, bool]:
192
204
  """Get the result of `request` (by calling `compute` as needed)."""
193
205
  cache_stats.increment_query(self.config.cache_stats_key)
194
206
 
@@ -0,0 +1,47 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ import os
4
+
5
+ from helm.common.cache import CacheConfig, MongoCacheConfig, BlackHoleCacheConfig, SqliteCacheConfig
6
+
7
+
8
+ class CacheBackendConfig(ABC):
9
+ """Config for a cache backend."""
10
+
11
+ @abstractmethod
12
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
13
+ """Get a CacheConfig for the given shard."""
14
+ pass
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class MongoCacheBackendConfig(CacheBackendConfig):
19
+ """Config for a MongoDB cache backend."""
20
+
21
+ uri: str
22
+ """URL for the MongoDB database that contains the collection.
23
+
24
+ Example format: mongodb://[username:password@]host1[:port1]/[dbname]
25
+ For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/"""
26
+
27
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
28
+ return MongoCacheConfig(uri=self.uri, collection_name=shard_name)
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class BlackHoleCacheBackendConfig(CacheBackendConfig):
33
+ """Config for a cache backend that does not save any data."""
34
+
35
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
36
+ return BlackHoleCacheConfig()
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class SqliteCacheBackendConfig(CacheBackendConfig):
41
+ """Config for a Sqlite cache backend."""
42
+
43
+ path: str
44
+ """Path for the directory that will contain Sqlite files for caches."""
45
+
46
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
47
+ return SqliteCacheConfig(path=os.path.join(self.path, f"{shard_name}.sqlite"))
@@ -0,0 +1,41 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ DEFAULT_CLIP_SCORE_MODEL = "openai/clip-vit-large-patch14"
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class CLIPScoreRequest:
10
+ """
11
+ Computes a CLIPScore for a given caption and image.
12
+ """
13
+
14
+ # Caption to compute CLIPScore for
15
+ caption: str
16
+
17
+ # Location of the image
18
+ image_location: str
19
+
20
+ # Which CLIP model to use
21
+ model: str = DEFAULT_CLIP_SCORE_MODEL
22
+
23
+ # Compute multilingual CLIPScore
24
+ multilingual: bool = False
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class CLIPScoreResult:
29
+ """Result after sending a `CLIPScoreRequest`."""
30
+
31
+ # Whether the request was successful
32
+ success: bool
33
+
34
+ # Whether the request was cached
35
+ cached: bool
36
+
37
+ # The CLIPScore
38
+ score: float = 0.0
39
+
40
+ # If `success` is false, what was the error?
41
+ error: Optional[str] = None
File without changes
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable
3
+
4
+
5
+ class FileCache(ABC):
6
+ """
7
+ Cache to store files.
8
+ """
9
+
10
+ @abstractmethod
11
+ def store(self, compute: Callable[[], bytes]) -> str:
12
+ """
13
+ Stores the output of `compute` as a file at a unique location.
14
+ Returns the location of the file.
15
+ """
16
+ pass
@@ -0,0 +1,61 @@
1
+ import os
2
+ from typing import Callable
3
+
4
+ from helm.common.general import ensure_directory_exists, generate_unique_id
5
+ from .file_cache import FileCache
6
+
7
+ from helm.common.optional_dependencies import handle_module_not_found_error
8
+
9
+ try:
10
+ from PIL import Image
11
+ except ModuleNotFoundError as e:
12
+ handle_module_not_found_error(e, ["images"])
13
+
14
+
15
+ class LocalFileCache(FileCache):
16
+ def __init__(self, base_path: str, file_extension: str):
17
+ ensure_directory_exists(base_path)
18
+ self._location: str = base_path
19
+ self._file_extension: str = file_extension
20
+
21
+ def store(self, compute: Callable[[], bytes]) -> str:
22
+ """
23
+ Stores the output of `compute` as a file at a unique path.
24
+ Returns the file path.
25
+ """
26
+ file_path: str = self.generate_unique_new_file_path()
27
+ with open(file_path, "wb") as f:
28
+ f.write(compute())
29
+
30
+ return file_path
31
+
32
+ def generate_unique_new_file_path(self) -> str:
33
+ """Generate an unique file name at `base_path`"""
34
+
35
+ def generate_one() -> str:
36
+ file_name: str = f"{generate_unique_id()}.{self._file_extension}"
37
+ return os.path.join(self._location, file_name)
38
+
39
+ file_path: str
40
+ while True:
41
+ file_path = generate_one()
42
+ if not os.path.exists(file_path):
43
+ break
44
+ return file_path
45
+
46
+
47
+ class LocalPILFileCache(LocalFileCache):
48
+ def __init__(self, base_path: str):
49
+ super().__init__(base_path, "png")
50
+
51
+ def store_image(self, compute: Callable[[], Image.Image]) -> str:
52
+ """
53
+ Stores the output of `compute` as a file at a unique path.
54
+ Returns the file path.
55
+ """
56
+ file_path: str = self.generate_unique_new_file_path()
57
+ compute().save(file_path)
58
+ return file_path
59
+
60
+ def load_image(self, file_path: str) -> Image.Image:
61
+ return Image.open(file_path).convert("RGB")
@@ -0,0 +1,25 @@
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ import unittest
5
+
6
+ from .local_file_cache import LocalFileCache
7
+
8
+
9
+ class TestLocalFileCache(unittest.TestCase):
10
+ def setup_method(self, _):
11
+ self.path: str = tempfile.mkdtemp()
12
+
13
+ def teardown_method(self, _):
14
+ shutil.rmtree(self.path)
15
+
16
+ def test_get(self):
17
+ cache = LocalFileCache(self.path, file_extension="txt")
18
+ file_path1: str = cache.store(lambda: "hello.".encode())
19
+
20
+ # Verify the contents of the file
21
+ with open(file_path1, "r") as f:
22
+ assert f.read() == "hello."
23
+
24
+ cache.store(lambda: "bye.".encode())
25
+ assert len(os.listdir(self.path)) == 2
@@ -0,0 +1,27 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class FileUploadRequest:
7
+ """Uploads a file at `path`."""
8
+
9
+ # Path of the file to upload
10
+ path: str
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class FileUploadResult:
15
+ """Result after sending a `FileUploadRequest`."""
16
+
17
+ # Whether the request was successful
18
+ success: bool
19
+
20
+ # Whether the request was cached
21
+ cached: bool
22
+
23
+ # URL of the uploaded file
24
+ url: str
25
+
26
+ # If `success` is false, what was the error?
27
+ error: Optional[str] = None
helm/common/general.py CHANGED
@@ -63,7 +63,7 @@ def shell(args: List[str]):
63
63
  hlog(f"Executing: {cmd}")
64
64
  exit_code = subprocess.call(args)
65
65
  if exit_code != 0:
66
- hlog(f"Failed with exit code {exit_code}: {cmd}")
66
+ raise Exception(f"Failed with exit code {exit_code}: {cmd}")
67
67
 
68
68
 
69
69
  @htrack(None)
@@ -0,0 +1,25 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class ImageGenerationParameters:
7
+ """
8
+ Parameters for image generation.
9
+ """
10
+
11
+ output_image_width: Optional[int] = None
12
+ """Width of the generated image. The model will generate images with the model's
13
+ default dimensions when unspecified."""
14
+
15
+ output_image_height: Optional[int] = None
16
+ """Height of the generated image. The model will generate images with the model's
17
+ default dimensions when unspecified."""
18
+
19
+ guidance_scale: Optional[float] = None
20
+ """A non-negative number determining how much importance is given to the prompt
21
+ when generating images. Higher values will generate images that follow more
22
+ closely to the prompt. Currently only for diffusion models."""
23
+
24
+ diffusion_denoising_steps: Optional[int] = None
25
+ """The number of denoising steps for diffusion models."""
@@ -2,7 +2,10 @@ import base64
2
2
  import io
3
3
  import requests
4
4
  import shutil
5
- from typing import Optional
5
+ from typing import List, Optional
6
+ from urllib.request import urlopen
7
+
8
+ import numpy as np
6
9
 
7
10
  from .general import is_url
8
11
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -45,3 +48,23 @@ def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optiona
45
48
  image.save(dest)
46
49
  else:
47
50
  shutil.copy(src, dest)
51
+
52
+
53
+ def is_blacked_out_image(image_location: str) -> bool:
54
+ """Returns True if the image is all black. False otherwise."""
55
+ try:
56
+ import cv2
57
+ except ModuleNotFoundError as e:
58
+ handle_module_not_found_error(e, ["heim"])
59
+
60
+ if is_url(image_location):
61
+ arr = np.asarray(bytearray(urlopen(image_location).read()), dtype=np.uint8)
62
+ image = cv2.imdecode(arr, -1)
63
+ else:
64
+ image = cv2.imread(image_location, 0)
65
+ return cv2.countNonZero(image) == 0
66
+
67
+
68
+ def filter_blacked_out_images(image_locations: List[str]) -> List[str]:
69
+ """Returns a list of image locations that are not blacked out."""
70
+ return [image_location for image_location in image_locations if not is_blacked_out_image(image_location)]
@@ -1,12 +1,12 @@
1
1
  from abc import abstractmethod
2
2
  import contextlib
3
3
  import json
4
- from typing import Dict, Generator, Iterable, Optional, Tuple
4
+ from typing import Dict, Generator, Iterable, Mapping, Optional, Tuple
5
5
 
6
6
  from sqlitedict import SqliteDict
7
7
 
8
8
 
9
- def request_to_key(request: Dict) -> str:
9
+ def request_to_key(request: Mapping) -> str:
10
10
  """Normalize a `request` into a `key` so that we can hash using it."""
11
11
  return json.dumps(request, sort_keys=True)
12
12
 
@@ -27,7 +27,7 @@ class KeyValueStore(contextlib.AbstractContextManager):
27
27
  pass
28
28
 
29
29
  @abstractmethod
30
- def put(self, key: Dict, value: Dict) -> None:
30
+ def put(self, key: Mapping, value: Dict) -> None:
31
31
  pass
32
32
 
33
33
  @abstractmethod
@@ -68,7 +68,7 @@ class SqliteKeyValueStore(KeyValueStore):
68
68
  for key, value in self._sqlite_dict.items():
69
69
  yield (key, value)
70
70
 
71
- def put(self, key: Dict, value: Dict) -> None:
71
+ def put(self, key: Mapping, value: Dict) -> None:
72
72
  key_string = request_to_key(key)
73
73
  self._sqlite_dict[key_string] = value
74
74
  self._sqlite_dict.commit()
@@ -80,3 +80,34 @@ class SqliteKeyValueStore(KeyValueStore):
80
80
  def remove(self, key: Dict) -> None:
81
81
  del self._sqlite_dict[key]
82
82
  self._sqlite_dict.commit()
83
+
84
+
85
+ class BlackHoleKeyValueStore(KeyValueStore):
86
+ """Key value store that discards all data."""
87
+
88
+ def __enter__(self) -> "BlackHoleKeyValueStore":
89
+ return self
90
+
91
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
92
+ pass
93
+
94
+ def contains(self, key: Dict) -> bool:
95
+ return False
96
+
97
+ def get(self, key: Dict) -> Optional[Dict]:
98
+ return None
99
+
100
+ def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
101
+ # Return an empty generator.
102
+ # See: https://stackoverflow.com/a/13243870
103
+ return
104
+ yield
105
+
106
+ def put(self, key: Mapping, value: Dict) -> None:
107
+ return None
108
+
109
+ def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
110
+ return None
111
+
112
+ def remove(self, key: Dict) -> None:
113
+ return None
@@ -5,6 +5,7 @@ from dataclasses import dataclass, field, replace
5
5
  from typing import List, Optional
6
6
 
7
7
 
8
+ IMAGE_TYPE = "image"
8
9
  TEXT_TYPE = "text"
9
10
 
10
11
 
@@ -26,6 +27,10 @@ class MediaObject:
26
27
  location: Optional[str] = None
27
28
  """When the media object is a file, specify the location of the media object, which can be a local path or URL."""
28
29
 
30
+ def to_dict(self) -> dict:
31
+ """Converts the media object to a dictionary."""
32
+ return {key: value for key, value in self.__dict__.items() if value is not None}
33
+
29
34
  @property
30
35
  def type(self) -> str:
31
36
  """The MIME type of the media object."""
@@ -113,6 +118,14 @@ class MultimediaObject:
113
118
  """
114
119
  return MultimediaObject(media_objects=self.media_objects + other.media_objects)
115
120
 
121
+ @property
122
+ def size(self) -> int:
123
+ """
124
+ Get the number of `MediaObject`s in this multimodal content.
125
+ :return: The number of `MediaObject`s .
126
+ """
127
+ return len(self.media_objects)
128
+
116
129
  @property
117
130
  def text(self) -> str:
118
131
  """
@@ -0,0 +1,71 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class ModerationAPIRequest:
7
+ # Text to check against OpenAI's content policy
8
+ text: str
9
+
10
+ # From https://beta.openai.com/docs/api-reference/moderations/create,
11
+ # "the default is text-moderation-latest which will be automatically upgraded over time.
12
+ # This ensures you are always using our most accurate model. If you use text-moderation-stable,
13
+ # we will provide advanced notice before updating the model. Accuracy of text-moderation-stable
14
+ # may be slightly lower than for text-moderation-latest."
15
+ use_latest_model: bool = False
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ModerationCategoryFlaggedResults:
20
+ """
21
+ Contains per-category binary content violation flags.
22
+ For descriptions of the categories, see https://beta.openai.com/docs/guides/moderation/overview.
23
+ """
24
+
25
+ hate_flagged: bool
26
+ hate_threatening_flagged: bool
27
+ self_harm_flagged: bool
28
+ sexual_flagged: bool
29
+ sexual_minors_flagged: bool
30
+ violence_flagged: bool
31
+ violence_graphic_flagged: bool
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class ModerationCategoryScores:
36
+ """
37
+ Contains per-category scores. Values are between 0 and 1, where higher values denote higher
38
+ confidence. The scores should not be interpreted as probabilities.
39
+ For descriptions of the categories, see https://beta.openai.com/docs/guides/moderation/overview.
40
+ """
41
+
42
+ hate_score: float
43
+ hate_threatening_score: float
44
+ self_harm_score: float
45
+ sexual_score: float
46
+ sexual_minors_score: float
47
+ violence_score: float
48
+ violence_graphic_score: float
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class ModerationAPIRequestResult:
53
+ """Result after sending a `ModerationAPIRequest`."""
54
+
55
+ # Whether the request was successful
56
+ success: bool
57
+
58
+ # Whether the request was cached
59
+ cached: bool
60
+
61
+ # True if the model classifies the content as violating OpenAI's content policy, False otherwise
62
+ flagged: Optional[bool]
63
+
64
+ # Flagged results
65
+ flagged_results: Optional[ModerationCategoryFlaggedResults]
66
+
67
+ # Score results
68
+ scores: Optional[ModerationCategoryScores]
69
+
70
+ # If `success` is false, what was the error?
71
+ error: Optional[str] = None
@@ -1,5 +1,5 @@
1
1
  import json
2
- from typing import Dict, Generator, Iterable, Optional, Tuple
2
+ from typing import Dict, Generator, Iterable, Mapping, Optional, Tuple
3
3
 
4
4
  from helm.common.key_value_store import KeyValueStore
5
5
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -35,7 +35,7 @@ class MongoKeyValueStore(KeyValueStore):
35
35
  def __exit__(self, exc_type, exc_value, traceback) -> None:
36
36
  return
37
37
 
38
- def _canonicalize_key(self, key: Dict) -> SON:
38
+ def _canonicalize_key(self, key: Mapping) -> SON:
39
39
  serialized = json.dumps(key, sort_keys=True)
40
40
  return json.loads(serialized, object_pairs_hook=SON)
41
41
 
@@ -63,7 +63,7 @@ class MongoKeyValueStore(KeyValueStore):
63
63
  else:
64
64
  yield (request, response)
65
65
 
66
- def put(self, key: Dict, value: Dict) -> None:
66
+ def put(self, key: Mapping, value: Dict) -> None:
67
67
  request = self._canonicalize_key(key)
68
68
  document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
69
69
  # The MongoDB collection should have a unique indexed on "request"
@@ -0,0 +1,31 @@
1
+ from typing import List, Optional
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.scenarios.scenario import Reference
5
+ from helm.common.request import RequestResult
6
+
7
+
8
+ def gather_generated_image_locations(request_result: RequestResult) -> List[str]:
9
+ """Gathers the locations (file paths or URLs) of the generated images."""
10
+ image_locations: List[str] = []
11
+ for image in request_result.completions:
12
+ # Models like DALL-E 2 can skip generating images for prompts that violate their content policy
13
+ if image.multimodal_content is None or image.multimodal_content.size == 0:
14
+ return []
15
+
16
+ location: Optional[str] = image.multimodal_content.media_objects[0].location
17
+ if location is not None:
18
+ image_locations.append(location)
19
+ return image_locations
20
+
21
+
22
+ def get_gold_image_location(request_state: RequestState) -> str:
23
+ """Returns the first gold image location."""
24
+ references: List[Reference] = request_state.instance.references
25
+ assert (
26
+ len(references) > 0
27
+ and references[0].output.multimedia_content is not None
28
+ and references[0].output.multimedia_content.size > 0
29
+ and references[0].output.multimedia_content.media_objects[0].location is not None
30
+ ), "Expected at least one gold image"
31
+ return references[0].output.multimedia_content.media_objects[0].location
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional, Dict
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class NudityCheckRequest:
7
+ """
8
+ Checks for nudity for a given set of images.
9
+ """
10
+
11
+ # Batch of images
12
+ image_locations: List[str] = field(default_factory=list)
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class NudityCheckResult:
17
+ """Result after sending a `NudityCheckRequest`."""
18
+
19
+ # Whether the request was successful
20
+ success: bool
21
+
22
+ # Whether the request was cached
23
+ cached: bool
24
+
25
+ # Nudity results. True indicates the particular image contains nudity.
26
+ image_to_nudity: Dict[str, bool] = field(default_factory=dict)
27
+
28
+ # If `success` is false, what was the error?
29
+ error: Optional[str] = None