crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,72 @@
1
+ """Helper utilities for working with Amazon Bedrock."""
2
+
3
+ import os
4
+ from typing import Optional
5
+
6
+ from helm.common.hierarchical_logger import hlog
7
+ from helm.common.optional_dependencies import handle_module_not_found_error
8
+
9
+ try:
10
+ import boto3
11
+ from botocore.config import Config
12
+ except ModuleNotFoundError as e:
13
+ handle_module_not_found_error(e, ["aws"])
14
+
15
+
16
+ # From https://github.com/aws-samples/amazon-bedrock-workshop/blob/main/01_Generation/00_generate_w_bedrock.ipynb
17
+ # MIT-0 Licensed
18
+ def get_bedrock_client(
19
+ assumed_role: Optional[str] = None,
20
+ region: Optional[str] = None,
21
+ runtime: Optional[bool] = True,
22
+ ):
23
+ """Create a boto3 client for Amazon Bedrock, with optional configuration overrides
24
+
25
+ Parameters
26
+ ----------
27
+ assumed_role :
28
+ Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not
29
+ specified, the current active credentials will be used.
30
+ region :
31
+ Optional name of the AWS Region in which the service should be called (e.g. "us-east-1").
32
+ If not specified, AWS_REGION or AWS_DEFAULT_REGION environment variable will be used.
33
+ runtime :
34
+ Optional choice of getting different client to perform operations with the Amazon Bedrock service.
35
+ """
36
+ if region is None:
37
+ target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION"))
38
+ else:
39
+ target_region = region
40
+
41
+ session_kwargs = {"region_name": target_region}
42
+ client_kwargs = {**session_kwargs}
43
+
44
+ profile_name = os.environ.get("AWS_PROFILE")
45
+ if profile_name:
46
+ session_kwargs["profile_name"] = profile_name
47
+
48
+ retry_config = Config(
49
+ region_name=target_region,
50
+ retries={
51
+ "max_attempts": 10,
52
+ "mode": "standard",
53
+ },
54
+ )
55
+ session = boto3.Session(**session_kwargs)
56
+
57
+ if assumed_role:
58
+ sts = session.client("sts")
59
+ response = sts.assume_role(RoleArn=str(assumed_role), RoleSessionName="crfm-helm")
60
+ client_kwargs["aws_access_key_id"] = response["Credentials"]["AccessKeyId"]
61
+ client_kwargs["aws_secret_access_key"] = response["Credentials"]["SecretAccessKey"]
62
+ client_kwargs["aws_session_token"] = response["Credentials"]["SessionToken"]
63
+
64
+ if runtime:
65
+ service_name = "bedrock-runtime"
66
+ else:
67
+ service_name = "bedrock"
68
+
69
+ bedrock_client = session.client(service_name=service_name, config=retry_config, **client_kwargs)
70
+
71
+ hlog(f"Amazon Bedrock client successfully created with endpoint {bedrock_client._endpoint}")
72
+ return bedrock_client
@@ -1,11 +1,13 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
- from typing import Dict, List, Optional
3
+ from typing import List, Mapping, Optional, cast
4
4
 
5
5
  from helm.common.hierarchical_logger import hlog
6
6
  from helm.common.media_object import MultimediaObject, TEXT_TYPE
7
- from helm.common.request import Request, RequestResult, Sequence, Token
7
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
8
8
  from helm.common.cache import Cache, CacheConfig
9
+ from helm.common.tokenization_request import DecodeRequest, TokenizationRequest
10
+ from helm.tokenizers.tokenizer import Tokenizer
9
11
 
10
12
 
11
13
  class Client(ABC):
@@ -30,24 +32,28 @@ class CachingClient(Client):
30
32
  self.cache = Cache(cache_config) if cache_config is not None else None
31
33
 
32
34
  @staticmethod
33
- def make_cache_key(raw_request: Dict, request: Request) -> Dict:
35
+ def make_cache_key(raw_request: Mapping, request: Request) -> Mapping:
34
36
  """
35
37
  Construct the key for the cache using the raw request.
36
38
  Add `request.random` to the key, if defined.
37
39
  """
38
40
  if request.random is not None:
39
41
  assert "random" not in raw_request
40
- cache_key = {**raw_request, "random": request.random}
42
+ cache_key: Mapping = {**raw_request, "random": request.random}
41
43
  else:
42
44
  cache_key = raw_request
43
45
  return cache_key
44
46
 
45
47
 
46
- def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool = True) -> Sequence:
48
+ def truncate_sequence(sequence: GeneratedOutput, request: Request, print_warning: bool = True) -> GeneratedOutput:
47
49
  """
48
50
  Certain providers have bugs where they aren't respecting max_tokens,
49
51
  stop_sequences and the end of text token, so as a hack, we have to manually
50
52
  truncate the suffix of `sequence` and `tokens` as a post-hoc process.
53
+
54
+ This method is unsafe and may produce warnings or incorrect results.
55
+ Prefer using the safer truncate_and_tokenize_response_text() method instead
56
+ if your use case satisfies its requirements.
51
57
  """
52
58
  # TODO: if echo_prompt, then we should only ignore the prompt, but we don't
53
59
  # know how many tokens the prompt takes up.
@@ -87,7 +93,7 @@ def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool
87
93
  if print_warning:
88
94
  hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
89
95
 
90
- sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens)
96
+ sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
91
97
 
92
98
  # Truncate based on the max number of tokens.
93
99
  if len(sequence.tokens) > request.max_tokens:
@@ -104,11 +110,63 @@ def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool
104
110
 
105
111
  new_logprob = sum(token.logprob for token in new_tokens)
106
112
 
107
- sequence = Sequence(text=new_text, logprob=new_logprob, tokens=new_tokens)
113
+ sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
108
114
 
109
115
  return sequence
110
116
 
111
117
 
118
+ def truncate_and_tokenize_response_text(
119
+ text: str, request: Request, tokenizer: Tokenizer, tokenizer_name: str, original_finish_reason: str = "endoftext"
120
+ ) -> GeneratedOutput:
121
+ """Truncate a string-only response to respect stop_sequences and max_tokens.
122
+
123
+ This can only be used if all of the following conditions are true:
124
+
125
+ - You have access to the tokenizer.
126
+ - The request has echo_prompt = False.
127
+ - The tokenizer supports encoding and decoding.
128
+ - The tokenizer's tokenize() method supports truncation.
129
+ - The model's response is text-only.
130
+ - The model's response not already provide the tokenized text.
131
+ - The model's response does not provide logprobs.
132
+
133
+ This method is safer than truncate_sequence() and should be preferred if the above conditions are met.
134
+ Unlike truncate_sequence(), this method will not produce warnings or incorrect results.
135
+ This is because the the tokens are derived from the truncated text using the tokenizer,
136
+ so the text and the tokens in the resulting result are guranteed to match."""
137
+ # Finish reason strings are token from basic_metrics._compute_finish_reason_metrics()
138
+ finish_reason: str = original_finish_reason
139
+ if request.echo_prompt:
140
+ raise Exception("truncate_and_tokenize_response_text() does not support requests with echo_prompt = True")
141
+
142
+ for stop_sequence in request.stop_sequences:
143
+ try:
144
+ text = text[: text.index(stop_sequence)]
145
+ finish_reason = "stop"
146
+ except ValueError:
147
+ pass
148
+
149
+ token_strings = cast(
150
+ List[str], tokenizer.tokenize(TokenizationRequest(text=text, tokenizer=tokenizer_name)).raw_tokens
151
+ )
152
+ if len(token_strings) > request.max_tokens:
153
+ encoded_ints = cast(
154
+ List[int],
155
+ tokenizer.tokenize(
156
+ TokenizationRequest(
157
+ text=text, tokenizer=tokenizer_name, encode=True, truncation=True, max_length=request.max_tokens
158
+ )
159
+ ).raw_tokens,
160
+ )
161
+ text = tokenizer.decode(DecodeRequest(encoded_ints, tokenizer_name)).text
162
+ token_strings = cast(
163
+ List[str], tokenizer.tokenize(TokenizationRequest(text=text, tokenizer=tokenizer_name)).raw_tokens
164
+ )
165
+ finish_reason = "length"
166
+ tokens = [Token(text=token_string, logprob=0.0) for token_string in token_strings]
167
+ return GeneratedOutput(text=text, logprob=0.0, tokens=tokens, finish_reason={"reason": finish_reason})
168
+
169
+
112
170
  def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
113
171
  """
114
172
  Certain tokenizers introduce special characters to represent spaces, such as
@@ -0,0 +1,49 @@
1
+ from typing import Dict, Optional
2
+ from dataclasses import asdict
3
+
4
+ from helm.common.cache import Cache, CacheConfig
5
+ from helm.common.clip_score_request import DEFAULT_CLIP_SCORE_MODEL, CLIPScoreRequest, CLIPScoreResult
6
+ from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
7
+
8
+
9
+ class CLIPScoreClientError(Exception):
10
+ pass
11
+
12
+
13
+ class CLIPScoreClient:
14
+ def __init__(self, cache_config: CacheConfig):
15
+ self.cache = Cache(cache_config)
16
+ self._clip_scorer: Optional[BaseCLIPScorer] = None
17
+
18
+ def compute_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
19
+ """
20
+ Compute a CLIPScore for a given caption and image.
21
+ """
22
+ # TODO: support multilingual CLIPScore and other CLIP models.
23
+ assert request.model == DEFAULT_CLIP_SCORE_MODEL, f"Unsupported model: {request.model}"
24
+ assert not request.multilingual
25
+
26
+ try:
27
+
28
+ def do_it():
29
+ if self._clip_scorer is None:
30
+ from helm.clients.clip_scorers.clip_scorer import CLIPScorer
31
+
32
+ self._clip_scorer = CLIPScorer()
33
+
34
+ score: float = self._clip_scorer.compute_score(
35
+ caption=request.caption, image_location=request.image_location
36
+ )
37
+ return {"score": score}
38
+
39
+ cache_key: Dict = asdict(request)
40
+ results, cached = self.cache.get(cache_key, do_it)
41
+
42
+ except Exception as e:
43
+ raise CLIPScoreClientError(e)
44
+
45
+ return CLIPScoreResult(
46
+ success=True,
47
+ cached=cached,
48
+ score=results["score"],
49
+ )
File without changes
@@ -0,0 +1,18 @@
1
+ from abc import abstractmethod, ABC
2
+ from typing import List
3
+
4
+
5
+ class BaseCLIPScorer(ABC):
6
+ @abstractmethod
7
+ def compute_score(self, caption: str, image_location: str) -> float:
8
+ pass
9
+
10
+ def select_best_image(self, caption: str, image_locations: List[str]) -> str:
11
+ """Selects the image from a list of images with the highest CLIPScore given the caption."""
12
+ assert len(image_locations) > 0, "Need at least one image"
13
+
14
+ if len(image_locations) == 1:
15
+ return image_locations[0]
16
+
17
+ scores: List[float] = [self.compute_score(caption, image_location) for image_location in image_locations]
18
+ return image_locations[scores.index(max(scores))]
@@ -0,0 +1,50 @@
1
+ from typing import Literal
2
+
3
+ from torchvision import transforms
4
+ import torch
5
+
6
+ from helm.common.gpu_utils import get_torch_device
7
+ from helm.common.images_utils import open_image
8
+ from helm.common.optional_dependencies import handle_module_not_found_error
9
+ from .base_clip_scorer import BaseCLIPScorer
10
+
11
+
12
+ _ = torch.manual_seed(42)
13
+
14
+
15
+ class CLIPScorer(BaseCLIPScorer):
16
+ """
17
+ CLIPScore is a reference free metric that can be used to evaluate the correlation between an image
18
+ caption and the content of the image. It has been found to be highly correlated with human judgement.
19
+ Paper: https://arxiv.org/abs/2104.08718
20
+
21
+ We use the TorchMetrics implementation:
22
+ https://torchmetrics.readthedocs.io/en/stable/multimodal/clip_score.html.
23
+ The score is bound between 0 and 100, where a score closer to 100 is better.
24
+
25
+ Verified implementation against the scores of image-caption pairs from
26
+ https://wandb.ai/dalle-mini/dalle-mini/reports/OpenAI-CLIP-Score-exploration--VmlldzoxNjMwODM1.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ model_name: Literal[
32
+ "openai/clip-vit-base-patch16",
33
+ "openai/clip-vit-base-patch32",
34
+ "openai/clip-vit-large-patch14-336",
35
+ "openai/clip-vit-large-patch14",
36
+ ] = "openai/clip-vit-large-patch14",
37
+ ):
38
+ try:
39
+ from torchmetrics.multimodal import CLIPScore
40
+ except ModuleNotFoundError as e:
41
+ handle_module_not_found_error(e, ["heim"])
42
+
43
+ self._device: torch.device = get_torch_device()
44
+ self._metric = CLIPScore(model_name_or_path=model_name).to(self._device)
45
+
46
+ def compute_score(self, caption: str, image_location: str) -> float:
47
+ image = open_image(image_location)
48
+ image_tensor: torch.Tensor = transforms.ToTensor()(image).to(self._device)
49
+ score: float = self._metric(image_tensor, caption).detach().item()
50
+ return score
@@ -0,0 +1,50 @@
1
+ import torch
2
+ import transformers
3
+
4
+ from helm.common.gpu_utils import get_torch_device, get_torch_device_name
5
+ from helm.common.images_utils import open_image
6
+ from helm.common.optional_dependencies import handle_module_not_found_error
7
+ from .base_clip_scorer import BaseCLIPScorer
8
+
9
+ _ = torch.manual_seed(42)
10
+
11
+
12
+ class MultilingualCLIPScorer(BaseCLIPScorer):
13
+ """
14
+ Multilingual-CLIP extends OpenAI's English text encoders to multiple other languages.
15
+ Adapted from https://huggingface.co/M-CLIP/XLM-Roberta-Large-Vit-L-14
16
+ """
17
+
18
+ TEXT_MODEL_NAME: str = "M-CLIP/XLM-Roberta-Large-Vit-L-14"
19
+ IMAGE_MODEL_NAME: str = "ViT-L/14"
20
+
21
+ def __init__(self):
22
+ try:
23
+ import clip
24
+ from multilingual_clip import pt_multilingual_clip
25
+ except ModuleNotFoundError as e:
26
+ handle_module_not_found_error(e, ["heim"])
27
+
28
+ super().__init__()
29
+ self._device: torch.device = get_torch_device()
30
+ self._text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(self.TEXT_MODEL_NAME)
31
+ self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.TEXT_MODEL_NAME)
32
+ self._model, self._preprocess = clip.load(self.IMAGE_MODEL_NAME, device=get_torch_device_name())
33
+
34
+ def compute_score(self, caption: str, image_location: str) -> float:
35
+ # Get text features
36
+ text_features = self._text_model.forward(caption, self._tokenizer)
37
+ text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
38
+ text_features = text_features.to(self._device)
39
+
40
+ image = open_image(image_location)
41
+ image = self._preprocess(image).unsqueeze(0).to(self._device)
42
+
43
+ # Get image features
44
+ with torch.no_grad():
45
+ image_features = self._model.encode_image(image)
46
+ image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
47
+
48
+ # Compute score using text and image features
49
+ score = 100 * (image_features * text_features).sum(axis=-1)
50
+ return score.detach().item()
@@ -8,7 +8,7 @@ from helm.common.request import (
8
8
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
9
9
  Request,
10
10
  RequestResult,
11
- Sequence,
11
+ GeneratedOutput,
12
12
  Token,
13
13
  )
14
14
  from .client import CachingClient, truncate_sequence
@@ -120,7 +120,7 @@ class CohereClient(CachingClient):
120
120
  error: str = f"CohereClient error: {e}"
121
121
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
122
122
 
123
- completions: List[Sequence] = []
123
+ completions: List[GeneratedOutput] = []
124
124
  for generation in response["generations"]:
125
125
  # From https://docs.cohere.ai/generate-reference, "the likelihood refers to the average log-likelihood
126
126
  # of the entire specified string..." What we want is the sum of the log probabilities of all tokens.
@@ -132,14 +132,7 @@ class CohereClient(CachingClient):
132
132
  logprob: float = token_likelihood.get("likelihood", 0)
133
133
  sequence_logprob += logprob
134
134
 
135
- tokens.append(
136
- Token(
137
- text=token_likelihood["token"],
138
- logprob=logprob,
139
- # Cohere does not include the top log probs in the response
140
- top_logprobs={},
141
- )
142
- )
135
+ tokens.append(Token(text=token_likelihood["token"], logprob=logprob))
143
136
 
144
137
  sequence_text: str = generation["text"]
145
138
  if request.echo_prompt and request.max_tokens > 0:
@@ -147,7 +140,7 @@ class CohereClient(CachingClient):
147
140
  # `return_likelihoods` is "ALL" and `max_tokens` is greater than 0.
148
141
  sequence_text = request.prompt + sequence_text
149
142
 
150
- completion: Sequence = Sequence(text=sequence_text, logprob=sequence_logprob, tokens=tokens)
143
+ completion: GeneratedOutput = GeneratedOutput(text=sequence_text, logprob=sequence_logprob, tokens=tokens)
151
144
  completion = truncate_sequence(completion, request)
152
145
  completions.append(completion)
153
146
 
@@ -0,0 +1,82 @@
1
+ from dataclasses import asdict
2
+ from typing import Dict, Optional
3
+ import requests
4
+
5
+ from helm.common.cache import Cache, CacheConfig
6
+ from helm.common.optional_dependencies import handle_module_not_found_error
7
+ from helm.common.hierarchical_logger import hlog
8
+ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
9
+
10
+
11
+ class GCSClientError(Exception):
12
+ pass
13
+
14
+
15
+ class GCSClient:
16
+ """
17
+ Uploads files to GCS. Ensure the GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
18
+ environment variable is set.
19
+ """
20
+
21
+ MAX_CHECK_ATTEMPTS: int = 10
22
+
23
+ def __init__(self, bucket_name: str, cache_config: CacheConfig):
24
+ try:
25
+ from google.cloud import storage # type: ignore
26
+ except ModuleNotFoundError as e:
27
+ handle_module_not_found_error(e, ["heim"])
28
+
29
+ self._bucket_name: str = bucket_name
30
+ self._cache = Cache(cache_config)
31
+ self._storage_client: Optional[storage.Client] = None
32
+
33
+ def upload(self, request: FileUploadRequest) -> FileUploadResult:
34
+ """Uploads a file to GCS."""
35
+ try:
36
+ from google.cloud import storage # type: ignore
37
+ except ModuleNotFoundError as e:
38
+ handle_module_not_found_error(e, ["heim"])
39
+
40
+ try:
41
+
42
+ def do_it():
43
+ if self._storage_client is None:
44
+ self._storage_client = storage.Client()
45
+
46
+ bucket = self._storage_client.bucket(self._bucket_name)
47
+ file_path: str = request.path
48
+ blob = bucket.blob(file_path)
49
+
50
+ # Optional: set a generation-match precondition to avoid potential race conditions
51
+ # and data corruptions. The request to upload is aborted if the object's
52
+ # generation number does not match your precondition. For a destination
53
+ # object that does not yet exist, set the if_generation_match precondition to 0.
54
+ # If the destination object already exists in your bucket, set instead a
55
+ # generation-match precondition using its generation number.
56
+ generation_match_precondition: int = 0
57
+
58
+ blob.upload_from_filename(file_path, if_generation_match=generation_match_precondition)
59
+ url: str = self._get_url(file_path)
60
+
61
+ # Ensure the file was uploaded successfully
62
+ uploaded: bool = False
63
+ for _ in range(0, self.MAX_CHECK_ATTEMPTS):
64
+ check_response = requests.head(url)
65
+ if check_response.status_code == 200:
66
+ uploaded = True
67
+ break
68
+ assert uploaded, f"File {file_path} was not uploaded successfully."
69
+
70
+ hlog(f"File {file_path} uploaded and is available at {url}.")
71
+ return {"url": url}
72
+
73
+ cache_key: Dict = asdict(request)
74
+ result, cached = self._cache.get(cache_key, do_it)
75
+
76
+ except Exception as e:
77
+ raise GCSClientError(e)
78
+
79
+ return FileUploadResult(success=True, cached=cached, url=result["url"])
80
+
81
+ def _get_url(self, path: str) -> str:
82
+ return f"https://storage.googleapis.com/{self._bucket_name}/{path}"
@@ -1,7 +1,7 @@
1
1
  from typing import List, Dict
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
- from helm.common.request import Request, RequestResult, Sequence, Token
4
+ from helm.common.request import Request, RequestResult, GeneratedOutput, Token
5
5
  from .client import CachingClient, truncate_sequence
6
6
 
7
7
 
@@ -32,7 +32,7 @@ class GoogleClient(CachingClient):
32
32
 
33
33
  def make_request(self, request: Request) -> RequestResult:
34
34
  raw_request = GoogleClient.convert_to_raw_request(request)
35
- cache_key: Dict = CachingClient.make_cache_key(raw_request, request)
35
+ cache_key = CachingClient.make_cache_key(raw_request, request)
36
36
 
37
37
  try:
38
38
 
@@ -48,17 +48,17 @@ class GoogleClient(CachingClient):
48
48
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
49
49
 
50
50
  # Expect the result to be structured the same way as a response from OpenAI API.
51
- completions: List[Sequence] = []
51
+ completions: List[GeneratedOutput] = []
52
52
  for raw_completion in response["choices"]:
53
53
  sequence_logprob = 0
54
54
  tokens: List[Token] = []
55
55
 
56
56
  raw_data = raw_completion["logprobs"]
57
57
  for text, logprob in zip(raw_data["tokens"], raw_data["token_logprobs"]):
58
- tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict()))
58
+ tokens.append(Token(text=text, logprob=logprob or 0))
59
59
  sequence_logprob += logprob or 0
60
60
 
61
- completion = Sequence(
61
+ completion = GeneratedOutput(
62
62
  text=raw_completion["text"],
63
63
  logprob=sequence_logprob,
64
64
  tokens=tokens,
@@ -0,0 +1,35 @@
1
+ from typing import Optional
2
+
3
+ from helm.common.cache import Cache, SqliteCacheConfig
4
+ from helm.common.optional_dependencies import handle_module_not_found_error
5
+
6
+ try:
7
+ from google.cloud import translate_v2 as translate # type: ignore
8
+ except ModuleNotFoundError as e:
9
+ handle_module_not_found_error(e, ["heim"])
10
+
11
+
12
+ class GoogleTranslateClient:
13
+ """
14
+ Client for Google Translate.
15
+ Follow the instructions at https://cloud.google.com/translate/docs/setup to use this client.
16
+
17
+ # TODO: add this as a central service
18
+ """
19
+
20
+ def __init__(self, cache_path: str = "prod_env/cache/google_translate.sqlite"):
21
+ self.translate_client: Optional[translate.Client] = None
22
+ self.cache = Cache(SqliteCacheConfig(cache_path))
23
+
24
+ def translate(self, text: str, target_language: str) -> str:
25
+ def do_it():
26
+ if self.translate_client is None:
27
+ self.translate_client = translate.Client()
28
+
29
+ result = self.translate_client.translate(text, target_language=target_language)
30
+ del result["input"]
31
+ assert "translatedText" in result, f"Invalid response: {result}"
32
+ return result
33
+
34
+ response, _ = self.cache.get({"text": text, "target_language": target_language}, do_it)
35
+ return response["translatedText"]
@@ -1,12 +1,13 @@
1
1
  import os
2
2
  from dataclasses import asdict
3
+ from typing import Any, Dict
3
4
 
4
5
  from helm.common.cache import CacheConfig
5
6
  from helm.common.request import (
6
7
  wrap_request_time,
7
8
  Request,
8
9
  RequestResult,
9
- Sequence,
10
+ GeneratedOutput,
10
11
  Token,
11
12
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
12
13
  )
@@ -51,7 +52,7 @@ class HTTPModelClient(CachingClient):
51
52
 
52
53
  try:
53
54
 
54
- def do_it():
55
+ def do_it() -> Dict[str, Any]:
55
56
  url = f"{self.base_url}/process"
56
57
  response = requests.post(url, json=raw_request, timeout=self.timeout)
57
58
  response.raise_for_status()
@@ -63,11 +64,8 @@ class HTTPModelClient(CachingClient):
63
64
  else:
64
65
  response, cached = do_it(), False
65
66
 
66
- tokens = [
67
- Token(text=token["text"], logprob=token["logprob"], top_logprobs=token["top_logprob"])
68
- for token in response["tokens"]
69
- ]
70
- completions = [Sequence(text=response["text"], logprob=response["logprob"], tokens=tokens)]
67
+ tokens = [Token(text=token["text"], logprob=token["logprob"]) for token in response["tokens"]]
68
+ completions = [GeneratedOutput(text=response["text"], logprob=response["logprob"], tokens=tokens)]
71
69
 
72
70
  return RequestResult(
73
71
  success=True,