crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional, TypedDict, Union, cast
2
2
  import json
3
3
  import requests
4
4
  import time
@@ -6,13 +6,14 @@ import urllib.parse
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
8
  from helm.common.hierarchical_logger import htrack_block, hlog
9
+ from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
9
10
  from helm.common.optional_dependencies import handle_module_not_found_error
10
11
  from helm.common.request import (
11
12
  wrap_request_time,
12
13
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
13
14
  Request,
14
15
  RequestResult,
15
- Sequence,
16
+ GeneratedOutput,
16
17
  Token,
17
18
  ErrorFlags,
18
19
  )
@@ -20,16 +21,30 @@ from helm.common.tokenization_request import (
20
21
  TokenizationRequest,
21
22
  TokenizationRequestResult,
22
23
  )
23
- from helm.proxy.tokenizers.tokenizer import Tokenizer
24
- from .client import CachingClient, truncate_sequence
24
+ from helm.proxy.retry import NonRetriableException
25
+ from helm.tokenizers.tokenizer import Tokenizer
26
+ from helm.clients.client import CachingClient, truncate_sequence, truncate_and_tokenize_response_text
25
27
 
26
28
  try:
27
- import anthropic
29
+ from anthropic import Anthropic, BadRequestError
30
+ from anthropic.types import MessageParam
31
+ from anthropic.types.image_block_param import ImageBlockParam
32
+ from anthropic.types.text_block_param import TextBlockParam
28
33
  import websocket
29
34
  except ModuleNotFoundError as e:
30
35
  handle_module_not_found_error(e, ["anthropic"])
31
36
 
32
37
 
38
+ class AnthropicCompletionRequest(TypedDict):
39
+ prompt: str
40
+ stop_sequences: List[str]
41
+ model: str
42
+ max_tokens_to_sample: int
43
+ temperature: float
44
+ top_p: float
45
+ top_k: int
46
+
47
+
33
48
  class AnthropicClient(CachingClient):
34
49
  """
35
50
  Client for the Anthropic models (https://arxiv.org/abs/2204.05862).
@@ -56,15 +71,19 @@ class AnthropicClient(CachingClient):
56
71
  ADDITIONAL_TOKENS: int = 5
57
72
  PROMPT_ANSWER_START: str = "The answer is "
58
73
 
59
- def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig, api_key: Optional[str] = None):
60
- super().__init__(cache_config=cache_config, tokenizer=tokenizer)
74
+ def __init__(
75
+ self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
76
+ ):
77
+ super().__init__(cache_config=cache_config)
78
+ self.tokenizer = tokenizer
79
+ self.tokenizer_name = tokenizer_name
61
80
  self.api_key: Optional[str] = api_key
62
- self._client = anthropic.Client(api_key) if api_key else None
81
+ self.client = Anthropic(api_key=api_key)
63
82
 
64
- def _send_request(self, raw_request: Dict[str, Any]) -> Dict[str, Any]:
83
+ def _send_request(self, raw_request: AnthropicCompletionRequest) -> Dict[str, Any]:
65
84
  if self.api_key is None:
66
85
  raise Exception("API key is not set. Please set it in the HELM config file.")
67
- result = self._client.completion(**raw_request)
86
+ result = self.client.completions.create(**raw_request).model_dump()
68
87
  assert "error" not in result, f"Request failed with error: {result['error']}"
69
88
  return result
70
89
 
@@ -99,7 +118,7 @@ class AnthropicClient(CachingClient):
99
118
  if request.max_tokens == 0 and not request.echo_prompt:
100
119
  raise ValueError("echo_prompt must be True when max_tokens=0.")
101
120
 
102
- raw_request = {
121
+ raw_request: AnthropicCompletionRequest = {
103
122
  "prompt": request.prompt,
104
123
  "stop_sequences": request.stop_sequences,
105
124
  "model": request.model_engine,
@@ -109,7 +128,7 @@ class AnthropicClient(CachingClient):
109
128
  "top_k": request.top_k_per_token,
110
129
  }
111
130
 
112
- completions: List[Sequence] = []
131
+ completions: List[GeneratedOutput] = []
113
132
 
114
133
  # `num_completions` is not supported, so instead make `num_completions` separate requests.
115
134
  for completion_index in range(request.num_completions):
@@ -164,15 +183,13 @@ class AnthropicClient(CachingClient):
164
183
  # The Anthropic API doesn't return us tokens or logprobs, so we tokenize ourselves.
165
184
  tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
166
185
  # Anthropic uses their own tokenizer
167
- TokenizationRequest(text, tokenizer=request.model_engine)
186
+ TokenizationRequest(text, tokenizer=self.tokenizer_name)
168
187
  )
169
188
 
170
189
  # Log probs are not currently not supported by the Anthropic, so set to 0 for now.
171
- tokens: List[Token] = [
172
- Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
173
- ]
190
+ tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
174
191
 
175
- completion = Sequence(text=response["completion"], logprob=0, tokens=tokens)
192
+ completion = GeneratedOutput(text=response["completion"], logprob=0, tokens=tokens)
176
193
  # See NOTE() in _filter_completion() to understand why warnings are printed for truncation.
177
194
  # TODO(#1512): Fix this with post-processing.
178
195
  sequence = truncate_sequence(completion, request, print_warning=True)
@@ -188,6 +205,179 @@ class AnthropicClient(CachingClient):
188
205
  )
189
206
 
190
207
 
208
+ def _is_content_moderation_failure(response: Dict) -> bool:
209
+ """Return whether a a response failed because of the content moderation filter."""
210
+ if (
211
+ "error" in response
212
+ and "message" in response["error"]
213
+ and response["error"]["message"] == "Output blocked by content filtering policy"
214
+ ):
215
+ hlog(f"Anthropic - output blocked by content filtering policy: {response}")
216
+ return True
217
+ return False
218
+
219
+
220
+ class AnthropicMessagesRequest(TypedDict, total=False):
221
+ messages: List[MessageParam]
222
+ model: str
223
+ stop_sequences: List[str]
224
+ system: str
225
+ max_tokens: int
226
+ temperature: float
227
+ top_k: int
228
+ top_p: float
229
+
230
+
231
+ class AnthropicMessagesRequestError(NonRetriableException):
232
+ pass
233
+
234
+
235
+ class AnthropicMessagesResponseError(Exception):
236
+ pass
237
+
238
+
239
+ class AnthropicMessagesClient(CachingClient):
240
+ # Source: https://docs.anthropic.com/claude/docs/models-overview
241
+ MAX_OUTPUT_TOKENS = 4096
242
+
243
+ def __init__(
244
+ self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
245
+ ):
246
+ super().__init__(cache_config=cache_config)
247
+ self.tokenizer = tokenizer
248
+ self.tokenizer_name = tokenizer_name
249
+ self.client = Anthropic(api_key=api_key)
250
+ self.api_key: Optional[str] = api_key
251
+
252
+ def make_request(self, request: Request) -> RequestResult:
253
+ if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
254
+ raise AnthropicMessagesRequestError(
255
+ f"Request.max_tokens must be <= {AnthropicMessagesClient.MAX_OUTPUT_TOKENS}"
256
+ )
257
+
258
+ messages: List[MessageParam] = []
259
+ system_message: Optional[MessageParam] = None
260
+
261
+ if request.messages is not None:
262
+ # TODO(#2439): Refactor out Request validation
263
+ if request.multimodal_prompt is not None or request.prompt:
264
+ raise AnthropicMessagesRequestError(
265
+ "Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
266
+ )
267
+ messages = cast(List[MessageParam], request.messages)
268
+ if messages[0]["role"] == "system":
269
+ system_message = messages[0]
270
+ messages = messages[1:]
271
+
272
+ elif request.multimodal_prompt is not None:
273
+ # TODO(#2439): Refactor out Request validation
274
+ if request.messages is not None or request.prompt:
275
+ raise AnthropicMessagesRequestError(
276
+ "Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
277
+ )
278
+ blocks: List[Union[TextBlockParam, ImageBlockParam]] = []
279
+ for media_object in request.multimodal_prompt.media_objects:
280
+ if media_object.is_type(IMAGE_TYPE):
281
+ # TODO(#2439): Refactor out Request validation
282
+ if not media_object.location:
283
+ raise Exception("MediaObject of image type has missing location field value")
284
+
285
+ from helm.common.images_utils import encode_base64
286
+
287
+ base64_image: str = encode_base64(media_object.location, format="JPEG")
288
+ image_block: ImageBlockParam = {
289
+ "type": "image",
290
+ "source": {
291
+ "type": "base64",
292
+ "media_type": "image/jpeg",
293
+ "data": base64_image,
294
+ },
295
+ }
296
+ blocks.append(image_block)
297
+ if media_object.is_type(TEXT_TYPE):
298
+ # TODO(#2439): Refactor out Request validation
299
+ if media_object.text is None:
300
+ raise ValueError("MediaObject of text type has missing text field value")
301
+ text_block: TextBlockParam = {
302
+ "type": "text",
303
+ "text": media_object.text,
304
+ }
305
+ blocks.append(text_block)
306
+ messages = [{"role": "user", "content": blocks}]
307
+
308
+ else:
309
+ messages = [{"role": "user", "content": request.prompt}]
310
+
311
+ raw_request: AnthropicMessagesRequest = {
312
+ "messages": messages,
313
+ "model": request.model_engine,
314
+ "stop_sequences": request.stop_sequences,
315
+ "max_tokens": request.max_tokens,
316
+ "temperature": request.temperature,
317
+ "top_p": request.top_p,
318
+ "top_k": request.top_k_per_token,
319
+ }
320
+ if system_message is not None:
321
+ raw_request["system"] = cast(str, system_message["content"])
322
+ completions: List[GeneratedOutput] = []
323
+
324
+ # `num_completions` is not supported, so instead make `num_completions` separate requests.
325
+ for completion_index in range(request.num_completions):
326
+
327
+ def do_it() -> Dict[str, Any]:
328
+ try:
329
+ result = self.client.messages.create(**raw_request).model_dump()
330
+ if "content" not in result or not result["content"]:
331
+ raise AnthropicMessagesResponseError(f"Anthropic response has empty content: {result}")
332
+ elif "text" not in result["content"][0]:
333
+ raise AnthropicMessagesResponseError(f"Anthropic response has non-text content: {result}")
334
+ return result
335
+ except BadRequestError as e:
336
+ response = e.response.json()
337
+ if _is_content_moderation_failure(response):
338
+ return response
339
+ raise
340
+
341
+ cache_key = CachingClient.make_cache_key(
342
+ {
343
+ "completion_index": completion_index,
344
+ **raw_request,
345
+ },
346
+ request,
347
+ )
348
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
349
+
350
+ if _is_content_moderation_failure(response):
351
+ hlog(
352
+ f"WARNING: Returning empty request for {request.model_deployment} "
353
+ "due to content moderation filter"
354
+ )
355
+ return RequestResult(
356
+ success=False,
357
+ cached=cached,
358
+ error=response["error"]["message"],
359
+ completions=[],
360
+ embedding=[],
361
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
362
+ request_time=response["request_time"],
363
+ request_datetime=response["request_datetime"],
364
+ )
365
+
366
+ completion = truncate_and_tokenize_response_text(
367
+ response["content"][0]["text"], request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
368
+ )
369
+ completions.append(completion)
370
+
371
+ return RequestResult(
372
+ success=True,
373
+ cached=cached,
374
+ request_time=response["request_time"],
375
+ request_datetime=response["request_datetime"],
376
+ completions=completions,
377
+ embedding=[],
378
+ )
379
+
380
+
191
381
  class AnthropicRequestError(Exception):
192
382
  pass
193
383
 
@@ -239,9 +429,9 @@ class AnthropicLegacyClient(CachingClient):
239
429
  hlog(f"Invalid logprobs response: {raw_response}")
240
430
  return False
241
431
 
242
- def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
432
+ def __init__(self, api_key: str, cache_config: CacheConfig):
243
433
  hlog("This client is deprecated. Please use AnthropicClient instead.")
244
- super().__init__(cache_config=cache_config, tokenizer=tokenizer)
434
+ super().__init__(cache_config=cache_config)
245
435
  self.api_key = api_key
246
436
 
247
437
  def make_request(self, request: Request) -> RequestResult:
@@ -249,7 +439,7 @@ class AnthropicLegacyClient(CachingClient):
249
439
  if request.embedding:
250
440
  return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
251
441
  # Validate the fields of `Request`
252
- if request.model != "anthropic/stanford-online-all-v4-s3":
442
+ if request.model_engine != "stanford-online-all-v4-s3":
253
443
  raise ValueError(f"Invalid model: {request.model}")
254
444
  if request.max_tokens > AnthropicLegacyClient.MAX_COMPLETION_LENGTH:
255
445
  raise ValueError(
@@ -390,7 +580,7 @@ class AnthropicLegacyClient(CachingClient):
390
580
 
391
581
  # Since Anthropic doesn't support multiple completions, we have to manually call it multiple times,
392
582
  # and aggregate the results into `completions` and `request_time`.
393
- completions: List[Sequence] = []
583
+ completions: List[GeneratedOutput] = []
394
584
  all_cached = True
395
585
  request_time = 0
396
586
  request_datetime: Optional[int] = None
@@ -423,8 +613,7 @@ class AnthropicLegacyClient(CachingClient):
423
613
  for text, token_logprob, all_logprobs, all_tokens in zip(
424
614
  log_probs["tokens"], log_probs["logprobs"], log_probs["topk_logprobs"], log_probs["topk_tokens"]
425
615
  ):
426
- top_logprobs: Dict[str, float] = {text: logprob for text, logprob in zip(all_tokens, all_logprobs)}
427
- tokens.append(Token(text=text, logprob=token_logprob, top_logprobs=top_logprobs))
616
+ tokens.append(Token(text=text, logprob=token_logprob))
428
617
  sequence_logprob += token_logprob
429
618
 
430
619
  finish_reason: str = response["stop_reason"]
@@ -432,7 +621,7 @@ class AnthropicLegacyClient(CachingClient):
432
621
  if finish_reason == AnthropicLegacyClient.STOP_SEQUENCE_STOP_REASON:
433
622
  finish_reason = "stop"
434
623
 
435
- completion = Sequence(
624
+ completion = GeneratedOutput(
436
625
  text=response["text"],
437
626
  logprob=sequence_logprob,
438
627
  tokens=tokens,
@@ -0,0 +1,215 @@
1
+ from dataclasses import replace
2
+ import os
3
+ from typing import Any, Dict, Mapping, Optional
4
+
5
+ from retrying import Attempt, RetryError
6
+
7
+ from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
8
+ from helm.common.file_caches.file_cache import FileCache
9
+ from helm.common.file_caches.local_file_cache import LocalFileCache
10
+ from helm.common.credentials_utils import provide_api_key
11
+ from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
12
+ from helm.common.hierarchical_logger import hlog
13
+ from helm.common.object_spec import create_object, inject_object_spec_args
14
+ from helm.common.request import Request, RequestResult
15
+ from helm.clients.client import Client
16
+ from helm.clients.moderation_api_client import ModerationAPIClient
17
+ from helm.proxy.critique.critique_client import CritiqueClient
18
+ from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
19
+ from helm.proxy.retry import NonRetriableException, retry_request
20
+ from helm.tokenizers.auto_tokenizer import AutoTokenizer
21
+
22
+
23
+ class AuthenticationError(NonRetriableException):
24
+ pass
25
+
26
+
27
+ class AutoClient(Client):
28
+ """Automatically dispatch to the proper `Client` based on the model deployment name."""
29
+
30
+ def __init__(
31
+ self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
32
+ ):
33
+ self._auto_tokenizer = AutoTokenizer(credentials, cache_backend_config)
34
+ self.credentials = credentials
35
+ self.file_storage_path = file_storage_path
36
+ self.cache_backend_config = cache_backend_config
37
+ self.clients: Dict[str, Client] = {}
38
+ self._critique_client: Optional[CritiqueClient] = None
39
+ hlog(f"AutoClient: file_storage_path = {file_storage_path}")
40
+ hlog(f"AutoClient: cache_backend_config = {cache_backend_config}")
41
+
42
+ def _get_client(self, model_deployment_name: str) -> Client:
43
+ """Return a client based on the model, creating it if necessary."""
44
+ # First try to find the client in the cache
45
+ client: Optional[Client] = self.clients.get(model_deployment_name)
46
+ if client is not None:
47
+ return client
48
+
49
+ # Otherwise, create the client
50
+ model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
51
+ if model_deployment:
52
+ # Perform dependency injection to fill in remaining arguments.
53
+ # Dependency injection is needed here for these reasons:
54
+ #
55
+ # 1. Different clients have different parameters. Dependency injection provides arguments
56
+ # that match the parameters of the client.
57
+ # 2. Some arguments, such as the tokenizer, are not static data objects that can be
58
+ # in the users configuration file. Instead, they have to be constructed dynamically at
59
+ # runtime.
60
+ # 3. The providers must be lazily-evaluated, because eager evaluation can result in an
61
+ # exception. For instance, some clients do not require an API key, so trying to fetch
62
+ # the API key from configuration eagerly will result in an exception because the user
63
+ # will not have configured an API key.
64
+
65
+ # Prepare a cache
66
+ host_organization: str = model_deployment.host_organization
67
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config(host_organization)
68
+
69
+ client_spec = inject_object_spec_args(
70
+ model_deployment.client_spec,
71
+ constant_bindings={
72
+ "cache_config": cache_config,
73
+ "tokenizer_name": model_deployment.tokenizer_name,
74
+ },
75
+ provider_bindings={
76
+ "api_key": lambda: provide_api_key(self.credentials, host_organization, model_deployment_name),
77
+ "tokenizer": lambda: self._auto_tokenizer._get_tokenizer(
78
+ tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
79
+ ),
80
+ "org_id": lambda: self.credentials.get(
81
+ host_organization + "OrgId", None
82
+ ), # OpenAI, GooseAI, Microsoft
83
+ "moderation_api_client": lambda: self.get_moderation_api_client(), # OpenAI DALL-E
84
+ "lock_file_path": lambda: os.path.join(
85
+ self.file_storage_path, f"{host_organization}.lock"
86
+ ), # Microsoft
87
+ "project_id": lambda: self.credentials.get(host_organization + "ProjectId", None), # VertexAI
88
+ "location": lambda: self.credentials.get(host_organization + "Location", None), # VertexAI
89
+ "hf_auth_token": lambda: self.credentials.get("huggingfaceAuthToken", None), # HuggingFace
90
+ "file_cache": lambda: self._get_file_cache(host_organization), # Text-to-image models
91
+ },
92
+ )
93
+ client = create_object(client_spec)
94
+ else:
95
+ raise ValueError(f"Could not find client for model deployment: {model_deployment_name}")
96
+
97
+ # Cache the client
98
+ self.clients[model_deployment_name] = client
99
+
100
+ return client
101
+
102
+ def make_request(self, request: Request) -> RequestResult:
103
+ """
104
+ Dispatch based on the name of the model (e.g., openai/davinci).
105
+ Retries if request fails.
106
+ """
107
+
108
+ # TODO: need to revisit this because this swallows up any exceptions that are raised.
109
+ @retry_request
110
+ def make_request_with_retry(client: Client, request: Request) -> RequestResult:
111
+ return client.make_request(request)
112
+
113
+ client: Client = self._get_client(request.model_deployment)
114
+
115
+ try:
116
+ return make_request_with_retry(client=client, request=request)
117
+ except RetryError as e:
118
+ last_attempt: Attempt = e.last_attempt
119
+ retry_error: str = (
120
+ f"Failed to make request to {request.model_deployment} after retrying "
121
+ f"{last_attempt.attempt_number} times"
122
+ )
123
+ hlog(retry_error)
124
+
125
+ # Notify our user that we failed to make the request even after retrying.
126
+ return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
127
+
128
+ def get_gcs_client(self):
129
+ from .gcs_client import GCSClient
130
+
131
+ bucket_name: str = self.credentials["gcsBucketName"]
132
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config("gcs")
133
+ return GCSClient(bucket_name, cache_config)
134
+
135
+ def get_nudity_check_client(self):
136
+ from helm.clients.image_generation.nudity_check_client import NudityCheckClient
137
+
138
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config("nudity")
139
+ return NudityCheckClient(cache_config)
140
+
141
+ def get_clip_score_client(self):
142
+ from .clip_score_client import CLIPScoreClient
143
+
144
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config("clip_score")
145
+ return CLIPScoreClient(cache_config)
146
+
147
+ def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
148
+ """Get the toxicity classifier client. We currently only support Perspective API."""
149
+ from helm.clients.perspective_api_client import PerspectiveAPIClient
150
+
151
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config("perspectiveapi")
152
+ return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)
153
+
154
+ def get_moderation_api_client(self) -> ModerationAPIClient:
155
+ """Get the ModerationAPI client."""
156
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config("ModerationAPI")
157
+ return ModerationAPIClient(self.credentials.get("openaiApiKey", ""), cache_config)
158
+
159
+ def get_critique_client(self) -> CritiqueClient:
160
+ """Get the critique client."""
161
+ if self._critique_client:
162
+ return self._critique_client
163
+ critique_type = self.credentials.get("critiqueType")
164
+ if critique_type == "random":
165
+ from helm.proxy.critique.critique_client import RandomCritiqueClient
166
+
167
+ self._critique_client = RandomCritiqueClient()
168
+ elif critique_type == "mturk":
169
+ from helm.proxy.critique.mechanical_turk_critique_client import (
170
+ MechanicalTurkCritiqueClient,
171
+ )
172
+
173
+ self._critique_client = MechanicalTurkCritiqueClient()
174
+ elif critique_type == "surgeai":
175
+ from helm.proxy.critique.surge_ai_critique_client import (
176
+ SurgeAICritiqueClient,
177
+ )
178
+
179
+ surgeai_credentials = self.credentials.get("surgeaiApiKey")
180
+ if not surgeai_credentials:
181
+ raise ValueError("surgeaiApiKey credentials are required for SurgeAICritiqueClient")
182
+ self._critique_client = SurgeAICritiqueClient(
183
+ surgeai_credentials, self.cache_backend_config.get_cache_config("surgeai")
184
+ )
185
+ elif critique_type == "model":
186
+ from helm.proxy.critique.model_critique_client import ModelCritiqueClient
187
+
188
+ model_name: Optional[str] = self.credentials.get("critiqueModelName")
189
+ if model_name is None:
190
+ raise ValueError("critiqueModelName is required for ModelCritiqueClient")
191
+ client: Client = self._get_client(model_name)
192
+ self._critique_client = ModelCritiqueClient(client, model_name)
193
+ elif critique_type == "scale":
194
+ from helm.proxy.critique.scale_critique_client import ScaleCritiqueClient
195
+
196
+ scale_credentials = self.credentials.get("scaleApiKey")
197
+ scale_project = self.credentials.get("scaleProject", None)
198
+ if not scale_project:
199
+ raise ValueError("scaleProject is required for ScaleCritiqueClient.")
200
+ if not scale_credentials:
201
+ raise ValueError("scaleApiKey is required for ScaleCritiqueClient")
202
+ self._critique_client = ScaleCritiqueClient(
203
+ scale_credentials, self.cache_backend_config.get_cache_config("scale"), scale_project
204
+ )
205
+ else:
206
+ raise ValueError(
207
+ "CritiqueClient is not configured; set critiqueType to 'mturk',"
208
+ "'mturk-sandbox', 'surgeai', 'scale' or 'random'"
209
+ )
210
+ return self._critique_client
211
+
212
+ def _get_file_cache(self, host_organization: str) -> FileCache:
213
+ # Initialize `FileCache` for text-to-image model APIs
214
+ local_file_cache_path: str = os.path.join(self.file_storage_path, "output", host_organization)
215
+ return LocalFileCache(local_file_cache_path, file_extension="png")