crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,180 +0,0 @@
1
- from typing import List, Optional, Dict
2
-
3
- from filelock import FileLock
4
- from openai.api_resources.abstract import engine_api_resource
5
- import openai as turing
6
-
7
- from helm.common.cache import CacheConfig
8
- from helm.common.request import (
9
- wrap_request_time,
10
- EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
11
- Request,
12
- RequestResult,
13
- Sequence,
14
- Token,
15
- )
16
- from .client import CachingClient, truncate_sequence
17
- from .openai_client import ORIGINAL_COMPLETION_ATTRIBUTES
18
-
19
-
20
- class MicrosoftClient(CachingClient):
21
- """
22
- Client for the Microsoft's Megatron-Turing NLG models (https://arxiv.org/abs/2201.11990).
23
-
24
- According to the internal documentation: https://github.com/microsoft/turing-academic-TNLG,
25
- "the model will generate roughly 3 tokens per second. The response will be returned once
26
- all tokens have been generated."
27
- """
28
-
29
- @staticmethod
30
- def convert_to_raw_request(request: Request) -> Dict:
31
- return {
32
- "engine": request.model_engine,
33
- "prompt": request.prompt,
34
- "temperature": request.temperature,
35
- "max_tokens": request.max_tokens,
36
- "best_of": request.top_k_per_token,
37
- "logprobs": request.top_k_per_token,
38
- # Despite what was stated here: https://github.com/microsoft/turing-academic-TNLG#api-parameters,
39
- # their API supports at most one stop sequence. Pass in the first one for now and handle the rest
40
- # of the stop sequences during post processing (see `truncate_sequence` below).
41
- "stop": None if len(request.stop_sequences) == 0 else request.stop_sequences[0],
42
- "top_p": request.top_p,
43
- "echo": request.echo_prompt,
44
- }
45
-
46
- def __init__(
47
- self,
48
- lock_file_path: str,
49
- cache_config: CacheConfig,
50
- api_key: Optional[str] = None,
51
- org_id: Optional[str] = None,
52
- ):
53
- super().__init__(cache_config=cache_config)
54
-
55
- # Adapted from their documentation: https://github.com/microsoft/turing-academic-TNLG
56
- class EngineAPIResource(engine_api_resource.EngineAPIResource):
57
- @classmethod
58
- def class_url(
59
- cls, engine: Optional[str] = None, api_type: Optional[str] = None, api_version: Optional[str] = None
60
- ) -> str:
61
- return f"/{engine}/inference"
62
-
63
- self.org_id: Optional[str] = org_id
64
- self.api_key: Optional[str] = api_key
65
- self.api_base: str = "https://turingnlg-turingnlg-mstap-v2.turingase.p.azurewebsites.net"
66
- self.completion_attributes = (EngineAPIResource,) + ORIGINAL_COMPLETION_ATTRIBUTES[1:]
67
-
68
- # The Microsoft Turing server only allows a single request at a time, so acquire a
69
- # process-safe lock before making a request.
70
- # https://github.com/microsoft/turing-academic-TNLG#rate-limitations
71
- #
72
- # Since the model will generate roughly three tokens per second and the max context window
73
- # is 2048 tokens, we expect the maximum time for a request to be fulfilled to be 700 seconds.
74
- self._lock = FileLock(lock_file_path, timeout=700)
75
-
76
- def make_request(self, request: Request) -> RequestResult:
77
- """
78
- Make a request for the Microsoft MT-NLG models.
79
-
80
- They mimicked the OpenAI completions API, but not all the parameters are supported.
81
-
82
- Supported parameters:
83
- engine
84
- prompt
85
- temperature
86
- max_tokens
87
- best_of
88
- logprobs
89
- stop ("Only a single "stop" value (str) is currently supported.")
90
- top_p
91
- echo
92
- n (Not originally supported, but we simulate n by making multiple requests)
93
-
94
- Not supported parameters:
95
- presence_penalty
96
- frequency_penalty
97
- """
98
- # Embedding not supported for this model
99
- if request.embedding:
100
- return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
101
-
102
- raw_request = MicrosoftClient.convert_to_raw_request(request)
103
- completions: List[Sequence] = []
104
- request_time = 0
105
- request_datetime: Optional[int] = None
106
- all_cached = True
107
-
108
- # API currently only supports 1 completion at a time, so we have to hit it multiple times.
109
- for completion_index in range(request.num_completions):
110
- try:
111
-
112
- def do_it():
113
- with self._lock:
114
- # Following https://beta.openai.com/docs/api-reference/authentication
115
- # `organization` can be set to None.
116
- turing.organization = self.org_id
117
- turing.api_key = self.api_key
118
- turing.api_base = self.api_base
119
- turing.api_resources.completion.Completion.__bases__ = self.completion_attributes
120
-
121
- response: Dict = turing.Completion.create(**raw_request)
122
- # Validate the responses, so we don't cache malformed responses with null `logprobs` and `text`
123
- if (
124
- "choices" not in response
125
- or len(response["choices"]) == 0
126
- or response["choices"][0].get("text") is None
127
- or response["choices"][0].get("logprobs") is None
128
- ):
129
- raise turing.error.OpenAIError(
130
- f"For request: {raw_request}, invalid response from the MT-NLG server: {response}."
131
- )
132
-
133
- return response
134
-
135
- def fail():
136
- raise RuntimeError(
137
- f"The result has not been uploaded to the cache for the following request: {cache_key}"
138
- )
139
-
140
- # We want to make `request.num_completions` fresh requests,
141
- # cache key should contain the completion_index.
142
- cache_key = CachingClient.make_cache_key({"completion_index": completion_index, **raw_request}, request)
143
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it if self.api_key else fail))
144
- except turing.error.OpenAIError as e:
145
- error: str = f"OpenAI (Turing API) error: {e}"
146
- return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
147
-
148
- for raw_completion in response["choices"]:
149
- sequence_logprob = 0
150
- tokens: List[Token] = []
151
-
152
- raw_data = raw_completion["logprobs"]
153
- for text, logprob, top_logprobs in zip(
154
- raw_data["tokens"], raw_data["token_logprobs"], raw_data["top_logprobs"]
155
- ):
156
- tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict(top_logprobs or {})))
157
- sequence_logprob += logprob or 0
158
-
159
- completion = Sequence(
160
- text=raw_completion["text"],
161
- logprob=sequence_logprob,
162
- tokens=tokens,
163
- finish_reason={"reason": raw_completion["finish_reason"]},
164
- )
165
- completion = truncate_sequence(completion, request)
166
- completions.append(completion)
167
-
168
- request_time += response["request_time"]
169
- # Use the datetime from the first completion because that's when the request was fired
170
- request_datetime = request_datetime or response.get("request_datetime")
171
- all_cached = all_cached and cached
172
-
173
- return RequestResult(
174
- success=True,
175
- cached=all_cached,
176
- request_time=request_time,
177
- request_datetime=request_datetime,
178
- completions=completions,
179
- embedding=[],
180
- )
@@ -1,206 +0,0 @@
1
- # mypy: check_untyped_defs = False
2
- from dataclasses import replace
3
- from typing import Any, Dict, List, Optional, cast
4
-
5
- from helm.common.cache import CacheConfig
6
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
7
- from helm.common.hierarchical_logger import hlog
8
- from helm.common.optional_dependencies import handle_module_not_found_error
9
- from helm.common.tokenization_request import (
10
- TokenizationRequest,
11
- TokenizationRequestResult,
12
- )
13
- from helm.proxy.tokenizers.tokenizer import Tokenizer
14
- from .client import CachingClient, truncate_sequence
15
-
16
- try:
17
- import openai
18
- except ModuleNotFoundError as e:
19
- handle_module_not_found_error(e, ["openai"])
20
-
21
-
22
- ORIGINAL_COMPLETION_ATTRIBUTES = openai.api_resources.completion.Completion.__bases__
23
-
24
-
25
- class OpenAIClient(CachingClient):
26
- END_OF_TEXT: str = "<|endoftext|>"
27
-
28
- def __init__(
29
- self,
30
- tokenizer: Tokenizer,
31
- tokenizer_name: str,
32
- cache_config: CacheConfig,
33
- api_key: Optional[str] = None,
34
- org_id: Optional[str] = None,
35
- ):
36
- super().__init__(cache_config=cache_config)
37
- self.tokenizer = tokenizer
38
- self.tokenizer_name = tokenizer_name
39
- self.org_id: Optional[str] = org_id
40
- self.api_key: Optional[str] = api_key
41
- self.api_base: str = "https://api.openai.com/v1"
42
-
43
- def _is_chat_model_engine(self, model_engine: str):
44
- return model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4")
45
-
46
- def make_request(self, request: Request) -> RequestResult:
47
- if self.api_key is None:
48
- raise ValueError("OpenAI API key is required")
49
-
50
- raw_request: Dict[str, Any]
51
- if request.embedding:
52
- raw_request = {
53
- "input": request.prompt,
54
- "engine": request.model_engine,
55
- }
56
- elif self._is_chat_model_engine(request.model_engine):
57
- messages: Optional[List[Dict[str, str]]] = request.messages
58
- if request.messages and len(request.messages) > 1:
59
- # Checks that all messages have a role and some content
60
- for message in request.messages:
61
- if not message.get("role") or not message.get("content"):
62
- raise ValueError("All messages must have a role and content")
63
- # Checks that the last role is "user"
64
- if request.messages[-1]["role"] != "user":
65
- raise ValueError("Last message must have role 'user'")
66
- if request.prompt != "":
67
- hlog("WARNING: Since message is set, prompt will be ignored")
68
- else:
69
- # Convert prompt into a single message
70
- # For now, put the whole prompt in a single user message, and expect the response
71
- # to be returned in a single assistant message.
72
- # TODO: Support ChatML for creating multiple messages with different roles.
73
- # See: https://github.com/openai/openai-python/blob/main/chatml.md
74
- messages = [{"role": "user", "content": request.prompt}]
75
- raw_request = {
76
- "model": request.model_engine,
77
- "messages": messages,
78
- "temperature": request.temperature,
79
- "top_p": request.top_p,
80
- "n": request.num_completions,
81
- "stop": request.stop_sequences or None, # API doesn't like empty list
82
- # Note: Chat models may require adding an extra token to max_tokens
83
- # for the internal special role token.
84
- "max_tokens": request.max_tokens,
85
- "presence_penalty": request.presence_penalty,
86
- "frequency_penalty": request.frequency_penalty,
87
- }
88
- else:
89
- raw_request = {
90
- "engine": request.model_engine,
91
- "prompt": request.prompt,
92
- "temperature": request.temperature,
93
- "n": request.num_completions,
94
- "max_tokens": request.max_tokens,
95
- "best_of": request.top_k_per_token,
96
- "logprobs": request.top_k_per_token,
97
- "stop": request.stop_sequences or None, # API doesn't like empty list
98
- "top_p": request.top_p,
99
- "presence_penalty": request.presence_penalty,
100
- "frequency_penalty": request.frequency_penalty,
101
- "echo": request.echo_prompt,
102
- }
103
-
104
- # OpenAI doesn't let you ask for more completions than the number of
105
- # per-token candidates.
106
- raw_request["best_of"] = max(raw_request["best_of"], raw_request["n"])
107
- raw_request["logprobs"] = max(raw_request["logprobs"], raw_request["n"])
108
-
109
- try:
110
- if request.embedding:
111
-
112
- def do_it():
113
- openai.organization = self.org_id
114
- openai.api_key = self.api_key
115
- openai.api_base = self.api_base
116
- return openai.Embedding.create(**raw_request)
117
-
118
- elif self._is_chat_model_engine(request.model_engine):
119
-
120
- def do_it():
121
- openai.organization = self.org_id
122
- openai.api_key = self.api_key
123
- openai.api_base = self.api_base
124
- return openai.ChatCompletion.create(**raw_request)
125
-
126
- else:
127
-
128
- def do_it():
129
- # Following https://beta.openai.com/docs/api-reference/authentication
130
- # `organization` can be set to None.
131
- openai.organization = self.org_id
132
- openai.api_key = self.api_key
133
- openai.api_base = self.api_base
134
- openai.api_resources.completion.Completion.__bases__ = ORIGINAL_COMPLETION_ATTRIBUTES
135
- return openai.Completion.create(**raw_request)
136
-
137
- cache_key = CachingClient.make_cache_key(raw_request, request)
138
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
139
- except openai.error.OpenAIError as e:
140
- error: str = f"OpenAI error: {e}"
141
- return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
142
-
143
- # If the user is requesting completions instead of an embedding, then `completions`
144
- # needs to be populated, and `embedding` should be an empty list and vice-versa.
145
- embedding: List[float] = []
146
- completions: List[Sequence] = []
147
- tokens: List[Token]
148
- if request.embedding:
149
- # If the user is requesting an embedding instead of completion
150
- # then completions would be left as an empty list. The embedding needs to be set.
151
- embedding = response["data"][0]["embedding"]
152
- elif self._is_chat_model_engine(request.model_engine):
153
- for raw_completion in response["choices"]:
154
- # The OpenAI chat completion API doesn't support echo.
155
- # If `echo_prompt` is true, combine the prompt and completion.
156
- raw_completion_content = raw_completion["message"]["content"]
157
- text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
158
- # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
159
- tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
160
- TokenizationRequest(text, tokenizer=self.tokenizer_name)
161
- )
162
- # Log probs are not currently not supported by the OpenAI chat completion API, so set to 0 for now.
163
- tokens = [
164
- Token(text=cast(str, raw_token), logprob=0, top_logprobs={})
165
- for raw_token in tokenization_result.raw_tokens
166
- ]
167
- completion = Sequence(
168
- text=text,
169
- logprob=0, # OpenAI does not provide logprobs
170
- tokens=tokens,
171
- finish_reason={"reason": raw_completion["finish_reason"]},
172
- )
173
- completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
174
- else:
175
- for raw_completion in response["choices"]:
176
- sequence_logprob = 0
177
- tokens = []
178
-
179
- raw_data = raw_completion["logprobs"]
180
- for text, logprob, top_logprobs in zip(
181
- raw_data["tokens"], raw_data["token_logprobs"], raw_data["top_logprobs"]
182
- ):
183
- tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict(top_logprobs or {})))
184
- sequence_logprob += logprob or 0
185
- completion = Sequence(
186
- text=raw_completion["text"],
187
- logprob=sequence_logprob,
188
- tokens=tokens,
189
- finish_reason={"reason": raw_completion["finish_reason"]},
190
- )
191
- # OpenAI sends us back tokens past the end of text token,
192
- # so we need to manually truncate the list of tokens.
193
- # TODO: filed an issue with their support to check what the expected behavior here is.
194
- completion = truncate_sequence(
195
- completion, replace(request, stop_sequences=request.stop_sequences + [OpenAIClient.END_OF_TEXT])
196
- )
197
- completions.append(completion)
198
-
199
- return RequestResult(
200
- success=True,
201
- cached=cached,
202
- request_time=response["request_time"],
203
- request_datetime=response.get("request_datetime"),
204
- completions=completions,
205
- embedding=embedding,
206
- )
@@ -1,60 +0,0 @@
1
- from typing import List, Dict
2
-
3
- from helm.common.cache import CacheConfig
4
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
5
- from helm.proxy.tokenizers.simple_tokenizer import SimpleTokenizer
6
- from .client import CachingClient
7
-
8
-
9
- class SimpleClient(CachingClient):
10
- """Implements some "models" that just generate silly things quickly just to debug the infrastructure."""
11
-
12
- def __init__(self, cache_config: CacheConfig):
13
- super().__init__(cache_config=cache_config)
14
-
15
- def make_request(self, request: Request) -> RequestResult:
16
- raw_request = {
17
- "engine": request.model_engine,
18
- "prompt": request.prompt,
19
- "n": request.num_completions,
20
- }
21
-
22
- if request.model_engine == "model1":
23
-
24
- def do_it():
25
- return self.invoke_model1(raw_request)
26
-
27
- cache_key = CachingClient.make_cache_key(raw_request, request)
28
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
29
- completions = [
30
- Sequence(
31
- text=text,
32
- logprob=logprob,
33
- tokens=[Token(text=text, logprob=logprob, top_logprobs=response["completions"])],
34
- )
35
- for text, logprob in response["completions"].items()
36
- ]
37
- else:
38
- raise ValueError(f"Invalid model: {request.model}")
39
-
40
- return RequestResult(
41
- success=True,
42
- cached=False,
43
- request_time=0,
44
- request_datetime=response.get("request_datetime"),
45
- completions=completions,
46
- embedding=[],
47
- )
48
-
49
- def invoke_model1(self, raw_request: Dict) -> Dict:
50
- """
51
- Example: 7 2 4 6
52
- Completions (num_completions = 3):
53
- - 6
54
- - 4
55
- - 2
56
- """
57
- prompt_tokens: List[str] = SimpleTokenizer.tokenize_by_space(raw_request["prompt"])
58
- choices = reversed(prompt_tokens[-raw_request["n"] :])
59
- response = {"completions": dict((text, -i) for i, text in enumerate(choices))}
60
- return response
@@ -1,49 +0,0 @@
1
- from .client import truncate_sequence
2
- from typing import List
3
- from helm.common.request import Request, Sequence, Token
4
-
5
-
6
- def truncate_sequence_helper(tokens: List[str], request: Request, expected_tokens: List[str]):
7
- sequence = Sequence(
8
- text="".join(tokens),
9
- tokens=[Token(text=text, logprob=-1, top_logprobs={}) for text in tokens],
10
- logprob=-len(tokens),
11
- )
12
-
13
- output_sequence = truncate_sequence(sequence, request)
14
-
15
- assert expected_tokens == [token.text for token in output_sequence.tokens]
16
- assert "".join(expected_tokens) == output_sequence.text
17
- assert output_sequence.logprob == sum(token.logprob for token in output_sequence.tokens)
18
-
19
-
20
- def test_truncate_sequence():
21
- # echo_prompt = True, nothing gets truncated
22
- truncate_sequence_helper(
23
- ["a", "b", "c"],
24
- Request(
25
- model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True
26
- ),
27
- ["a", "b", "c"],
28
- )
29
-
30
- # Nothing gets truncated
31
- truncate_sequence_helper(
32
- ["hello", " world"],
33
- Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]),
34
- ["hello", " world"],
35
- )
36
-
37
- # Truncate using stop sequences
38
- truncate_sequence_helper(
39
- ["hello", " world", "\n", "what"],
40
- Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]),
41
- ["hello", " world"],
42
- )
43
-
44
- # Truncate using max tokens
45
- truncate_sequence_helper(
46
- ["a", "b", "c"],
47
- Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2),
48
- ["a", "b"],
49
- )
@@ -1,115 +0,0 @@
1
- import requests
2
- from typing import List
3
-
4
- from helm.common.cache import CacheConfig
5
- from helm.common.optional_dependencies import handle_module_not_found_error
6
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
7
- from helm.common.tokenization_request import (
8
- TokenizationRequest,
9
- TokenizationRequestResult,
10
- )
11
- from helm.proxy.tokenizers.tokenizer import Tokenizer
12
- from .client import CachingClient, truncate_sequence
13
-
14
- try:
15
- import vertexai
16
- from vertexai.language_models import TextGenerationModel, TextGenerationResponse
17
- except ModuleNotFoundError as e:
18
- handle_module_not_found_error(e, ["google"])
19
-
20
-
21
- class VertexAIClient(CachingClient):
22
- def __init__(
23
- self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, project_id: str, location: str
24
- ) -> None:
25
- super().__init__(cache_config=cache_config)
26
- self.project_id = project_id
27
- self.location = location
28
- self.tokenizer = tokenizer
29
- self.tokenizer_name = tokenizer_name
30
-
31
- vertexai.init(project=self.project_id, location=self.location)
32
-
33
- def make_request(self, request: Request) -> RequestResult:
34
- """Make a request"""
35
- parameters = {
36
- "temperature": request.temperature,
37
- "max_output_tokens": request.max_tokens,
38
- "top_k": request.top_k_per_token,
39
- "top_p": request.top_p,
40
- "stop_sequences": request.stop_sequences,
41
- "candidate_count": request.num_completions,
42
- # TODO #2084: Add support for these parameters.
43
- # The parameters "echo", "frequency_penalty", and "presence_penalty" are supposed to be supported
44
- # in an HTTP request (See https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text),
45
- # but they are not supported in the Python SDK:
46
- # https://github.com/googleapis/python-aiplatform/blob/beae48f63e40ea171c3f1625164569e7311b8e5a/vertexai/language_models/_language_models.py#L968C1-L980C1
47
- # "frequency_penalty": request.frequency_penalty,
48
- # "presence_penalty": request.presence_penalty,
49
- # "echo": request.echo_prompt,
50
- }
51
-
52
- completions: List[Sequence] = []
53
- model_name: str = request.model_engine
54
-
55
- try:
56
-
57
- def do_it():
58
- model = TextGenerationModel.from_pretrained(model_name)
59
- response = model.predict(request.prompt, **parameters)
60
- candidates: List[TextGenerationResponse] = response.candidates
61
- response_dict = {
62
- "predictions": [{"text": completion.text for completion in candidates}],
63
- } # TODO: Extract more information from the response
64
- return response_dict
65
-
66
- # We need to include the engine's name to differentiate among requests made for different model
67
- # engines since the engine name is not included in the request itself.
68
- # Same for the prompt.
69
- cache_key = CachingClient.make_cache_key(
70
- {
71
- "engine": request.model_engine,
72
- "prompt": request.prompt,
73
- **parameters,
74
- },
75
- request,
76
- )
77
-
78
- response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
79
- except (requests.exceptions.RequestException, AssertionError) as e:
80
- error: str = f"VertexAIClient error: {e}"
81
- return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
82
-
83
- for prediction in response["predictions"]:
84
- response_text = prediction["text"]
85
-
86
- # The Python SDK does not support echo
87
- # TODO #2084: Add support for echo.
88
- text: str = request.prompt + response_text if request.echo_prompt else response_text
89
-
90
- tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
91
- TokenizationRequest(text, tokenizer=self.tokenizer_name)
92
- )
93
-
94
- # TODO #2085: Add support for log probs.
95
- # Once again, log probs seem to be supported by the API but not by the Python SDK.
96
- # HTTP Response body reference:
97
- # https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#response_body
98
- # Python SDK reference:
99
- # https://github.com/googleapis/python-aiplatform/blob/beae48f63e40ea171c3f1625164569e7311b8e5a/vertexai/language_models/_language_models.py#L868
100
- tokens: List[Token] = [
101
- Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
102
- ]
103
-
104
- completion = Sequence(text=response_text, logprob=0, tokens=tokens)
105
- sequence = truncate_sequence(completion, request, print_warning=True)
106
- completions.append(sequence)
107
-
108
- return RequestResult(
109
- success=True,
110
- cached=cached,
111
- request_time=response["request_time"],
112
- request_datetime=response["request_datetime"],
113
- completions=completions,
114
- embedding=[],
115
- )
@@ -1,20 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class AI21TokenCounter(TokenCounter):
8
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
9
- """
10
- Counts the number of generated tokens and NOT the number of tokens in the prompt
11
- (https://studio.ai21.com/docs/calculating-usage).
12
-
13
- The AI21 documentation (https://studio.ai21.com/docs/calculating-usage/) defines
14
- generated tokens as:
15
- "the total number of all completion tokens you generate. For example, assume you post
16
- a complete request for J1-Jumbo with a prompt consisting of 10 tokens and requiring 3
17
- completions, i.e. numResults = 3, and the model generates completions with 5, 15, and
18
- 20 tokens. In total this request will consume 5+15+20=40 generated tokens."
19
- """
20
- return sum(len(sequence.tokens) for sequence in completions)
@@ -1,13 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class CohereTokenCounter(TokenCounter):
8
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
9
- """
10
- Counts the number of generated tokens.
11
- TODO: Cohere simply counts the number of generations, but we currently only support counting tokens.
12
- """
13
- return sum(len(sequence.tokens) for sequence in completions)
@@ -1,12 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class FreeTokenCounter(TokenCounter):
8
- """For when we don't care about keeping track of the number of tokens."""
9
-
10
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
11
- """No need to count tokens, since it's free. Return 0."""
12
- return 0