crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,134 @@
1
+ import requests
2
+ from typing import Any, Dict, List, Optional, TypedDict
3
+
4
+ from helm.proxy.retry import NonRetriableException
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.optional_dependencies import handle_module_not_found_error
7
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
8
+ from helm.tokenizers.tokenizer import Tokenizer
9
+ from .client import CachingClient, truncate_and_tokenize_response_text
10
+
11
+ try:
12
+ from mistralai.client import MistralClient
13
+ from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
14
+ except ModuleNotFoundError as e:
15
+ handle_module_not_found_error(e, ["mistral"])
16
+
17
+
18
+ class MistralAIRequest(TypedDict):
19
+ """Data passed between make_request and _send_request. Used as the cache key."""
20
+
21
+ model: str
22
+ prompt: str
23
+ max_tokens: int
24
+ temperature: float
25
+ top_p: float
26
+ random_seed: Optional[int]
27
+
28
+
29
+ class MistralAIClient(CachingClient):
30
+ """
31
+ Client for Mistral API.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ tokenizer: Tokenizer,
37
+ tokenizer_name: str,
38
+ cache_config: CacheConfig,
39
+ api_key: str,
40
+ mistral_model: Optional[str] = None,
41
+ ):
42
+ super().__init__(cache_config=cache_config)
43
+ self.api_key: str = api_key
44
+ self.tokenizer = tokenizer
45
+ self.tokenizer_name = tokenizer_name
46
+ self._client = MistralClient(api_key=self.api_key)
47
+ self.mistral_model = mistral_model
48
+
49
+ def _send_request(self, raw_request: MistralAIRequest) -> Dict[str, Any]:
50
+ messages = [ChatMessage(role="user", content=raw_request["prompt"])]
51
+
52
+ chat_response: ChatCompletionResponse = self._client.chat(
53
+ model=raw_request["model"],
54
+ messages=messages,
55
+ temperature=raw_request["temperature"],
56
+ max_tokens=raw_request["max_tokens"],
57
+ top_p=raw_request["top_p"],
58
+ random_seed=raw_request["random_seed"],
59
+ safe_prompt=False, # Disable safe_prompt
60
+ )
61
+ # Documentation: "If mode is 'json', the output will only contain JSON serializable types."
62
+ # Source: https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_dump
63
+ #
64
+ # We need to ensure that the output only contains JSON serializable types because the output
65
+ # will be serialized for storage in the cache.
66
+ return chat_response.model_dump(mode="json")
67
+
68
+ def _get_random_seed(self, request: Request, completion_index: int) -> Optional[int]:
69
+ if request.random is None and completion_index == 0:
70
+ return None
71
+
72
+ # Treat the user's request.random as an integer for the random seed.
73
+ try:
74
+ request_random_seed = int(request.random) if request.random is not None else 0
75
+ except ValueError:
76
+ raise NonRetriableException("MistralAIClient only supports integer values for request.random")
77
+
78
+ # A large prime is used so that the resulting values are unlikely to collide
79
+ # with request.random values chosen by the user.
80
+ fixed_large_prime = 1911011
81
+ completion_index_random_seed = completion_index * fixed_large_prime
82
+
83
+ return request_random_seed + completion_index_random_seed
84
+
85
+ def make_request(self, request: Request) -> RequestResult:
86
+ """Make a request"""
87
+ completions: List[GeneratedOutput] = []
88
+
89
+ # `num_completions` is not supported, so instead make `num_completions` separate requests.
90
+ for completion_index in range(request.num_completions):
91
+ try:
92
+ raw_request: MistralAIRequest = {
93
+ "model": self.mistral_model or request.model_engine,
94
+ "prompt": request.prompt,
95
+ "max_tokens": request.max_tokens,
96
+ "temperature": request.temperature,
97
+ "top_p": request.top_p,
98
+ "random_seed": self._get_random_seed(request, completion_index),
99
+ }
100
+
101
+ def do_it() -> Dict[str, Any]:
102
+ result: Dict[str, Any] = self._send_request(raw_request)
103
+ return result
104
+
105
+ # We need to include the engine's name to differentiate among requests made for different model
106
+ # engines since the engine name is not included in the request itself.
107
+ # In addition, we want to make `request.num_completions` fresh
108
+ # requests, cache key should contain the completion_index.
109
+ # Echoing the original prompt is not officially supported by Mistral. We instead prepend the
110
+ # completion with the prompt when `echo_prompt` is true, so keep track of it in the cache key.
111
+ cache_key = CachingClient.make_cache_key(raw_request, request)
112
+
113
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
114
+ except (requests.exceptions.RequestException, AssertionError) as e:
115
+ error: str = f"MistralClient error: {e}"
116
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
117
+
118
+ response_message: Dict[str, Any] = response["choices"][0]["message"]
119
+ assert response_message["role"] == "assistant"
120
+ response_text: str = response_message["content"]
121
+
122
+ # The Mistral API doesn't support echo. If `echo_prompt` is true, combine the prompt and completion.
123
+ text: str = request.prompt + response_text if request.echo_prompt else response_text
124
+ sequence = truncate_and_tokenize_response_text(text, request, self.tokenizer, self.tokenizer_name)
125
+ completions.append(sequence)
126
+
127
+ return RequestResult(
128
+ success=True,
129
+ cached=cached,
130
+ request_time=response["request_time"],
131
+ request_datetime=response["request_datetime"],
132
+ completions=completions,
133
+ embedding=[],
134
+ )
@@ -0,0 +1,109 @@
1
+ from typing import Any, Dict
2
+
3
+ from helm.common.request import wrap_request_time
4
+ from helm.common.cache import Cache, CacheConfig
5
+ from helm.common.moderations_api_request import (
6
+ ModerationCategoryScores,
7
+ ModerationCategoryFlaggedResults,
8
+ ModerationAPIRequest,
9
+ ModerationAPIRequestResult,
10
+ )
11
+ from helm.common.optional_dependencies import handle_module_not_found_error
12
+
13
+
14
+ class ModerationAPIClient:
15
+ """
16
+ From https://beta.openai.com/docs/guides/moderation/overview, the moderation endpoint is a tool
17
+ to check whether content complies with OpenAI's content policy. Developers can thus identify content
18
+ that OpenAI's content policy prohibits and take action, for instance by filtering it.
19
+ """
20
+
21
+ # For descriptions of the models, see https://beta.openai.com/docs/api-reference/moderations/create
22
+ LATEST_MODEL: str = "text-moderation-latest"
23
+ STABLE_MODEL: str = "text-moderation-stable"
24
+
25
+ # List of categories (https://beta.openai.com/docs/guides/moderation/overview)
26
+ HATE: str = "hate"
27
+ HATE_THREATENING: str = "hate/threatening"
28
+ SELF_HARM: str = "self-harm"
29
+ SEXUAL: str = "sexual"
30
+ SEXUAL_MINORS: str = "sexual/minors"
31
+ VIOLENCE: str = "violence"
32
+ VIOLENCE_GRAPHIC: str = "violence/graphic"
33
+
34
+ def __init__(self, api_key: str, cache_config: CacheConfig):
35
+ self.cache = Cache(cache_config)
36
+ try:
37
+ from openai import OpenAI
38
+ except ModuleNotFoundError as e:
39
+ handle_module_not_found_error(e, ["openai"])
40
+ # TODO: Add OpenAI organization.
41
+ self.client = OpenAI(api_key=api_key)
42
+
43
+ def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
44
+ """
45
+ Sends a request to OpenAI's moderation endpoint.
46
+ https://beta.openai.com/docs/api-reference/moderations/create
47
+ """
48
+ try:
49
+ import openai
50
+ except ModuleNotFoundError as e:
51
+ handle_module_not_found_error(e, ["openai"])
52
+
53
+ raw_request: Dict[str, str] = {
54
+ "input": request.text,
55
+ "model": self.LATEST_MODEL if request.use_latest_model else self.STABLE_MODEL,
56
+ }
57
+
58
+ try:
59
+
60
+ def do_it() -> Dict[str, Any]:
61
+ result = self.client.moderations.create(input=request.text).model_dump(mode="json")
62
+ assert "results" in result and len(result["results"]) > 0, f"Invalid response: {result}"
63
+ return result
64
+
65
+ response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
66
+ except openai.OpenAIError as e:
67
+ error: str = f"Moderation API error: {e}"
68
+ return ModerationAPIRequestResult(
69
+ success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None
70
+ )
71
+
72
+ moderation_result = response["results"][0]
73
+ category_results: Dict[str, bool] = moderation_result["categories"]
74
+ score_results: Dict[str, float] = moderation_result["category_scores"]
75
+
76
+ flagged_results = ModerationCategoryFlaggedResults(
77
+ hate_flagged=category_results[self.HATE],
78
+ hate_threatening_flagged=category_results[self.HATE_THREATENING],
79
+ self_harm_flagged=category_results[self.SELF_HARM],
80
+ sexual_flagged=category_results[self.SEXUAL],
81
+ sexual_minors_flagged=category_results[self.SEXUAL_MINORS],
82
+ violence_flagged=category_results[self.VIOLENCE],
83
+ violence_graphic_flagged=category_results[self.VIOLENCE_GRAPHIC],
84
+ )
85
+ scores = ModerationCategoryScores(
86
+ hate_score=score_results[self.HATE],
87
+ hate_threatening_score=score_results[self.HATE_THREATENING],
88
+ self_harm_score=score_results[self.SELF_HARM],
89
+ sexual_score=score_results[self.SEXUAL],
90
+ sexual_minors_score=score_results[self.SEXUAL_MINORS],
91
+ violence_score=score_results[self.VIOLENCE],
92
+ violence_graphic_score=score_results[self.VIOLENCE_GRAPHIC],
93
+ )
94
+ return ModerationAPIRequestResult(
95
+ success=True,
96
+ cached=cached,
97
+ flagged=moderation_result["flagged"],
98
+ flagged_results=flagged_results,
99
+ scores=scores,
100
+ )
101
+
102
+ def will_be_flagged(self, text: str) -> bool:
103
+ """Returns True if the text is against OpenAI's content policy and will be flagged, False otherwise."""
104
+ result: ModerationAPIRequestResult = self.get_moderation_results(
105
+ # Use the latest model so the account does not get banned
106
+ ModerationAPIRequest(text=text, use_latest_model=True)
107
+ )
108
+ assert result.flagged is not None
109
+ return result.flagged
@@ -0,0 +1,43 @@
1
+ from threading import Lock
2
+ from typing import Optional
3
+
4
+ from transformers import AutoConfig, AutoModelForCausalLM
5
+ from helm.common.cache import CacheConfig
6
+
7
+ from helm.common.optional_dependencies import OptionalDependencyNotInstalled
8
+ from helm.clients.huggingface_client import HuggingFaceClient
9
+
10
+
11
+ _register_open_lm_lock = Lock()
12
+ _register_open_lm_done = False
13
+
14
+
15
+ def _register_open_lm_for_auto_model():
16
+ """Register OpenLMForCausalLM for AutoModelForCausalLM."""
17
+ try:
18
+ from open_lm.utils.transformers.hf_model import OpenLMforCausalLM
19
+ from open_lm.utils.transformers.hf_config import OpenLMConfig
20
+ except ModuleNotFoundError as e:
21
+ # Provide manual instructions for installing open_lm from GitHub
22
+ # because PyPI does not allow installing dependencies directly from GitHub.
23
+ raise OptionalDependencyNotInstalled(
24
+ f"Optional dependency {e.name} is not installed. "
25
+ "Please run `pip install open_lm@git+https://github.com/mlfoundations/open_lm.git@main` to install it."
26
+ ) from e
27
+
28
+ with _register_open_lm_lock:
29
+ global _register_open_lm_done
30
+ if not _register_open_lm_done:
31
+ AutoConfig.register("openlm", OpenLMConfig)
32
+ AutoModelForCausalLM.register(OpenLMConfig, OpenLMforCausalLM)
33
+ _register_open_lm_done = True
34
+
35
+
36
+ class OpenLMClient(HuggingFaceClient):
37
+ """Client for OpenLM: https://github.com/mlfoundations/open_lm"""
38
+
39
+ def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
40
+ _register_open_lm_for_auto_model()
41
+ super().__init__(
42
+ cache_config=cache_config, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
43
+ )
@@ -0,0 +1,301 @@
1
+ # mypy: check_untyped_defs = False
2
+ from dataclasses import replace
3
+ from typing import Any, Dict, List, Optional, cast, Union
4
+
5
+ from helm.benchmark.model_metadata_registry import is_vlm
6
+ from helm.common.cache import CacheConfig
7
+ from helm.common.media_object import TEXT_TYPE
8
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
9
+ from helm.common.hierarchical_logger import hlog
10
+ from helm.common.optional_dependencies import handle_module_not_found_error
11
+ from helm.common.tokenization_request import (
12
+ TokenizationRequest,
13
+ TokenizationRequestResult,
14
+ )
15
+ from helm.tokenizers.tokenizer import Tokenizer
16
+ from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
17
+
18
+ try:
19
+ import openai
20
+ from openai import OpenAI
21
+ except ModuleNotFoundError as e:
22
+ handle_module_not_found_error(e, ["openai"])
23
+
24
+
25
+ class OpenAIClient(CachingClient):
26
+ END_OF_TEXT: str = "<|endoftext|>"
27
+
28
+ # Error OpenAI throws when the image in the prompt violates their content policy
29
+ INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
30
+
31
+ # Set the finish reason to this if the prompt violates OpenAI's content policy
32
+ CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
33
+ "The prompt violates OpenAI's content policy. "
34
+ "See https://labs.openai.com/policies/content-policy for more information."
35
+ )
36
+
37
+ def __init__(
38
+ self,
39
+ tokenizer: Tokenizer,
40
+ tokenizer_name: str,
41
+ cache_config: CacheConfig,
42
+ api_key: Optional[str] = None,
43
+ org_id: Optional[str] = None,
44
+ base_url: Optional[str] = None,
45
+ ):
46
+ super().__init__(cache_config=cache_config)
47
+ self.tokenizer = tokenizer
48
+ self.tokenizer_name = tokenizer_name
49
+ self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
50
+
51
+ def _is_chat_model_engine(self, model_engine: str) -> bool:
52
+ if model_engine == "gpt-3.5-turbo-instruct":
53
+ return False
54
+ elif model_engine.startswith("gpt-3.5") or model_engine.startswith("gpt-4"):
55
+ return True
56
+ return False
57
+
58
+ def _get_model_for_request(self, request: Request) -> str:
59
+ return request.model_engine
60
+
61
+ def _get_cache_key(self, raw_request: Dict, request: Request):
62
+ cache_key = CachingClient.make_cache_key(raw_request, request)
63
+ if is_vlm(request.model):
64
+ assert request.multimodal_prompt is not None
65
+ prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
66
+ cache_key = {**cache_key, "multimodal_prompt": prompt_key}
67
+ del cache_key["messages"]
68
+ return cache_key
69
+
70
+ def _make_embedding_request(self, request: Request) -> RequestResult:
71
+ raw_request: Dict[str, Any]
72
+ raw_request = {
73
+ "input": request.prompt,
74
+ # Note: In older deprecated versions of the OpenAI API, "model" used to be "engine".
75
+ "model": self._get_model_for_request(request),
76
+ }
77
+
78
+ def do_it() -> Dict[str, Any]:
79
+ return self.client.embeddings.create(**raw_request).model_dump(mode="json")
80
+
81
+ try:
82
+ cache_key = self._get_cache_key(raw_request, request)
83
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
84
+ except openai.OpenAIError as e:
85
+ error: str = f"OpenAI error: {e}"
86
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
87
+
88
+ # If the user is requesting completions instead of an embedding, then `completions`
89
+ # needs to be populated, and `embedding` should be an empty list and vice-versa.
90
+ embedding: List[float] = []
91
+ # If the user is requesting an embedding instead of completion
92
+ # then completions would be left as an empty list. The embedding needs to be set.
93
+ embedding = response["data"][0]["embedding"]
94
+
95
+ return RequestResult(
96
+ success=True,
97
+ cached=cached,
98
+ request_time=response["request_time"],
99
+ request_datetime=response.get("request_datetime"),
100
+ completions=[],
101
+ embedding=embedding,
102
+ )
103
+
104
+ def _make_chat_request(self, request: Request) -> RequestResult:
105
+ messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
106
+ if request.messages is not None:
107
+ # Checks that all messages have a role and some content
108
+ for message in request.messages:
109
+ if not message.get("role") or not message.get("content"):
110
+ raise ValueError("All messages must have a role and content")
111
+ # Checks that the last role is "user"
112
+ if request.messages[-1]["role"] != "user":
113
+ raise ValueError("Last message must have role 'user'")
114
+ if request.prompt != "":
115
+ hlog("WARNING: Since message is set, prompt will be ignored")
116
+ else:
117
+ # Convert prompt into a single message
118
+ # For now, put the whole prompt in a single user message, and expect the response
119
+ # to be returned in a single assistant message.
120
+ # TODO: Support ChatML for creating multiple messages with different roles.
121
+ # See: https://github.com/openai/openai-python/blob/main/chatml.md
122
+
123
+ # Content can either be text or a list of multimodal content made up of text and images:
124
+ # https://platform.openai.com/docs/guides/vision
125
+ content: Union[str, List[Union[str, Any]]]
126
+ if request.multimodal_prompt is not None:
127
+ content = []
128
+ for media_object in request.multimodal_prompt.media_objects:
129
+ if media_object.is_type("image") and media_object.location:
130
+ from helm.common.images_utils import encode_base64
131
+
132
+ base64_image: str = encode_base64(media_object.location)
133
+ image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
134
+ content.append({"type": "image_url", "image_url": image_object})
135
+ elif media_object.is_type(TEXT_TYPE):
136
+ if media_object.text is None:
137
+ raise ValueError("MediaObject of text type has missing text field value")
138
+ content.append({"type": media_object.type, "text": media_object.text})
139
+ else:
140
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
141
+
142
+ else:
143
+ content = request.prompt
144
+
145
+ messages = [{"role": "user", "content": content}]
146
+
147
+ raw_request: Dict[str, Any] = {
148
+ "model": self._get_model_for_request(request),
149
+ "messages": messages,
150
+ "temperature": request.temperature,
151
+ "top_p": request.top_p,
152
+ "n": request.num_completions,
153
+ "stop": request.stop_sequences or None, # API doesn't like empty list
154
+ # Note: Chat models may require adding an extra token to max_tokens
155
+ # for the internal special role token.
156
+ "max_tokens": request.max_tokens,
157
+ "presence_penalty": request.presence_penalty,
158
+ "frequency_penalty": request.frequency_penalty,
159
+ }
160
+
161
+ # OpenAI's vision API doesn't allow None values for stop.
162
+ # Fails with "body -> stop: none is not an allowed value" if None is passed.
163
+ if is_vlm(request.model) and raw_request["stop"] is None:
164
+ raw_request.pop("stop")
165
+
166
+ def do_it() -> Dict[str, Any]:
167
+ return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
168
+
169
+ try:
170
+ cache_key = self._get_cache_key(raw_request, request)
171
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
172
+ except openai.OpenAIError as e:
173
+ if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
174
+ hlog(f"Failed safety check: {str(request)}")
175
+ empty_completion = GeneratedOutput(
176
+ text="",
177
+ logprob=0,
178
+ tokens=[],
179
+ finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
180
+ )
181
+ return RequestResult(
182
+ success=True,
183
+ cached=False,
184
+ request_time=0,
185
+ completions=[empty_completion] * request.num_completions,
186
+ embedding=[],
187
+ )
188
+
189
+ error: str = f"OpenAI error: {e}"
190
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
191
+
192
+ completions: List[GeneratedOutput] = []
193
+ for raw_completion in response["choices"]:
194
+ # The OpenAI chat completion API doesn't support echo.
195
+ # If `echo_prompt` is true, combine the prompt and completion.
196
+ raw_completion_content = raw_completion["message"]["content"]
197
+ text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
198
+ # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
199
+ tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
200
+ TokenizationRequest(text, tokenizer=self.tokenizer_name)
201
+ )
202
+ # Log probs are not currently not supported by the OpenAI chat completion API, so set to 0 for now.
203
+ tokens: List[Token] = [
204
+ Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
205
+ ]
206
+ completion = GeneratedOutput(
207
+ text=text,
208
+ logprob=0, # OpenAI does not provide logprobs
209
+ tokens=tokens,
210
+ finish_reason={"reason": raw_completion["finish_reason"]},
211
+ )
212
+ completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
213
+
214
+ return RequestResult(
215
+ success=True,
216
+ cached=cached,
217
+ request_time=response["request_time"],
218
+ request_datetime=response.get("request_datetime"),
219
+ completions=completions,
220
+ embedding=[],
221
+ )
222
+
223
+ def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
224
+ raw_request: Dict[str, Any] = {
225
+ # Note: In older deprecated versions of the OpenAI API, "model" used to be "engine".
226
+ "model": self._get_model_for_request(request),
227
+ "prompt": request.prompt,
228
+ "temperature": request.temperature,
229
+ "n": request.num_completions,
230
+ "max_tokens": request.max_tokens,
231
+ "best_of": request.top_k_per_token,
232
+ "logprobs": request.top_k_per_token,
233
+ "stop": request.stop_sequences or None, # API doesn't like empty list
234
+ "top_p": request.top_p,
235
+ "presence_penalty": request.presence_penalty,
236
+ "frequency_penalty": request.frequency_penalty,
237
+ "echo": request.echo_prompt,
238
+ }
239
+
240
+ # OpenAI doesn't let you ask for more completions than the number of
241
+ # per-token candidates.
242
+ raw_request["best_of"] = max(raw_request["best_of"], raw_request["n"])
243
+ raw_request["logprobs"] = max(raw_request["logprobs"], raw_request["n"])
244
+
245
+ return raw_request
246
+
247
+ def _make_completion_request(self, request: Request) -> RequestResult:
248
+ raw_request = self._to_raw_completion_request(request)
249
+
250
+ def do_it() -> Dict[str, Any]:
251
+ return self.client.completions.create(**raw_request).model_dump(mode="json")
252
+
253
+ try:
254
+ cache_key = self._get_cache_key(raw_request, request)
255
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
256
+ except openai.OpenAIError as e:
257
+ error: str = f"OpenAI error: {e}"
258
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
259
+
260
+ completions: List[GeneratedOutput] = []
261
+ for raw_completion in response["choices"]:
262
+ sequence_logprob = 0
263
+ tokens: List[Token] = []
264
+
265
+ raw_data = raw_completion["logprobs"]
266
+ for (
267
+ text,
268
+ logprob,
269
+ ) in zip(raw_data["tokens"], raw_data["token_logprobs"]):
270
+ tokens.append(Token(text=text, logprob=logprob or 0))
271
+ sequence_logprob += logprob or 0
272
+ completion = GeneratedOutput(
273
+ text=raw_completion["text"],
274
+ logprob=sequence_logprob,
275
+ tokens=tokens,
276
+ finish_reason={"reason": raw_completion["finish_reason"]},
277
+ )
278
+ # OpenAI sends us back tokens past the end of text token,
279
+ # so we need to manually truncate the list of tokens.
280
+ # TODO: filed an issue with their support to check what the expected behavior here is.
281
+ completion = truncate_sequence(
282
+ completion, replace(request, stop_sequences=request.stop_sequences + [OpenAIClient.END_OF_TEXT])
283
+ )
284
+ completions.append(completion)
285
+
286
+ return RequestResult(
287
+ success=True,
288
+ cached=cached,
289
+ request_time=response["request_time"],
290
+ request_datetime=response.get("request_datetime"),
291
+ completions=completions,
292
+ embedding=[],
293
+ )
294
+
295
+ def make_request(self, request: Request) -> RequestResult:
296
+ if request.embedding:
297
+ return self._make_embedding_request(request)
298
+ elif self._is_chat_model_engine(request.model_engine):
299
+ return self._make_chat_request(request)
300
+ else:
301
+ return self._make_completion_request(request)
@@ -5,12 +5,12 @@ from typing import Any, Dict, List
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.hierarchical_logger import hlog
8
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token, ErrorFlags
8
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
9
9
  from helm.common.tokenization_request import (
10
10
  TokenizationRequest,
11
11
  TokenizationRequestResult,
12
12
  )
13
- from helm.proxy.tokenizers.tokenizer import Tokenizer
13
+ from helm.tokenizers.tokenizer import Tokenizer
14
14
  from .client import CachingClient, truncate_sequence
15
15
 
16
16
 
@@ -67,14 +67,14 @@ class PalmyraClient(CachingClient):
67
67
  # "random_seed": request.random,
68
68
  }
69
69
 
70
- completions: List[Sequence] = []
70
+ completions: List[GeneratedOutput] = []
71
71
  model_name: str = request.model_engine
72
72
 
73
73
  # `num_completions` is not supported, so instead make `num_completions` separate requests.
74
74
  for completion_index in range(request.num_completions):
75
75
  try:
76
76
 
77
- def do_it():
77
+ def do_it() -> Dict[str, Any]:
78
78
  # Add an argument timeout to raw_request to avoid waiting getting timeout of 60s
79
79
  # which happens for long prompts.
80
80
  request_with_timeout = {"timeout": 300, **raw_request}
@@ -128,11 +128,9 @@ class PalmyraClient(CachingClient):
128
128
  )
129
129
 
130
130
  # Log probs are not currently not supported by the Writer, so set to 0 for now.
131
- tokens: List[Token] = [
132
- Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
133
- ]
131
+ tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
134
132
 
135
- completion = Sequence(text=response_text, logprob=0, tokens=tokens)
133
+ completion = GeneratedOutput(text=response_text, logprob=0, tokens=tokens)
136
134
  sequence = truncate_sequence(completion, request, print_warning=True)
137
135
  completions.append(sequence)
138
136
 
@@ -1,14 +1,14 @@
1
1
  # mypy: check_untyped_defs = False
2
2
  import threading
3
3
  from dataclasses import asdict
4
- from typing import List, Dict, Optional
4
+ from typing import Any, List, Dict, Optional
5
5
 
6
6
  from dacite import from_dict
7
7
  from googleapiclient import discovery
8
8
  from googleapiclient.errors import BatchError, HttpError
9
9
  from googleapiclient.http import BatchHttpRequest
10
10
  from httplib2 import HttpLib2Error
11
- from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
11
+ from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
12
12
  from helm.proxy.retry import NonRetriableException
13
13
 
14
14
  from helm.common.cache import Cache, CacheConfig
@@ -91,14 +91,9 @@ class PerspectiveAPIClient(ToxicityClassifierClient):
91
91
  Batch several requests into a single API request and get the toxicity attributes and scores.
92
92
  For more information, see https://googleapis.github.io/google-api-python-client/docs/batch.html.
93
93
  """
94
-
95
- with self._client_lock:
96
- if not self._client:
97
- self._client = self._create_client()
98
-
99
94
  try:
100
95
 
101
- def do_it():
96
+ def do_it() -> Dict[str, Any]:
102
97
  text_to_response: Dict[str, Dict] = dict()
103
98
 
104
99
  def callback(request_id: str, response: Dict, error: HttpError):
@@ -106,6 +101,10 @@ class PerspectiveAPIClient(ToxicityClassifierClient):
106
101
  raise error
107
102
  text_to_response[request_id] = response
108
103
 
104
+ with self._client_lock:
105
+ if not self._client:
106
+ self._client = self._create_client()
107
+
109
108
  # Create a batch request. We will add a request to the batch request for each text string
110
109
  batch_request: BatchHttpRequest = self._client.new_batch_http_request()
111
110