crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,64 @@
1
+ import itertools
2
+ from typing import List, TypedDict
3
+ from typing import Dict, Any
4
+
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
+ from helm.clients.client import CachingClient
8
+
9
+
10
+ class SimpleClientRequest(TypedDict):
11
+ engine: str
12
+ prompt: str
13
+ num_completions: int
14
+
15
+
16
+ class SimpleClient(CachingClient):
17
+ """Simple client for tutorials and for debugging."""
18
+
19
+ def __init__(self, cache_config: CacheConfig):
20
+ super().__init__(cache_config=cache_config)
21
+
22
+ def make_request(self, request: Request) -> RequestResult:
23
+ raw_request: SimpleClientRequest = {
24
+ "engine": request.model_engine,
25
+ "prompt": request.prompt,
26
+ "num_completions": request.num_completions,
27
+ }
28
+
29
+ def do_it() -> Dict[str, Any]:
30
+ return self.invoke_model(raw_request)
31
+
32
+ cache_key = CachingClient.make_cache_key(raw_request, request)
33
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
34
+ logprob = 0
35
+ completions = [
36
+ GeneratedOutput(
37
+ text=text,
38
+ logprob=logprob,
39
+ tokens=[Token(text=text, logprob=logprob)],
40
+ )
41
+ for text in response["completions"]
42
+ ]
43
+
44
+ return RequestResult(
45
+ success=True,
46
+ cached=cached,
47
+ request_time=response["request_time"],
48
+ request_datetime=response.get("request_datetime"),
49
+ completions=completions,
50
+ embedding=[],
51
+ )
52
+
53
+ def invoke_model(self, raw_request: SimpleClientRequest) -> Dict[str, Any]:
54
+ """
55
+ Example:
56
+ Prompt: 7 2 4 6
57
+ Completions (num_completions = 3):
58
+ - 6
59
+ - 4
60
+ - 2
61
+ """
62
+ prompt_words: List[str] = raw_request["prompt"].split()
63
+ completions = list(itertools.islice(itertools.cycle(reversed(prompt_words)), raw_request["num_completions"]))
64
+ return {"completions": completions}
@@ -1,12 +1,13 @@
1
1
  import dataclasses
2
2
  from tempfile import TemporaryDirectory
3
- from helm.common.request import Sequence, Token
3
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
4
+ from helm.common.request import GeneratedOutput, Token
4
5
 
5
6
  import pytest
6
7
 
7
8
  from helm.common.request import Request, RequestResult
8
9
  from helm.common.general import get_credentials
9
- from helm.proxy.clients.auto_client import AutoClient
10
+ from helm.clients.auto_client import AutoClient
10
11
 
11
12
 
12
13
  @pytest.mark.models
@@ -15,8 +16,8 @@ class TestAutoClient:
15
16
  credentials = get_credentials()
16
17
  if not credentials:
17
18
  pytest.skip("Skipping test because no credentials found")
18
- with TemporaryDirectory() as cache_path:
19
- auto_client = AutoClient(credentials, cache_path)
19
+ with TemporaryDirectory() as temp_dir_path:
20
+ auto_client = AutoClient(credentials, temp_dir_path, BlackHoleCacheBackendConfig())
20
21
  actual_result = auto_client.make_request(request)
21
22
  assert actual_result.request_time or actual_result.batch_request_time
22
23
  actual_result = dataclasses.replace(
@@ -36,32 +37,29 @@ class TestAutoClient:
36
37
  success=True,
37
38
  embedding=[],
38
39
  completions=[
39
- Sequence(
40
+ GeneratedOutput(
40
41
  text=" intelligent species on the planet. They are also one",
41
42
  logprob=-9.087313510477543,
42
43
  tokens=[
43
44
  Token(
44
45
  text="Ġintelligent",
45
46
  logprob=-1.9816237688064575,
46
- top_logprobs={"Ġintelligent": -1.9816237688064575},
47
47
  ),
48
48
  Token(
49
49
  text="Ġspecies",
50
50
  logprob=-1.2881066799163818,
51
- top_logprobs={"Ġspecies": -1.2881066799163818},
52
51
  ),
53
- Token(text="Ġon", logprob=-0.16092979907989502, top_logprobs={"Ġon": -0.16092979907989502}),
54
- Token(text="Ġthe", logprob=-0.23620447516441345, top_logprobs={"Ġthe": -0.23620447516441345}),
52
+ Token(text="Ġon", logprob=-0.16092979907989502),
53
+ Token(text="Ġthe", logprob=-0.23620447516441345),
55
54
  Token(
56
55
  text="Ġplanet",
57
56
  logprob=-0.015416033565998077,
58
- top_logprobs={"Ġplanet": -0.015416033565998077},
59
57
  ),
60
- Token(text=".", logprob=-0.6683081388473511, top_logprobs={".": -0.6683081388473511}),
61
- Token(text="ĠThey", logprob=-1.9231040477752686, top_logprobs={"ĠThey": -1.9231040477752686}),
62
- Token(text="Ġare", logprob=-0.9322243332862854, top_logprobs={"Ġare": -0.9322243332862854}),
63
- Token(text="Ġalso", logprob=-0.7750787138938904, top_logprobs={"Ġalso": -0.7750787138938904}),
64
- Token(text="Ġone", logprob=-1.1063175201416016, top_logprobs={"Ġone": -1.1063175201416016}),
58
+ Token(text=".", logprob=-0.6683081388473511),
59
+ Token(text="ĠThey", logprob=-1.9231040477752686),
60
+ Token(text="Ġare", logprob=-0.9322243332862854),
61
+ Token(text="Ġalso", logprob=-0.7750787138938904),
62
+ Token(text="Ġone", logprob=-1.1063175201416016),
65
63
  ],
66
64
  finish_reason={"reason": "length"},
67
65
  )
@@ -0,0 +1,100 @@
1
+ from helm.common.cache import BlackHoleCacheConfig
2
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
3
+ from .client import truncate_sequence, truncate_and_tokenize_response_text
4
+ from typing import List
5
+ from helm.common.request import Request, GeneratedOutput, Token
6
+
7
+
8
+ def truncate_sequence_helper(tokens: List[str], request: Request, expected_tokens: List[str]):
9
+ sequence = GeneratedOutput(
10
+ text="".join(tokens),
11
+ tokens=[Token(text=text, logprob=-1) for text in tokens],
12
+ logprob=-len(tokens),
13
+ )
14
+
15
+ output_sequence = truncate_sequence(sequence, request)
16
+
17
+ assert expected_tokens == [token.text for token in output_sequence.tokens]
18
+ assert "".join(expected_tokens) == output_sequence.text
19
+ assert output_sequence.logprob == sum(token.logprob for token in output_sequence.tokens)
20
+
21
+
22
+ def test_truncate_sequence():
23
+ # echo_prompt = True, nothing gets truncated
24
+ truncate_sequence_helper(
25
+ ["a", "b", "c"],
26
+ Request(
27
+ model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True
28
+ ),
29
+ ["a", "b", "c"],
30
+ )
31
+
32
+ # Nothing gets truncated
33
+ truncate_sequence_helper(
34
+ ["hello", " world"],
35
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]),
36
+ ["hello", " world"],
37
+ )
38
+
39
+ # Truncate using stop sequences
40
+ truncate_sequence_helper(
41
+ ["hello", " world", "\n", "what"],
42
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]),
43
+ ["hello", " world"],
44
+ )
45
+
46
+ # Truncate using max tokens
47
+ truncate_sequence_helper(
48
+ ["a", "b", "c"],
49
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2),
50
+ ["a", "b"],
51
+ )
52
+
53
+
54
+ def test_truncate_and_tokenize_response_text():
55
+ tokenizer = HuggingFaceTokenizer(BlackHoleCacheConfig())
56
+ tokenizer_name = "huggingface/gpt2"
57
+
58
+ # No truncation
59
+ response = truncate_and_tokenize_response_text(
60
+ "I am a scientist. I am a scientist.", Request(max_tokens=100, stop_sequences=[]), tokenizer, tokenizer_name
61
+ )
62
+ assert response.finish_reason
63
+ assert response.finish_reason["reason"] == "endoftext"
64
+ assert response.text == "I am a scientist. I am a scientist."
65
+ assert response.tokens == [
66
+ Token("I", 0.0),
67
+ Token(" am", 0.0),
68
+ Token(" a", 0.0),
69
+ Token(" scientist", 0.0),
70
+ Token(".", 0.0),
71
+ Token(" I", 0.0),
72
+ Token(" am", 0.0),
73
+ Token(" a", 0.0),
74
+ Token(" scientist", 0.0),
75
+ Token(".", 0.0),
76
+ ]
77
+
78
+ response = truncate_and_tokenize_response_text(
79
+ "I am a scientist. I am a scientist.", Request(max_tokens=7, stop_sequences=["."]), tokenizer, tokenizer_name
80
+ )
81
+ assert response.finish_reason
82
+ assert response.finish_reason["reason"] == "stop"
83
+ assert response.text == "I am a scientist"
84
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0), Token(" scientist", 0.0)]
85
+
86
+ response = truncate_and_tokenize_response_text(
87
+ "I am a scientist. I am a scientist.", Request(max_tokens=3, stop_sequences=[]), tokenizer, tokenizer_name
88
+ )
89
+ assert response.finish_reason
90
+ assert response.finish_reason["reason"] == "length"
91
+ assert response.text == "I am a"
92
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0)]
93
+
94
+ response = truncate_and_tokenize_response_text(
95
+ "I am a scientist. I am a scientist.", Request(max_tokens=3, stop_sequences=["."]), tokenizer, tokenizer_name
96
+ )
97
+ assert response.finish_reason
98
+ assert response.finish_reason["reason"] == "length"
99
+ assert response.text == "I am a"
100
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0)]
@@ -1,31 +1,24 @@
1
- import os
2
1
  import pytest
3
- import tempfile
4
2
 
5
- from helm.common.cache import SqliteCacheConfig
3
+ from helm.common.cache import BlackHoleCacheConfig
6
4
  from helm.common.request import Request, RequestResult
7
- from .huggingface_client import HuggingFaceClient
5
+ from helm.clients.huggingface_client import HuggingFaceClient
8
6
 
9
7
 
10
8
  class TestHuggingFaceClient:
11
- def setup_method(self, method):
12
- cache_file = tempfile.NamedTemporaryFile(delete=False)
13
- self.cache_path: str = cache_file.name
14
- self.client = HuggingFaceClient(cache_config=SqliteCacheConfig(self.cache_path))
15
-
16
- def teardown_method(self, method):
17
- os.remove(self.cache_path)
18
-
19
9
  def test_gpt2(self):
10
+ client = HuggingFaceClient(
11
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
12
+ )
20
13
  prompt: str = "I am a computer scientist."
21
- result: RequestResult = self.client.make_request(
14
+ result: RequestResult = client.make_request(
22
15
  Request(
23
16
  model="openai/gpt2",
24
17
  model_deployment="huggingface/gpt2",
25
18
  prompt=prompt,
26
19
  num_completions=3,
27
20
  top_k_per_token=5,
28
- max_tokens=0,
21
+ max_tokens=1,
29
22
  echo_prompt=True,
30
23
  )
31
24
  )
@@ -36,7 +29,10 @@ class TestHuggingFaceClient:
36
29
 
37
30
  @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
38
31
  def test_gptj_6b(self):
39
- result: RequestResult = self.client.make_request(
32
+ client = HuggingFaceClient(
33
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
34
+ )
35
+ result: RequestResult = client.make_request(
40
36
  Request(
41
37
  model="eleutherai/gpt-j-6b",
42
38
  model_deployment="huggingface/gpt-j-6b",
@@ -49,8 +45,11 @@ class TestHuggingFaceClient:
49
45
  assert len(result.completions) == 3
50
46
 
51
47
  def test_logprob(self):
48
+ client = HuggingFaceClient(
49
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
50
+ )
52
51
  prompt: str = "I am a computer scientist."
53
- result: RequestResult = self.client.make_request(
52
+ result: RequestResult = client.make_request(
54
53
  Request(
55
54
  model="openai/gpt2",
56
55
  model_deployment="huggingface/gpt2",
@@ -0,0 +1,19 @@
1
+ from helm.clients.simple_client import SimpleClient
2
+ from helm.common.cache import BlackHoleCacheConfig
3
+ from helm.common.request import GeneratedOutput, Request, Token
4
+
5
+
6
+ def test_simple_client_make_request():
7
+ client = SimpleClient(BlackHoleCacheConfig())
8
+ request = Request(
9
+ model="simple/model1",
10
+ model_deployment="simple/model1",
11
+ prompt="Elephants are one of the most",
12
+ temperature=0.0,
13
+ max_tokens=10,
14
+ )
15
+ result = client.make_request(request)
16
+ assert result.success
17
+ assert not result.cached
18
+ assert result.embedding == []
19
+ assert result.completions == [GeneratedOutput(text="most", logprob=0, tokens=[Token(text="most", logprob=0)])]
@@ -12,15 +12,15 @@ class TestTogetherClient:
12
12
  def setup_method(self, method):
13
13
  cache_file = tempfile.NamedTemporaryFile(delete=False)
14
14
  self.cache_path: str = cache_file.name
15
- self.client = TogetherClient(cache_config=SqliteCacheConfig(self.cache_path))
16
15
 
17
16
  def teardown_method(self, method):
18
17
  os.remove(self.cache_path)
19
18
 
20
19
  @pytest.mark.parametrize(
21
- "test_input,expected",
20
+ "together_model,test_input,expected",
22
21
  [
23
22
  (
23
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1",
24
24
  Request(
25
25
  model="together/redpajama-incite-base-3b-v1",
26
26
  model_deployment="together/redpajama-incite-base-3b-v1",
@@ -28,7 +28,6 @@ class TestTogetherClient:
28
28
  {
29
29
  "best_of": 1,
30
30
  "echo": False,
31
- "logprobs": 1,
32
31
  "max_tokens": 100,
33
32
  "model": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
34
33
  "n": 1,
@@ -40,6 +39,7 @@ class TestTogetherClient:
40
39
  },
41
40
  ),
42
41
  (
42
+ "huggyllama/llama-7b",
43
43
  Request(
44
44
  model="meta/llama-7b",
45
45
  model_deployment="together/llama-7b",
@@ -55,7 +55,6 @@ class TestTogetherClient:
55
55
  {
56
56
  "best_of": 3,
57
57
  "echo": True,
58
- "logprobs": 3,
59
58
  "max_tokens": 24,
60
59
  "model": "huggyllama/llama-7b",
61
60
  "n": 4,
@@ -67,6 +66,7 @@ class TestTogetherClient:
67
66
  },
68
67
  ),
69
68
  (
69
+ "togethercomputer/alpaca-7b",
70
70
  Request(
71
71
  model="stanford/alpaca-7b",
72
72
  model_deployment="together/alpaca-7b",
@@ -75,7 +75,6 @@ class TestTogetherClient:
75
75
  {
76
76
  "best_of": 1,
77
77
  "echo": False,
78
- "logprobs": 1,
79
78
  "max_tokens": 100,
80
79
  "model": "togethercomputer/alpaca-7b",
81
80
  "n": 1,
@@ -89,9 +88,22 @@ class TestTogetherClient:
89
88
  # TODO(#1828): Add test for `SET_DETAILS_TO_TRUE` after Together supports it.
90
89
  ],
91
90
  )
92
- def test_convert_to_raw_request(self, test_input, expected):
93
- assert expected == TogetherClient.convert_to_raw_request(test_input)
91
+ def test_convert_to_raw_request(self, together_model, test_input, expected):
92
+ client = TogetherClient(
93
+ cache_config=SqliteCacheConfig(self.cache_path),
94
+ together_model=together_model,
95
+ )
96
+ assert expected == client.convert_to_raw_request(test_input)
94
97
 
95
98
  def test_api_key_error(self):
99
+ client = TogetherClient(
100
+ cache_config=SqliteCacheConfig(self.cache_path),
101
+ together_model="togethercomputer/RedPajama-INCITE-Base-3B-v1",
102
+ )
96
103
  with pytest.raises(TogetherClientError):
97
- self.client.make_request(Request(model="bigscience/bloom", model_deployment="together/bloom"))
104
+ client.make_request(
105
+ Request(
106
+ model="together/redpajama-incite-base-3b-v1",
107
+ model_deployment="together/redpajama-incite-base-3b-v1",
108
+ )
109
+ )
@@ -5,67 +5,10 @@ import requests
5
5
  from retrying import retry
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
8
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
9
9
  from .client import CachingClient, truncate_sequence, cleanup_str
10
10
 
11
11
 
12
- MODEL_ALIASES: Dict[str, str] = {
13
- # Legacy models
14
- "flan-t5-xxl": "flan-t5-xxl-hf",
15
- "h3-2.7b": "h3-2.7b-h3",
16
- "opt-1.3b": "opt-1.3b-ft-tp1",
17
- "opt-6.7b": "opt-6.7b-ft-tp1",
18
- "mpt-7b": "togethercomputer/mpt-7b",
19
- "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
20
- "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
21
- "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
22
- # Production models
23
- "redpajama-incite-base-3b-v1": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
24
- "redpajama-incite-instruct-3b-v1": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
25
- "redpajama-incite-base-7b": "togethercomputer/RedPajama-INCITE-7B-Base",
26
- "redpajama-incite-instruct-7b": "togethercomputer/RedPajama-INCITE-7B-Instruct",
27
- "alpaca-7b": "togethercomputer/alpaca-7b",
28
- "dolly-v2-3b": "databricks/dolly-v2-3b",
29
- "dolly-v2-7b": "databricks/dolly-v2-7b",
30
- "dolly-v2-12b": "databricks/dolly-v2-12b",
31
- "falcon-7b": "togethercomputer/falcon-7b",
32
- "falcon-7b-instruct": "togethercomputer/falcon-7b-instruct",
33
- "falcon-40b": "togethercomputer/falcon-40b",
34
- "falcon-40b-instruct": "togethercomputer/falcon-40b-instruct",
35
- "gpt-jt-6b-v1": "togethercomputer/GPT-JT-6B-v1",
36
- "gpt-neoxt-chat-base-20b": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
37
- "llama-7b": "huggyllama/llama-7b",
38
- "llama-13b": "huggyllama/llama-13b",
39
- "llama-30b": "huggyllama/llama-30b",
40
- "llama-65b": "huggyllama/llama-65b",
41
- "llama-2-7b": "togethercomputer/llama-2-7b",
42
- "llama-2-13b": "togethercomputer/llama-2-13b",
43
- "llama-2-70b": "togethercomputer/llama-2-70b",
44
- "mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1",
45
- "mixtral-8x7b-32kseqlen": "mistralai/mixtral-8x7b-32kseqlen",
46
- "mpt-30b": "togethercomputer/mpt-30b",
47
- "mpt-instruct-30b": "togethercomputer/mpt-30b-instruct",
48
- "pythia-1b-v0": "EleutherAI/pythia-1b-v0",
49
- "pythia-2.8b-v0": "EleutherAI/pythia-2.8b-v0",
50
- "pythia-6.9b": "EleutherAI/pythia-6.9b",
51
- "pythia-12b-v0": "EleutherAI/pythia-12b-v0",
52
- "vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3",
53
- "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
54
- "yi-6b": "zero-one-ai/Yi-6B",
55
- "yi-34b": "zero-one-ai/Yi-34B",
56
- }
57
- """Together model name aliases.
58
-
59
- HELM users use a shorter model name (e.g. together/flan-t5-xxl)
60
- whereas the Together client sends and caches requests using
61
- a longer model name that is suffixed with the implementation framework
62
- (e.g. flan-t5-xxl-hf). This allows tracking exactly which
63
- implementation was used in the cached results, since some results may
64
- be different depending on the implementation (e.g. efficiency metrics).
65
- This also allows future migration of results in the case of changes of
66
- available implementations on Together."""
67
-
68
-
69
12
  class _RewriteRequestTags:
70
13
  """Tags that indicate that the request for the model must be rewritten before sending to Together."""
71
14
 
@@ -154,36 +97,35 @@ class TogetherClient(CachingClient):
154
97
  INFERENCE_ENDPOINT: str = "https://api.together.xyz/api/inference"
155
98
  RETRIEVE_JOB_MAX_WAIT_SECONDS: int = 60
156
99
 
157
- @staticmethod
158
- def convert_to_raw_request(request: Request) -> Dict:
100
+ def convert_to_raw_request(self, request: Request) -> Dict:
159
101
  # Following the examples from https://github.com/togethercomputer/open-models-api
160
102
  raw_request = {
161
103
  "request_type": "language-model-inference",
162
- "model": MODEL_ALIASES.get(request.model_engine, request.model_engine),
104
+ "model": self.together_model or request.model,
163
105
  "prompt": request.prompt,
164
106
  "temperature": request.temperature,
165
107
  "n": request.num_completions,
166
108
  "max_tokens": request.max_tokens,
167
109
  "best_of": request.top_k_per_token,
168
- "logprobs": request.top_k_per_token,
169
110
  "stop": request.stop_sequences or None,
170
111
  "echo": request.echo_prompt,
171
112
  "top_p": request.top_p,
172
113
  }
173
114
  return _rewrite_raw_request_for_model_tags(raw_request, request.model_engine)
174
115
 
175
- def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None):
116
+ def __init__(self, cache_config: CacheConfig, together_model: Optional[str] = None, api_key: Optional[str] = None):
176
117
  super().__init__(cache_config=cache_config)
177
118
  # TODO: the endpoint currently doesn't require an API key. When an API key is not specified
178
119
  # in credentials.conf, we rely on offline evaluation only.
179
120
  self.api_key: Optional[str] = api_key
121
+ self.together_model = together_model
180
122
 
181
123
  def _get_job_url(self, job_id: str) -> str:
182
124
  return f"https://api.together.xyz/jobs/job/{job_id}"
183
125
 
184
126
  def make_request(self, request: Request) -> RequestResult:
185
- raw_request = TogetherClient.convert_to_raw_request(request)
186
- cache_key: Dict = CachingClient.make_cache_key(raw_request, request)
127
+ raw_request = self.convert_to_raw_request(request)
128
+ cache_key = CachingClient.make_cache_key(raw_request, request)
187
129
 
188
130
  if not self.api_key:
189
131
  raise TogetherClientError("togetherApiKey not set in credentials.conf")
@@ -278,7 +220,7 @@ class TogetherClient(CachingClient):
278
220
  )
279
221
 
280
222
  # Expect the result to be structured the same way as a response from OpenAI API.
281
- completions: List[Sequence] = []
223
+ completions: List[GeneratedOutput] = []
282
224
  for raw_completion in response["choices"]:
283
225
  sequence_logprob = 0
284
226
  tokens: List[Token] = []
@@ -288,22 +230,20 @@ class TogetherClient(CachingClient):
288
230
  # Waiting for a fix.
289
231
  if "logprobs" in raw_completion:
290
232
  raw_data = raw_completion["logprobs"]
291
- for text, logprob, top_logprobs in zip(
292
- raw_data["tokens"], raw_data["token_logprobs"], raw_data["top_logprobs"]
293
- ):
233
+ for text, logprob in zip(raw_data["tokens"], raw_data["token_logprobs"]):
294
234
  # TODO #1654: Check if this is still needed
295
235
  text = cleanup_str(text, "together")
296
- tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict(top_logprobs or {})))
236
+ tokens.append(Token(text=text, logprob=logprob or 0))
297
237
  sequence_logprob += logprob or 0
298
238
  else:
299
239
  # hack: just make the entire text one token so that something shows up in the frontend
300
240
  text = cleanup_str(raw_completion["text"], "together")
301
- tokens.append(Token(text=text, logprob=0, top_logprobs={}))
241
+ tokens.append(Token(text=text, logprob=0))
302
242
 
303
243
  raw_finish_reason: Optional[str] = raw_completion.get("finish_reason")
304
244
  finish_reason: Optional[Dict] = {"reason": raw_finish_reason} if raw_finish_reason else None
305
245
 
306
- completion = Sequence(
246
+ completion = GeneratedOutput(
307
247
  text=cleanup_str(raw_completion["text"], "together"),
308
248
  logprob=sequence_logprob,
309
249
  tokens=tokens,