crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,64 @@
1
+ import itertools
2
+ from typing import List, TypedDict
3
+ from typing import Dict, Any
4
+
5
+ from helm.common.cache import CacheConfig
6
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
+ from helm.clients.client import CachingClient
8
+
9
+
10
+ class SimpleClientRequest(TypedDict):
11
+ engine: str
12
+ prompt: str
13
+ num_completions: int
14
+
15
+
16
+ class SimpleClient(CachingClient):
17
+ """Simple client for tutorials and for debugging."""
18
+
19
+ def __init__(self, cache_config: CacheConfig):
20
+ super().__init__(cache_config=cache_config)
21
+
22
+ def make_request(self, request: Request) -> RequestResult:
23
+ raw_request: SimpleClientRequest = {
24
+ "engine": request.model_engine,
25
+ "prompt": request.prompt,
26
+ "num_completions": request.num_completions,
27
+ }
28
+
29
+ def do_it() -> Dict[str, Any]:
30
+ return self.invoke_model(raw_request)
31
+
32
+ cache_key = CachingClient.make_cache_key(raw_request, request)
33
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
34
+ logprob = 0
35
+ completions = [
36
+ GeneratedOutput(
37
+ text=text,
38
+ logprob=logprob,
39
+ tokens=[Token(text=text, logprob=logprob)],
40
+ )
41
+ for text in response["completions"]
42
+ ]
43
+
44
+ return RequestResult(
45
+ success=True,
46
+ cached=cached,
47
+ request_time=response["request_time"],
48
+ request_datetime=response.get("request_datetime"),
49
+ completions=completions,
50
+ embedding=[],
51
+ )
52
+
53
+ def invoke_model(self, raw_request: SimpleClientRequest) -> Dict[str, Any]:
54
+ """
55
+ Example:
56
+ Prompt: 7 2 4 6
57
+ Completions (num_completions = 3):
58
+ - 6
59
+ - 4
60
+ - 2
61
+ """
62
+ prompt_words: List[str] = raw_request["prompt"].split()
63
+ completions = list(itertools.islice(itertools.cycle(reversed(prompt_words)), raw_request["num_completions"]))
64
+ return {"completions": completions}
@@ -1,12 +1,13 @@
1
1
  import dataclasses
2
2
  from tempfile import TemporaryDirectory
3
- from helm.common.request import Sequence, Token
3
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
4
+ from helm.common.request import GeneratedOutput, Token
4
5
 
5
6
  import pytest
6
7
 
7
8
  from helm.common.request import Request, RequestResult
8
9
  from helm.common.general import get_credentials
9
- from helm.proxy.clients.auto_client import AutoClient
10
+ from helm.clients.auto_client import AutoClient
10
11
 
11
12
 
12
13
  @pytest.mark.models
@@ -15,8 +16,8 @@ class TestAutoClient:
15
16
  credentials = get_credentials()
16
17
  if not credentials:
17
18
  pytest.skip("Skipping test because no credentials found")
18
- with TemporaryDirectory() as cache_path:
19
- auto_client = AutoClient(credentials, cache_path)
19
+ with TemporaryDirectory() as temp_dir_path:
20
+ auto_client = AutoClient(credentials, temp_dir_path, BlackHoleCacheBackendConfig())
20
21
  actual_result = auto_client.make_request(request)
21
22
  assert actual_result.request_time or actual_result.batch_request_time
22
23
  actual_result = dataclasses.replace(
@@ -36,32 +37,29 @@ class TestAutoClient:
36
37
  success=True,
37
38
  embedding=[],
38
39
  completions=[
39
- Sequence(
40
+ GeneratedOutput(
40
41
  text=" intelligent species on the planet. They are also one",
41
42
  logprob=-9.087313510477543,
42
43
  tokens=[
43
44
  Token(
44
45
  text="Ġintelligent",
45
46
  logprob=-1.9816237688064575,
46
- top_logprobs={"Ġintelligent": -1.9816237688064575},
47
47
  ),
48
48
  Token(
49
49
  text="Ġspecies",
50
50
  logprob=-1.2881066799163818,
51
- top_logprobs={"Ġspecies": -1.2881066799163818},
52
51
  ),
53
- Token(text="Ġon", logprob=-0.16092979907989502, top_logprobs={"Ġon": -0.16092979907989502}),
54
- Token(text="Ġthe", logprob=-0.23620447516441345, top_logprobs={"Ġthe": -0.23620447516441345}),
52
+ Token(text="Ġon", logprob=-0.16092979907989502),
53
+ Token(text="Ġthe", logprob=-0.23620447516441345),
55
54
  Token(
56
55
  text="Ġplanet",
57
56
  logprob=-0.015416033565998077,
58
- top_logprobs={"Ġplanet": -0.015416033565998077},
59
57
  ),
60
- Token(text=".", logprob=-0.6683081388473511, top_logprobs={".": -0.6683081388473511}),
61
- Token(text="ĠThey", logprob=-1.9231040477752686, top_logprobs={"ĠThey": -1.9231040477752686}),
62
- Token(text="Ġare", logprob=-0.9322243332862854, top_logprobs={"Ġare": -0.9322243332862854}),
63
- Token(text="Ġalso", logprob=-0.7750787138938904, top_logprobs={"Ġalso": -0.7750787138938904}),
64
- Token(text="Ġone", logprob=-1.1063175201416016, top_logprobs={"Ġone": -1.1063175201416016}),
58
+ Token(text=".", logprob=-0.6683081388473511),
59
+ Token(text="ĠThey", logprob=-1.9231040477752686),
60
+ Token(text="Ġare", logprob=-0.9322243332862854),
61
+ Token(text="Ġalso", logprob=-0.7750787138938904),
62
+ Token(text="Ġone", logprob=-1.1063175201416016),
65
63
  ],
66
64
  finish_reason={"reason": "length"},
67
65
  )
@@ -0,0 +1,100 @@
1
+ from helm.common.cache import BlackHoleCacheConfig
2
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
3
+ from .client import truncate_sequence, truncate_and_tokenize_response_text
4
+ from typing import List
5
+ from helm.common.request import Request, GeneratedOutput, Token
6
+
7
+
8
+ def truncate_sequence_helper(tokens: List[str], request: Request, expected_tokens: List[str]):
9
+ sequence = GeneratedOutput(
10
+ text="".join(tokens),
11
+ tokens=[Token(text=text, logprob=-1) for text in tokens],
12
+ logprob=-len(tokens),
13
+ )
14
+
15
+ output_sequence = truncate_sequence(sequence, request)
16
+
17
+ assert expected_tokens == [token.text for token in output_sequence.tokens]
18
+ assert "".join(expected_tokens) == output_sequence.text
19
+ assert output_sequence.logprob == sum(token.logprob for token in output_sequence.tokens)
20
+
21
+
22
+ def test_truncate_sequence():
23
+ # echo_prompt = True, nothing gets truncated
24
+ truncate_sequence_helper(
25
+ ["a", "b", "c"],
26
+ Request(
27
+ model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True
28
+ ),
29
+ ["a", "b", "c"],
30
+ )
31
+
32
+ # Nothing gets truncated
33
+ truncate_sequence_helper(
34
+ ["hello", " world"],
35
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]),
36
+ ["hello", " world"],
37
+ )
38
+
39
+ # Truncate using stop sequences
40
+ truncate_sequence_helper(
41
+ ["hello", " world", "\n", "what"],
42
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]),
43
+ ["hello", " world"],
44
+ )
45
+
46
+ # Truncate using max tokens
47
+ truncate_sequence_helper(
48
+ ["a", "b", "c"],
49
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2),
50
+ ["a", "b"],
51
+ )
52
+
53
+
54
+ def test_truncate_and_tokenize_response_text():
55
+ tokenizer = HuggingFaceTokenizer(BlackHoleCacheConfig())
56
+ tokenizer_name = "huggingface/gpt2"
57
+
58
+ # No truncation
59
+ response = truncate_and_tokenize_response_text(
60
+ "I am a scientist. I am a scientist.", Request(max_tokens=100, stop_sequences=[]), tokenizer, tokenizer_name
61
+ )
62
+ assert response.finish_reason
63
+ assert response.finish_reason["reason"] == "endoftext"
64
+ assert response.text == "I am a scientist. I am a scientist."
65
+ assert response.tokens == [
66
+ Token("I", 0.0),
67
+ Token(" am", 0.0),
68
+ Token(" a", 0.0),
69
+ Token(" scientist", 0.0),
70
+ Token(".", 0.0),
71
+ Token(" I", 0.0),
72
+ Token(" am", 0.0),
73
+ Token(" a", 0.0),
74
+ Token(" scientist", 0.0),
75
+ Token(".", 0.0),
76
+ ]
77
+
78
+ response = truncate_and_tokenize_response_text(
79
+ "I am a scientist. I am a scientist.", Request(max_tokens=7, stop_sequences=["."]), tokenizer, tokenizer_name
80
+ )
81
+ assert response.finish_reason
82
+ assert response.finish_reason["reason"] == "stop"
83
+ assert response.text == "I am a scientist"
84
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0), Token(" scientist", 0.0)]
85
+
86
+ response = truncate_and_tokenize_response_text(
87
+ "I am a scientist. I am a scientist.", Request(max_tokens=3, stop_sequences=[]), tokenizer, tokenizer_name
88
+ )
89
+ assert response.finish_reason
90
+ assert response.finish_reason["reason"] == "length"
91
+ assert response.text == "I am a"
92
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0)]
93
+
94
+ response = truncate_and_tokenize_response_text(
95
+ "I am a scientist. I am a scientist.", Request(max_tokens=3, stop_sequences=["."]), tokenizer, tokenizer_name
96
+ )
97
+ assert response.finish_reason
98
+ assert response.finish_reason["reason"] == "length"
99
+ assert response.text == "I am a"
100
+ assert response.tokens == [Token("I", 0.0), Token(" am", 0.0), Token(" a", 0.0)]
@@ -1,31 +1,24 @@
1
- import os
2
1
  import pytest
3
- import tempfile
4
2
 
5
- from helm.common.cache import SqliteCacheConfig
3
+ from helm.common.cache import BlackHoleCacheConfig
6
4
  from helm.common.request import Request, RequestResult
7
- from .huggingface_client import HuggingFaceClient
5
+ from helm.clients.huggingface_client import HuggingFaceClient
8
6
 
9
7
 
10
8
  class TestHuggingFaceClient:
11
- def setup_method(self, method):
12
- cache_file = tempfile.NamedTemporaryFile(delete=False)
13
- self.cache_path: str = cache_file.name
14
- self.client = HuggingFaceClient(cache_config=SqliteCacheConfig(self.cache_path))
15
-
16
- def teardown_method(self, method):
17
- os.remove(self.cache_path)
18
-
19
9
  def test_gpt2(self):
10
+ client = HuggingFaceClient(
11
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
12
+ )
20
13
  prompt: str = "I am a computer scientist."
21
- result: RequestResult = self.client.make_request(
14
+ result: RequestResult = client.make_request(
22
15
  Request(
23
16
  model="openai/gpt2",
24
17
  model_deployment="huggingface/gpt2",
25
18
  prompt=prompt,
26
19
  num_completions=3,
27
20
  top_k_per_token=5,
28
- max_tokens=0,
21
+ max_tokens=1,
29
22
  echo_prompt=True,
30
23
  )
31
24
  )
@@ -36,7 +29,10 @@ class TestHuggingFaceClient:
36
29
 
37
30
  @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
38
31
  def test_gptj_6b(self):
39
- result: RequestResult = self.client.make_request(
32
+ client = HuggingFaceClient(
33
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
34
+ )
35
+ result: RequestResult = client.make_request(
40
36
  Request(
41
37
  model="eleutherai/gpt-j-6b",
42
38
  model_deployment="huggingface/gpt-j-6b",
@@ -49,8 +45,11 @@ class TestHuggingFaceClient:
49
45
  assert len(result.completions) == 3
50
46
 
51
47
  def test_logprob(self):
48
+ client = HuggingFaceClient(
49
+ cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2"
50
+ )
52
51
  prompt: str = "I am a computer scientist."
53
- result: RequestResult = self.client.make_request(
52
+ result: RequestResult = client.make_request(
54
53
  Request(
55
54
  model="openai/gpt2",
56
55
  model_deployment="huggingface/gpt2",
@@ -0,0 +1,19 @@
1
+ from helm.clients.simple_client import SimpleClient
2
+ from helm.common.cache import BlackHoleCacheConfig
3
+ from helm.common.request import GeneratedOutput, Request, Token
4
+
5
+
6
+ def test_simple_client_make_request():
7
+ client = SimpleClient(BlackHoleCacheConfig())
8
+ request = Request(
9
+ model="simple/model1",
10
+ model_deployment="simple/model1",
11
+ prompt="Elephants are one of the most",
12
+ temperature=0.0,
13
+ max_tokens=10,
14
+ )
15
+ result = client.make_request(request)
16
+ assert result.success
17
+ assert not result.cached
18
+ assert result.embedding == []
19
+ assert result.completions == [GeneratedOutput(text="most", logprob=0, tokens=[Token(text="most", logprob=0)])]
@@ -12,15 +12,15 @@ class TestTogetherClient:
12
12
  def setup_method(self, method):
13
13
  cache_file = tempfile.NamedTemporaryFile(delete=False)
14
14
  self.cache_path: str = cache_file.name
15
- self.client = TogetherClient(cache_config=SqliteCacheConfig(self.cache_path))
16
15
 
17
16
  def teardown_method(self, method):
18
17
  os.remove(self.cache_path)
19
18
 
20
19
  @pytest.mark.parametrize(
21
- "test_input,expected",
20
+ "together_model,test_input,expected",
22
21
  [
23
22
  (
23
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1",
24
24
  Request(
25
25
  model="together/redpajama-incite-base-3b-v1",
26
26
  model_deployment="together/redpajama-incite-base-3b-v1",
@@ -28,7 +28,6 @@ class TestTogetherClient:
28
28
  {
29
29
  "best_of": 1,
30
30
  "echo": False,
31
- "logprobs": 1,
32
31
  "max_tokens": 100,
33
32
  "model": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
34
33
  "n": 1,
@@ -40,6 +39,7 @@ class TestTogetherClient:
40
39
  },
41
40
  ),
42
41
  (
42
+ "huggyllama/llama-7b",
43
43
  Request(
44
44
  model="meta/llama-7b",
45
45
  model_deployment="together/llama-7b",
@@ -55,7 +55,6 @@ class TestTogetherClient:
55
55
  {
56
56
  "best_of": 3,
57
57
  "echo": True,
58
- "logprobs": 3,
59
58
  "max_tokens": 24,
60
59
  "model": "huggyllama/llama-7b",
61
60
  "n": 4,
@@ -67,6 +66,7 @@ class TestTogetherClient:
67
66
  },
68
67
  ),
69
68
  (
69
+ "togethercomputer/alpaca-7b",
70
70
  Request(
71
71
  model="stanford/alpaca-7b",
72
72
  model_deployment="together/alpaca-7b",
@@ -75,7 +75,6 @@ class TestTogetherClient:
75
75
  {
76
76
  "best_of": 1,
77
77
  "echo": False,
78
- "logprobs": 1,
79
78
  "max_tokens": 100,
80
79
  "model": "togethercomputer/alpaca-7b",
81
80
  "n": 1,
@@ -89,9 +88,22 @@ class TestTogetherClient:
89
88
  # TODO(#1828): Add test for `SET_DETAILS_TO_TRUE` after Together supports it.
90
89
  ],
91
90
  )
92
- def test_convert_to_raw_request(self, test_input, expected):
93
- assert expected == TogetherClient.convert_to_raw_request(test_input)
91
+ def test_convert_to_raw_request(self, together_model, test_input, expected):
92
+ client = TogetherClient(
93
+ cache_config=SqliteCacheConfig(self.cache_path),
94
+ together_model=together_model,
95
+ )
96
+ assert expected == client.convert_to_raw_request(test_input)
94
97
 
95
98
  def test_api_key_error(self):
99
+ client = TogetherClient(
100
+ cache_config=SqliteCacheConfig(self.cache_path),
101
+ together_model="togethercomputer/RedPajama-INCITE-Base-3B-v1",
102
+ )
96
103
  with pytest.raises(TogetherClientError):
97
- self.client.make_request(Request(model="bigscience/bloom", model_deployment="together/bloom"))
104
+ client.make_request(
105
+ Request(
106
+ model="together/redpajama-incite-base-3b-v1",
107
+ model_deployment="together/redpajama-incite-base-3b-v1",
108
+ )
109
+ )
@@ -1,69 +1,20 @@
1
1
  from copy import deepcopy
2
- from typing import List, Dict, Any, Optional, Union
2
+ from itertools import zip_longest
3
+ from typing import List, Dict, Any, Optional, TypedDict, Union
3
4
 
4
5
  import requests
5
6
  from retrying import retry
6
7
 
7
8
  from helm.common.cache import CacheConfig
8
- from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
9
- from .client import CachingClient, truncate_sequence, cleanup_str
10
-
11
-
12
- MODEL_ALIASES: Dict[str, str] = {
13
- # Legacy models
14
- "flan-t5-xxl": "flan-t5-xxl-hf",
15
- "h3-2.7b": "h3-2.7b-h3",
16
- "opt-1.3b": "opt-1.3b-ft-tp1",
17
- "opt-6.7b": "opt-6.7b-ft-tp1",
18
- "mpt-7b": "togethercomputer/mpt-7b",
19
- "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
20
- "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
21
- "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
22
- # Production models
23
- "redpajama-incite-base-3b-v1": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
24
- "redpajama-incite-instruct-3b-v1": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
25
- "redpajama-incite-base-7b": "togethercomputer/RedPajama-INCITE-7B-Base",
26
- "redpajama-incite-instruct-7b": "togethercomputer/RedPajama-INCITE-7B-Instruct",
27
- "alpaca-7b": "togethercomputer/alpaca-7b",
28
- "dolly-v2-3b": "databricks/dolly-v2-3b",
29
- "dolly-v2-7b": "databricks/dolly-v2-7b",
30
- "dolly-v2-12b": "databricks/dolly-v2-12b",
31
- "falcon-7b": "togethercomputer/falcon-7b",
32
- "falcon-7b-instruct": "togethercomputer/falcon-7b-instruct",
33
- "falcon-40b": "togethercomputer/falcon-40b",
34
- "falcon-40b-instruct": "togethercomputer/falcon-40b-instruct",
35
- "gpt-jt-6b-v1": "togethercomputer/GPT-JT-6B-v1",
36
- "gpt-neoxt-chat-base-20b": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
37
- "llama-7b": "huggyllama/llama-7b",
38
- "llama-13b": "huggyllama/llama-13b",
39
- "llama-30b": "huggyllama/llama-30b",
40
- "llama-65b": "huggyllama/llama-65b",
41
- "llama-2-7b": "togethercomputer/llama-2-7b",
42
- "llama-2-13b": "togethercomputer/llama-2-13b",
43
- "llama-2-70b": "togethercomputer/llama-2-70b",
44
- "mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1",
45
- "mixtral-8x7b-32kseqlen": "mistralai/mixtral-8x7b-32kseqlen",
46
- "mpt-30b": "togethercomputer/mpt-30b",
47
- "mpt-instruct-30b": "togethercomputer/mpt-30b-instruct",
48
- "pythia-1b-v0": "EleutherAI/pythia-1b-v0",
49
- "pythia-2.8b-v0": "EleutherAI/pythia-2.8b-v0",
50
- "pythia-6.9b": "EleutherAI/pythia-6.9b",
51
- "pythia-12b-v0": "EleutherAI/pythia-12b-v0",
52
- "vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3",
53
- "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
54
- "yi-6b": "zero-one-ai/Yi-6B",
55
- "yi-34b": "zero-one-ai/Yi-34B",
56
- }
57
- """Together model name aliases.
9
+ from helm.common.optional_dependencies import handle_module_not_found_error
10
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
11
+ from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
58
12
 
59
- HELM users use a shorter model name (e.g. together/flan-t5-xxl)
60
- whereas the Together client sends and caches requests using
61
- a longer model name that is suffixed with the implementation framework
62
- (e.g. flan-t5-xxl-hf). This allows tracking exactly which
63
- implementation was used in the cached results, since some results may
64
- be different depending on the implementation (e.g. efficiency metrics).
65
- This also allows future migration of results in the case of changes of
66
- available implementations on Together."""
13
+ try:
14
+ from together import Together
15
+ from together.types import ChatCompletionResponse
16
+ except ModuleNotFoundError as e:
17
+ handle_module_not_found_error(e, ["together"])
67
18
 
68
19
 
69
20
  class _RewriteRequestTags:
@@ -154,36 +105,35 @@ class TogetherClient(CachingClient):
154
105
  INFERENCE_ENDPOINT: str = "https://api.together.xyz/api/inference"
155
106
  RETRIEVE_JOB_MAX_WAIT_SECONDS: int = 60
156
107
 
157
- @staticmethod
158
- def convert_to_raw_request(request: Request) -> Dict:
108
+ def convert_to_raw_request(self, request: Request) -> Dict:
159
109
  # Following the examples from https://github.com/togethercomputer/open-models-api
160
110
  raw_request = {
161
111
  "request_type": "language-model-inference",
162
- "model": MODEL_ALIASES.get(request.model_engine, request.model_engine),
112
+ "model": self.together_model or request.model,
163
113
  "prompt": request.prompt,
164
114
  "temperature": request.temperature,
165
115
  "n": request.num_completions,
166
116
  "max_tokens": request.max_tokens,
167
117
  "best_of": request.top_k_per_token,
168
- "logprobs": request.top_k_per_token,
169
118
  "stop": request.stop_sequences or None,
170
119
  "echo": request.echo_prompt,
171
120
  "top_p": request.top_p,
172
121
  }
173
122
  return _rewrite_raw_request_for_model_tags(raw_request, request.model_engine)
174
123
 
175
- def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None):
124
+ def __init__(self, cache_config: CacheConfig, together_model: Optional[str] = None, api_key: Optional[str] = None):
176
125
  super().__init__(cache_config=cache_config)
177
126
  # TODO: the endpoint currently doesn't require an API key. When an API key is not specified
178
127
  # in credentials.conf, we rely on offline evaluation only.
179
128
  self.api_key: Optional[str] = api_key
129
+ self.together_model = together_model
180
130
 
181
131
  def _get_job_url(self, job_id: str) -> str:
182
132
  return f"https://api.together.xyz/jobs/job/{job_id}"
183
133
 
184
134
  def make_request(self, request: Request) -> RequestResult:
185
- raw_request = TogetherClient.convert_to_raw_request(request)
186
- cache_key: Dict = CachingClient.make_cache_key(raw_request, request)
135
+ raw_request = self.convert_to_raw_request(request)
136
+ cache_key = CachingClient.make_cache_key(raw_request, request)
187
137
 
188
138
  if not self.api_key:
189
139
  raise TogetherClientError("togetherApiKey not set in credentials.conf")
@@ -278,7 +228,7 @@ class TogetherClient(CachingClient):
278
228
  )
279
229
 
280
230
  # Expect the result to be structured the same way as a response from OpenAI API.
281
- completions: List[Sequence] = []
231
+ completions: List[GeneratedOutput] = []
282
232
  for raw_completion in response["choices"]:
283
233
  sequence_logprob = 0
284
234
  tokens: List[Token] = []
@@ -288,22 +238,20 @@ class TogetherClient(CachingClient):
288
238
  # Waiting for a fix.
289
239
  if "logprobs" in raw_completion:
290
240
  raw_data = raw_completion["logprobs"]
291
- for text, logprob, top_logprobs in zip(
292
- raw_data["tokens"], raw_data["token_logprobs"], raw_data["top_logprobs"]
293
- ):
241
+ for text, logprob in zip(raw_data["tokens"], raw_data["token_logprobs"]):
294
242
  # TODO #1654: Check if this is still needed
295
243
  text = cleanup_str(text, "together")
296
- tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict(top_logprobs or {})))
244
+ tokens.append(Token(text=text, logprob=logprob or 0))
297
245
  sequence_logprob += logprob or 0
298
246
  else:
299
247
  # hack: just make the entire text one token so that something shows up in the frontend
300
248
  text = cleanup_str(raw_completion["text"], "together")
301
- tokens.append(Token(text=text, logprob=0, top_logprobs={}))
249
+ tokens.append(Token(text=text, logprob=0))
302
250
 
303
251
  raw_finish_reason: Optional[str] = raw_completion.get("finish_reason")
304
252
  finish_reason: Optional[Dict] = {"reason": raw_finish_reason} if raw_finish_reason else None
305
253
 
306
- completion = Sequence(
254
+ completion = GeneratedOutput(
307
255
  text=cleanup_str(raw_completion["text"], "together"),
308
256
  logprob=sequence_logprob,
309
257
  tokens=tokens,
@@ -332,3 +280,86 @@ class TogetherClient(CachingClient):
332
280
  completions=completions,
333
281
  embedding=[],
334
282
  )
283
+
284
+
285
+ class TogetherRawChatRequest(TypedDict):
286
+ messages: List[Dict[str, str]]
287
+ model: str
288
+ max_tokens: int
289
+ stop: List[str]
290
+ temperature: float
291
+ top_p: float
292
+ top_k: int
293
+ logprobs: int
294
+ echo: bool
295
+ n: int
296
+
297
+
298
+ def convert_to_raw_chat_request(request: Request) -> TogetherRawChatRequest:
299
+ if request.messages:
300
+ messages = request.messages
301
+ else:
302
+ messages = [{"role": "user", "content": request.prompt}]
303
+ return {
304
+ "messages": messages,
305
+ "model": request.model,
306
+ "max_tokens": request.max_tokens,
307
+ "stop": request.stop_sequences,
308
+ "temperature": request.temperature,
309
+ "top_p": request.top_p,
310
+ "top_k": request.top_k_per_token,
311
+ "logprobs": min(request.top_k_per_token, 1),
312
+ "echo": request.echo_prompt,
313
+ "n": request.num_completions,
314
+ }
315
+
316
+
317
+ class TogetherChatClient(CachingClient):
318
+ """Client that uses the Python Together library for chat models."""
319
+
320
+ def __init__(self, cache_config: CacheConfig, api_key: str, together_model: Optional[str] = None):
321
+ super().__init__(cache_config=cache_config)
322
+ self._client = Together(api_key=api_key)
323
+
324
+ def make_request(self, request: Request) -> RequestResult:
325
+ raw_request = convert_to_raw_chat_request(request)
326
+ cache_key = CachingClient.make_cache_key(raw_request, request)
327
+
328
+ def do_it() -> Dict[Any, Any]:
329
+ response = self._client.chat.completions.create(**raw_request)
330
+ return response.model_dump(mode="json")
331
+
332
+ try:
333
+ raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
334
+ response = ChatCompletionResponse.model_validate(raw_response)
335
+ except Exception as error:
336
+ return RequestResult(
337
+ success=False,
338
+ cached=False,
339
+ error=str(error),
340
+ completions=[],
341
+ embedding=[],
342
+ )
343
+
344
+ generated_outputs: List[GeneratedOutput] = []
345
+ for choice in response.choices:
346
+ # NOTE: Together always returns None for choice.finish_reason
347
+ # NOTE: Together does not return logprobs for the whole generated output, only for individual tokens
348
+ tokens: List[Token] = []
349
+ if choice.logprobs:
350
+ for token_text, token_logprob in zip_longest(
351
+ choice.logprobs.tokens or [], choice.logprobs.token_logprobs or []
352
+ ):
353
+ if token_text is None:
354
+ break
355
+ tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
356
+ assert choice.message.role == "assistant"
357
+ generated_outputs.append(GeneratedOutput(text=choice.message.content, logprob=0.0, tokens=tokens))
358
+ return RequestResult(
359
+ success=True,
360
+ cached=cached,
361
+ request_time=raw_response["request_time"],
362
+ request_datetime=raw_response["request_datetime"],
363
+ completions=generated_outputs,
364
+ embedding=[],
365
+ )