crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,41 +1,20 @@
1
- from helm.benchmark.model_deployment_registry import WindowServiceSpec, get_model_deployment
2
- from helm.proxy.models import (
3
- get_model,
4
- get_model_names_with_tag,
5
- Model,
6
- AI21_WIDER_CONTEXT_WINDOW_TAG,
7
- AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
8
- WIDER_CONTEXT_WINDOW_TAG,
9
- GPT_TURBO_CONTEXT_WINDOW_TAG,
10
- GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
11
- GPT4_CONTEXT_WINDOW_TAG,
12
- GPT4_32K_CONTEXT_WINDOW_TAG,
13
- )
1
+ from typing import Optional
14
2
 
15
- from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
16
- from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
17
- from helm.benchmark.window_services.remote_window_service import get_remote_window_service
3
+ from helm.benchmark.model_deployment_registry import ModelDeployment, WindowServiceSpec, get_model_deployment
4
+ from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
18
5
  from helm.benchmark.window_services.window_service import WindowService
19
6
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
20
- from helm.proxy.clients.remote_model_registry import get_remote_model
21
7
  from helm.common.object_spec import create_object, inject_object_spec_args
22
8
 
23
9
 
24
10
  class WindowServiceFactory:
25
11
  @staticmethod
26
- def get_window_service(model_name: str, service: TokenizerService) -> WindowService:
12
+ def get_window_service(model_deployment_name: str, service: TokenizerService) -> WindowService:
27
13
  """
28
14
  Returns a `WindowService` given the name of the model.
29
15
  Make sure this function returns instantaneously on repeated calls.
30
16
  """
31
- model: Model = get_model(model_name)
32
- organization: str = model.organization
33
- engine: str = model.engine
34
-
35
- window_service: WindowService
36
-
37
- # TODO: Migrate all window services to use use model deployments
38
- model_deployment = get_model_deployment(model_name)
17
+ model_deployment: Optional[ModelDeployment] = get_model_deployment(model_deployment_name)
39
18
  if model_deployment:
40
19
  # If the model deployment specifies a WindowServiceSpec, instantiate it.
41
20
  window_service_spec: WindowServiceSpec
@@ -45,6 +24,16 @@ class WindowServiceFactory:
45
24
  window_service_spec = WindowServiceSpec(
46
25
  class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService", args={}
47
26
  )
27
+
28
+ # If provided, look up special tokens from TokenizerConfig.
29
+ end_of_text_token: Optional[str] = None
30
+ prefix_token: Optional[str] = None
31
+ if model_deployment.tokenizer_name:
32
+ tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(model_deployment.tokenizer_name)
33
+ if tokenizer_config:
34
+ end_of_text_token = tokenizer_config.end_of_text_token
35
+ prefix_token = tokenizer_config.prefix_token
36
+
48
37
  # Perform dependency injection to fill in remaining arguments.
49
38
  # Dependency injection is needed here for these reasons:
50
39
  #
@@ -54,253 +43,19 @@ class WindowServiceFactory:
54
43
  # in the users configuration file. Instead, they have to be constructed dynamically at runtime.
55
44
  window_service_spec = inject_object_spec_args(
56
45
  window_service_spec,
57
- {
46
+ constant_bindings={
58
47
  "service": service,
59
48
  "tokenizer_name": model_deployment.tokenizer_name,
60
49
  "max_sequence_length": model_deployment.max_sequence_length,
61
50
  "max_request_length": model_deployment.max_request_length,
51
+ "max_sequence_and_generated_tokens_length": model_deployment.max_sequence_and_generated_tokens_length, # noqa
52
+ "end_of_text_token": end_of_text_token,
53
+ "prefix_token": prefix_token,
54
+ },
55
+ provider_bindings={
56
+ "gpt2_window_service": lambda: WindowServiceFactory.get_window_service("huggingface/gpt2", service)
62
57
  },
63
58
  )
64
- window_service = create_object(window_service_spec)
65
- elif get_remote_model(model_name):
66
- window_service = get_remote_window_service(service, model_name)
67
- elif organization == "neurips":
68
- from helm.benchmark.window_services.http_model_window_service import HTTPModelWindowServce
69
-
70
- window_service = HTTPModelWindowServce(service)
71
- elif organization == "openai":
72
- from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
73
- from helm.benchmark.window_services.wider_openai_window_service import (
74
- WiderOpenAIWindowService,
75
- GPTTurboWindowService,
76
- GPTTurbo16KWindowService,
77
- GPT4WindowService,
78
- GPT432KWindowService,
79
- )
80
-
81
- if model_name in get_model_names_with_tag(GPT4_CONTEXT_WINDOW_TAG):
82
- window_service = GPT4WindowService(service)
83
- elif model_name in get_model_names_with_tag(GPT4_32K_CONTEXT_WINDOW_TAG):
84
- window_service = GPT432KWindowService(service)
85
- if model_name in get_model_names_with_tag(GPT_TURBO_CONTEXT_WINDOW_TAG):
86
- window_service = GPTTurboWindowService(service)
87
- elif model_name in get_model_names_with_tag(GPT_TURBO_16K_CONTEXT_WINDOW_TAG):
88
- window_service = GPTTurbo16KWindowService(service)
89
- elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
90
- window_service = WiderOpenAIWindowService(service)
91
- else:
92
- window_service = OpenAIWindowService(service)
93
- # For the Google models, we approximate with the OpenAIWindowService
94
- elif organization == "simple" or organization == "google":
95
- from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
96
-
97
- window_service = OpenAIWindowService(service)
98
- elif organization == "AlephAlpha":
99
- from helm.benchmark.window_services.luminous_window_service import (
100
- LuminousBaseWindowService,
101
- LuminousExtendedWindowService,
102
- LuminousSupremeWindowService,
103
- LuminousWorldWindowService,
104
- )
105
-
106
- if engine == "luminous-base":
107
- window_service = LuminousBaseWindowService(service)
108
- elif engine == "luminous-extended":
109
- window_service = LuminousExtendedWindowService(service)
110
- elif engine == "luminous-supreme":
111
- window_service = LuminousSupremeWindowService(service)
112
- elif engine == "luminous-world":
113
- window_service = LuminousWorldWindowService(service)
114
- else:
115
- raise ValueError(f"Unhandled Aleph Alpha model: {engine}")
116
- elif organization == "microsoft":
117
- from helm.benchmark.window_services.mt_nlg_window_service import MTNLGWindowService
118
-
119
- window_service = MTNLGWindowService(service)
120
- elif organization == "anthropic":
121
- from helm.benchmark.window_services.anthropic_window_service import (
122
- AnthropicWindowService,
123
- LegacyAnthropicWindowService,
124
- )
125
-
126
- if engine == "stanford-online-all-v4-s3":
127
- window_service = LegacyAnthropicWindowService(service)
128
- else:
129
- window_service = AnthropicWindowService(service)
130
- elif organization == "writer":
131
- from helm.benchmark.window_services.palmyra_window_service import (
132
- PalmyraWindowService,
133
- LongerPalmyraWindowService,
134
- )
135
-
136
- if engine in ["palmyra-base", "palmyra-large", "palmyra-instruct-30", "palmyra-e"]:
137
- window_service = PalmyraWindowService(service)
138
- elif engine in ["palmyra-x", "silk-road"]:
139
- window_service = LongerPalmyraWindowService(service)
140
- else:
141
- raise ValueError(f"Unhandled Writer model: {engine}")
142
- elif engine == "santacoder":
143
- from helm.benchmark.window_services.santacoder_window_service import SantaCoderWindowService
144
-
145
- window_service = SantaCoderWindowService(service)
146
- elif engine == "starcoder":
147
- from helm.benchmark.window_services.starcoder_window_service import StarCoderWindowService
148
-
149
- window_service = StarCoderWindowService(service)
150
- elif model_name == "huggingface/gpt2":
151
- window_service = GPT2WindowService(service)
152
- elif model_name == "together/bloom":
153
- from helm.benchmark.window_services.bloom_window_service import BloomWindowService
154
-
155
- window_service = BloomWindowService(service)
156
- elif model_name == "together/glm":
157
- # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on
158
- # icetk---a unified multimodal tokenizer for images, Chinese, and English."
159
- from helm.benchmark.window_services.ice_window_service import ICEWindowService
160
-
161
- window_service = ICEWindowService(service)
162
- elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "together/gpt-jt-6b-v1", "gooseai/gpt-j-6b"]:
163
- from helm.benchmark.window_services.gptj_window_service import GPTJWindowService
164
-
165
- window_service = GPTJWindowService(service)
166
- elif model_name in [
167
- "together/gpt-neox-20b",
168
- "gooseai/gpt-neo-20b",
169
- "together/gpt-neoxt-chat-base-20b",
170
- "together/redpajama-incite-base-3b-v1",
171
- "together/redpajama-incite-instruct-3b-v1",
172
- "together/redpajama-incite-base-7b",
173
- "together/redpajama-incite-instruct-7b",
174
- # Pythia uses the same tokenizer as GPT-NeoX-20B.
175
- # See: https://huggingface.co/EleutherAI/pythia-6.9b#training-procedure
176
- "eleutherai/pythia-1b-v0",
177
- "eleutherai/pythia-2.8b-v0",
178
- "eleutherai/pythia-6.9b",
179
- "eleutherai/pythia-12b-v0",
180
- # MPT-7B model was trained with the EleutherAI/gpt-neox-20b tokenizer
181
- # See: https://huggingface.co/mosaicml/mpt-7b
182
- "mosaicml/mpt-7b",
183
- "mosaicml/mpt-instruct-7b",
184
- "mosaicml/mpt-30b",
185
- "mosaicml/mpt-instruct-30b",
186
- # Dolly models are based on Pythia.
187
- # See: https://github.com/databrickslabs/dolly
188
- "databricks/dolly-v2-3b",
189
- "databricks/dolly-v2-7b",
190
- "databricks/dolly-v2-12b",
191
- ]:
192
- from helm.benchmark.window_services.gptneox_window_service import GPTNeoXWindowService
193
-
194
- window_service = GPTNeoXWindowService(service)
195
- elif model_name in [
196
- "tiiuae/falcon-7b",
197
- "tiiuae/falcon-7b-instruct",
198
- "tiiuae/falcon-40b",
199
- "tiiuae/falcon-40b-instruct",
200
- ]:
201
- window_service = HuggingFaceWindowService(service=service, tokenizer_name="tiiuae/falcon-7b")
202
- elif model_name in [
203
- "stabilityai/stablelm-base-alpha-3b",
204
- "stabilityai/stablelm-base-alpha-7b",
205
- ]:
206
- from helm.benchmark.window_services.gptneox_window_service import StableLMAlphaWindowService
207
-
208
- window_service = StableLMAlphaWindowService(service)
209
- elif model_name == "together/h3-2.7b":
210
- window_service = GPT2WindowService(service)
211
- elif model_name in [
212
- "together/opt-1.3b",
213
- "together/opt-6.7b",
214
- "together/opt-66b",
215
- "together/opt-175b",
216
- ]:
217
- from helm.benchmark.window_services.opt_window_service import OPTWindowService
218
-
219
- window_service = OPTWindowService(service)
220
- elif model_name == "together/t0pp":
221
- from helm.benchmark.window_services.t0pp_window_service import T0ppWindowService
222
-
223
- window_service = T0ppWindowService(service)
224
- elif model_name == "together/t5-11b":
225
- from helm.benchmark.window_services.t511b_window_service import T511bWindowService
226
-
227
- window_service = T511bWindowService(service)
228
- elif model_name == "together/flan-t5-xxl":
229
- from helm.benchmark.window_services.flan_t5_window_service import FlanT5WindowService
230
-
231
- window_service = FlanT5WindowService(service)
232
- elif model_name == "together/ul2":
233
- from helm.benchmark.window_services.ul2_window_service import UL2WindowService
234
-
235
- window_service = UL2WindowService(service)
236
- elif model_name == "together/yalm":
237
- from helm.benchmark.window_services.yalm_window_service import YaLMWindowService
238
-
239
- window_service = YaLMWindowService(service)
240
- elif model_name == "nvidia/megatron-gpt2":
241
- from helm.benchmark.window_services.megatron_window_service import MegatronWindowService
242
-
243
- window_service = MegatronWindowService(service)
244
- elif model_name in [
245
- "lmsys/vicuna-7b-v1.3",
246
- "lmsys/vicuna-13b-v1.3",
247
- "meta/llama-7b",
248
- "meta/llama-13b",
249
- "meta/llama-30b",
250
- "meta/llama-65b",
251
- "stanford/alpaca-7b",
252
- ]:
253
- from helm.benchmark.window_services.llama_window_service import LlamaWindowService
254
-
255
- window_service = LlamaWindowService(service)
256
- elif model_name in [
257
- "meta/llama-2-7b",
258
- "meta/llama-2-13b",
259
- "meta/llama-2-70b",
260
- ]:
261
- from helm.benchmark.window_services.llama_window_service import Llama2WindowService
262
-
263
- window_service = Llama2WindowService(service)
264
- elif organization == "cohere":
265
- from helm.benchmark.window_services.cohere_window_service import (
266
- CohereWindowService,
267
- CohereCommandWindowService,
268
- )
269
-
270
- if "command" in engine:
271
- window_service = CohereCommandWindowService(service)
272
- else:
273
- window_service = CohereWindowService(service)
274
- elif organization == "ai21":
275
- from helm.benchmark.window_services.wider_ai21_window_service import (
276
- WiderAI21WindowService,
277
- AI21Jurassic2JumboWindowService,
278
- )
279
- from helm.benchmark.window_services.ai21_window_service import AI21WindowService
280
-
281
- if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
282
- window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
283
- if model_name in get_model_names_with_tag(AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG):
284
- window_service = AI21Jurassic2JumboWindowService(
285
- service=service, gpt2_window_service=GPT2WindowService(service)
286
- )
287
- else:
288
- window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
289
-
290
- elif organization == "lightningai":
291
- from helm.benchmark.window_services.lit_gpt_window_service import LitGPTWindowServce
292
-
293
- window_service = LitGPTWindowServce(service)
294
- elif organization == "mistralai":
295
- window_service = HuggingFaceWindowService(service, tokenizer_name="mistralai/Mistral-7B-v0.1")
296
- elif model_name in [
297
- "HuggingFaceM4/idefics-9b",
298
- "HuggingFaceM4/idefics-9b-instruct",
299
- "HuggingFaceM4/idefics-80b",
300
- "HuggingFaceM4/idefics-80b-instruct",
301
- ]:
302
- window_service = HuggingFaceWindowService(service, model_name)
303
- else:
304
- raise ValueError(f"Unhandled model name: {model_name}")
59
+ return create_object(window_service_spec)
305
60
 
306
- return window_service
61
+ raise ValueError(f"Unhandled model deployment name: {model_deployment_name}")
@@ -1,34 +1,7 @@
1
- from helm.proxy.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer
2
1
  from .local_window_service import LocalWindowService
3
- from .tokenizer_service import TokenizerService
4
2
 
5
3
 
6
4
  class YaLMWindowService(LocalWindowService):
7
- def __init__(self, service: TokenizerService):
8
- super().__init__(service)
9
-
10
- @property
11
- def tokenizer_name(self) -> str:
12
- return "Yandex/yalm"
13
-
14
- @property
15
- def max_sequence_length(self) -> int:
16
- return YaLMTokenizer.MAX_SEQUENCE_LENGTH
17
-
18
- @property
19
- def max_request_length(self) -> int:
20
- return self.max_sequence_length + 1
21
-
22
- @property
23
- def end_of_text_token(self) -> str:
24
- """The end of text token."""
25
- return YaLMTokenizer.EOS_TOKEN
26
-
27
- @property
28
- def prefix_token(self) -> str:
29
- """The prefix token"""
30
- return self.end_of_text_token
31
-
32
5
  def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
33
6
  """
34
7
  Truncates text from the right to fit within the context window given by `max_request_length`
File without changes
@@ -7,10 +7,9 @@ from helm.common.request import (
7
7
  EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
8
8
  Request,
9
9
  RequestResult,
10
- Sequence,
10
+ GeneratedOutput,
11
11
  Token,
12
12
  )
13
- from helm.proxy.tokenizers.tokenizer import Tokenizer
14
13
  from .client import CachingClient, truncate_sequence, cleanup_str
15
14
  from .ai21_utils import AI21RequestError, handle_failed_request
16
15
 
@@ -24,8 +23,8 @@ class AI21Client(CachingClient):
24
23
  COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/{model}/complete"
25
24
  EXPERIMENTAL_COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/experimental/{model}/complete"
26
25
 
27
- def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig, url: Optional[str] = None):
28
- super().__init__(cache_config=cache_config, tokenizer=tokenizer)
26
+ def __init__(self, api_key: str, cache_config: CacheConfig, url: Optional[str] = None):
27
+ super().__init__(cache_config=cache_config)
29
28
  self.api_key = api_key
30
29
  self.url = url
31
30
 
@@ -98,25 +97,19 @@ class AI21Client(CachingClient):
98
97
  # Compute the actual length of the token text
99
98
  # e.g. "▁burying"(0,8) -> 8 - 0 = 8; "▁burying"(0,7) -> 7 - 0 = 7
100
99
  text_length: int = raw["textRange"]["end"] - raw["textRange"]["start"]
101
- # "topTokens" can be None when sending a request with topKReturn=0
102
- # AI21 sends unscaled logprobs as `raw_logprob` so use this instead of `logprob`.
103
- top_logprobs: Dict[str, float] = dict(
104
- (fix_text(x["token"], first), x["raw_logprob"]) for x in raw["topTokens"] or []
105
- )
106
100
 
107
101
  return Token(
108
102
  # Text should not be longer than text_length. Since "▁" is always inserted
109
103
  # in the beginning, we truncate the text from the right.
110
104
  text=fix_text(raw["generatedToken"]["token"], first)[-text_length:] if text_length else "",
111
105
  logprob=raw["generatedToken"]["raw_logprob"],
112
- top_logprobs=top_logprobs,
113
106
  )
114
107
 
115
- def parse_sequence(raw: Dict, first: bool, finish_reason: Optional[Dict] = None) -> Sequence:
108
+ def parse_sequence(raw: Dict, first: bool, finish_reason: Optional[Dict] = None) -> GeneratedOutput:
116
109
  text = raw["text"]
117
110
  tokens = [parse_token(token, first and i == 0) for i, token in enumerate(raw["tokens"])]
118
111
  logprob = sum(token.logprob for token in tokens)
119
- return Sequence(text=text, logprob=logprob, tokens=tokens, finish_reason=finish_reason)
112
+ return GeneratedOutput(text=text, logprob=logprob, tokens=tokens, finish_reason=finish_reason)
120
113
 
121
114
  prompt = parse_sequence(response["prompt"], True)
122
115
  completions = []
@@ -0,0 +1,112 @@
1
+ from typing import List
2
+
3
+ from helm.common.cache import CacheConfig
4
+ from helm.common.media_object import TEXT_TYPE
5
+ from helm.common.optional_dependencies import handle_module_not_found_error
6
+ from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
+ from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
8
+
9
+ try:
10
+ from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
11
+ except ModuleNotFoundError as e:
12
+ handle_module_not_found_error(e, ["aleph-alpha"])
13
+
14
+
15
+ class AlephAlphaClient(CachingClient):
16
+ def __init__(self, api_key: str, cache_config: CacheConfig):
17
+ super().__init__(cache_config=cache_config)
18
+ self._api_key: str = api_key
19
+ self._aleph_alpha_client = Client(token=self._api_key) if self._api_key else None
20
+
21
+ def make_request(self, request: Request) -> RequestResult:
22
+ """Make a request following https://docs.aleph-alpha.com/api/complete."""
23
+ assert self._aleph_alpha_client is not None
24
+
25
+ model: str = request.model_engine
26
+ prompt: Prompt
27
+
28
+ # The prompt key is a unique identifier for the prompt
29
+ prompt_key: str = request.prompt
30
+
31
+ # Contents can either be text or a list of multimodal content made up of text, images or other content
32
+ if request.multimodal_prompt is not None:
33
+ from helm.common.images_utils import encode_base64
34
+
35
+ items = []
36
+ for media_object in request.multimodal_prompt.media_objects:
37
+ if media_object.is_type("image") and media_object.location:
38
+ items.append(Image(base_64=encode_base64(media_object.location), cropping=None, controls=[]))
39
+ elif media_object.is_type(TEXT_TYPE):
40
+ if media_object.text is None:
41
+ raise ValueError("MediaObject of text type has missing text field value")
42
+ items.append(media_object.text)
43
+ else:
44
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
45
+
46
+ prompt = Prompt(items=items)
47
+ prompt_key = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
48
+ else:
49
+ prompt = Prompt.from_text(request.prompt)
50
+
51
+ parameters = {
52
+ "maximum_tokens": request.max_tokens,
53
+ "temperature": request.temperature,
54
+ "top_k": request.top_k_per_token,
55
+ "top_p": request.top_p,
56
+ "presence_penalty": request.presence_penalty,
57
+ "frequency_penalty": request.frequency_penalty,
58
+ "n": request.num_completions,
59
+ "stop_sequences": request.stop_sequences,
60
+ "log_probs": request.top_k_per_token,
61
+ "echo": request.echo_prompt,
62
+ "tokens": True, # Setting to True returns individual tokens of the completion
63
+ }
64
+
65
+ try:
66
+
67
+ def do_it():
68
+ assert self._aleph_alpha_client is not None
69
+ completion_response: CompletionResponse = self._aleph_alpha_client.complete(
70
+ request=CompletionRequest(prompt=prompt, **parameters), model=model
71
+ )
72
+ result = dict(completion_response.to_json())
73
+ assert "completions" in result, f"Invalid response: {result}"
74
+ return result
75
+
76
+ cache_key = CachingClient.make_cache_key({"model": model, "prompt": prompt_key, **parameters}, request)
77
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
78
+ except Exception as e:
79
+ error: str = f"AlephAlphaClient error: {e}"
80
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
81
+
82
+ completions: List[GeneratedOutput] = []
83
+ for completion in response["completions"]:
84
+ sequence_logprob: float = 0
85
+ tokens: List[Token] = []
86
+
87
+ # `completion_tokens` is the list of selected tokens.
88
+ for i, token in enumerate(completion.get("completion_tokens", [])):
89
+ # Use the selected token value to get the logprob
90
+ logprob: float = completion["log_probs"][i][token]
91
+ sequence_logprob += logprob
92
+ tokens.append(
93
+ Token(
94
+ text=token,
95
+ logprob=logprob,
96
+ )
97
+ )
98
+
99
+ sequence: GeneratedOutput = GeneratedOutput(
100
+ text=completion["completion"], logprob=sequence_logprob, tokens=tokens
101
+ )
102
+ sequence = truncate_sequence(sequence, request)
103
+ completions.append(sequence)
104
+
105
+ return RequestResult(
106
+ success=True,
107
+ cached=cached,
108
+ request_time=response["request_time"],
109
+ request_datetime=response["request_datetime"],
110
+ completions=completions,
111
+ embedding=[],
112
+ )