crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,362 @@
1
+ # This file defines all the tokenizers that are supported by the Helm API.
2
+
3
+ # If you want to add a new tokenizer, you can technically do it here but we recommend
4
+ # you to do it in prod_env/tokenizer_configs.yaml instead.
5
+
6
+ # Follow the template of this file to add a new tokenizer. You can copy paste this to get started:
7
+ # # This file contains the tokenizer configs for the private tokenizers
8
+ # tokenizer_configs: [] # Leave empty to disable private tokenizers
9
+
10
+
11
+ tokenizer_configs:
12
+
13
+ - name: simple/tokenizer1
14
+ tokenizer_spec:
15
+ class_name: "helm.tokenizers.simple_tokenizer.SimpleTokenizer"
16
+ end_of_text_token: "</s>"
17
+ prefix_token: "<s>"
18
+
19
+ # AI21
20
+ - name: ai21/j1
21
+ tokenizer_spec:
22
+ class_name: "helm.tokenizers.ai21_tokenizer.AI21Tokenizer"
23
+ end_of_text_token: " "
24
+ prefix_token: ""
25
+
26
+ # AlephAlpha
27
+ - name: AlephAlpha/luminous-base
28
+ tokenizer_spec:
29
+ class_name: "helm.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
30
+ end_of_text_token: ""
31
+ prefix_token: ""
32
+ - name: AlephAlpha/luminous-extended
33
+ tokenizer_spec:
34
+ class_name: "helm.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
35
+ end_of_text_token: ""
36
+ prefix_token: ""
37
+ - name: AlephAlpha/luminous-supreme
38
+ tokenizer_spec:
39
+ class_name: "helm.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
40
+ end_of_text_token: ""
41
+ prefix_token: ""
42
+ - name: AlephAlpha/luminous-world
43
+ tokenizer_spec:
44
+ class_name: "helm.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
45
+ end_of_text_token: ""
46
+ prefix_token: ""
47
+
48
+ # Anthropic
49
+ - name: anthropic/claude
50
+ tokenizer_spec:
51
+ class_name: "helm.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
52
+ end_of_text_token: "<|endoftext|>"
53
+ prefix_token: "<|endoftext|>"
54
+
55
+ # Bigcode
56
+ - name: bigcode/santacoder
57
+ tokenizer_spec:
58
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
59
+ end_of_text_token: "<|endoftext|>"
60
+ prefix_token: "<|endoftext|>"
61
+ - name: bigcode/starcoder
62
+ tokenizer_spec:
63
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
64
+ end_of_text_token: "<|endoftext|>"
65
+ prefix_token: "<|endoftext|>"
66
+
67
+ # Bigscience
68
+ - name: bigscience/bloom
69
+ tokenizer_spec:
70
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
71
+ end_of_text_token: "</s>"
72
+ prefix_token: "</s>"
73
+ - name: bigscience/T0pp
74
+ tokenizer_spec:
75
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
76
+ end_of_text_token: "</s>"
77
+ prefix_token: ""
78
+
79
+ # Cohere
80
+ - name: cohere/cohere
81
+ tokenizer_spec:
82
+ class_name: "helm.tokenizers.cohere_tokenizer.CohereTokenizer"
83
+ end_of_text_token: ""
84
+ prefix_token: ":"
85
+
86
+ # Databricks
87
+ - name: databricks/dbrx-instruct
88
+ tokenizer_spec:
89
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
90
+ end_of_text_token: "<|endoftext|>"
91
+ prefix_token: "<|endoftext|>"
92
+
93
+ # DeepSeek
94
+ - name: deepseek-ai/deepseek-llm-67b-chat
95
+ tokenizer_spec:
96
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
97
+ end_of_text_token: "<|end▁of▁sentence|>"
98
+ prefix_token: "<|begin▁of▁sentence|>"
99
+
100
+ # EleutherAI
101
+ - name: EleutherAI/gpt-j-6B
102
+ tokenizer_spec:
103
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
104
+ end_of_text_token: "<|endoftext|>"
105
+ prefix_token: "<|endoftext|>"
106
+ - name: EleutherAI/gpt-neox-20b
107
+ tokenizer_spec:
108
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
109
+ end_of_text_token: "<|endoftext|>"
110
+ prefix_token: "<|endoftext|>"
111
+
112
+ # Facebook
113
+ - name: facebook/opt-66b
114
+ tokenizer_spec:
115
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
116
+ end_of_text_token: "</s>"
117
+ prefix_token: "</s>"
118
+
119
+ # Google
120
+ - name: google/t5-11b
121
+ tokenizer_spec:
122
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
123
+ args:
124
+ pretrained_model_name_or_path: google-t5/t5-11b
125
+ end_of_text_token: "</s>"
126
+ prefix_token: ""
127
+ - name: google/flan-t5-xxl
128
+ tokenizer_spec:
129
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
130
+ end_of_text_token: "</s>"
131
+ prefix_token: ""
132
+ - name: google/ul2
133
+ tokenizer_spec:
134
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
135
+ end_of_text_token: "</s>"
136
+ prefix_token: ""
137
+ - name: google/mt5-base
138
+ tokenizer_spec:
139
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
140
+ end_of_text_token: "</s>"
141
+ prefix_token: ""
142
+ - name: google/text-bison@001
143
+ tokenizer_spec:
144
+ class_name: "helm.tokenizers.vertexai_tokenizer.VertexAITokenizer"
145
+ end_of_text_token: "</s>"
146
+ prefix_token: ""
147
+ - name: google/text-bison@002
148
+ tokenizer_spec:
149
+ class_name: "helm.tokenizers.vertexai_tokenizer.VertexAITokenizer"
150
+ end_of_text_token: "</s>"
151
+ prefix_token: ""
152
+ - name: google/text-unicorn@001
153
+ tokenizer_spec:
154
+ class_name: "helm.tokenizers.vertexai_tokenizer.VertexAITokenizer"
155
+ end_of_text_token: "</s>"
156
+ prefix_token: ""
157
+ - name: google/gemma-2b
158
+ tokenizer_spec:
159
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
160
+ end_of_text_token: "<eos>"
161
+ prefix_token: "<bos>"
162
+
163
+ # Hf-internal-testing
164
+
165
+ # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
166
+ # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example
167
+ - name: hf-internal-testing/llama-tokenizer
168
+ tokenizer_spec:
169
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
170
+ end_of_text_token: "</s>"
171
+ prefix_token: "<s>"
172
+
173
+ # HuggingFaceM4
174
+ - name: HuggingFaceM4/idefics-9b
175
+ tokenizer_spec:
176
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
177
+ end_of_text_token: "</s>"
178
+ prefix_token: "<s>"
179
+ - name: HuggingFaceM4/idefics-9b-instruct
180
+ tokenizer_spec:
181
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
182
+ end_of_text_token: "</s>"
183
+ prefix_token: "<s>"
184
+ - name: HuggingFaceM4/idefics-80b
185
+ tokenizer_spec:
186
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
187
+ end_of_text_token: "</s>"
188
+ prefix_token: "<s>"
189
+ - name: HuggingFaceM4/idefics-80b-instruct
190
+ tokenizer_spec:
191
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
192
+ end_of_text_token: "</s>"
193
+ prefix_token: "<s>"
194
+
195
+ - name: anas-awadalla/mpt-7b
196
+ tokenizer_spec:
197
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
198
+ end_of_text_token: "<|endoftext|>"
199
+ prefix_token: ""
200
+
201
+ # Huggingface
202
+ - name: huggingface/gpt2
203
+ tokenizer_spec:
204
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
205
+ args:
206
+ pretrained_model_name_or_path: openai-community/gpt2
207
+ end_of_text_token: "<|endoftext|>"
208
+ prefix_token: "<|endoftext|>"
209
+
210
+ # Lighting AI
211
+ - name: lightningai/lit-gpt
212
+ tokenizer_spec:
213
+ class_name: "helm.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer"
214
+ end_of_text_token: "<|endoftext|>"
215
+ prefix_token: "<|endoftext|>"
216
+
217
+ # Meta-llama
218
+
219
+ # To use the Llama-2 tokenizer:
220
+ #
221
+ # 1. Accept the license agreement: https://ai.meta.com/resources/models-and-libraries/llama-downloads/
222
+ # 2. Request to access the Hugging Face repository: https://huggingface.co/meta-llama/Llama-2-7b
223
+ # 3. Run `huggingface-cli login`
224
+ #
225
+ # If you encounter the following error, complete the above steps and try again:
226
+ #
227
+ # meta-llama/Llama-2-70b-hf is not a local folder and is not a valid model identifier listed on
228
+ # 'https://huggingface.co/models'
229
+ - name: meta-llama/Llama-2-7b-hf
230
+ tokenizer_spec:
231
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
232
+ end_of_text_token: "</s>"
233
+ prefix_token: "<s>"
234
+
235
+ - name: meta/llama-3-8b
236
+ tokenizer_spec:
237
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
238
+ args:
239
+ pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
240
+ prefix_token: "<|begin_of_text|>"
241
+ end_of_text_token: "<|end_of_text|>"
242
+
243
+ # 01-ai
244
+ - name: 01-ai/Yi-6B
245
+ tokenizer_spec:
246
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
247
+ end_of_text_token: "</s>"
248
+ prefix_token: "<s>"
249
+
250
+
251
+ # Allen Institute for AI
252
+ # The allenai/olmo-7b requires Python 3.9 or newer.
253
+ # To use the allenai/olmo-7b tokenizer, run `pip install crfm-helm[allenai]` first.
254
+ - name: allenai/olmo-7b
255
+ tokenizer_spec:
256
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
257
+ args:
258
+ trust_remote_code: true
259
+ end_of_text_token: "<|endoftext|>"
260
+ prefix_token: ""
261
+
262
+
263
+ # Microsoft
264
+ - name: microsoft/phi-2
265
+ tokenizer_spec:
266
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
267
+ end_of_text_token: "<|endoftext|>"
268
+ prefix_token: "<|endoftext|>"
269
+
270
+ # Mistralai
271
+ - name: mistralai/Mistral-7B-v0.1
272
+ tokenizer_spec:
273
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
274
+ end_of_text_token: "</s>"
275
+ prefix_token: "<s>"
276
+
277
+ # Neurips
278
+ - name: neurips/local
279
+ tokenizer_spec:
280
+ class_name: "helm.tokenizers.http_model_tokenizer.HTTPModelTokenizer"
281
+ end_of_text_token: "<|endoftext|>"
282
+ prefix_token: "<|endoftext|>"
283
+
284
+ # Openai
285
+ - name: openai/cl100k_base
286
+ tokenizer_spec:
287
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
288
+ end_of_text_token: "<|endoftext|>"
289
+ prefix_token: "<|endoftext|>"
290
+
291
+ - name: openai/clip-vit-large-patch14
292
+ tokenizer_spec:
293
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
294
+ end_of_text_token: ""
295
+ prefix_token: ""
296
+
297
+ - name: qwen/qwen-7b
298
+ tokenizer_spec:
299
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
300
+ args:
301
+ pretrained_model_name_or_path: Qwen/Qwen-7B
302
+ trust_remote_code: true
303
+ end_of_text_token: "<|endoftext|>"
304
+ prefix_token: ""
305
+
306
+ - name: qwen/qwen1.5-7b
307
+ tokenizer_spec:
308
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
309
+ args:
310
+ pretrained_model_name_or_path: Qwen/Qwen1.5-7B
311
+ end_of_text_token: "<|endoftext|>"
312
+ prefix_token: ""
313
+
314
+ - name: qwen/qwen-vl
315
+ tokenizer_spec:
316
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
317
+ args:
318
+ pretrained_model_name_or_path: Qwen/Qwen-VL
319
+ trust_remote_code: true
320
+ # Source: https://github.com/QwenLM/Qwen-VL
321
+ end_of_text_token: "<|endoftext|>"
322
+ prefix_token: ""
323
+
324
+ - name: qwen/qwen-vl-chat
325
+ tokenizer_spec:
326
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
327
+ args:
328
+ pretrained_model_name_or_path: Qwen/Qwen-VL-Chat
329
+ trust_remote_code: true
330
+ # Source: https://github.com/QwenLM/Qwen-VL
331
+ end_of_text_token: "<|endoftext|>"
332
+ prefix_token: ""
333
+
334
+ # Tiiuae
335
+ - name: tiiuae/falcon-7b
336
+ tokenizer_spec:
337
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
338
+ end_of_text_token: "<|endoftext|>"
339
+ prefix_token: ""
340
+
341
+ # TsinghuaKEG
342
+ - name: TsinghuaKEG/ice
343
+ tokenizer_spec:
344
+ class_name: "helm.tokenizers.ice_tokenizer.ICETokenizer"
345
+ end_of_text_token: "</s>"
346
+ prefix_token: ""
347
+
348
+ # Writer
349
+ - name: writer/gpt2
350
+ tokenizer_spec:
351
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
352
+ args:
353
+ pretrained_model_name_or_path: openai-community/gpt2
354
+ end_of_text_token: ""
355
+ prefix_token: ""
356
+
357
+ # Yandex
358
+ - name: Yandex/yalm
359
+ tokenizer_spec:
360
+ class_name: "helm.tokenizers.yalm_tokenizer.YaLMTokenizer"
361
+ end_of_text_token: "</s>"
362
+ prefix_token: "</s>"
helm/proxy/accounts.py CHANGED
@@ -23,6 +23,9 @@ DEFAULT_QUOTAS = {
23
23
  "jurassic": {"daily": 10000},
24
24
  "gooseai": {"daily": 10000},
25
25
  "cohere": {"daily": 10000},
26
+ "dall_e": {"daily": 5}, # In terms of the number of generated images
27
+ "together_vision": {"daily": 30},
28
+ "simple": {"daily": 10000},
26
29
  }
27
30
 
28
31
 
@@ -303,7 +306,7 @@ class Accounts:
303
306
  model_group: str,
304
307
  granularity: str,
305
308
  compute_period: Callable[[], str],
306
- ):
309
+ ) -> None:
307
310
  """Helper that checks the usage at a certain granularity (e.g., daily, monthly, total)."""
308
311
 
309
312
  model_group_usages = account.usages.get(model_group)
@@ -321,14 +324,38 @@ class Accounts:
321
324
  if not usage.can_use():
322
325
  raise InsufficientQuotaError(f"{granularity} quota ({usage.quota}) for {model_group} already used up")
323
326
 
327
+ def check_non_empty_quota(
328
+ account: Account,
329
+ model_group: str,
330
+ ) -> None:
331
+ """Helper that checks that the account has quota at some granularity.
332
+
333
+ At each granularity, a quota of None means unlimited quota.
334
+ However, if the quota is None at every granularity, it means that there is no quota.
335
+ To enforce this rule, this helper raises a InsufficientQuotaError if the quota is None
336
+ at every granularity."""
337
+ model_group_usages = account.usages.get(model_group)
338
+ if model_group_usages is None:
339
+ raise InsufficientQuotaError(f"No quota for {model_group}")
340
+ if all(
341
+ [
342
+ granularity_usage.quota is None or granularity_usage.quota <= 0
343
+ for granularity_usage in model_group_usages.values()
344
+ ]
345
+ ):
346
+ raise InsufficientQuotaError(f"No quota for {model_group}")
347
+
324
348
  if self.root_mode:
325
349
  return
326
350
 
327
351
  with SqliteDict(self.path) as cache:
328
352
  account: Account = from_dict(Account, cache[api_key])
329
- granular_check_can_use(account, model_group, "daily", compute_daily_period)
330
- granular_check_can_use(account, model_group, "monthly", compute_monthly_period)
331
- granular_check_can_use(account, model_group, "total", compute_total_period)
353
+ if account.is_admin:
354
+ return
355
+ granular_check_can_use(account, model_group, "daily", compute_daily_period)
356
+ granular_check_can_use(account, model_group, "monthly", compute_monthly_period)
357
+ granular_check_can_use(account, model_group, "total", compute_total_period)
358
+ check_non_empty_quota(account, model_group)
332
359
 
333
360
  def use(self, api_key: str, model_group: str, delta: int):
334
361
  """
@@ -4,6 +4,7 @@ import os
4
4
  from threading import Lock
5
5
  from typing import Dict, List, Optional, Tuple, Union
6
6
  import re
7
+ import sys
7
8
 
8
9
  from helm.common.critique_request import (
9
10
  CritiqueRequest,
@@ -15,6 +16,8 @@ from helm.common.critique_request import (
15
16
  from helm.common.hierarchical_logger import hlog
16
17
  from helm.proxy.critique.mechanical_turk_utils import replace_emoji_characters
17
18
 
19
+ csv.field_size_limit(sys.maxsize)
20
+
18
21
  # A representation of fields that can be used as a dict key.
19
22
  _CritiqueRequestKey = Tuple[Tuple[str, str], ...]
20
23
 
@@ -2,6 +2,7 @@ from typing import Dict, List, Union, Optional
2
2
  import string
3
3
  import dataclasses
4
4
 
5
+ from helm.benchmark.run_spec_factory import get_default_model_deployment_for_model
5
6
  from helm.common.critique_request import (
6
7
  CritiqueRequest,
7
8
  CritiqueRequestResult,
@@ -11,8 +12,8 @@ from helm.common.critique_request import (
11
12
  )
12
13
  from helm.common.hierarchical_logger import hlog
13
14
  from helm.common.optional_dependencies import handle_module_not_found_error
14
- from helm.common.request import Request, RequestResult, Sequence
15
- from helm.proxy.clients.client import Client
15
+ from helm.common.request import Request, RequestResult, GeneratedOutput
16
+ from helm.clients.client import Client
16
17
  from helm.proxy.critique.critique_client import CritiqueClient
17
18
 
18
19
 
@@ -26,6 +27,10 @@ class ModelCritiqueClient(CritiqueClient):
26
27
  def __init__(self, client: Client, model_name):
27
28
  self._client = client
28
29
  self._model_name = model_name
30
+ self._model_deployment_name = (
31
+ get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
32
+ or self._model_name
33
+ )
29
34
 
30
35
  def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
31
36
  for key, value in fields.items():
@@ -75,6 +80,7 @@ class ModelCritiqueClient(CritiqueClient):
75
80
 
76
81
  request = Request(
77
82
  model=self._model_name,
83
+ model_deployment=self._model_deployment_name,
78
84
  prompt=prompt,
79
85
  max_tokens=max_tokens,
80
86
  echo_prompt=False,
@@ -108,7 +114,7 @@ class ModelCritiqueClient(CritiqueClient):
108
114
  return answers
109
115
 
110
116
  def _multiple_choice_completion_to_answer(
111
- self, question: CritiqueQuestionTemplate, completion: Sequence
117
+ self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
112
118
  ) -> Optional[str]:
113
119
  """Convert a multiple choice completion to an answer."""
114
120
  assert question.question_type == "multiple_choice"
@@ -125,7 +131,7 @@ class ModelCritiqueClient(CritiqueClient):
125
131
  return None
126
132
 
127
133
  def _checkbox_completion_to_answer(
128
- self, question: CritiqueQuestionTemplate, completion: Sequence
134
+ self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
129
135
  ) -> Optional[List[str]]:
130
136
  """Convert a checkbox completion to an answer."""
131
137
  assert question.question_type == "checkbox"
@@ -141,7 +147,9 @@ class ModelCritiqueClient(CritiqueClient):
141
147
  hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")
142
148
  return None
143
149
 
144
- def _free_response_completion_to_answer(self, question: CritiqueQuestionTemplate, completion: Sequence) -> str:
150
+ def _free_response_completion_to_answer(
151
+ self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
152
+ ) -> str:
145
153
  """Convert a free response completion to an answer."""
146
154
  assert question.question_type == "free_response"
147
155
  return completion.text
@@ -21,6 +21,8 @@ example_queries = [
21
21
  """
22
22
  temperature: 0.5 # Medium amount of randomness
23
23
  stop_sequences: [.] # Stop when you hit a period
24
+ model: openai/gpt-3.5-turbo-0613
25
+ model_deployment: openai/gpt-3.5-turbo-0613
24
26
  """
25
27
  ),
26
28
  environments="",
@@ -31,7 +33,9 @@ example_queries = [
31
33
  """
32
34
  temperature: 0.5 # Medium amount of randomness
33
35
  stop_sequences: [\\n] # Stop when you hit a newline
34
- num_completions: 10 # Generate many samples
36
+ num_completions: 5 # Generate many samples
37
+ model: openai/gpt-3.5-turbo-0613
38
+ model_deployment: openai/gpt-3.5-turbo-0613
35
39
  """
36
40
  ),
37
41
  environments="",
@@ -42,7 +46,9 @@ example_queries = [
42
46
  """
43
47
  echo_prompt: true # Analyze the prompt
44
48
  max_tokens: 0 # Don't generate any more
45
- top_k_per_token: 10 # Show alternatives for each position
49
+ top_k_per_token: 5 # Show alternatives for each position
50
+ model: openai/davinci-002
51
+ model_deployment: openai/davinci-002
46
52
  """
47
53
  ),
48
54
  environments=dedent(""),
@@ -53,6 +59,8 @@ example_queries = [
53
59
  """
54
60
  temperature: 0 # Deterministic
55
61
  max_tokens: 50
62
+ model: openai/gpt-3.5-turbo-0613
63
+ model_deployment: openai/gpt-3.5-turbo-0613
56
64
  """
57
65
  ),
58
66
  environments="",
@@ -63,13 +71,15 @@ example_queries = [
63
71
  """
64
72
  temperature: 0
65
73
  stop_sequences: [.]
66
- model: ${model} # Try out multiple models
74
+ # Try out multiple models
75
+ model: ${model}
76
+ model_deployment: ${model}
67
77
  """
68
78
  ),
69
79
  environments=dedent(
70
80
  """
71
81
  occupation: [mathematician, lawyer, doctor]
72
- model: [openai/davinci, ai21/j1-jumbo]
82
+ model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
73
83
  """
74
84
  ),
75
85
  ),
@@ -88,12 +98,14 @@ example_queries = [
88
98
  temperature: 0.5
89
99
  stop_sequences: [\\n]
90
100
  num_completions: 5
91
- model: ${model} # Try out GPT-3 and Jurassic
101
+ # Try out multiple models
102
+ model: ${model}
103
+ model_deployment: ${model}
92
104
  """
93
105
  ),
94
106
  environments=dedent(
95
107
  """
96
- model: [openai/davinci, ai21/j1-jumbo]
108
+ model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
97
109
  """
98
110
  ),
99
111
  ),
@@ -122,20 +134,23 @@ example_queries = [
122
134
  temperature: 0
123
135
  max_tokens: 1
124
136
  top_k_per_token: 4
125
- model: ${model} # Try out GPT-3 and Jurassic
137
+ # Try out multiple models
138
+ model: ${model}
139
+ model_deployment: ${model}
126
140
  """
127
141
  ),
128
142
  environments=dedent(
129
143
  """
130
- model: [openai/davinci, ai21/j1-jumbo]
144
+ model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
131
145
  """
132
146
  ),
133
147
  ),
134
148
  Query(
135
- prompt="Takes two vectors a and b and returns their Euclidean distance",
149
+ prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
136
150
  settings=dedent(
137
151
  """
138
- model: openai/code-davinci-001 # Codex for code generation
152
+ model: openai/gpt-3.5-turbo-0613
153
+ model_deployment: openai/gpt-3.5-turbo-0613
139
154
  """
140
155
  ),
141
156
  environments="",
@@ -144,19 +159,16 @@ example_queries = [
144
159
  prompt="The quick brown fox",
145
160
  settings=dedent(
146
161
  """
147
- model: ${model}
148
162
  temperature: 0.3
149
163
  stop_sequences: [\\n]
164
+ # Try out multiple models
165
+ model: ${model}
166
+ model_deployment: ${model}
150
167
  """
151
168
  ),
152
169
  environments=dedent(
153
170
  """
154
- model: [
155
- "openai/davinci", "openai/text-davinci-002",
156
- "openai/text-davinci-003", "ai21/j1-grande-v2-beta",
157
- "together/gpt-j-6b", "together/gpt-jt-6b-v1",
158
- "together/bloom", "together/opt-175b"
159
- ]
171
+ model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
160
172
  """
161
173
  ),
162
174
  ),
helm/proxy/retry.py CHANGED
@@ -41,11 +41,13 @@ def get_retry_decorator(
41
41
  Wait function to pass into `Retrying` that logs and returns the amount of time to sleep
42
42
  depending on the number of attempts and delay (in milliseconds).
43
43
  """
44
+ del delay # unused
45
+ next_delay = 2**attempts * wait_exponential_multiplier_seconds * 1000
44
46
  hlog(
45
- f"{operation} failed. Retrying (attempt #{attempts + 1}) in {delay // 1000} seconds... "
47
+ f"{operation} failed. Retrying (attempt #{attempts + 1}) in {next_delay // 1000} seconds... "
46
48
  "(See above for error details)"
47
49
  )
48
- return _retrying.exponential_sleep(attempts, delay)
50
+ return next_delay
49
51
 
50
52
  def print_exception_and_traceback(exception: Exception) -> bool:
51
53
  """
@@ -85,3 +87,7 @@ def retry_if_request_failed(result: Union[RequestResult, TokenizationRequestResu
85
87
  retry_request: Callable = get_retry_decorator(
86
88
  "Request", max_attempts=5, wait_exponential_multiplier_seconds=5, retry_on_result=retry_if_request_failed
87
89
  )
90
+
91
+ retry_tokenizer_request: Callable = get_retry_decorator(
92
+ "Request", max_attempts=5, wait_exponential_multiplier_seconds=1, retry_on_result=retry_if_request_failed
93
+ )