crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,4 @@
1
+ window.RELEASE = "v1.0.0";
2
+ window.BENCHMARK_OUTPUT_BASE_URL =
3
+ "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/";
4
+ window.PROJECT_ID = "lite";
@@ -0,0 +1,20 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="./helm.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Holistic Evaluation of Language Models (HELM)</title>
8
+ <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
9
+ <script type="text/javascript" src="./config.js"></script>
10
+ <script type="module" crossorigin src="./assets/index-d839df55.js"></script>
11
+ <link rel="modulepreload" crossorigin href="./assets/react-d4a0b69b.js">
12
+ <link rel="modulepreload" crossorigin href="./assets/recharts-6d337683.js">
13
+ <link rel="modulepreload" crossorigin href="./assets/tremor-54a99cc4.js">
14
+ <link rel="stylesheet" href="./assets/index-5088afcb.css">
15
+ </head>
16
+ <body class="block">
17
+ <div id="root"></div>
18
+
19
+ </body>
20
+ </html>
@@ -4,14 +4,14 @@ from typing import List
4
4
  from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
5
5
  from helm.benchmark.augmentations.perturbation import PerturbationSpec
6
6
  from helm.benchmark.data_preprocessor import DataPreprocessor
7
- from helm.benchmark.run_specs import get_scenario_spec1
7
+ from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
8
8
  from helm.benchmark.scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids
9
9
 
10
10
 
11
11
  def test_data_preprocessor():
12
12
  # Test that each Instance is given a unique ID and is preserved through data augmentation
13
13
  data_preprocessor = DataPreprocessor(DataAugmenterSpec())
14
- scenario: Scenario = create_scenario(get_scenario_spec1())
14
+ scenario: Scenario = create_scenario(get_simple1_spec().scenario_spec)
15
15
  instances = with_instance_ids(scenario.get_instances(output_path=""))
16
16
  instances: List[Instance] = data_preprocessor.preprocess(instances)
17
17
  for i, instance in enumerate(instances):
@@ -32,7 +32,7 @@ def test_data_preprocessor_with_data_augmentation():
32
32
  should_include_original_eval=True,
33
33
  )
34
34
  data_preprocessor = DataPreprocessor(data_augmenter_spec)
35
- scenario: Scenario = create_scenario(get_scenario_spec1())
35
+ scenario: Scenario = create_scenario(get_simple1_spec().scenario_spec)
36
36
  instances = with_instance_ids(scenario.get_instances(output_path=""))
37
37
  instances: List[Instance] = data_preprocessor.preprocess(instances)
38
38
  assert len(instances) == 10 + 10 + 10 # original train + original eval + perturbed eval
@@ -0,0 +1,90 @@
1
+ """Temporary test for preserving invariants during the model / tokenizer / window service refactor.
2
+
3
+ Delete this after the refactor is done."""
4
+
5
+ from typing import Optional
6
+
7
+ import pytest
8
+ from tempfile import TemporaryDirectory
9
+ from helm.benchmark.model_deployment_registry import (
10
+ get_model_deployment,
11
+ ModelDeployment,
12
+ ALL_MODEL_DEPLOYMENTS,
13
+ )
14
+ from helm.benchmark.model_metadata_registry import get_model_metadata, ModelMetadata
15
+ from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
16
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service
17
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
18
+ from helm.clients.client import Client
19
+ from helm.tokenizers.tokenizer import Tokenizer
20
+ from helm.benchmark.window_services.window_service import WindowService
21
+
22
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
23
+ from helm.clients.auto_client import AutoClient
24
+ from helm.tokenizers.auto_tokenizer import AutoTokenizer
25
+
26
+
27
+ INT_MAX: int = 2**31 - 1
28
+
29
+
30
+ class TestModelProperties:
31
+ @pytest.mark.parametrize("deployment_name", [deployment.name for deployment in ALL_MODEL_DEPLOYMENTS])
32
+ def test_models_has_window_service(self, deployment_name: str):
33
+ with TemporaryDirectory() as tmpdir:
34
+ credentials = {"openaiApiKey": "test-openai-api-key"}
35
+ auto_client = AutoClient(credentials, tmpdir, BlackHoleCacheBackendConfig())
36
+ auto_tokenizer = AutoTokenizer({}, BlackHoleCacheBackendConfig())
37
+ tokenizer_service = get_tokenizer_service(tmpdir, BlackHoleCacheBackendConfig())
38
+
39
+ # Loading the TokenizerConfig and ModelMetadat ensures that they are valid.
40
+ deployment: ModelDeployment = get_model_deployment(deployment_name)
41
+ tokenizer_name: str = deployment.tokenizer_name if deployment.tokenizer_name else deployment_name
42
+ tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(tokenizer_name)
43
+ assert tokenizer_config is not None
44
+ model: ModelMetadata = get_model_metadata(
45
+ deployment.model_name if deployment.model_name else deployment_name
46
+ )
47
+
48
+ # Can't test lit-gpt client because it requires manual dependencies
49
+ if "lit-gpt" in model.name:
50
+ return
51
+
52
+ # Can't test Llama 2 because it requires Hugging Face credentials
53
+ if "llama-2-" in model.name:
54
+ return
55
+
56
+ # Can't test Vertex AI because it requires Google credentials
57
+ if deployment_name.startswith("google/"):
58
+ return
59
+
60
+ # Can't test Bedrock because it requires Amazon credentials
61
+ if deployment_name.startswith("amazon/"):
62
+ return
63
+
64
+ # Loads the model, window service and tokenizer
65
+ # which checks that the model, window service and tokenizer are all valid,
66
+ # and that no Client, WindowService or Tokenizer are crashing.
67
+ client: Client = auto_client._get_client(deployment_name) # noqa: F841
68
+ window_service: WindowService = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
69
+ tokenizer: Tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name) # noqa: F841
70
+
71
+ # Verify that the parameters that are redundant between the ModelDeployment, Tokenizer and the
72
+ # WindowService are the same.
73
+ assert window_service.tokenizer_name == deployment.tokenizer_name
74
+ assert window_service.max_sequence_length == deployment.max_sequence_length
75
+ assert (
76
+ window_service.max_request_length == deployment.max_request_length
77
+ if deployment.max_request_length
78
+ else deployment.max_sequence_length
79
+ )
80
+ assert (
81
+ window_service.max_sequence_and_generated_tokens_length
82
+ == deployment.max_sequence_and_generated_tokens_length
83
+ if deployment.max_sequence_and_generated_tokens_length
84
+ else INT_MAX
85
+ )
86
+ assert tokenizer_config.end_of_text_token == window_service.end_of_text_token
87
+ assert tokenizer_config.prefix_token == window_service.prefix_token
88
+
89
+ # TODO: Add a dummy tokenize, decode and make_request request to each client/tokenizer
90
+ # Do this once we have a proper Cache for tests.
@@ -2,7 +2,7 @@ import unittest
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.run_expander import IncreaseMaxTokensRunExpander
5
- from helm.benchmark.runner import RunSpec
5
+ from helm.benchmark.run_spec import RunSpec
6
6
  from helm.benchmark.scenarios.scenario import ScenarioSpec
7
7
 
8
8
 
@@ -1,4 +1,3 @@
1
- import os
2
1
  from typing import Dict, Optional, List
3
2
  from dataclasses import dataclass
4
3
 
@@ -9,9 +8,6 @@ from helm.common.hierarchical_logger import hlog
9
8
  from helm.common.object_spec import ObjectSpec
10
9
 
11
10
 
12
- TOKENIEZR_CONFIGS_FILE = "tokenizer_configs.yaml"
13
-
14
-
15
11
  class TokenizerSpec(ObjectSpec):
16
12
  pass
17
13
 
@@ -26,7 +22,11 @@ class TokenizerConfig:
26
22
  tokenizer_spec: TokenizerSpec
27
23
  """Specification for instantiating the client for this tokenizer."""
28
24
 
29
- # TODO: Add `end_of_text_token`` and `prefix_token``
25
+ end_of_text_token: Optional[str] = None
26
+ """The end of text token."""
27
+
28
+ prefix_token: Optional[str] = None
29
+ """The prefix token."""
30
30
 
31
31
 
32
32
  @dataclass(frozen=True)
@@ -34,11 +34,13 @@ class TokenizerConfigs:
34
34
  tokenizer_configs: List[TokenizerConfig]
35
35
 
36
36
 
37
- _name_to_tokenizer_config: Dict[str, TokenizerConfig] = {}
37
+ ALL_TOKENIZER_CONFIGS: List[TokenizerConfig] = []
38
+ TOKENIZER_NAME_TO_CONFIG: Dict[str, TokenizerConfig] = {config.name: config for config in ALL_TOKENIZER_CONFIGS}
38
39
 
39
40
 
40
41
  def register_tokenizer_config(tokenizer_config: TokenizerConfig) -> None:
41
- _name_to_tokenizer_config[tokenizer_config.name] = tokenizer_config
42
+ ALL_TOKENIZER_CONFIGS.append(tokenizer_config)
43
+ TOKENIZER_NAME_TO_CONFIG[tokenizer_config.name] = tokenizer_config
42
44
 
43
45
 
44
46
  def register_tokenizer_configs_from_path(path: str) -> None:
@@ -50,11 +52,5 @@ def register_tokenizer_configs_from_path(path: str) -> None:
50
52
  register_tokenizer_config(tokenizer_config)
51
53
 
52
54
 
53
- def maybe_register_tokenizer_configs_from_base_path(base_path: str) -> None:
54
- path = os.path.join(base_path, TOKENIEZR_CONFIGS_FILE)
55
- if os.path.exists(path):
56
- register_tokenizer_configs_from_path(path)
57
-
58
-
59
55
  def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]:
60
- return _name_to_tokenizer_config.get(name)
56
+ return TOKENIZER_NAME_TO_CONFIG.get(name)
@@ -9,12 +9,11 @@ from helm.common.tokenization_request import (
9
9
  TokenizationToken,
10
10
  TextRange,
11
11
  )
12
- from .window_service import WindowService, EncodeResult
12
+ from .window_service import ConfigurableWindowService, EncodeResult, WindowService
13
13
  from .tokenizer_service import TokenizerService
14
- from .gpt2_window_service import GPT2WindowService
15
14
 
16
15
 
17
- class AI21WindowService(WindowService):
16
+ class AI21WindowService(ConfigurableWindowService):
18
17
  """Tokenizes by making a request to the proxy server with REST endpoint: `/api/tokenize`."""
19
18
 
20
19
  # AI21's tokenizer API rejects a tokenization request if the input sequence is too long, so
@@ -32,39 +31,29 @@ class AI21WindowService(WindowService):
32
31
  "AI21 only gave API access to their tokenizer, so this method is not supported."
33
32
  )
34
33
 
35
- def __init__(self, service: TokenizerService, gpt2_window_service: GPT2WindowService):
34
+ def __init__(
35
+ self,
36
+ gpt2_window_service: WindowService,
37
+ service: TokenizerService,
38
+ tokenizer_name: str,
39
+ max_sequence_length: int,
40
+ max_request_length: Optional[int] = None,
41
+ max_sequence_and_generated_tokens_length: Optional[int] = None,
42
+ end_of_text_token: Optional[str] = None,
43
+ prefix_token: Optional[str] = None,
44
+ ):
45
+ super().__init__(
46
+ tokenizer_name=tokenizer_name,
47
+ max_sequence_length=max_sequence_length,
48
+ max_request_length=max_request_length,
49
+ max_sequence_and_generated_tokens_length=max_sequence_and_generated_tokens_length,
50
+ end_of_text_token=end_of_text_token,
51
+ prefix_token=prefix_token,
52
+ )
36
53
  # We need the `TokenizerService` to make requests to the server.
37
54
  self.service: TokenizerService = service
38
55
  # As explained above, we need a `GPT2WindowService` to help tokenize long text sequences.
39
- self.gpt2_window_service: GPT2WindowService = gpt2_window_service
40
-
41
- @property
42
- def tokenizer_name(self) -> str:
43
- """Name of the tokenizer to use when sending a request."""
44
- return "ai21/j1"
45
-
46
- @property
47
- def max_sequence_length(self) -> int:
48
- """
49
- The max token length of the model in. The AI21 server automatically prepends a token to every prompt,
50
- so the actual max sequence length is 2048-1 = 2047.
51
- """
52
- return 2047
53
-
54
- @property
55
- def max_request_length(self) -> int:
56
- """The max sequence length is the same as the max request length for AI21."""
57
- return self.max_sequence_length
58
-
59
- @property
60
- def end_of_text_token(self) -> str:
61
- # TODO: I'm not sure what their end of text token is. I don't think it's documented.
62
- return " "
63
-
64
- @property
65
- def prefix_token(self) -> str:
66
- """AI21 tokenizers do no have a prefix token"""
67
- return ""
56
+ self.gpt2_window_service: WindowService = gpt2_window_service
68
57
 
69
58
  def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
70
59
  """
@@ -1,8 +1,7 @@
1
1
  from typing import List, Optional
2
2
 
3
- from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer
3
+ from helm.tokenizers.cohere_tokenizer import CohereTokenizer
4
4
  from .local_window_service import LocalWindowService
5
- from .tokenizer_service import TokenizerService
6
5
  from .window_service import EncodeResult
7
6
  from helm.common.tokenization_request import (
8
7
  TokenizationRequest,
@@ -12,47 +11,6 @@ from helm.common.tokenization_request import (
12
11
 
13
12
 
14
13
  class CohereWindowService(LocalWindowService):
15
- def __init__(self, service: TokenizerService):
16
- super().__init__(service)
17
-
18
- @property
19
- def tokenizer_name(self) -> str:
20
- return "cohere/cohere"
21
-
22
- @property
23
- def max_sequence_length(self) -> int:
24
- """
25
- The max length of the model input. Similar to MT-NLG, Cohere does not predict the logprob of
26
- the first input token so `max_sequence_length` is one token shorter than `max_request_length`.
27
- """
28
- return self.max_request_length - 1
29
-
30
- @property
31
- def max_request_length(self) -> int:
32
- """
33
- The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
34
- If we exceed the `max_sequence_length`, we get the following error:
35
-
36
- Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
37
- exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
38
- """
39
- return 2048
40
-
41
- @property
42
- def end_of_text_token(self) -> str:
43
- """
44
- The end of text token. Cohere does not have one.
45
- """
46
- return ""
47
-
48
- @property
49
- def prefix_token(self) -> str:
50
- """
51
- The prefix token. Cohere does not return the log prob for the first token when `echo_prompt` is True.
52
- """
53
- # Cohere recommended ":", but we can try out different values
54
- return ":"
55
-
56
14
  def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
57
15
  """
58
16
  Encodes the input text to tokens.
@@ -141,23 +99,3 @@ class CohereWindowService(LocalWindowService):
141
99
  result = result[:-1]
142
100
 
143
101
  return result
144
-
145
-
146
- class CohereCommandWindowService(CohereWindowService):
147
- def __init__(self, service: TokenizerService):
148
- super().__init__(service)
149
-
150
- @property
151
- def max_request_length(self) -> int:
152
- """
153
- The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
154
- If we exceed the `max_sequence_length`, we get the following error:
155
-
156
- Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
157
- exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
158
-
159
- For the Command model, in rare situations, the co.tokenize returns a shorter list of tokens
160
- than the co.generate. This causes sequence length errors for rare inputs. Cohere's advice is
161
- to reduce the sequence length to 2020 to avoid these issues.
162
- """
163
- return 2020
@@ -1,39 +1,6 @@
1
- from typing import Optional
2
1
  from .local_window_service import LocalWindowService
3
- from .tokenizer_service import TokenizerService
4
2
 
5
3
 
6
4
  class DefaultWindowService(LocalWindowService):
7
- def __init__(
8
- self,
9
- service: TokenizerService,
10
- tokenizer_name: str,
11
- max_sequence_length: int,
12
- max_request_length: Optional[int] = None,
13
- ):
14
- super().__init__(service)
15
- self._tokenizer_name = tokenizer_name
16
- self._max_sequence_length = max_sequence_length
17
- self._max_request_length = max_request_length
18
-
19
- @property
20
- def max_sequence_length(self) -> int:
21
- return self._max_sequence_length
22
-
23
- @property
24
- def max_request_length(self) -> int:
25
- return self._max_request_length or self._max_sequence_length
26
-
27
- @property
28
- def end_of_text_token(self) -> str:
29
- # TODO: Support this
30
- return ""
31
-
32
- @property
33
- def tokenizer_name(self) -> str:
34
- return self._tokenizer_name
35
-
36
- @property
37
- def prefix_token(self) -> str:
38
- # TODO: Support this
39
- return ""
5
+ # TODO: Delete this WindowService.
6
+ pass
@@ -2,20 +2,9 @@ from abc import ABC
2
2
 
3
3
  from helm.common.hierarchical_logger import hlog
4
4
  from .local_window_service import LocalWindowService
5
- from .tokenizer_service import TokenizerService
6
5
 
7
6
 
8
7
  class EncoderDecoderWindowService(LocalWindowService, ABC):
9
- def __init__(self, service: TokenizerService):
10
- super().__init__(service)
11
-
12
- @property
13
- def max_request_length(self) -> int:
14
- """
15
- Return the max request length. We set the max requests length to be `max_sequence_length`.
16
- """
17
- return self.max_sequence_length
18
-
19
8
  @property
20
9
  def max_output_length(self) -> int:
21
10
  """
@@ -1,41 +1,7 @@
1
1
  from .local_window_service import LocalWindowService
2
- from .tokenizer_service import TokenizerService
3
2
 
4
3
 
5
4
  class ICEWindowService(LocalWindowService):
6
- def __init__(self, service: TokenizerService):
7
- super().__init__(service)
8
-
9
- @property
10
- def tokenizer_name(self) -> str:
11
- return "TsinghuaKEG/ice"
12
-
13
- @property
14
- def max_sequence_length(self) -> int:
15
- """
16
- The max length of the model input.
17
- According to https://github.com/THUDM/GLM-130B, the max sequence length is 2048.
18
- """
19
- return 2048
20
-
21
- @property
22
- def max_request_length(self) -> int:
23
- return self.max_sequence_length + 1
24
-
25
- @property
26
- def end_of_text_token(self) -> str:
27
- """The end of text token."""
28
- # Followed up in https://github.com/THUDM/icetk/issues/1
29
- return "</s>"
30
-
31
- @property
32
- def prefix_token(self) -> str:
33
- """
34
- The prefix token.
35
- Inference with echo=True is not feasible, so just set it to the empty string.
36
- """
37
- return ""
38
-
39
5
  def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
40
6
  """
41
7
  Truncates text from the right to fit within the context window given by `max_request_length`
@@ -0,0 +1,15 @@
1
+ from abc import ABC
2
+
3
+ from helm.benchmark.window_services.local_window_service import LocalWindowService
4
+
5
+
6
+ class CLIPWindowService(LocalWindowService, ABC):
7
+ def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
8
+ result: str = self.decode(self.encode(text, truncation=True, max_length=self.max_request_length).tokens)
9
+
10
+ # HACK: For the vast majority of cases, the above logic works, but there are a few where the
11
+ # token count exceeds `max_length` by 1.
12
+ while not self.fits_within_context_window(result):
13
+ result = result[:-1]
14
+
15
+ return result
@@ -0,0 +1,9 @@
1
+ from .clip_window_service import CLIPWindowService
2
+
3
+
4
+ class LexicaSearchWindowService(CLIPWindowService):
5
+ def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
6
+ return len(text) <= self.max_sequence_length
7
+
8
+ def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
9
+ return text[: self.max_sequence_length]
@@ -0,0 +1,9 @@
1
+ from .clip_window_service import CLIPWindowService
2
+
3
+
4
+ class OpenAIDALLEWindowService(CLIPWindowService):
5
+ def fits_within_context_window(self, text: str, expected_completion_token_length: int = 0) -> bool:
6
+ return len(text) <= self.max_sequence_length
7
+
8
+ def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
9
+ return text[: self.max_sequence_length]
@@ -0,0 +1,29 @@
1
+ import shutil
2
+ import tempfile
3
+
4
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service
6
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
7
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
8
+
9
+
10
+ class TestCLIPWindowService:
11
+ def setup_method(self):
12
+ self.path: str = tempfile.mkdtemp()
13
+ service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
14
+ self.window_service = WindowServiceFactory.get_window_service("huggingface/dreamlike-photoreal-v2-0", service)
15
+
16
+ def teardown_method(self, method):
17
+ shutil.rmtree(self.path)
18
+
19
+ def test_truncate_from_right(self):
20
+ example_text: str = (
21
+ "an instqrumemnt used for cutting cloth , paper , axdz othr thdin mteroial , "
22
+ "consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle "
23
+ "so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on"
24
+ )
25
+ assert not self.window_service.fits_within_context_window(example_text)
26
+
27
+ # Truncate and ensure it fits within the context window
28
+ truncated_prompt: str = self.window_service.truncate_from_right(example_text)
29
+ assert self.window_service.fits_within_context_window(truncated_prompt)
@@ -0,0 +1,30 @@
1
+ import shutil
2
+ import tempfile
3
+
4
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
5
+ from helm.clients.image_generation.dalle2_client import DALLE2Client
6
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
7
+ from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
8
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
9
+
10
+
11
+ class TestOpenAIDALLEWindowService:
12
+ def setup_method(self):
13
+ self.path: str = tempfile.mkdtemp()
14
+ service: TokenizerService = get_tokenizer_service(self.path, BlackHoleCacheBackendConfig())
15
+ self.window_service = WindowServiceFactory.get_window_service("openai/dall-e-2", service)
16
+
17
+ def teardown_method(self, method):
18
+ shutil.rmtree(self.path)
19
+
20
+ def test_fits_within_context_window(self):
21
+ assert self.window_service.fits_within_context_window(TEST_PROMPT)
22
+
23
+ def test_truncate_from_right(self):
24
+ long_prompt: str = TEST_PROMPT * 10
25
+ assert not self.window_service.fits_within_context_window(long_prompt)
26
+
27
+ # Truncate and ensure it fits within the context window
28
+ truncated_long_prompt: str = self.window_service.truncate_from_right(long_prompt)
29
+ assert len(truncated_long_prompt) == DALLE2Client.MAX_PROMPT_LENGTH
30
+ assert self.window_service.fits_within_context_window(truncated_long_prompt)
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
  from typing import List, Optional, cast
3
3
 
4
- from .window_service import WindowService, EncodeResult
4
+ from .window_service import ConfigurableWindowService, EncodeResult
5
5
  from .tokenizer_service import TokenizerService
6
6
  from helm.common.tokenization_request import (
7
7
  DecodeRequest,
@@ -10,11 +10,28 @@ from helm.common.tokenization_request import (
10
10
  TokenizationRequestResult,
11
11
  TokenizationToken,
12
12
  )
13
- from helm.proxy.clients.client import cleanup_tokens
13
+ from helm.clients.client import cleanup_tokens
14
14
 
15
15
 
16
- class LocalWindowService(WindowService, ABC):
17
- def __init__(self, service: TokenizerService):
16
+ class LocalWindowService(ConfigurableWindowService, ABC):
17
+ def __init__(
18
+ self,
19
+ service: TokenizerService,
20
+ tokenizer_name: str,
21
+ max_sequence_length: int,
22
+ max_request_length: Optional[int] = None,
23
+ max_sequence_and_generated_tokens_length: Optional[int] = None,
24
+ end_of_text_token: Optional[str] = None,
25
+ prefix_token: Optional[str] = None,
26
+ ):
27
+ super().__init__(
28
+ tokenizer_name=tokenizer_name,
29
+ max_sequence_length=max_sequence_length,
30
+ max_request_length=max_request_length,
31
+ max_sequence_and_generated_tokens_length=max_sequence_and_generated_tokens_length,
32
+ end_of_text_token=end_of_text_token,
33
+ prefix_token=prefix_token,
34
+ )
18
35
  self.service: TokenizerService = service
19
36
 
20
37
  def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
@@ -0,0 +1,32 @@
1
+ from typing import List, Optional
2
+
3
+ from helm.benchmark.window_services.window_service import EncodeResult
4
+ from helm.benchmark.window_services.default_window_service import DefaultWindowService
5
+ from helm.common.tokenization_request import (
6
+ TokenizationRequest,
7
+ TokenizationRequestResult,
8
+ TokenizationToken,
9
+ )
10
+
11
+
12
+ class NoDecodingWindowService(DefaultWindowService):
13
+ """A window service for tokenizers that have a unimplemented decode() method.
14
+
15
+ This assumes that concatenating the tokens from the tokenize endpoint will result in the original string,
16
+ which is not always true for all tokenizers.
17
+
18
+ TODO(#2141): Come up with a more correct way of doing this."""
19
+
20
+ def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
21
+ response: TokenizationRequestResult = self.service.tokenize(
22
+ TokenizationRequest(text, tokenizer=self.tokenizer_name, encode=False, truncation=truncation)
23
+ )
24
+ return EncodeResult(text=text, tokens=response.tokens[:max_length])
25
+
26
+ def decode(self, tokens: List[TokenizationToken], normalized_text: Optional[str] = None) -> str:
27
+ del normalized_text
28
+ token_strings = []
29
+ for token in tokens:
30
+ assert isinstance(token.value, str)
31
+ token_strings.append(token.value)
32
+ return "".join(token_strings)