crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/proxy/test_models.py DELETED
@@ -1,27 +0,0 @@
1
- from .models import get_model, get_model_group, get_models_by_organization, get_all_code_models, Model
2
-
3
-
4
- def test_get_model():
5
- model: Model = get_model("ai21/j1-jumbo")
6
- assert model.organization == "ai21"
7
- assert model.engine == "j1-jumbo"
8
-
9
-
10
- def test_get_model_with_invalid_model_name():
11
- try:
12
- get_model("invalid/model")
13
- assert False, "Expected to throw ValueError"
14
- except ValueError:
15
- pass
16
-
17
-
18
- def test_get_model_group():
19
- assert get_model_group("openai/text-curie-001") == "gpt3"
20
-
21
-
22
- def test_get_models_by_organization():
23
- assert get_models_by_organization("simple") == ["simple/model1"]
24
-
25
-
26
- def test_all_code_models():
27
- assert "openai/code-davinci-002" in get_all_code_models()
@@ -1,20 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class AI21TokenCounter(TokenCounter):
8
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
9
- """
10
- Counts the number of generated tokens and NOT the number of tokens in the prompt
11
- (https://studio.ai21.com/docs/calculating-usage).
12
-
13
- The AI21 documentation (https://studio.ai21.com/docs/calculating-usage/) defines
14
- generated tokens as:
15
- "the total number of all completion tokens you generate. For example, assume you post
16
- a complete request for J1-Jumbo with a prompt consisting of 10 tokens and requiring 3
17
- completions, i.e. numResults = 3, and the model generates completions with 5, 15, and
18
- 20 tokens. In total this request will consume 5+15+20=40 generated tokens."
19
- """
20
- return sum(len(sequence.tokens) for sequence in completions)
@@ -1,13 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class CohereTokenCounter(TokenCounter):
8
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
9
- """
10
- Counts the number of generated tokens.
11
- TODO: Cohere simply counts the number of generations, but we currently only support counting tokens.
12
- """
13
- return sum(len(sequence.tokens) for sequence in completions)
@@ -1,12 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class FreeTokenCounter(TokenCounter):
8
- """For when we don't care about keeping track of the number of tokens."""
9
-
10
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
11
- """No need to count tokens, since it's free. Return 0."""
12
- return 0
@@ -1,24 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from .token_counter import TokenCounter
5
-
6
-
7
- class GooseAITokenCounter(TokenCounter):
8
- # From https://goose.ai/pricing: "the base price includes your first 25 tokens
9
- # generated, and you can scale beyond that on a per-token basis."
10
- BASE_PRICE_TOKENS: int = 25
11
-
12
- @staticmethod
13
- def account_for_base_tokens(num_tokens: int):
14
- """Subtracts the number of tokens included in the base price."""
15
- return max(num_tokens - GooseAITokenCounter.BASE_PRICE_TOKENS, 0)
16
-
17
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
18
- """
19
- Counts the number of generated tokens and NOT the number of tokens in the prompt.
20
- From https://goose.ai/pricing: "by charging only for output, you have control since
21
- you can configure the maximum number of tokens generated per API call
22
- (up to 2,048 tokens)."
23
- """
24
- return GooseAITokenCounter.account_for_base_tokens(sum(len(sequence.tokens) for sequence in completions))
@@ -1,22 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence
4
- from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
5
- from helm.proxy.clients.huggingface_client import HuggingFaceClient
6
- from .token_counter import TokenCounter
7
-
8
-
9
- class OpenAITokenCounter(TokenCounter):
10
- def __init__(self, huggingface_client: HuggingFaceClient):
11
- self.huggingface_client: HuggingFaceClient = huggingface_client
12
-
13
- def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
14
- """
15
- Counts the total number of tokens using the suggestion here:
16
- https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5
17
- """
18
- tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenize(
19
- TokenizationRequest(request.prompt)
20
- )
21
- # Number of tokens in the prompt + number of tokens in all the completions
22
- return len(tokenized_prompt.tokens) + sum([len(sequence.tokens) for sequence in completions])
@@ -1,86 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import Request, Sequence, Token
4
- from .ai21_token_counter import AI21TokenCounter
5
-
6
-
7
- class TestAI21TokenCounter:
8
- def setup_method(self, method):
9
- self.token_counter = AI21TokenCounter()
10
-
11
- def test_count_tokens(self):
12
- request = Request(
13
- prompt="The Center for Research on Foundation Models (CRFM) is "
14
- "an interdisciplinary initiative born out of the Stanford "
15
- "Institute for Human-Centered Artificial Intelligence (HAI) "
16
- "that aims to make fundamental advances in the study, development, "
17
- "and deployment of foundation models."
18
- )
19
- completions: List[Sequence] = [
20
- Sequence(
21
- text="\n\nFoundation models are an artificial intelligence paradigm that emphasizes: (1) reasoning "
22
- "about symbolic, structured knowledge, (2) learning to perform tasks from observation, ",
23
- logprob=-66.24831021729915,
24
- tokens=[
25
- Token(text="\n", logprob=-1.034758448600769, top_logprobs={"\n": -1.034758448600769}),
26
- Token(text="\n", logprob=-2.325321674346924, top_logprobs={" Foundation": -1.2628217935562134}),
27
- Token(
28
- text=" Foundation",
29
- logprob=-1.2575088739395142,
30
- top_logprobs={" Foundation": -1.2575088739395142},
31
- ),
32
- Token(
33
- text=" models are",
34
- logprob=-0.9496442079544067,
35
- top_logprobs={" models are": -0.9496442079544067},
36
- ),
37
- Token(text=" an", logprob=-5.785625457763672, top_logprobs={" a class of": -2.762187957763672}),
38
- Token(
39
- text=" artificial intelligence",
40
- logprob=-2.073937177658081,
41
- top_logprobs={" increasingly popular": -1.714562177658081},
42
- ),
43
- Token(text=" paradigm", logprob=-2.855092763900757, top_logprobs={" ": -1.2613427639007568}),
44
- Token(text=" that", logprob=-1.380897879600525, top_logprobs={" that": -1.380897879600525}),
45
- Token(
46
- text=" emphasizes",
47
- logprob=-4.230795860290527,
48
- top_logprobs={" attempts to": -3.5276708602905273},
49
- ),
50
- Token(text=":", logprob=-5.380178451538086, top_logprobs={" reasoning": -2.192678689956665}),
51
- Token(text=" ", logprob=-1.1408348083496094, top_logprobs={"\n": -0.6095848083496094}),
52
- Token(text="(", logprob=-0.41460439562797546, top_logprobs={"(": -0.41460439562797546}),
53
- Token(text="1", logprob=-0.5666008591651917, top_logprobs={"1": -0.5666008591651917}),
54
- Token(text=")", logprob=-0.001801535952836275, top_logprobs={")": -0.001801535952836275}),
55
- Token(text=" reasoning", logprob=-3.4144058227539062, top_logprobs={" the": -2.3987808227539062}),
56
- Token(text=" about", logprob=-1.3604949712753296, top_logprobs={" about": -1.3604949712753296}),
57
- Token(text=" symbolic", logprob=-7.108627796173096, top_logprobs={" and": -2.5617527961730957}),
58
- Token(text=",", logprob=-2.8421378135681152, top_logprobs={" knowledge": -1.6233878135681152}),
59
- Token(
60
- text=" structured",
61
- logprob=-2.6082611083984375,
62
- top_logprobs={" structured": -2.6082611083984375},
63
- ),
64
- Token(text=" knowledge", logprob=-0.91008061170578, top_logprobs={" knowledge": -0.91008061170578}),
65
- Token(text=",", logprob=-1.0750247240066528, top_logprobs={",": -1.0750247240066528}),
66
- Token(text=" ", logprob=-0.5834965705871582, top_logprobs={" ": -0.5834965705871582}),
67
- Token(text="(", logprob=-0.0004963834653608501, top_logprobs={"(": -0.0004963834653608501}),
68
- Token(text="2", logprob=-0.0009141556802205741, top_logprobs={"2": -0.0009141556802205741}),
69
- Token(text=")", logprob=-5.686121585313231e-05, top_logprobs={")": -5.686121585313231e-05}),
70
- Token(text=" learning", logprob=-2.123058319091797, top_logprobs={" learning": -2.123058319091797}),
71
- Token(
72
- text=" to perform", logprob=-5.197870254516602, top_logprobs={" through": -1.7916204929351807}
73
- ),
74
- Token(text=" tasks", logprob=-1.5782833099365234, top_logprobs={" complex": -1.5470333099365234}),
75
- Token(text=" from", logprob=-1.1503676176071167, top_logprobs={" from": -1.1503676176071167}),
76
- Token(
77
- text=" observation", logprob=-4.8489789962768555, top_logprobs={" human": -2.5989787578582764}
78
- ),
79
- Token(text=",", logprob=-0.7239797711372375, top_logprobs={",": -0.7239797711372375}),
80
- Token(text=" ", logprob=-1.3241727352142334, top_logprobs={" and": -0.5897976756095886}),
81
- ],
82
- )
83
- ]
84
-
85
- # Verified against https://studio.ai21.com/playground.
86
- assert self.token_counter.count_tokens(request, completions) == 32
@@ -1,79 +0,0 @@
1
- import os
2
- import tempfile
3
- from typing import List
4
-
5
- from helm.common.cache import SqliteCacheConfig
6
- from helm.common.request import Request, Sequence, Token
7
- from helm.proxy.clients.huggingface_client import HuggingFaceClient
8
- from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
9
- from .openai_token_counter import OpenAITokenCounter
10
-
11
-
12
- class TestOpenAITokenCounter:
13
- # The following prompt has 51 tokens according to the GPT-2 tokenizer
14
- TEST_PROMPT: str = (
15
- "The Center for Research on Foundation Models (CRFM) is "
16
- "an interdisciplinary initiative born out of the Stanford "
17
- "Institute for Human-Centered Artificial Intelligence (HAI) "
18
- "that aims to make fundamental advances in the study, development, "
19
- "and deployment of foundation models."
20
- )
21
-
22
- def setup_method(self, method):
23
- self.cache_path: str = tempfile.NamedTemporaryFile(delete=False).name
24
- self.token_counter = OpenAITokenCounter(
25
- HuggingFaceClient(
26
- tokenizer=HuggingFaceTokenizer(cache_config=SqliteCacheConfig(self.cache_path)),
27
- cache_config=SqliteCacheConfig(self.cache_path),
28
- )
29
- )
30
-
31
- def teardown_method(self, method):
32
- os.remove(self.cache_path)
33
-
34
- def test_count_tokens(self):
35
- request = Request(prompt=TestOpenAITokenCounter.TEST_PROMPT)
36
- completions: List[Sequence] = [
37
- Sequence(
38
- text=" The CRFM is dedicated to advancing our knowledge of the foundations of artificial intelligence "
39
- "(AI) and related fields. It focuses on foundational questions in AI, which are",
40
- logprob=-49.00783279519999,
41
- tokens=[
42
- Token(text=" The", logprob=-1.8096403, top_logprobs={"\n": -1.6654028}),
43
- Token(text=" CR", logprob=-1.2861944, top_logprobs={" CR": -1.2861944}),
44
- Token(text="FM", logprob=-0.0032369632, top_logprobs={"FM": -0.0032369632}),
45
- Token(text=" is", logprob=-1.4355252, top_logprobs={" is": -1.4355252}),
46
- Token(text=" dedicated", logprob=-3.814422, top_logprobs={" a": -1.8003343}),
47
- Token(text=" to", logprob=-0.009623392, top_logprobs={" to": -0.009623392}),
48
- Token(text=" advancing", logprob=-2.6732886, top_logprobs={" the": -1.859751}),
49
- Token(text=" our", logprob=-3.123714, top_logprobs={" the": -1.0504603}),
50
- Token(text=" knowledge", logprob=-3.030337, top_logprobs={" understanding": -0.34646907}),
51
- Token(text=" of", logprob=-0.46280858, top_logprobs={" of": -0.46280858}),
52
- Token(text=" the", logprob=-1.4058315, top_logprobs={" the": -1.4058315}),
53
- Token(text=" foundations", logprob=-2.0638132, top_logprobs={" foundations": -2.0638132}),
54
- Token(text=" of", logprob=-0.2607486, top_logprobs={" of": -0.2607486}),
55
- Token(text=" artificial", logprob=-1.1653417, top_logprobs={" artificial": -1.1653417}),
56
- Token(text=" intelligence", logprob=-0.03756146, top_logprobs={" intelligence": -0.03756146}),
57
- Token(text=" (", logprob=-2.019812, top_logprobs={",": -1.3503861}),
58
- Token(text="AI", logprob=-0.03869382, top_logprobs={"AI": -0.03869382}),
59
- Token(text=")", logprob=-0.49895737, top_logprobs={")": -0.49895737}),
60
- Token(text=" and", logprob=-0.81909865, top_logprobs={" and": -0.81909865}),
61
- Token(text=" related", logprob=-2.611718, top_logprobs={" to": -2.3555496}),
62
- Token(text=" fields", logprob=-0.7640527, top_logprobs={" fields": -0.7640527}),
63
- Token(text=".", logprob=-1.8066244, top_logprobs={",": -1.2972366}),
64
- Token(text=" It", logprob=-2.2856107, top_logprobs={"\n": -1.7865871}),
65
- Token(text=" focuses", logprob=-3.3735154, top_logprobs={" is": -1.4982711}),
66
- Token(text=" on", logprob=-0.13244776, top_logprobs={" on": -0.13244776}),
67
- Token(text=" foundational", logprob=-1.2640914, top_logprobs={" foundational": -1.2640914}),
68
- Token(text=" questions", logprob=-2.010647, top_logprobs={" issues": -1.673752}),
69
- Token(text=" in", logprob=-1.980726, top_logprobs={" that": -1.8133409}),
70
- Token(text=" AI", logprob=-0.5709368, top_logprobs={" AI": -0.5709368}),
71
- Token(text=",", logprob=-1.036094, top_logprobs={",": -1.036094}),
72
- Token(text=" which", logprob=-3.826836, top_logprobs={" such": -2.0843854}),
73
- Token(text=" are", logprob=-1.3858839, top_logprobs={" are": -1.3858839}),
74
- ],
75
- )
76
- ]
77
-
78
- # Verified against https://beta.openai.com/tokenizer. Prompt + completions = 51 + 32.
79
- assert self.token_counter.count_tokens(request, completions) == 51 + 32
@@ -1,32 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.tokenization_request import (
4
- DecodeRequest,
5
- DecodeRequestResult,
6
- TokenizationRequest,
7
- TokenizationRequestResult,
8
- TokenizationToken,
9
- )
10
- from .tokenizer import Tokenizer
11
-
12
-
13
- class SimpleTokenizer(Tokenizer):
14
- """Implements some "models" that just generate silly things quickly just to debug the infrastructure."""
15
-
16
- @staticmethod
17
- def tokenize_by_space(text: str) -> List[str]:
18
- """Simply tokenizes by a single white space."""
19
- return text.split(" ")
20
-
21
- def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
22
- # TODO: Does not support encoding
23
- if request.tokenizer == "simple/model1":
24
- raw_tokens: List[str] = SimpleTokenizer.tokenize_by_space(request.text)
25
- return TokenizationRequestResult(
26
- success=True, cached=False, tokens=[TokenizationToken(text) for text in raw_tokens], text=request.text
27
- )
28
- else:
29
- raise ValueError("Unknown model")
30
-
31
- def decode(self, request: DecodeRequest) -> DecodeRequestResult:
32
- raise NotImplementedError
@@ -1,56 +0,0 @@
1
- from typing import Optional
2
- from helm.common.general import singleton
3
- from .huggingface_tokenizer import HuggingFaceTokenizer
4
-
5
-
6
- class TestHuggingFaceTokenizer:
7
- # The following prompt has 51 tokens according to the GPT-2 tokenizer
8
- TEST_PROMPT: str = (
9
- "The Center for Research on Foundation Models (CRFM) is "
10
- "an interdisciplinary initiative born out of the Stanford "
11
- "Institute for Human-Centered Artificial Intelligence (HAI) "
12
- "that aims to make fundamental advances in the study, development, "
13
- "and deployment of foundation models."
14
- )
15
-
16
- @staticmethod
17
- def verify_get_tokenizer(
18
- tokenizer_name: str, expected_num_tokens: int, pretrained_model_name_or_path: Optional[str] = None
19
- ):
20
- tokenizer = HuggingFaceTokenizer.get_tokenizer(
21
- helm_tokenizer_name=tokenizer_name,
22
- pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
23
- )
24
- assert tokenizer_name in HuggingFaceTokenizer._tokenizers, "Tokenizer should be cached"
25
- assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
26
-
27
- def test_get_tokenizer_gpt2(self):
28
- TestHuggingFaceTokenizer.verify_get_tokenizer("huggingface/gpt2", 51, pretrained_model_name_or_path="gpt2")
29
-
30
- def test_get_tokenizer_gptj(self):
31
- TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-j-6B", 51)
32
-
33
- def test_get_tokenizer_gptneox(self):
34
- TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-neox-20b", 52)
35
-
36
- def test_get_tokenizer_bloom(self):
37
- TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/bloom", 51)
38
-
39
- def test_get_tokenizer_t0pp(self):
40
- TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/T0pp", 58)
41
-
42
- def test_get_tokenizer_t511b(self):
43
- TestHuggingFaceTokenizer.verify_get_tokenizer("google/t5-11b", 58, pretrained_model_name_or_path="t5-11b")
44
-
45
- def test_get_tokenizer_ul2(self):
46
- TestHuggingFaceTokenizer.verify_get_tokenizer("google/ul2", 58)
47
-
48
- def test_get_santacoder(self):
49
- TestHuggingFaceTokenizer.verify_get_tokenizer("bigcode/santacoder", 62)
50
-
51
- def test_gpt2_tokenize_eos(self):
52
- eos_token: str = "<|endoftext|>"
53
- tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
54
- token_ids = tokenizer.encode(eos_token)
55
- assert singleton(token_ids) == 50256
56
- assert tokenizer.decode(token_ids) == eos_token
File without changes
File without changes
File without changes
File without changes
File without changes