crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -11,28 +11,32 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Input,
12
12
  Output,
13
13
  )
14
- from helm.benchmark.run_specs import get_scenario_spec1, get_adapter_spec1
14
+ from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
15
15
  from helm.benchmark.adaptation.prompt import Prompt
16
16
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
17
  from .adapter_factory import AdapterFactory, ADAPT_GENERATION
18
+ from .generation_adapter import GenerationAdapter
18
19
  from .test_adapter import TestAdapter
19
20
 
20
21
 
21
22
  class TestGenerationAdapter(TestAdapter):
22
23
  def test_adapt(self):
23
- scenario = create_scenario(get_scenario_spec1())
24
- adapter_spec = get_adapter_spec1()
24
+ run_spec = get_simple1_spec()
25
+ scenario = create_scenario(run_spec.scenario_spec)
26
+ adapter_spec = run_spec.adapter_spec
25
27
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
26
- scenario_state = adapter.adapt(scenario.get_instances(output_path=""), parallelism=1)
28
+ instances = scenario.get_instances(output_path="")
29
+ request_states = adapter.adapt(instances, parallelism=1)
30
+ non_train_instances = [instance for instance in instances if instance.split != TRAIN_SPLIT]
27
31
 
28
32
  # Make sure we generated the right number of request_states:
29
33
  # For each trial, instance and reference (+ 1 for free-form generation).
30
- num_instances = len(scenario_state.instances)
31
- assert num_instances * adapter_spec.num_train_trials == len(scenario_state.request_states)
34
+ assert len(non_train_instances) * adapter_spec.num_train_trials == len(request_states)
32
35
 
33
36
  def test_construct_prompt(self):
34
37
  adapter_spec = AdapterSpec(
35
38
  model="openai/davinci",
39
+ model_deployment="openai/davinci",
36
40
  method=ADAPT_GENERATION,
37
41
  input_prefix="",
38
42
  input_suffix="",
@@ -59,7 +63,12 @@ class TestGenerationAdapter(TestAdapter):
59
63
 
60
64
  def test_construct_prompt_with_truncation(self):
61
65
  adapter_spec = AdapterSpec(
62
- model="openai/davinci", method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=100
66
+ model="openai/davinci",
67
+ model_deployment="openai/davinci",
68
+ method=ADAPT_GENERATION,
69
+ input_prefix="",
70
+ output_prefix="",
71
+ max_tokens=100,
63
72
  )
64
73
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
65
74
  correct_reference = Reference(Output(text=""), tags=[CORRECT_TAG])
@@ -80,7 +89,9 @@ class TestGenerationAdapter(TestAdapter):
80
89
  assert prompt_text.count("eval") == 1948
81
90
 
82
91
  def test_sample_examples_without_references(self):
83
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=1)
92
+ adapter_spec = AdapterSpec(
93
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=1
94
+ )
84
95
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
85
96
  all_train_instances = [
86
97
  Instance(Input(text="prompt1"), references=[]),
@@ -92,7 +103,9 @@ class TestGenerationAdapter(TestAdapter):
92
103
  assert len(examples) == 1
93
104
 
94
105
  def test_sample_examples_open_ended_generation(self):
95
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=3)
106
+ adapter_spec = AdapterSpec(
107
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
108
+ )
96
109
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
97
110
 
98
111
  all_train_instances: List[Instance] = [
@@ -106,7 +119,9 @@ class TestGenerationAdapter(TestAdapter):
106
119
  assert seed0_examples != seed1_examples, "Examples should differ when changing the seed"
107
120
 
108
121
  def test_sample_examples_open_ended_generation_stress(self):
109
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=5)
122
+ adapter_spec = AdapterSpec(
123
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=5
124
+ )
110
125
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
111
126
 
112
127
  all_train_instances: List[Instance] = [
@@ -146,7 +161,11 @@ class TestGenerationAdapter(TestAdapter):
146
161
 
147
162
  def test_multiple_correct_reference(self):
148
163
  adapter_spec = AdapterSpec(
149
- method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, sample_train=False
164
+ method=ADAPT_GENERATION,
165
+ model="openai/ada",
166
+ model_deployment="openai/ada",
167
+ max_train_instances=2,
168
+ sample_train=False,
150
169
  )
151
170
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
152
171
  train_instances = [
@@ -178,7 +197,7 @@ class TestGenerationAdapter(TestAdapter):
178
197
  ],
179
198
  split=TEST_SPLIT,
180
199
  )
181
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
200
+ actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1)
182
201
  assert len(actual_instances) == 1
183
202
  assert actual_instances[0].request.prompt == (
184
203
  "Input: Second reference is correct\n"
@@ -191,7 +210,12 @@ class TestGenerationAdapter(TestAdapter):
191
210
 
192
211
  def test_multiple_correct_reference_multi_label(self):
193
212
  adapter_spec = AdapterSpec(
194
- method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, multi_label=True, sample_train=False
213
+ method=ADAPT_GENERATION,
214
+ model="openai/ada",
215
+ model_deployment="openai/ada",
216
+ max_train_instances=2,
217
+ multi_label=True,
218
+ sample_train=False,
195
219
  )
196
220
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
197
221
  train_instances = [
@@ -223,7 +247,7 @@ class TestGenerationAdapter(TestAdapter):
223
247
  ],
224
248
  split=TEST_SPLIT,
225
249
  )
226
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
250
+ actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1)
227
251
  assert len(actual_instances) == 1
228
252
  assert actual_instances[0].request.prompt == (
229
253
  "Input: Second reference is correct\n"
@@ -233,3 +257,24 @@ class TestGenerationAdapter(TestAdapter):
233
257
  "Input: First reference is correct\n"
234
258
  "Output:"
235
259
  )
260
+
261
+ def test_construct_prompt_image_generation(self):
262
+ adapter_spec = AdapterSpec(
263
+ model_deployment="openai/dall-e-2",
264
+ method=ADAPT_GENERATION,
265
+ input_prefix="",
266
+ input_suffix="",
267
+ output_prefix="",
268
+ output_suffix="",
269
+ max_train_instances=0,
270
+ num_outputs=1,
271
+ max_tokens=0,
272
+ )
273
+ adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
274
+ assert isinstance(adapter, GenerationAdapter)
275
+
276
+ eval_instance = Instance(Input(text="a blue dog"), references=[])
277
+ prompt: Prompt = adapter.construct_prompt([], eval_instance, include_output=False, reference_index=None)
278
+
279
+ assert adapter.window_service.fits_within_context_window(prompt.text)
280
+ assert prompt.text == "a blue dog"
@@ -7,7 +7,7 @@ from helm.common.request import Request
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
8
  from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
9
9
  from .test_adapter import TestAdapter
10
- from helm.benchmark.scenarios.scenario import Instance, Input, Reference
10
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
11
11
 
12
12
 
13
13
  class TestLanguageModelingAdapter(TestAdapter):
@@ -16,6 +16,7 @@ class TestLanguageModelingAdapter(TestAdapter):
16
16
  method=ADAPT_LANGUAGE_MODELING,
17
17
  input_prefix="",
18
18
  model="openai/davinci",
19
+ model_deployment="openai/davinci",
19
20
  output_prefix="",
20
21
  max_tokens=0,
21
22
  )
@@ -39,6 +40,7 @@ class TestLanguageModelingAdapter(TestAdapter):
39
40
  method=ADAPT_LANGUAGE_MODELING,
40
41
  input_prefix="",
41
42
  model="openai/curie",
43
+ model_deployment="openai/curie",
42
44
  output_prefix="",
43
45
  max_tokens=0,
44
46
  )
@@ -70,6 +72,7 @@ class TestLanguageModelingAdapter(TestAdapter):
70
72
  method=ADAPT_LANGUAGE_MODELING,
71
73
  input_prefix="",
72
74
  model="anthropic/claude-v1.3",
75
+ model_deployment="anthropic/claude-v1.3",
73
76
  output_prefix="",
74
77
  max_tokens=0,
75
78
  )
@@ -81,9 +84,10 @@ class TestLanguageModelingAdapter(TestAdapter):
81
84
  instance: Instance = Instance(
82
85
  input=input_text,
83
86
  references=[reference],
87
+ split=TEST_SPLIT,
84
88
  )
85
89
  # Ensure the adapter returns the correct prompt
86
- request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
90
+ request_states: List[RequestState] = adapter.adapt([instance], parallelism=1)
87
91
  request: Request = request_states[0].request
88
92
  # The prompt should be "<|endoftext|>Excuse me, do you have the time?"
89
93
  assert request.prompt == "<|endoftext|>Excuse me, do you have the time?"
@@ -93,8 +97,9 @@ class TestLanguageModelingAdapter(TestAdapter):
93
97
  instance_long: Instance = Instance(
94
98
  input=input_text_long,
95
99
  references=[reference],
100
+ split=TEST_SPLIT,
96
101
  )
97
- request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1).request_states
102
+ request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1)
98
103
  request_long: Request = request_states_long[0].request
99
104
  # Count the number of tokens of the prompt
100
105
  num_tokens = len(adapter.window_service.encode(request_long.prompt).token_values)
@@ -105,13 +110,14 @@ class TestLanguageModelingAdapter(TestAdapter):
105
110
  method=ADAPT_LANGUAGE_MODELING,
106
111
  input_prefix="",
107
112
  model="anthropic/claude-v1.3",
113
+ model_deployment="anthropic/claude-v1.3",
108
114
  output_prefix="",
109
115
  max_tokens=2000,
110
116
  )
111
117
  adapter_2 = AdapterFactory.get_adapter(adapter_spec_2_, self.tokenizer_service)
112
118
 
113
119
  # Step 2.1. Check that if the prompt is not too long, it is not truncated
114
- request_state_2: List[RequestState] = adapter_2.adapt([instance], parallelism=1).request_states
120
+ request_state_2: List[RequestState] = adapter_2.adapt([instance], parallelism=1)
115
121
  request_2: Request = request_state_2[0].request
116
122
  # The prompt should be unchanged
117
123
  assert request_2.prompt == "<|endoftext|>Excuse me, do you have the time?"
@@ -119,9 +125,38 @@ class TestLanguageModelingAdapter(TestAdapter):
119
125
 
120
126
  # Step 2.2. Check that if the prompt + max_tokens is too long, it is truncated
121
127
  # but that we keep the same number of tokens as in the previous test
122
- request_states_long_2: List[RequestState] = adapter_2.adapt([instance_long], parallelism=1).request_states
128
+ request_states_long_2: List[RequestState] = adapter_2.adapt([instance_long], parallelism=1)
123
129
  request_long_2: Request = request_states_long_2[0].request
124
130
  # Count the number of tokens of the prompt
125
131
  num_tokens_2 = len(adapter_2.window_service.encode(request_long_2.prompt).token_values)
126
132
  assert num_tokens_2 == adapter.window_service.max_sequence_and_generated_tokens_length - 2000
127
133
  assert request_long_2.max_tokens == 2000
134
+
135
+ # TODO(#1969) Determine if this behavior is actually desirable.
136
+ def test_prompt_wrapping(self):
137
+ input_tokens = 25
138
+ max_sequence_length = 10
139
+ adapter_spec = AdapterSpec(
140
+ method=ADAPT_LANGUAGE_MODELING,
141
+ input_prefix="",
142
+ model="openai/code-davinci-002",
143
+ model_deployment="openai/code-davinci-002",
144
+ output_prefix="",
145
+ max_tokens=0,
146
+ )
147
+ adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
148
+ # Monkey patch the window service to have really short max sequences.
149
+ adapter.window_service._max_sequence_length = max_sequence_length
150
+ adapter.window_service._max_request_length = max_sequence_length + 1
151
+ input_text = Input(text=" ".join(str(i) for i in range(input_tokens)))
152
+ instance = Instance(input=input_text, references=[], split=TEST_SPLIT)
153
+
154
+ # Generate the requests
155
+ request_states: List[RequestState] = adapter.adapt([instance], parallelism=1)
156
+ # A smaller window service creates more requests
157
+ assert len(request_states) == 3
158
+ assert request_states[0].request.prompt == "<|endoftext|>0 1 2 3 4 5 6 7 8 9"
159
+ # Only the first prompt inclues the prefix_token
160
+ assert request_states[1].request.prompt == " 9 10 11 12 13 14 15 16 17 18 19"
161
+ # The last prompt includes as many conditioning_tokens as will fit
162
+ assert request_states[2].request.prompt == " 14 15 16 17 18 19 20 21 22 23 24"
@@ -1,13 +1,28 @@
1
1
  # mypy: check_untyped_defs = False
2
+ from typing import List, Set
2
3
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
3
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
5
  from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
5
6
  from .test_adapter import TestAdapter
6
7
 
7
8
 
9
+ def _make_instance(
10
+ text: str, reference_texts: List[str], correct_references: Set[int], is_eval: bool = False
11
+ ) -> Instance:
12
+ references = []
13
+ for i, reference_text in enumerate(reference_texts):
14
+ tags = [CORRECT_TAG] if i in correct_references else []
15
+ references.append(Reference(Output(text=reference_text), tags=tags))
16
+
17
+ split = TEST_SPLIT if is_eval else TRAIN_SPLIT
18
+ return Instance(Input(text=text), references=references, split=split)
19
+
20
+
8
21
  class TestMultipleChoiceJointAdapter(TestAdapter):
9
22
  def test_sample_examples(self):
10
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4)
23
+ adapter_spec = AdapterSpec(
24
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=4
25
+ )
11
26
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
12
27
  all_train_instances = [
13
28
  Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -27,13 +42,20 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
27
42
  assert examples[3].input.text == "say yes3"
28
43
 
29
44
  def test_sample_examples_no_train_instances(self):
30
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=2)
45
+ adapter_spec = AdapterSpec(
46
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=2
47
+ )
31
48
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
32
49
  examples = adapter.sample_examples(all_train_instances=[], seed=0)
33
50
  assert len(examples) == 0
34
51
 
35
52
  def test_sample_examples_greater_max_train_instances(self):
36
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10)
53
+ adapter_spec = AdapterSpec(
54
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
55
+ model="openai/ada",
56
+ model_deployment="openai/ada",
57
+ max_train_instances=10,
58
+ )
37
59
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
38
60
  all_train_instances = [
39
61
  Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -44,9 +66,54 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
44
66
  examples = adapter.sample_examples(all_train_instances, seed=0)
45
67
  assert len(examples) == 3
46
68
 
69
+ def test_sample_examples_unique_labels(self):
70
+ """This is a demonstration of behavior reported in issue #2224."""
71
+ adapter_spec = AdapterSpec(
72
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
73
+ )
74
+ adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
75
+ all_train_instances = [
76
+ # Three with 0 being correct.
77
+ _make_instance("one", ["0", "1"], correct_references={0}),
78
+ _make_instance("two", ["2", "3"], correct_references={0}),
79
+ _make_instance("three", ["4", "5"], correct_references={0}),
80
+ # Two with 1 being correct.
81
+ _make_instance("four", ["6", "7"], correct_references={1}),
82
+ _make_instance("five", ["8", "9"], correct_references={1}),
83
+ ]
84
+ eval_instance = _make_instance("eval", ["10", "11"], correct_references={1}, is_eval=True)
85
+ request_states = adapter.adapt(all_train_instances + [eval_instance], parallelism=1)
86
+ assert len(request_states) == 1
87
+ # In every case, we are showing that model that Output should be "A".
88
+ assert request_states[0].request.prompt == (
89
+ "Input: three\n"
90
+ "A. 4\n"
91
+ "B. 5\n"
92
+ "Output: A\n"
93
+ "\n"
94
+ "Input: two\n"
95
+ "A. 2\n"
96
+ "B. 3\n"
97
+ "Output: A\n"
98
+ "\n"
99
+ "Input: one\n"
100
+ "A. 0\n"
101
+ "B. 1\n"
102
+ "Output: A\n"
103
+ "\n"
104
+ "Input: eval\n"
105
+ "A. 10\n"
106
+ "B. 11\n"
107
+ "Output:"
108
+ )
109
+
47
110
  def test_multiple_correct_reference(self):
48
111
  adapter_spec = AdapterSpec(
49
- method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10, sample_train=False
112
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
113
+ model="openai/ada",
114
+ model_deployment="openai/ada",
115
+ max_train_instances=10,
116
+ sample_train=False,
50
117
  )
51
118
  adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
52
119
  train_instances = [
@@ -78,9 +145,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
78
145
  ],
79
146
  split=TEST_SPLIT,
80
147
  )
81
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
82
- assert len(actual_instances) == 1
83
- assert actual_instances[0].request.prompt == (
148
+ request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
149
+ assert len(request_states) == 1
150
+ assert request_states[0].request.prompt == (
84
151
  "Input: Second reference is correct\n"
85
152
  "A. First\n"
86
153
  "B. Second\n"
@@ -102,6 +169,7 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
102
169
  adapter_spec = AdapterSpec(
103
170
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
104
171
  model="openai/ada",
172
+ model_deployment="openai/ada",
105
173
  max_train_instances=10,
106
174
  multi_label=True,
107
175
  sample_train=False,
@@ -136,9 +204,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
136
204
  ],
137
205
  split=TEST_SPLIT,
138
206
  )
139
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
140
- assert len(actual_instances) == 1
141
- assert actual_instances[0].request.prompt == (
207
+ request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
208
+ assert len(request_states) == 1
209
+ assert request_states[0].request.prompt == (
142
210
  "Input: Second reference is correct\n"
143
211
  "A. First\n"
144
212
  "B. Second\n"