crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,26 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import List, Optional
3
3
 
4
+ from helm.common.image_generation_parameters import ImageGenerationParameters
5
+
6
+
7
+ # Adaptation methods
8
+ ADAPT_GENERATION: str = "generation"
9
+ ADAPT_LANGUAGE_MODELING: str = "language_modeling"
10
+ ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
11
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
12
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
13
+ ADAPT_RANKING_BINARY: str = "ranking_binary"
14
+
15
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
16
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
17
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
18
+ ]
19
+
20
+ # Multimodal adaptation methods
21
+ ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
22
+ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL: str = "multiple_choice_joint_multimodal"
23
+
4
24
 
5
25
  @dataclass(frozen=True)
6
26
  class Substitution:
@@ -26,6 +46,9 @@ class AdapterSpec:
26
46
  # For example, it is recommended to prefix all prompts with [NLG] for UL2.
27
47
  global_prefix: str = ""
28
48
 
49
+ # Append all prompts with this string.
50
+ global_suffix: str = ""
51
+
29
52
  # Prompt starts with instructions
30
53
  instructions: str = ""
31
54
 
@@ -68,12 +91,18 @@ class AdapterSpec:
68
91
  # set of training instances. Used to compute error bars.
69
92
  num_train_trials: int = 1
70
93
 
94
+ # Number of trials, where we query the model with the same requests, but different random seeds
95
+ num_trials: int = 1
96
+
71
97
  # If true, randomly sample N training examples; if false, select N consecutive training examples
72
98
  sample_train: bool = True
73
99
 
74
100
  # Decoding parameters (inherited by `Request`)
75
101
 
76
- # Model to make the request to (need to fill in)
102
+ # Model deployment to make the request to (need to fill in)
103
+ model_deployment: str = ""
104
+
105
+ # Model to make the request to
77
106
  model: str = ""
78
107
 
79
108
  # Temperature to use
@@ -89,5 +118,11 @@ class AdapterSpec:
89
118
  random: Optional[str] = None
90
119
 
91
120
  # If true, for instances with multiple correct reference, the gold answer should be considered
92
- # to be all of the correct references rather than any of the correct references.
121
+ # to be all the correct references rather than any of the correct references.
93
122
  multi_label: bool = False
123
+
124
+ # Parameters for image generation
125
+ image_generation_parameters: Optional[ImageGenerationParameters] = None
126
+
127
+ # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
128
+ eval_splits: Optional[List[str]] = field(default=None, hash=False)
@@ -1,15 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import List
3
3
 
4
- import numpy as np
5
-
6
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
- from helm.benchmark.adaptation.scenario_state import ScenarioState
8
- from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.scenarios.scenario import Instance
9
7
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
10
8
  from helm.benchmark.window_services.window_service import WindowService
11
9
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
12
- from helm.common.hierarchical_logger import hlog
13
10
 
14
11
 
15
12
  class Adapter(ABC):
@@ -21,48 +18,13 @@ class Adapter(ABC):
21
18
  def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService):
22
19
  self.adapter_spec: AdapterSpec = adapter_spec
23
20
  self.window_service: WindowService = WindowServiceFactory.get_window_service(
24
- adapter_spec.model, tokenizer_service
21
+ adapter_spec.model_deployment, tokenizer_service
25
22
  )
26
23
 
27
24
  @abstractmethod
28
- def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
25
+ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
29
26
  """
30
27
  Takes a a list of `Instance`s and returns a `ScenarioState` with the
31
28
  list of corresponding `RequestState`s.
32
29
  """
33
30
  pass
34
-
35
- def get_run_instances(self, instances: List[Instance]) -> List[Instance]:
36
- """
37
- Get the instances necessary for this run:
38
- Train instances (split=train): keep all (if any) for in-context learning
39
- Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
40
- Return the resulting train and eval instances.
41
- """
42
- all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
43
-
44
- all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
45
- if (
46
- self.adapter_spec.max_eval_instances is not None
47
- and len(all_eval_instances) > self.adapter_spec.max_eval_instances
48
- ):
49
- # Pick the first `self.adapter_spec.max_eval_instances`.
50
- # The random sampling includes instances monotonically.
51
- np.random.seed(0)
52
- selected_eval_instances = list(
53
- np.random.choice(
54
- all_eval_instances, # type: ignore
55
- self.adapter_spec.max_eval_instances,
56
- replace=False,
57
- )
58
- )
59
- else:
60
- selected_eval_instances = all_eval_instances
61
-
62
- hlog(
63
- f"{len(instances)} instances, "
64
- f"{len(all_train_instances)} train instances, "
65
- f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
66
- )
67
-
68
- return all_train_instances + selected_eval_instances
@@ -1,31 +1,26 @@
1
- from typing import List
2
-
3
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
- from helm.benchmark.window_services.tokenizer_service import TokenizerService
5
- from .adapter import Adapter
6
- from .generation_adapter import GenerationAdapter
7
- from .language_modeling_adapter import LanguageModelingAdapter
8
- from .multiple_choice_joint_adapter import MultipleChoiceJointAdapter
9
- from .multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
10
- from .multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
11
- from .binary_ranking_adapter import BinaryRankingAdapter
12
- from .multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
13
-
14
- # Adaptation methods
15
- ADAPT_GENERATION: str = "generation"
16
- ADAPT_LANGUAGE_MODELING: str = "language_modeling"
17
- ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
18
- ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
19
- ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
20
- ADAPT_RANKING_BINARY: str = "ranking_binary"
21
-
22
- ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
23
- ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
1
+ from helm.benchmark.adaptation.adapter_spec import (
2
+ ADAPT_GENERATION,
3
+ ADAPT_GENERATION_MULTIMODAL,
4
+ ADAPT_LANGUAGE_MODELING,
5
+ ADAPT_MULTIPLE_CHOICE_JOINT,
6
+ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
24
7
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
25
- ]
26
-
27
- # Multimodal adaptation methods
28
- ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
8
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
9
+ ADAPT_RANKING_BINARY,
10
+ AdapterSpec,
11
+ )
12
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
13
+ from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
14
+ from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
15
+ from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
16
+ from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
17
+ from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
18
+ MultipleChoiceJointMultimodalAdapter,
19
+ )
20
+ from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
21
+ from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
22
+ from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
23
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
29
24
 
30
25
 
31
26
  class AdapterFactory:
@@ -51,6 +46,8 @@ class AdapterFactory:
51
46
  adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
52
47
  elif method == ADAPT_GENERATION_MULTIMODAL:
53
48
  adapter = GenerationMultimodalAdapter(adapter_spec, tokenizer_service)
49
+ elif method == ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL:
50
+ adapter = MultipleChoiceJointMultimodalAdapter(adapter_spec, tokenizer_service)
54
51
  else:
55
52
  raise ValueError(f"Invalid adaptation method: {method}")
56
53
 
@@ -50,6 +50,7 @@ class BinaryRankingAdapter(InContextLearningAdapter):
50
50
  )
51
51
  request = Request(
52
52
  model=self.adapter_spec.model,
53
+ model_deployment=self.adapter_spec.model_deployment,
53
54
  prompt=prompt.text,
54
55
  num_completions=self.adapter_spec.num_outputs,
55
56
  temperature=self.adapter_spec.temperature,
@@ -39,12 +39,14 @@ class GenerationAdapter(InContextLearningAdapter):
39
39
  )
40
40
  request = Request(
41
41
  model=self.adapter_spec.model,
42
+ model_deployment=self.adapter_spec.model_deployment,
42
43
  prompt=prompt.text,
43
44
  num_completions=self.adapter_spec.num_outputs,
44
45
  temperature=self.adapter_spec.temperature,
45
46
  max_tokens=self.adapter_spec.max_tokens,
46
47
  stop_sequences=self.adapter_spec.stop_sequences,
47
48
  random=self.adapter_spec.random,
49
+ image_generation_parameters=self.adapter_spec.image_generation_parameters,
48
50
  )
49
51
  request_state = RequestState(
50
52
  instance=eval_instance,
@@ -7,9 +7,9 @@ from typing import List, Dict, Optional
7
7
 
8
8
  from helm.benchmark.adaptation.prompt import Prompt
9
9
  from helm.benchmark.adaptation.request_state import RequestState
10
- from helm.benchmark.adaptation.scenario_state import ScenarioState
11
10
  from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
12
11
  from helm.common.general import parallel_map
12
+ from helm.common.request import Request
13
13
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
14
14
  from .adapter import Adapter
15
15
 
@@ -30,7 +30,7 @@ class InContextLearningAdapter(Adapter, ABC):
30
30
  pass
31
31
 
32
32
  @htrack(None)
33
- def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
33
+ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
34
34
  """
35
35
  Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
36
36
  The reason we don't do this per eval instance is that we create a common set of
@@ -64,7 +64,7 @@ class InContextLearningAdapter(Adapter, ABC):
64
64
  )
65
65
 
66
66
  hlog(f"{len(all_request_states)} requests")
67
- return ScenarioState(self.adapter_spec, all_request_states)
67
+ return all_request_states
68
68
 
69
69
  def _adapt_trial_index(
70
70
  self,
@@ -101,7 +101,23 @@ class InContextLearningAdapter(Adapter, ABC):
101
101
  hlog(line)
102
102
 
103
103
  # Flatten and return
104
- return [request_state for result in results for request_state in result]
104
+ all_request_states: List[RequestState] = [request_state for result in results for request_state in result]
105
+ return self._add_trials(all_request_states)
106
+
107
+ def _add_trials(self, request_states: List[RequestState]) -> List[RequestState]:
108
+ """Expand the request states by adding trials."""
109
+ if self.adapter_spec.num_trials <= 1:
110
+ return request_states
111
+
112
+ all_request_states: List[RequestState] = request_states.copy()
113
+ for i in range(1, self.adapter_spec.num_trials):
114
+ seed: str = str(i)
115
+ for request_state in request_states:
116
+ request: Request = replace(request_state.request, random=seed)
117
+ all_request_states.append(replace(request_state, request=request))
118
+
119
+ assert len(all_request_states) == len(request_states) * self.adapter_spec.num_trials
120
+ return all_request_states
105
121
 
106
122
  def sample_examples(
107
123
  self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
@@ -214,6 +230,7 @@ class InContextLearningAdapter(Adapter, ABC):
214
230
  # Prompt
215
231
  prompt = Prompt(
216
232
  global_prefix=self.adapter_spec.global_prefix,
233
+ global_suffix=self.adapter_spec.global_suffix,
217
234
  instructions_block=instructions_block,
218
235
  train_instance_blocks=train_instance_blocks,
219
236
  eval_instance_block=eval_instance_block,
@@ -1,7 +1,6 @@
1
1
  from typing import List, Tuple, Optional
2
2
 
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.adaptation.scenario_state import ScenarioState
5
4
  from helm.benchmark.scenarios.scenario import Instance, EVAL_SPLITS
6
5
  from helm.benchmark.window_services.window_service import EncodeResult
7
6
  from helm.common.general import flatten_list, parallel_map
@@ -26,7 +25,7 @@ class LanguageModelingAdapter(Adapter):
26
25
  """
27
26
 
28
27
  @htrack(None)
29
- def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
28
+ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
30
29
  """
31
30
  Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
32
31
  Only requires eval instances.
@@ -34,13 +33,19 @@ class LanguageModelingAdapter(Adapter):
34
33
  # Pick out evaluation instances. This includes both valid and test splits.
35
34
  eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
36
35
  hlog(f"{len(eval_instances)} eval instances")
37
-
36
+ # Since at least 2023-01-01, this adapter was using `instances` instead of `eval_instances`
37
+ # https://github.com/stanford-crfm/helm/commit/ac9892f7449418d32ab55843702db312b58003ed#diff-69871182494f0d9f4bc6aeea76e99c13edf0213e2c123432a63cd2024d66ffcaR39
38
+ # This assert is intended to identify run specs (if any) that had been producing incorrect results.
39
+ assert len(eval_instances) == len(instances), (
40
+ "Non-evaluation instances were passed to LanguageModelingAdapter, but LanguageModelingAdapter "
41
+ + "expects evaluation instances only. Please open a GitHub issue with your RunSpec."
42
+ )
38
43
  all_request_states: List[RequestState] = flatten_list(
39
- parallel_map(self._generate_requests, instances, parallelism)
44
+ parallel_map(self._generate_requests, eval_instances, parallelism)
40
45
  )
41
46
  hlog(f"{len(all_request_states)} requests")
42
47
 
43
- return ScenarioState(self.adapter_spec, all_request_states)
48
+ return all_request_states
44
49
 
45
50
  def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
46
51
  """
@@ -114,6 +119,7 @@ class LanguageModelingAdapter(Adapter):
114
119
  )
115
120
  request = Request(
116
121
  model=self.adapter_spec.model,
122
+ model_deployment=self.adapter_spec.model_deployment,
117
123
  prompt=prompt_text,
118
124
  num_completions=1,
119
125
  temperature=0,
@@ -162,6 +168,7 @@ class LanguageModelingAdapter(Adapter):
162
168
 
163
169
  request = Request(
164
170
  model=self.adapter_spec.model,
171
+ model_deployment=self.adapter_spec.model_deployment,
165
172
  prompt=prompt_text,
166
173
  num_completions=1,
167
174
  temperature=0,
@@ -29,6 +29,7 @@ class GenerationMultimodalAdapter(InContextLearningMultimodalAdapter):
29
29
 
30
30
  request = Request(
31
31
  model=self.adapter_spec.model,
32
+ model_deployment=self.adapter_spec.model_deployment,
32
33
  multimodal_prompt=prompt.multimedia_object,
33
34
  num_completions=self.adapter_spec.num_outputs,
34
35
  temperature=self.adapter_spec.temperature,
@@ -27,6 +27,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
27
27
 
28
28
  request = Request(
29
29
  model=self.adapter_spec.model,
30
+ model_deployment=self.adapter_spec.model_deployment,
30
31
  multimodal_prompt=prompt.multimedia_object,
31
32
  num_completions=self.adapter_spec.num_outputs,
32
33
  temperature=self.adapter_spec.temperature,
@@ -0,0 +1,104 @@
1
+ from abc import ABC
2
+ from typing import Dict, List, Optional
3
+
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.scenarios.scenario import Instance
6
+ from helm.common.media_object import MediaObject, MultimediaObject
7
+ from helm.common.request import Request
8
+ from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
9
+ InContextLearningMultimodalAdapter,
10
+ )
11
+ from .multimodal_prompt import MultimodalPrompt
12
+
13
+
14
+ class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, ABC):
15
+ """
16
+ An `Adapter`, guided by the `AdapterSpec`, takes a `Scenario` and produces
17
+ a `ScenarioState`. This `Adapter` has additional logic to support in-context
18
+ learning for multimodal models.
19
+ """
20
+
21
+ @staticmethod
22
+ def get_reference_prefix(prefix: str, i: int) -> str:
23
+ """
24
+ Example: prefix = "\nA. ", i = 2, return "\nC. "
25
+ """
26
+ return prefix.replace("A", chr(ord("A") + i))
27
+
28
+ def generate_requests(
29
+ self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
30
+ ) -> List[RequestState]:
31
+ prompt: MultimodalPrompt = self.construct_prompt(
32
+ training_instances, eval_instance, include_output=False, reference_index=None
33
+ )
34
+ output_mapping: Dict[str, str] = dict(
35
+ (self.get_reference_prefix("A", reference_index), reference.output.text)
36
+ for reference_index, reference in enumerate(eval_instance.references)
37
+ )
38
+ request = Request(
39
+ model=self.adapter_spec.model,
40
+ model_deployment=self.adapter_spec.model_deployment,
41
+ multimodal_prompt=prompt.multimedia_object,
42
+ num_completions=self.adapter_spec.num_outputs,
43
+ temperature=self.adapter_spec.temperature,
44
+ max_tokens=self.adapter_spec.max_tokens,
45
+ stop_sequences=[],
46
+ random=self.adapter_spec.random,
47
+ )
48
+ request_state = RequestState(
49
+ instance=eval_instance,
50
+ reference_index=None,
51
+ request_mode=None,
52
+ train_trial_index=train_trial_index,
53
+ output_mapping=output_mapping,
54
+ request=request,
55
+ result=None,
56
+ num_train_instances=prompt.num_train_instances,
57
+ prompt_truncated=False,
58
+ )
59
+ return [request_state]
60
+
61
+ def construct_example_multimodal_prompt(
62
+ self, instance: Instance, include_output: bool, reference_index: Optional[int]
63
+ ) -> MultimediaObject:
64
+ """
65
+ Returns a single example of the prompt. `include_output` controls whether the gold output is included.
66
+ """
67
+ # Input
68
+ assert instance.input.multimedia_content is not None
69
+ result: MultimediaObject = instance.input.multimedia_content.add_textual_prefix(self.adapter_spec.input_prefix)
70
+ result = result.add_textual_suffix(self.adapter_spec.input_suffix)
71
+
72
+ # Include the references
73
+ delimiter: str = ", "
74
+ no_correct_references: str = "n/a"
75
+ output: str = no_correct_references
76
+ for reference_index, reference in enumerate(instance.references):
77
+ prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
78
+
79
+ if reference.output.multimedia_content is not None:
80
+ reference_output_content: MultimediaObject = reference.output.multimedia_content
81
+ reference_output_content = reference_output_content.add_textual_prefix(prefix)
82
+ reference_output_content = reference_output_content.add_textual_suffix(
83
+ self.adapter_spec.reference_suffix
84
+ )
85
+ result = result.combine(reference_output_content)
86
+ else:
87
+ result = result.add_textual_suffix(prefix + reference.output.text + self.adapter_spec.reference_suffix)
88
+
89
+ if reference.is_correct:
90
+ if output == no_correct_references:
91
+ output = self.get_reference_prefix("A", reference_index)
92
+ elif self.adapter_spec.multi_label:
93
+ output += delimiter
94
+ output += self.get_reference_prefix("A", reference_index)
95
+
96
+ if include_output:
97
+ output_content: MultimediaObject = MultimediaObject([MediaObject(text=output, content_type="text/plain")])
98
+ output_content = output_content.add_textual_prefix(self.adapter_spec.output_prefix)
99
+ output_content = output_content.add_textual_suffix(self.adapter_spec.output_suffix)
100
+ result = result.combine(output_content)
101
+ else:
102
+ result = result.add_textual_suffix(self.adapter_spec.output_prefix.rstrip())
103
+
104
+ return result
@@ -1,6 +1,7 @@
1
1
  import shutil
2
2
  import tempfile
3
3
  import unittest
4
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
4
5
 
5
6
  from helm.common.media_object import MediaObject, MultimediaObject
6
7
  from helm.benchmark.scenarios.scenario import Instance, Reference, Input, Output, TEST_SPLIT, TRAIN_SPLIT, CORRECT_TAG
@@ -14,7 +15,7 @@ from .multimodal_prompt import MultimodalPrompt
14
15
  class TestInContextLearningMultimodalAdapter(unittest.TestCase):
15
16
  def setup_method(self, _):
16
17
  self._path: str = tempfile.mkdtemp()
17
- self._tokenizer_service = get_tokenizer_service(self._path)
18
+ self._tokenizer_service = get_tokenizer_service(self._path, BlackHoleCacheBackendConfig())
18
19
 
19
20
  def teardown_method(self, _):
20
21
  shutil.rmtree(self._path)
@@ -22,6 +23,7 @@ class TestInContextLearningMultimodalAdapter(unittest.TestCase):
22
23
  def test_construct_prompt(self):
23
24
  adapter_spec: AdapterSpec = AdapterSpec(
24
25
  model="simple/model1",
26
+ model_deployment="simple/model1",
25
27
  method=ADAPT_GENERATION_MULTIMODAL,
26
28
  global_prefix="[START]",
27
29
  instructions="Please answer the following question about the images.",
@@ -91,6 +93,7 @@ class TestInContextLearningMultimodalAdapter(unittest.TestCase):
91
93
  def test_construct_prompt_multi_label(self):
92
94
  adapter_spec: AdapterSpec = AdapterSpec(
93
95
  model="simple/model1",
96
+ model_deployment="simple/model1",
94
97
  method=ADAPT_GENERATION_MULTIMODAL,
95
98
  global_prefix="[START]",
96
99
  instructions="Please answer the following question about the images.",
@@ -171,6 +174,7 @@ class TestInContextLearningMultimodalAdapter(unittest.TestCase):
171
174
  """
172
175
  adapter_spec: AdapterSpec = AdapterSpec(
173
176
  model="simple/model1",
177
+ model_deployment="simple/model1",
174
178
  method=ADAPT_GENERATION_MULTIMODAL,
175
179
  input_prefix="User: ",
176
180
  input_suffix="<end_of_utterance>",
@@ -55,6 +55,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
55
55
  )
56
56
  request = Request(
57
57
  model=self.adapter_spec.model,
58
+ model_deployment=self.adapter_spec.model_deployment,
58
59
  prompt=prompt.text,
59
60
  num_completions=1,
60
61
  top_k_per_token=self.adapter_spec.num_outputs,
@@ -41,6 +41,7 @@ class MultipleChoiceSeparateAdapter(InContextLearningAdapter):
41
41
  ) -> RequestState:
42
42
  request = Request(
43
43
  model=self.adapter_spec.model,
44
+ model_deployment=self.adapter_spec.model_deployment,
44
45
  prompt=prompt.text,
45
46
  num_completions=1,
46
47
  temperature=0,
@@ -2,6 +2,7 @@ import shutil
2
2
  import tempfile
3
3
 
4
4
  from helm.common.authentication import Authentication
5
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
5
6
  from helm.proxy.services.server_service import ServerService
6
7
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
8
 
@@ -13,7 +14,7 @@ class TestAdapter:
13
14
 
14
15
  def setup_method(self):
15
16
  self.path: str = tempfile.mkdtemp()
16
- service = ServerService(base_path=self.path, root_mode=True)
17
+ service = ServerService(base_path=self.path, root_mode=True, cache_backend_config=BlackHoleCacheBackendConfig())
17
18
  self.tokenizer_service = TokenizerService(service, Authentication("test"))
18
19
 
19
20
  def teardown_method(self, _):