crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/runner.py CHANGED
@@ -6,70 +6,69 @@ import traceback
6
6
  import typing
7
7
  from collections import Counter
8
8
  import dataclasses
9
- from dataclasses import dataclass, field
10
9
  from typing import Any, Dict, List
10
+ import numpy as np
11
11
 
12
12
  from tqdm import tqdm
13
13
 
14
+ from helm.benchmark.adaptation.request_state import RequestState
14
15
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
15
16
  from helm.common.hierarchical_logger import hlog, htrack_block
16
17
  from helm.common.cache import cache_stats
17
- from .augmentations.data_augmenter import DataAugmenterSpec
18
- from .scenarios.scenario import Scenario, ScenarioSpec, create_scenario, Instance, with_instance_ids
19
- from .adaptation.adapters.adapter import Adapter
20
- from .adaptation.adapters.adapter_factory import AdapterFactory
21
- from .adaptation.scenario_state import ScenarioState
22
- from .adaptation.adapter_spec import AdapterSpec
23
- from .data_preprocessor import DataPreprocessor
24
- from .executor import ExecutionSpec, Executor
25
- from .metrics.dry_run_metrics import DryRunMetric
26
- from .metrics.metric_name import MetricName
27
- from .metrics.metric_service import MetricService
28
- from .metrics.metric import Metric, MetricSpec, MetricResult, PerInstanceStats, create_metric, Stat
29
- from .window_services.tokenizer_service import TokenizerService
18
+ from helm.benchmark.scenarios.scenario import (
19
+ EVAL_SPLITS,
20
+ TRAIN_SPLIT,
21
+ Scenario,
22
+ create_scenario,
23
+ Instance,
24
+ get_scenario_cache_path,
25
+ with_instance_ids,
26
+ )
27
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
28
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
29
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
30
+ from helm.benchmark.run_spec import RunSpec
31
+ from helm.benchmark.data_preprocessor import DataPreprocessor
32
+ from helm.benchmark.executor import ExecutionSpec, Executor
33
+ from helm.benchmark.annotation_executor import AnnotationExecutionSpec, AnnotationExecutor
34
+ from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
35
+ from helm.benchmark.metrics.metric_name import MetricName
36
+ from helm.benchmark.metrics.metric_service import MetricService
37
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
38
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
30
39
 
31
40
 
32
41
  LATEST_SYMLINK: str = "latest"
42
+ _BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
43
+ _CACHED_MODELS_FOLDER: str = "models"
33
44
 
34
45
 
35
- class RunnerError(Exception):
36
- """Error that happens in the Runner."""
37
-
38
- pass
39
-
46
+ def get_benchmark_output_path() -> str:
47
+ """Get the benchmark output path.
40
48
 
41
- @dataclass(frozen=True)
42
- class RunSpec:
43
- """
44
- Specifies how to do a single run, which gets a scenario, adapts it, and
45
- computes a list of stats based on the defined metrics.
46
- """
49
+ Many run spec functions need to know the benchmark output path,
50
+ but there is no way to pass it via the run spec function,
51
+ so instead the run spec function should read this global variable."""
52
+ return _BENCHMARK_OUTPUT_PATH
47
53
 
48
- # Unique identifier of the RunSpec
49
- name: str
50
54
 
51
- # Which scenario
52
- scenario_spec: ScenarioSpec
55
+ def get_cached_models_path() -> str:
56
+ """Get the cached models pat within the benchmark output path."""
57
+ path: str = os.path.join(get_benchmark_output_path(), _CACHED_MODELS_FOLDER)
58
+ ensure_directory_exists(path)
59
+ return path
53
60
 
54
- # Specifies how to adapt an instance into a set of requests
55
- adapter_spec: AdapterSpec
56
61
 
57
- # What to evaluate on
58
- metric_specs: List[MetricSpec]
62
+ def set_benchmark_output_path(benchmark_output_path: str) -> None:
63
+ """Set the benchmark output path."""
64
+ global _BENCHMARK_OUTPUT_PATH
65
+ _BENCHMARK_OUTPUT_PATH = benchmark_output_path
59
66
 
60
- # Data augmenter. The default `DataAugmenterSpec` does nothing.
61
- data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
62
67
 
63
- # Groups that this run spec belongs to (for aggregation)
64
- groups: List[str] = field(default_factory=list)
68
+ class RunnerError(Exception):
69
+ """Error that happens in the Runner."""
65
70
 
66
- def __post_init__(self):
67
- """
68
- `self.name` is used as the name of the output folder for the `RunSpec`.
69
- Clean up `self.name` by replacing any "/"'s with "_".
70
- """
71
- # TODO: Don't mutate name! clean this up before passing it into the constructor here
72
- object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
71
+ pass
73
72
 
74
73
 
75
74
  def remove_stats_nans(stats: List[Stat]) -> List[Stat]:
@@ -103,6 +102,40 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
103
102
  return result
104
103
 
105
104
 
105
+ def downsample_eval_instances(
106
+ instances: List[Instance], max_eval_instances: int, eval_splits: List[str]
107
+ ) -> List[Instance]:
108
+ """
109
+ Get the instances necessary for this run:
110
+ Train instances (split=train): keep all (if any) for in-context learning
111
+ Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
112
+ Return the resulting train and eval instances.
113
+ """
114
+ all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
115
+
116
+ all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in eval_splits]
117
+ if len(all_eval_instances) > max_eval_instances:
118
+ # The random sampling includes instances monotonically.
119
+ np.random.seed(0)
120
+ selected_eval_instances = list(
121
+ np.random.choice(
122
+ all_eval_instances, # type: ignore
123
+ max_eval_instances,
124
+ replace=False,
125
+ )
126
+ )
127
+ else:
128
+ selected_eval_instances = all_eval_instances
129
+
130
+ hlog(
131
+ f"{len(instances)} instances, "
132
+ f"{len(all_train_instances)} train instances, "
133
+ f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
134
+ )
135
+
136
+ return all_train_instances + selected_eval_instances
137
+
138
+
106
139
  class Runner:
107
140
  """
108
141
  The main entry point for running the entire benchmark. Mostly just
@@ -121,6 +154,15 @@ class Runner:
121
154
  exit_on_error: bool,
122
155
  ):
123
156
  self.executor = Executor(execution_spec)
157
+ self.annotator_executor = AnnotationExecutor(
158
+ AnnotationExecutionSpec(
159
+ local_path=execution_spec.local_path if execution_spec.local_path is not None else "",
160
+ parallelism=execution_spec.parallelism,
161
+ dry_run=execution_spec.dry_run,
162
+ sqlite_cache_backend_config=execution_spec.sqlite_cache_backend_config,
163
+ mongo_cache_backend_config=execution_spec.mongo_cache_backend_config,
164
+ )
165
+ )
124
166
  self.dry_run: bool = execution_spec.dry_run
125
167
  self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
126
168
  self.metric_service = MetricService(self.executor.service, execution_spec.auth)
@@ -131,9 +173,8 @@ class Runner:
131
173
  self.exit_on_error: bool = exit_on_error
132
174
 
133
175
  ensure_directory_exists(output_path)
134
- # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
135
- self.scenarios_path: str = os.path.join(output_path, "scenarios")
136
- ensure_directory_exists(self.scenarios_path)
176
+ self.output_path = output_path
177
+
137
178
  # Decide where to save input instances
138
179
  self.instances_path: str = os.path.join(output_path, "scenario_instances")
139
180
  ensure_directory_exists(self.instances_path)
@@ -145,11 +186,13 @@ class Runner:
145
186
  self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
146
187
  ensure_directory_exists(self.eval_cache_path)
147
188
 
148
- def _is_run_completed(self, run_spec: RunSpec):
189
+ def _get_run_path(self, run_spec: RunSpec) -> str:
190
+ return os.path.join(self.runs_path, run_spec.name)
191
+
192
+ def _is_run_completed(self, run_path: str):
149
193
  """Return whether the run was previously completed.
150
194
 
151
195
  A run is completed if all of the expected output files exist."""
152
- run_path: str = os.path.join(self.runs_path, run_spec.name)
153
196
  if not os.path.isdir(run_path):
154
197
  return False
155
198
  output_paths = [
@@ -182,31 +225,21 @@ class Runner:
182
225
  raise RunnerError(f"Failed runs: [{failed_runs_str}]")
183
226
 
184
227
  def run_one(self, run_spec: RunSpec):
228
+ run_path: str = self._get_run_path(run_spec)
229
+ if self.skip_completed_runs and self._is_run_completed(run_path):
230
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
231
+ return
232
+ ensure_directory_exists(run_path)
233
+
185
234
  # Load the scenario
186
235
  scenario: Scenario = create_scenario(run_spec.scenario_spec)
187
236
 
188
- # This `output_path` will be used when `Adapter` calls `Scenario.get_instances`.
189
- scenario_output_path = os.path.join(self.scenarios_path, scenario.name)
190
- ensure_directory_exists(scenario_output_path)
191
-
192
237
  # This 'output_path' will be used when the model's input instances are saved.
193
238
  args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
194
239
  scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
195
240
  input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
196
241
  input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
197
242
 
198
- run_path: str = os.path.join(self.runs_path, run_spec.name)
199
- ensure_directory_exists(run_path)
200
-
201
- if self.skip_completed_runs and self._is_run_completed(run_spec):
202
- # If scenario_state.json exists, assume that all other output files exist
203
- # because scenario_state.json is the last output file to be written.
204
- hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
205
- return
206
-
207
- # Fetch and initialize the Adapter based on the `AdapterSpec`.
208
- adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
209
-
210
243
  instances: List[Instance]
211
244
  if self.skip_instances:
212
245
  instances = []
@@ -217,6 +250,7 @@ class Runner:
217
250
  instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
218
251
  else:
219
252
  # Create the instances of the scenario
253
+ scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
220
254
  with htrack_block("scenario.get_instances"):
221
255
  instances = scenario.get_instances(scenario_output_path)
222
256
  if self.cache_instances and not os.path.exists(input_instances_file_path):
@@ -230,10 +264,14 @@ class Runner:
230
264
  return # Exit after saving the instances.
231
265
 
232
266
  # Give each instance a unique ID
233
- instances = with_instance_ids(instances)
267
+ if any([instance.id is None for instance in instances]):
268
+ instances = with_instance_ids(instances)
234
269
 
235
270
  # Get the instances necessary for this run.
236
- instances = adapter.get_run_instances(instances)
271
+ max_eval_instances = run_spec.adapter_spec.max_eval_instances
272
+ eval_splits = run_spec.adapter_spec.eval_splits or EVAL_SPLITS
273
+ if max_eval_instances is not None:
274
+ instances = downsample_eval_instances(instances, max_eval_instances, eval_splits)
237
275
 
238
276
  # Data preprocessing
239
277
  instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
@@ -241,15 +279,24 @@ class Runner:
241
279
  )
242
280
 
243
281
  # Adapt (convert to requests)
244
- scenario_state: ScenarioState = adapter.adapt(instances, self.executor.execution_spec.parallelism)
282
+ adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
283
+ request_states: List[RequestState] = adapter.adapt(instances, self.executor.execution_spec.parallelism)
284
+ scenario_state: ScenarioState = ScenarioState(
285
+ adapter_spec=run_spec.adapter_spec,
286
+ request_states=request_states,
287
+ annotator_specs=run_spec.annotators,
288
+ )
245
289
 
246
290
  # Execute (fill up results)
247
291
  scenario_state = self.executor.execute(scenario_state)
248
292
 
293
+ # Annotate (post-process the results)
294
+ scenario_state = self.annotator_executor.execute(scenario_state)
295
+
249
296
  # Apply the metrics
250
297
  # When performing a dry run, only estimate the number of tokens instead
251
298
  # of calculating the metrics.
252
- metrics: List[Metric] = (
299
+ metrics: List[MetricInterface] = (
253
300
  [DryRunMetric()] if self.dry_run else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
254
301
  )
255
302
  stats: List[Stat] = []
@@ -0,0 +1,21 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, Optional
3
+ import dacite
4
+ import yaml
5
+
6
+
7
+ @dataclass
8
+ class RunnerConfigSpec:
9
+ helm_max_concurrent_workers: int = -1
10
+ slurm_monitor_interval: int = 60
11
+ slurm_args: Optional[Dict[str, Any]] = None
12
+
13
+
14
+ RUNNER_CONFIG = RunnerConfigSpec()
15
+
16
+
17
+ def register_runner_config_from_path(dir_path: str) -> None:
18
+ global RUNNER_CONFIG
19
+ with open(dir_path, "r") as f:
20
+ raw = yaml.safe_load(f)
21
+ RUNNER_CONFIG = dacite.from_dict(RunnerConfigSpec, raw)
@@ -123,7 +123,7 @@ class BBQScenario(Scenario):
123
123
  # trying to book a cab on Uber.", "question": "Who was not comfortable using the phone?", "ans0":
124
124
  # "The grandfather", "ans1": "The grandson", "ans2": "Can't be determined", "label": 2}
125
125
 
126
- for (idx, (loaded_line, category)) in enumerate(loaded_lines):
126
+ for idx, (loaded_line, category) in enumerate(loaded_lines):
127
127
  is_negative: bool = loaded_line["question_polarity"] == "neg"
128
128
  is_ambiguous: bool = loaded_line["context_condition"] == "ambig"
129
129
  context: str = loaded_line["context"]
@@ -98,7 +98,7 @@ class BOLDScenario(Scenario):
98
98
  # with https://raw.githubusercontent.com/amazon-
99
99
  # research/bold/main/prompts/political_ideology_prompt.json as an example:
100
100
  # {"left-wing": {"left-wing_politics": }}
101
- for (prompt_category, prompt_topic) in prompt_topics_dict.items():
101
+ for prompt_category, prompt_topic in prompt_topics_dict.items():
102
102
  for prompt_text in prompt_topic.values():
103
103
  prompt_text = prompt_text[0] # stored as a list containing a single string
104
104
  prompt_tuples.append((prompt_category, prompt_text))
@@ -108,7 +108,7 @@ class BOLDScenario(Scenario):
108
108
 
109
109
  split_sizes = {"train": len(instances) - DEFAULT_TEST_SIZE, "test": DEFAULT_TEST_SIZE}
110
110
 
111
- for (idx, prompt_tuple) in enumerate(prompt_tuples):
111
+ for idx, prompt_tuple in enumerate(prompt_tuples):
112
112
  prompt_category, prompt_text = prompt_tuple
113
113
  curr_split = TRAIN_SPLIT
114
114
 
@@ -10,14 +10,29 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
10
10
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
11
11
  ADAPT_GENERATION,
12
12
  )
13
- from helm.common.general import ensure_file_downloaded, ensure_directory_exists
13
+ from helm.benchmark.runner import get_benchmark_output_path
14
+ from helm.common.general import (
15
+ assert_is_str,
16
+ assert_is_str_list,
17
+ ensure_file_downloaded,
18
+ ensure_directory_exists,
19
+ )
14
20
  from helm.common.hierarchical_logger import hlog
15
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
21
+ from .scenario import (
22
+ Scenario,
23
+ Instance,
24
+ Reference,
25
+ TRAIN_SPLIT,
26
+ TEST_SPLIT,
27
+ CORRECT_TAG,
28
+ Input,
29
+ Output,
30
+ get_scenario_cache_path,
31
+ )
16
32
  from .code_scenario import CodeReference, CodeInstance
17
33
 
18
34
 
19
35
  CLEVA_DATA_URL = "http://39.108.215.175/data"
20
- CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"
21
36
 
22
37
 
23
38
  @dataclass(frozen=True)
@@ -69,26 +84,17 @@ class Converter:
69
84
  """Convert a data point in CLEVA format to a HELM instance according to a given CLEVA prompt template."""
70
85
  transformed_data = self._apply_all(copy.deepcopy(data), templates)
71
86
 
72
- prompt: str = transformed_data["input"] # type: ignore
73
- assert isinstance(prompt, str)
87
+ prompt = assert_is_str(transformed_data["input"])
74
88
  if "choices" in transformed_data:
75
89
  # This is a multiple-choice task
76
- choices: List[str] = transformed_data["choices"] # type: ignore
77
- # Gurantee `choices` must be `List[str]`
78
- assert isinstance(choices, list)
79
- for c in choices:
80
- assert isinstance(c, str)
90
+ choices = assert_is_str_list(transformed_data["choices"])
81
91
  references: List[Reference] = [
82
92
  Reference(Output(text=text), tags=[CORRECT_TAG] if idx in transformed_data["label"] else [])
83
93
  for idx, text in enumerate(choices)
84
94
  ]
85
95
  else:
86
96
  # This is a generation task
87
- correct_answer: List[str] = transformed_data["label"] # type: ignore
88
- # Gurantee `label` must be `List[str]`
89
- assert isinstance(correct_answer, list)
90
- for a in correct_answer:
91
- assert isinstance(a, str)
97
+ correct_answer = assert_is_str_list(transformed_data["label"])
92
98
  references = [Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in correct_answer]
93
99
 
94
100
  instance = Instance(
@@ -109,15 +115,12 @@ class Converter:
109
115
  to a HELM CodeInstance according to a given CLEVA prompt template.
110
116
  """
111
117
 
112
- assert isinstance(templates["input"], str)
113
- data["prompt"] = templates["input"].format(**data)
114
- assert isinstance(data["prompt"], str)
115
- assert isinstance(data["canonical_solution"], str)
118
+ data["prompt"] = assert_is_str(templates["input"]).format(**data)
116
119
  instance = CodeInstance(
117
- input=Input(text=data["prompt"]),
120
+ input=Input(text=assert_is_str(data["prompt"])),
118
121
  references=[
119
122
  CodeReference(
120
- output=Output(text=data["canonical_solution"]),
123
+ output=Output(text=assert_is_str(data["canonical_solution"])),
121
124
  test_cases=data,
122
125
  tags=[CORRECT_TAG],
123
126
  )
@@ -211,27 +214,18 @@ class Converter:
211
214
  transformed_data[k] = self._apply(data[k], template, **data)
212
215
 
213
216
  # We then merge all other fields into the `input`
214
- assert isinstance(templates["input"], str), "The input field of a template should be a string"
215
- data["input"] = templates["input"].format(**transformed_data)
217
+ data["input"] = assert_is_str(templates["input"]).format(**transformed_data)
216
218
  if "choices" in data:
217
219
  # We take the corresponding choices and apply the `label` template
218
220
  # Note: we do not allow `label` template to access other fields in multi-choice tasks
219
221
  # Overwrite `choices` to the actual continuations
220
- choices: List[str] = data["choices"] # type: ignore
221
- # Gurantee `choices` must be `List[str]`
222
- assert isinstance(choices, list)
223
- for c in choices:
224
- assert isinstance(c, str)
222
+ choices = assert_is_str_list(data["choices"])
225
223
  data["choices"] = [self._apply(c, templates.get("label", None), label=c) for c in choices]
226
224
  else:
227
225
  # For generation tasks, we allow it to access to other stringified fields
228
226
  kwargs = transformed_data
229
227
  del kwargs["label"]
230
- labels: List[str] = data["label"] # type: ignore
231
- # Gurantee `label` must be `List[str]`
232
- assert isinstance(labels, list)
233
- for label in labels:
234
- assert isinstance(label, str)
228
+ labels = assert_is_str_list(data["label"])
235
229
  data["label"] = [self._apply(x, templates.get("label", None), **kwargs, label=x) for x in labels]
236
230
  return data
237
231
 
@@ -402,7 +396,10 @@ class CLEVAScenario(Scenario):
402
396
  self.subtask = subtask
403
397
  self.version = version
404
398
  self.converter = Converter()
405
- self.prompt_template, _ = CLEVAScenario.get_prompt_setting(self.task, subtask, version, prompt_id)
399
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name)
400
+ self.prompt_template, _ = CLEVAScenario.get_prompt_setting(
401
+ self.task, subtask, version, prompt_id, scenario_cache_path
402
+ )
406
403
 
407
404
  @property
408
405
  @abstractmethod
@@ -410,14 +407,14 @@ class CLEVAScenario(Scenario):
410
407
  pass
411
408
 
412
409
  @classmethod
413
- def download_dataset(cls, task: str, version: str):
410
+ def download_dataset(cls, task: str, version: str, cache_dir: str):
414
411
  source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
415
- target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version)
412
+ target_dir: str = os.path.join(cache_dir, "data", version)
416
413
  ensure_directory_exists(target_dir)
417
414
  ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)
418
415
 
419
- def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
420
- data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task)
416
+ def load_dataset(self, cache_dir: str) -> Dict[str, List[Dict[str, Any]]]:
417
+ data_dir: str = os.path.join(cache_dir, "data", self.version, self.task)
421
418
  if self.subtask:
422
419
  data_dir = os.path.join(data_dir, self.subtask)
423
420
 
@@ -434,8 +431,8 @@ class CLEVAScenario(Scenario):
434
431
  return dataset
435
432
 
436
433
  @staticmethod
437
- def load_prompt_templates(task: str, subtask: Optional[str], version: str) -> List[Dict[str, Any]]:
438
- prompt_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version, task)
434
+ def load_prompt_templates(task: str, subtask: Optional[str], version: str, cache_dir: str) -> List[Dict[str, Any]]:
435
+ prompt_dir: str = os.path.join(cache_dir, "data", version, task)
439
436
  if subtask:
440
437
  prompt_dir = os.path.join(prompt_dir, subtask)
441
438
  file_path = os.path.join(prompt_dir, "prompts.json")
@@ -448,7 +445,7 @@ class CLEVAScenario(Scenario):
448
445
 
449
446
  def get_instances(self, output_path: str) -> List[Instance]:
450
447
  # Download the raw data
451
- dataset = self.load_dataset()
448
+ dataset = self.load_dataset(output_path)
452
449
 
453
450
  # Read all the instances
454
451
  instances: List[Instance] = []
@@ -465,9 +462,9 @@ class CLEVAScenario(Scenario):
465
462
 
466
463
  @classmethod
467
464
  def get_prompt_setting(
468
- cls, task: str, subtask: Optional[str], version: str, prompt_id: int
465
+ cls, task: str, subtask: Optional[str], version: str, prompt_id: int, output_path: str
469
466
  ) -> Tuple[Dict[str, Any], PromptSetting]:
470
- prompt_templates = cls.load_prompt_templates(task, subtask, version)
467
+ prompt_templates = cls.load_prompt_templates(task, subtask, version, output_path)
471
468
  if prompt_id >= len(prompt_templates):
472
469
  raise ValueError(
473
470
  f"You want to use prompt template with prompt_id {prompt_id}, but there is only"
@@ -519,10 +516,10 @@ class CLEVAScenario(Scenario):
519
516
 
520
517
  @classmethod
521
518
  def load_inference_parameters(
522
- cls, task: str, subtask: Optional[str], version: str, prompt_id: int
519
+ cls, task: str, subtask: Optional[str], version: str, prompt_id: int, cache_dir: str
523
520
  ) -> Dict[str, Any]:
524
521
  # We use a dict instead of dataclass to store hyperparameters such that we can set different default values
525
- params_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version, task)
522
+ params_dir: str = os.path.join(cache_dir, "data", version, task)
526
523
  if subtask:
527
524
  params_dir = os.path.join(params_dir, subtask)
528
525
  file_path = os.path.join(params_dir, "infer_params.json")
@@ -932,7 +929,7 @@ class CLEVADialogueGenerationScenario(CLEVAScenario):
932
929
 
933
930
  def get_instances(self, output_path: str) -> List[Instance]:
934
931
  # Download the raw data
935
- dataset = self.load_dataset()
932
+ dataset = self.load_dataset(output_path)
936
933
 
937
934
  # Read all the instances
938
935
  instances: List[Instance] = []
@@ -48,6 +48,7 @@ APPS is a benchmark for code generation from natural language specifications.
48
48
  Each instance has 1) a problem description with examples (as what you get in
49
49
  programming competitions), 2) coding solutions, 3) test cases.
50
50
  """
51
+
51
52
  import io
52
53
  import json
53
54
  import os
@@ -139,8 +140,8 @@ def _read_and_preprocess_apps(target_path: str) -> List[CodeInstance]:
139
140
  # only if the version of Python has a default limit.
140
141
  #
141
142
  # See: https://docs.python.org/3/library/stdtypes.html#int-max-str-digits
142
- if hasattr(sys, "set_int_max_str_digits"): # type: ignore
143
- sys.set_int_max_str_digits(100000) # type: ignore
143
+ if hasattr(sys, "set_int_max_str_digits"):
144
+ sys.set_int_max_str_digits(100000)
144
145
 
145
146
  SINGLE_STR_LIMIT = 150000 # From original codebase.
146
147