crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,108 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class UnicornScenario(Scenario):
21
+ """
22
+ How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
23
+
24
+ We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
25
+ suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
26
+ evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
27
+ to test model performance under challenging conditions. In the OOD scenario, questions are matched with
28
+ boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
29
+ with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
30
+ highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
31
+
32
+ @article{tu2023unicorns,
33
+ title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
34
+ author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
35
+ Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
36
+ journal={arXiv preprint arXiv:2311.16101},
37
+ year={2023}
38
+ }
39
+
40
+ Paper: https://arxiv.org/abs/2311.16101
41
+ """
42
+
43
+ UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
44
+
45
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
46
+
47
+ SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
48
+
49
+ IMG_TYPE: Dict[str, str] = {
50
+ "OODCV-VQA": "jpeg",
51
+ "OODCV-Counterfactual": "jpeg",
52
+ "Sketchy-VQA": "png",
53
+ "Sketchy-Challenging": "png",
54
+ }
55
+
56
+ name = "unicorn"
57
+ description = (
58
+ "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
59
+ " ([paper](https://arxiv.org/abs/2311.16101))."
60
+ )
61
+ tags = ["vision-language"]
62
+
63
+ def __init__(self, subject: str):
64
+ super().__init__()
65
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
66
+ self._subject: str = subject
67
+ self._image_type: str = self.IMG_TYPE[self._subject]
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ images_path: str = os.path.join(output_path, "images")
71
+ ensure_directory_exists(images_path)
72
+
73
+ # There is only the test split in Unicorn benchmark
74
+ instances: List[Instance] = []
75
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
76
+
77
+ # Process the test set
78
+ for row in tqdm(
79
+ load_dataset(
80
+ self.UNICORN_HUGGINGFACE_DATASET_NAME,
81
+ data_files=question_data_files,
82
+ split=TEST_SPLIT,
83
+ cache_dir=output_path,
84
+ )
85
+ ):
86
+ # Download the image
87
+ image_path: str = row["image_path"]
88
+ local_image_path: str = os.path.join(output_path, image_path)
89
+ ensure_file_downloaded(
90
+ source_url=self.IMAGE_URL.format(image_path=image_path),
91
+ target_path=local_image_path,
92
+ unpack=False,
93
+ )
94
+
95
+ content: List[MediaObject] = [
96
+ MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
97
+ MediaObject(text=row["question"], content_type="text/plain"),
98
+ ]
99
+ answer: str = row["answer"]
100
+ instances.append(
101
+ Instance(
102
+ Input(multimedia_content=MultimediaObject(content)),
103
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
104
+ split=TEST_SPLIT,
105
+ )
106
+ )
107
+
108
+ return instances
@@ -13,13 +13,13 @@ from helm.benchmark.scenarios.scenario import (
13
13
  Scenario,
14
14
  )
15
15
  from helm.common.media_object import MediaObject, MultimediaObject
16
- from helm.common.general import ensure_directory_exists, ensure_file_downloaded
16
+ from helm.common.general import ensure_file_downloaded
17
17
 
18
18
 
19
19
  class VizWizScenario(Scenario):
20
20
  """
21
- VizWiz is a real-world visual question answering dataset consisting of questions
22
- asked by people who are blind. It originates from a natural visual question answering
21
+ VizWiz is a real-world visual question answering dataset consisting of questions asked by people who are
22
+ visually impaired. It originates from a natural visual question answering
23
23
  setting where blind people each took an image and recorded a spoken question about it,
24
24
  together with 10 crowdsourced answers per visual question.
25
25
 
@@ -60,7 +60,6 @@ class VizWizScenario(Scenario):
60
60
  def get_instances(self, output_path: str) -> List[Instance]:
61
61
  # Download the questions and annotations
62
62
  annotations_path: str = os.path.join(output_path, "annotations")
63
- ensure_directory_exists(annotations_path)
64
63
  ensure_file_downloaded(
65
64
  source_url=self.ANNOTATIONS_URL,
66
65
  target_path=annotations_path,
@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
54
54
  TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
55
55
  }
56
56
 
57
- name = "visual_question_answering"
57
+ name = "vqa"
58
58
  description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
59
59
  tags = ["vision-language", "visual question answering"]
60
60
 
@@ -111,11 +111,13 @@ class VQAScenario(Scenario):
111
111
  MediaObject(location=image_path, content_type="image/jpeg"),
112
112
  MediaObject(text=question_json["question"], content_type="text/plain"),
113
113
  ]
114
-
115
114
  instances.append(
116
115
  Instance(
117
116
  Input(multimedia_content=MultimediaObject(content)),
118
- references=[Reference(Output(text=answers_json["multiple_choice_answer"]), tags=[CORRECT_TAG])],
117
+ references=[
118
+ Reference(Output(text=answer_json["answer"]), tags=[CORRECT_TAG])
119
+ for answer_json in answers_json["answers"]
120
+ ],
119
121
  split=split,
120
122
  )
121
123
  )
@@ -61,7 +61,7 @@ class WMT14Scenario(Scenario):
61
61
  def get_instances(self, output_path: str) -> List[Instance]:
62
62
  with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
63
63
  subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
64
- hf_dataset: Any = load_dataset("wmt14", subset_name)
64
+ hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
65
65
  splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
66
66
 
67
67
  instances: List[Instance] = []
helm/benchmark/server.py CHANGED
@@ -70,6 +70,14 @@ def serve_benchmark_output(filename):
70
70
  return response
71
71
 
72
72
 
73
+ @app.get("/cache/output/<filename:path>")
74
+ def serve_cache_output(filename):
75
+ response = static_file(filename, root=app.config["helm.cacheoutputpath"])
76
+ response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
77
+ response.set_header("Expires", "0")
78
+ return response
79
+
80
+
73
81
  @app.get("/")
74
82
  @app.get("/<filename:path>")
75
83
  def serve_static(filename="index.html"):
@@ -87,6 +95,12 @@ def main():
87
95
  help="The location of the output path (filesystem path or URL)",
88
96
  default="benchmark_output",
89
97
  )
98
+ parser.add_argument(
99
+ "--cache-output-path",
100
+ type=str,
101
+ help="The location of the filesystem cache output folder (filesystem path or URL)",
102
+ default="prod_env/cache/output",
103
+ )
90
104
  parser.add_argument(
91
105
  "--suite",
92
106
  type=str,
@@ -99,6 +113,11 @@ def main():
99
113
  default=None,
100
114
  help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
101
115
  )
116
+ parser.add_argument(
117
+ "--jquery",
118
+ action="store_true",
119
+ help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
120
+ )
102
121
  args = parser.parse_args()
103
122
 
104
123
  if args.suite and args.release:
@@ -107,7 +126,8 @@ def main():
107
126
  # Determine the location of the static directory.
108
127
  # This is a hack: it assumes that the static directory has a physical location,
109
128
  # which is not always the case (e.g. when using zipimport).
110
- resource_path = resources.files("helm.benchmark.static").joinpath("index.html")
129
+ static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
130
+ resource_path = resources.files(static_package_name).joinpath("index.html")
111
131
  with resources.as_file(resource_path) as resource_filename:
112
132
  static_path = str(resource_filename.parent)
113
133
 
@@ -117,16 +137,19 @@ def main():
117
137
  # Output path is a URL, so set the output path base URL in the frontend to that URL
118
138
  # so that the frontend reads from that URL directly.
119
139
  app.config["helm.outputpath"] = None
140
+ # TODO: figure out helm.cacheoutputpath
120
141
  app.config["helm.outputurl"] = args.output_path
121
142
  else:
122
143
  # Output path is a location on disk, so set the output path base URL to /benchmark_output
123
144
  # and then serve files from the location on disk at that URL.
124
145
  app.config["helm.outputpath"] = path.abspath(args.output_path)
146
+ app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
125
147
  app.config["helm.outputurl"] = "benchmark_output"
126
148
 
127
149
  app.config["helm.suite"] = args.suite or "latest"
128
150
  app.config["helm.release"] = args.release
129
151
 
152
+ print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
130
153
  app.run(host="0.0.0.0", port=args.port)
131
154
 
132
155
 
@@ -10,6 +10,10 @@ import sys
10
10
 
11
11
  from helm.common.codec import from_json, to_json
12
12
  from helm.common.general import write
13
+ from helm.benchmark.config_registry import (
14
+ register_configs_from_directory,
15
+ register_builtin_configs_from_helm_package,
16
+ )
13
17
  from helm.benchmark.executor import ExecutionSpec
14
18
  from helm.benchmark.runner import Runner, RunSpec, RunnerError
15
19
  from helm.benchmark.slurm_jobs import (
@@ -24,10 +28,11 @@ from helm.benchmark.slurm_jobs import (
24
28
  from helm.common.general import ensure_directory_exists
25
29
  from helm.common.hierarchical_logger import hlog, htrack_block
26
30
 
31
+ from helm.benchmark.runner_config_registry import RUNNER_CONFIG
27
32
 
28
- _DEFAULT_MAX_CONCURRENT_WORKER_SLURM_JOBS = 8
29
33
  _MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKER_SLURM_JOBS"
30
34
  _SLURM_NODE_NAMES_ENV_NAME = "HELM_SLURM_NODE_NAMES"
35
+ _DEFAULT_MAX_CONCURRENT_WORKER_SLURM = 8
31
36
 
32
37
 
33
38
  @dataclass
@@ -89,12 +94,15 @@ class SlurmRunner(Runner):
89
94
  self.slurm_runner_spec_path = os.path.join(self.slurm_base_dir, "slurm_runner_spec.json")
90
95
 
91
96
  # Configure max concurrent worker Slurm jobs from the environment variable.
92
- # TODO: Read from a configuration file instead
93
97
  env_max_concurrent_worker_slurm_jobs = os.getenv(_MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME)
94
98
  self.max_concurrent_worker_slurm_jobs = (
95
99
  int(env_max_concurrent_worker_slurm_jobs)
96
100
  if env_max_concurrent_worker_slurm_jobs
97
- else _DEFAULT_MAX_CONCURRENT_WORKER_SLURM_JOBS
101
+ else (
102
+ RUNNER_CONFIG.helm_max_concurrent_workers
103
+ if RUNNER_CONFIG.helm_max_concurrent_workers > 0
104
+ else _DEFAULT_MAX_CONCURRENT_WORKER_SLURM
105
+ )
98
106
  )
99
107
 
100
108
  def run_all(self, run_specs: List[RunSpec]):
@@ -222,8 +230,7 @@ class SlurmRunner(Runner):
222
230
  break
223
231
 
224
232
  # Refresh every minute
225
- # TODO: Make this period configurable
226
- time.sleep(60)
233
+ time.sleep(RUNNER_CONFIG.slurm_monitor_interval)
227
234
  finally:
228
235
  # Cleanup by cancelling all jobs during program termination or if an exception is raised.
229
236
  cancel_all_jobs()
@@ -261,34 +268,48 @@ class SlurmRunner(Runner):
261
268
  run_spec_path,
262
269
  ]
263
270
  )
264
- # TODO: Make default Slurm arguments configurable.
265
- raw_slurm_args: Dict[str, str] = {
266
- "account": "nlp",
267
- "cpus_per_task": "4",
268
- "mem": "32G",
269
- "gres": "gpu:0",
270
- "open_mode": "append",
271
- "partition": "john",
272
- "time": "14-0", # Deadline of 14 days
273
- "mail_type": "FAIL",
274
- "job_name": run_name,
275
- "output": log_path,
276
- "chdir": os.getcwd(),
277
- }
278
- # TODO: Move resource requirements into RunSpec.
279
- slurm_node_names = os.getenv(_SLURM_NODE_NAMES_ENV_NAME)
280
- if run_spec.name.startswith("msmarco:"):
281
- raw_slurm_args["mem"] = "64G"
282
- if "device=cuda" in run_spec.name:
283
- raw_slurm_args["gres"] = "gpu:1"
284
- raw_slurm_args["partition"] = "jag-hi"
285
- if "model=huggingface" in run_spec.name:
286
- raw_slurm_args["gres"] = "gpu:1"
287
- raw_slurm_args["partition"] = "sphinx"
288
- if not slurm_node_names or "sphinx" not in slurm_node_names:
289
- raise Exception(f"Environment variable {_SLURM_NODE_NAMES_ENV_NAME} must be set to sphinx node names")
290
- if slurm_node_names:
291
- raw_slurm_args["nodelist"] = slurm_node_names
271
+ if RUNNER_CONFIG.slurm_args is None:
272
+ raw_slurm_args: Dict[str, str] = {
273
+ "account": "nlp",
274
+ "cpus_per_task": "4",
275
+ "mem": "32G",
276
+ "gres": "gpu:0",
277
+ "open_mode": "append",
278
+ "partition": "john",
279
+ "time": "14-0", # Deadline of 14 days
280
+ "mail_type": "FAIL",
281
+ "job_name": run_name,
282
+ "output": log_path,
283
+ "chdir": os.getcwd(),
284
+ }
285
+ # TODO: Move resource requirements into RunSpec.
286
+ slurm_node_names = os.getenv(_SLURM_NODE_NAMES_ENV_NAME)
287
+ if run_spec.name.startswith("msmarco:"):
288
+ raw_slurm_args["mem"] = "64G"
289
+ if "device=cuda" in run_spec.name:
290
+ raw_slurm_args["gres"] = "gpu:1"
291
+ raw_slurm_args["partition"] = "jag-hi"
292
+ if "model=huggingface" in run_spec.name:
293
+ raw_slurm_args["gres"] = "gpu:1"
294
+ raw_slurm_args["partition"] = "sphinx"
295
+ if not slurm_node_names or "sphinx" not in slurm_node_names:
296
+ raise Exception(
297
+ f"Environment variable {_SLURM_NODE_NAMES_ENV_NAME} must be set to sphinx node names"
298
+ )
299
+ if slurm_node_names:
300
+ raw_slurm_args["nodelist"] = slurm_node_names
301
+
302
+ else:
303
+ raw_slurm_args = RUNNER_CONFIG.slurm_args
304
+
305
+ dynamic_slurm_args = {
306
+ "job_name": run_name,
307
+ "output": log_path,
308
+ "chdir": os.getcwd(),
309
+ }
310
+
311
+ # User should not set these manually, overwrite them if necessary
312
+ raw_slurm_args.update(dynamic_slurm_args)
292
313
 
293
314
  slurm_args: Dict[str, str] = {key: shlex.quote(value) for key, value in raw_slurm_args.items()}
294
315
  # Uncomment this to get notification emails from Slurm for Slurm worker jobs.
@@ -300,27 +321,15 @@ class SlurmRunner(Runner):
300
321
  return slurm_job_id
301
322
 
302
323
 
303
- def run_as_worker(slurm_runner_spec_path: str, run_spec_path: str):
304
- """Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
305
-
306
- Used by the worker Slurm jobs only."""
307
- with open(slurm_runner_spec_path, "r") as f:
308
- slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
309
- with open(run_spec_path, "r") as f:
310
- run_spec = from_json(f.read(), RunSpec)
311
- slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
312
- slurm_runner.run_one(run_spec)
313
-
314
-
315
324
  def main():
316
325
  """Entry point for the SlurmRunner's worker Slurm jobs that run a single RunSpec.
317
326
 
318
327
  This entry point should only be used by SlurmRunner. Users should use `helm-run` instead.
319
328
  SlurmRunner has to use this entry point instead of helm-run because there is no way to
320
329
  specify the worker Slurm job parameters through `helm-run`. In particular, there is no way
321
- to run a specific `RunSpec` using the `--run-specs` parameter of `helm-run`, because the
322
- `run-specs` argument is a `RunSpec` description (not a `RunSpec`), and there is no way to
323
- convert a `RunSpec` into a `RunSpec` description."""
330
+ to run a specific `RunSpec` using the `--run-entries` parameter of `helm-run`, because the
331
+ `run-entries` argument contains `RunEntry` description (not `RunSpec`s), and there is no way to
332
+ convert a `RunSpec` into a `RunEntry` description."""
324
333
  parser = argparse.ArgumentParser()
325
334
  parser.add_argument(
326
335
  "--slurm-runner-spec-path",
@@ -335,7 +344,19 @@ def main():
335
344
  required=True,
336
345
  )
337
346
  args = parser.parse_args()
338
- run_as_worker(slurm_runner_spec_path=args.slurm_runner_spec_path, run_spec_path=args.run_spec_path)
347
+
348
+ # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
349
+ with open(args.slurm_runner_spec_path, "r") as f:
350
+ slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
351
+ with open(args.run_spec_path, "r") as f:
352
+ run_spec = from_json(f.read(), RunSpec)
353
+
354
+ register_builtin_configs_from_helm_package()
355
+ if slurm_runner_spec.execution_spec.local_path is not None:
356
+ register_configs_from_directory(slurm_runner_spec.execution_spec.local_path)
357
+
358
+ slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
359
+ slurm_runner.run_one(run_spec)
339
360
 
340
361
 
341
362
  if __name__ == "__main__":
@@ -492,7 +492,7 @@ $(function () {
492
492
  {{~#if perturbation~}}
493
493
  {{highlightNewWords input.text ../unperturbedInstance.input.text}}
494
494
  {{~else~}}
495
- {{input.text}}
495
+ {{{input.text}}}
496
496
  {{~/if~}}
497
497
  </div>
498
498
  {{/if}}