crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -22,13 +22,11 @@ from typing import List, Optional, Dict, Any, Tuple, Set
22
22
 
23
23
  from tqdm import tqdm
24
24
  from helm.benchmark.model_deployment_registry import get_model_deployment
25
-
26
25
  from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
27
26
  from helm.common.general import (
28
27
  write,
29
28
  ensure_directory_exists,
30
29
  asdict_without_nones,
31
- serialize_dates,
32
30
  parallel_map,
33
31
  singleton,
34
32
  unique_simplification,
@@ -42,27 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
42
40
  from helm.benchmark.metrics.metric_name import MetricName
43
41
  from helm.benchmark.metrics.metric import get_all_stats_by_name
44
42
  from helm.benchmark.metrics.statistic import Stat, merge_stat
45
- from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
43
+ from helm.benchmark.run_spec import RunSpec
44
+ from helm.benchmark.runner import LATEST_SYMLINK
46
45
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
47
46
  from helm.benchmark.presentation.schema import (
48
47
  MetricNameMatcher,
49
48
  RunGroup,
49
+ Field,
50
50
  read_schema,
51
- SCHEMA_CLASSIC_YAML_FILENAME,
51
+ get_default_schema_path,
52
52
  BY_GROUP,
53
53
  THIS_GROUP_ONLY,
54
54
  NO_GROUPS,
55
55
  )
56
- from helm.benchmark.presentation.contamination import (
57
- read_contamination,
58
- validate_contamination,
59
- CONTAMINATION_SYMBOLS,
60
- CONTAMINATION_STYLES,
61
- CONTAMINATION_LEVEL_STRONG,
62
- )
63
56
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
64
57
  from helm.benchmark.presentation.run_display import write_run_display_json
65
- from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
58
+ from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
66
59
 
67
60
 
68
61
  OVERLAP_N_COUNT = 13
@@ -172,7 +165,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
172
165
  except ValueError:
173
166
  pass
174
167
 
175
- # Return a placeholder "unknoown model" model metadata.
168
+ # Return a placeholder "unknown model" model metadata.
176
169
  return get_unknown_model_metadata(adapter_spec.model)
177
170
 
178
171
 
@@ -238,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
238
231
  if lower_is_better is None: # column does not have a meaningful ordering
239
232
  continue
240
233
 
241
- # sort row indices by cell value and then compute the number of wins as the index in the sorted list
242
- def is_cell_valid(cell: Cell) -> bool: # ignore cells which are strongly contaminated or have no value
243
- if cell.value is None:
244
- return False
245
- if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
246
- return False
247
- return True
248
-
249
- values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
234
+ values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
250
235
  if len(values) < 2: # don't rank a single model
251
236
  continue
252
237
  for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
@@ -309,7 +294,7 @@ class Summarizer:
309
294
  release: Optional[str],
310
295
  suites: Optional[List[str]],
311
296
  suite: Optional[str],
312
- schema_file: str,
297
+ schema_path: str,
313
298
  output_path: str,
314
299
  verbose: bool,
315
300
  num_threads: int,
@@ -329,7 +314,7 @@ class Summarizer:
329
314
  self.suites: List[str]
330
315
  self.run_suite_paths: List[str]
331
316
  self.suite: Optional[str] = None
332
- self.schema_file = schema_file
317
+ self.schema_path = schema_path
333
318
  self.release: Optional[str] = None
334
319
  if suite:
335
320
  self.suite = suite
@@ -347,9 +332,7 @@ class Summarizer:
347
332
 
348
333
  ensure_directory_exists(self.run_release_path)
349
334
 
350
- self.schema = read_schema(schema_file)
351
- self.contamination = read_contamination()
352
- validate_contamination(self.contamination, self.schema)
335
+ self.schema = read_schema(schema_path)
353
336
 
354
337
  def read_run(self, run_path: str) -> Run:
355
338
  """Load the `Run` object from `run_path`."""
@@ -377,7 +360,7 @@ class Summarizer:
377
360
  if run_group_name not in self.schema.name_to_run_group:
378
361
  hlog(
379
362
  f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
380
- f"but undefined in {self.schema_file}, skipping"
363
+ f"but undefined in {self.schema_path}, skipping"
381
364
  )
382
365
  continue
383
366
  run_group = self.schema.name_to_run_group[run_group_name]
@@ -433,11 +416,61 @@ class Summarizer:
433
416
  self.group_adapter_to_runs[group_name][adapter_spec].append(run)
434
417
  self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
435
418
 
436
- def write_schema(self):
419
+ @dataclass(frozen=True)
420
+ class _ModelField(Field):
421
+ """The frontend version of ModelMetadata.
422
+
423
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
424
+
425
+ All attributes have the same meaning as in ModelMetadata."""
426
+
427
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
428
+ creator_organization: Optional[str] = None
429
+ access: Optional[str] = None
430
+ todo: bool = False
431
+ release_date: Optional[str] = None
432
+ num_parameters: Optional[int] = None
433
+
434
+ def get_model_field_dicts(self) -> List[Dict]:
435
+ """Get a list of `ModelField`s dicts that will be written to schema.json.
436
+
437
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
438
+
439
+ This is populated by reading the `ModelMetadata` configs and filtering down to models that were
440
+ actually used, and converting each `ModelMetadata` to a `ModelField`."""
441
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
442
+ used_model_names: Set[str] = set()
443
+ for run in self.runs:
444
+ used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
445
+
446
+ model_field_dicts: List[Dict] = []
447
+ for model_name in get_all_models():
448
+ if model_name not in used_model_names:
449
+ continue
450
+ model_metadata = get_model_metadata(model_name)
451
+ model_field = Summarizer._ModelField(
452
+ name=model_metadata.name,
453
+ display_name=model_metadata.display_name,
454
+ short_display_name=model_metadata.display_name,
455
+ description=model_metadata.description,
456
+ creator_organization=model_metadata.creator_organization_name,
457
+ access=model_metadata.access,
458
+ todo=False,
459
+ release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
460
+ num_parameters=model_metadata.num_parameters,
461
+ )
462
+ model_field_dicts.append(asdict_without_nones(model_field))
463
+ return model_field_dicts
464
+
465
+ def write_schema(self) -> None:
437
466
  """Write the schema file to benchmark_output so the frontend knows about it."""
467
+ # Manually add the model metadata to the schema.json, where the frontend expects it.
468
+ # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
469
+ raw_schema = asdict_without_nones(self.schema)
470
+ raw_schema["models"] = self.get_model_field_dicts()
438
471
  write(
439
472
  os.path.join(self.run_release_path, "schema.json"),
440
- json.dumps(asdict_without_nones(self.schema), indent=2, default=serialize_dates),
473
+ json.dumps(raw_schema, indent=2),
441
474
  )
442
475
 
443
476
  def read_runs(self):
@@ -512,6 +545,7 @@ class Summarizer:
512
545
 
513
546
  return file_metadata
514
547
 
548
+ # TODO: Delete this after @andyzorigin's project is done.
515
549
  self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
516
550
 
517
551
  data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
@@ -594,7 +628,7 @@ class Summarizer:
594
628
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
595
629
  if metric_name not in defined_metric_names:
596
630
  hlog(
597
- f"WARNING: metric name {metric_name} undefined in {self.schema_file} "
631
+ f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
598
632
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
599
633
  )
600
634
 
@@ -726,9 +760,6 @@ class Summarizer:
726
760
  num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
727
761
  num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
728
762
 
729
- if len(num_instances) == 0:
730
- continue
731
-
732
763
  rows.append(
733
764
  [
734
765
  Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -762,9 +793,9 @@ class Summarizer:
762
793
  self,
763
794
  runs: List[Run],
764
795
  matcher: MetricNameMatcher,
765
- contamination_level: Optional[str],
766
796
  additional_info: Optional[str],
767
797
  hide_value: bool = False,
798
+ is_scenario_table: bool = False,
768
799
  ) -> Cell:
769
800
  """
770
801
  Use the metric name identified by `matcher` to pull out the stats from
@@ -818,18 +849,33 @@ class Summarizer:
818
849
  if self.verbose:
819
850
  description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
820
851
 
821
- style: Dict[str, Any] = {}
822
- if contamination_level is not None:
823
- style = CONTAMINATION_STYLES.get(contamination_level, style)
824
-
825
- return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
852
+ # Link the runs that this cell was aggregated from, if this is not a scenario table.
853
+ # Scenario tables link to the runs in the model cells,
854
+ # whereas non-scenario tables link to the runs in the metrics cells.
855
+ run_spec_names: Optional[List] = None
856
+ if not is_scenario_table:
857
+ # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
858
+ # run specs if a run spec belongs to multiple groups.
859
+ run_spec_names = []
860
+ run_spec_names_set = set()
861
+ for run_spec_name in aggregated_run_specs:
862
+ if run_spec_name not in run_spec_names_set:
863
+ run_spec_names.append(run_spec_name)
864
+ run_spec_names_set.add(run_spec_name)
865
+
866
+ return Cell(
867
+ value=value,
868
+ description=description,
869
+ style={},
870
+ run_spec_names=run_spec_names,
871
+ )
826
872
 
827
873
  def create_group_table(
828
874
  self,
829
875
  name: str,
830
876
  title: str,
831
877
  adapter_to_runs: Dict[AdapterSpec, List[Run]],
832
- link_to_runs: bool,
878
+ is_scenario_table: bool,
833
879
  columns: List[Tuple[RunGroup, str]], # run_group, metric_group
834
880
  sort_by_model_order: bool = True,
835
881
  sub_split: Optional[str] = None,
@@ -868,7 +914,7 @@ class Summarizer:
868
914
  matcher = replace(matcher, sub_split=sub_split)
869
915
  header_field = self.schema.name_to_metric.get(matcher.name)
870
916
  if header_field is None:
871
- hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_file}, skipping")
917
+ hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
872
918
  continue
873
919
  metadata = {
874
920
  "metric": header_field.get_short_display_name(),
@@ -921,10 +967,10 @@ class Summarizer:
921
967
 
922
968
  adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
923
969
  if sort_by_model_order:
924
- # Sort models by the order defined in the schema.
925
- # Models not defined in the schema will be sorted alphabetically and
926
- # placed before models in defined the schema.
927
- model_order = [model.name for model in self.schema.models]
970
+ # Sort models by the order defined in the the model metadata config.
971
+ # Models not defined in the model metadata config will be sorted alphabetically and
972
+ # placed before models in defined the model metadata config.
973
+ model_order = get_all_models()
928
974
 
929
975
  def _adapter_spec_sort_key(spec):
930
976
  index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
@@ -948,21 +994,18 @@ class Summarizer:
948
994
  runs = adapter_to_runs[adapter_spec]
949
995
  display_name = get_method_display_name(model_metadata.display_name, info)
950
996
 
951
- # Link to all the runs under this model
952
- if link_to_runs:
997
+ # Link the runs that this row was aggregated from, if this is a scenario table.
998
+ # Scenario tables link to the runs in the model cells,
999
+ # whereas non-scenario tables link to the runs in the metrics cells.
1000
+ run_spec_names: Optional[List[str]]
1001
+ if is_scenario_table:
953
1002
  run_spec_names = [run.run_spec.name for run in runs]
954
1003
  href = run_spec_names_to_url(run_spec_names)
955
1004
  else:
1005
+ run_spec_names = None
956
1006
  href = None
957
1007
 
958
- # Render contamination information
959
- point = self.contamination.get_point(model_name, columns[0][0].name)
960
- if num_groups == 1 and point is not None: # display contamination information at the adapter level
961
- cells = [
962
- Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
963
- ]
964
- else:
965
- cells = [Cell(display_name, description="", href=href)]
1008
+ cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
966
1009
  assert len(group_names) == len(matchers)
967
1010
  for group_name, matcher in zip(group_names, matchers):
968
1011
  group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -971,13 +1014,7 @@ class Summarizer:
971
1014
  if "babi" in group_name and "task:" not in name:
972
1015
  group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
973
1016
 
974
- point = self.contamination.get_point(model_name, group_name)
975
- if point is not None:
976
- description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
977
- contamination_level = point.level
978
- else:
979
- description = ""
980
- contamination_level = None
1017
+ description = ""
981
1018
 
982
1019
  group_overlap_stats = None
983
1020
  if (model_name, group_name) in self._model_group_overlap_stats:
@@ -999,9 +1036,9 @@ class Summarizer:
999
1036
  self.create_cell(
1000
1037
  group_runs,
1001
1038
  matcher,
1002
- contamination_level,
1003
1039
  additional_info=description,
1004
1040
  hide_value=hide_value,
1041
+ is_scenario_table=is_scenario_table,
1005
1042
  )
1006
1043
  )
1007
1044
 
@@ -1011,7 +1048,7 @@ class Summarizer:
1011
1048
  # There could be a ton of runs, so only do this if there are 2-5
1012
1049
  # TODO: replace in frontend with a selector to choose which rows to visualize.
1013
1050
  links = []
1014
- if link_to_runs:
1051
+ if is_scenario_table:
1015
1052
  all_run_spec_names = []
1016
1053
  for adapter_spec, runs in adapter_to_runs.items():
1017
1054
  if len(runs) > 1:
@@ -1094,8 +1131,8 @@ class Summarizer:
1094
1131
  title=display_name,
1095
1132
  adapter_to_runs=adapter_to_runs,
1096
1133
  columns=[(subgroup, metric_group) for subgroup in subgroups],
1097
- link_to_runs=False,
1098
- add_win_rate=True,
1134
+ is_scenario_table=False,
1135
+ add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
1099
1136
  )
1100
1137
  tables.append(table)
1101
1138
  return tables
@@ -1126,7 +1163,7 @@ class Summarizer:
1126
1163
  name=scenario_name,
1127
1164
  adapter_to_runs=adapter_to_runs,
1128
1165
  columns=columns,
1129
- link_to_runs=True,
1166
+ is_scenario_table=True,
1130
1167
  )
1131
1168
  tables.append(table)
1132
1169
  scenarios_shown += 1
@@ -1138,7 +1175,7 @@ class Summarizer:
1138
1175
  name=f"{subgroup.name}:sub_split={sub_split}",
1139
1176
  adapter_to_runs=adapter_to_runs,
1140
1177
  columns=columns,
1141
- link_to_runs=False,
1178
+ is_scenario_table=False,
1142
1179
  sub_split=sub_split,
1143
1180
  )
1144
1181
  tables.append(table)
@@ -1158,7 +1195,7 @@ class Summarizer:
1158
1195
  name=subgroup.name,
1159
1196
  adapter_to_runs=adapter_to_runs,
1160
1197
  columns=columns,
1161
- link_to_runs=False,
1198
+ is_scenario_table=False,
1162
1199
  )
1163
1200
  tables = [table] + tables
1164
1201
  all_tables.extend(tables)
@@ -1260,9 +1297,9 @@ class Summarizer:
1260
1297
  for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
1261
1298
  scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
1262
1299
  scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
1263
- self.scenario_spec_instance_id_dict[
1264
- scenario_spec_instance_ids.scenario_spec
1265
- ] = scenario_spec_instance_ids.instance_ids
1300
+ self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
1301
+ scenario_spec_instance_ids.instance_ids
1302
+ )
1266
1303
 
1267
1304
  def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1268
1305
  for run in self.runs:
@@ -1304,8 +1341,6 @@ class Summarizer:
1304
1341
 
1305
1342
  def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
1306
1343
  """Run the entire summarization pipeline."""
1307
- self.write_schema()
1308
-
1309
1344
  self.read_runs()
1310
1345
  self.group_runs()
1311
1346
  self.check_metrics_defined()
@@ -1320,6 +1355,10 @@ class Summarizer:
1320
1355
  # because it uses self.scenario_spec_instance_id_dict
1321
1356
  self.read_overlap_stats()
1322
1357
 
1358
+ # Must happen after self.read_runs()
1359
+ # because it uses self.runs
1360
+ self.write_schema()
1361
+
1323
1362
  self.write_executive_summary()
1324
1363
  self.write_runs()
1325
1364
  self.write_run_specs()
@@ -1337,10 +1376,9 @@ def main():
1337
1376
  "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1338
1377
  )
1339
1378
  parser.add_argument(
1340
- "--schema-file",
1379
+ "--schema-path",
1341
1380
  type=str,
1342
- help="File name of the schema to read (e.g., schema_classic.yaml).",
1343
- default=SCHEMA_CLASSIC_YAML_FILENAME,
1381
+ help="Path to the schema file (e.g., schema_classic.yaml).",
1344
1382
  )
1345
1383
  parser.add_argument(
1346
1384
  "--suite",
@@ -1407,6 +1445,8 @@ def main():
1407
1445
  else:
1408
1446
  raise ValueError("Exactly one of --release or --suite must be specified.")
1409
1447
 
1448
+ schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1449
+
1410
1450
  register_builtin_configs_from_helm_package()
1411
1451
  register_configs_from_directory(args.local_path)
1412
1452
 
@@ -1415,7 +1455,7 @@ def main():
1415
1455
  release=release,
1416
1456
  suites=suites,
1417
1457
  suite=suite,
1418
- schema_file=args.schema_file,
1458
+ schema_path=schema_path,
1419
1459
  output_path=args.output_path,
1420
1460
  verbose=args.debug,
1421
1461
  num_threads=args.num_threads,
@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
4
4
 
5
5
  @dataclass(frozen=True)
6
6
  class Cell:
7
- # Semantic value (that can be used for sorting)
8
7
  value: Any
8
+ """Semantic value (that can be used for sorting)"""
9
9
 
10
- # Optionally, if we want to render things specially (floating points to 3 decimal points)
11
10
  display_value: Optional[str] = None
11
+ """Optionally, if we want to render things specially (floating points to 3 decimal points)"""
12
12
 
13
- # Detailed description if hover over the cell
14
13
  description: Optional[str] = None
14
+ """Detailed description if hover over the cell"""
15
15
 
16
- # If we click on the link for this cell, it takes us somewhere
17
16
  href: Optional[str] = None
17
+ """If we click on the link for this cell, it takes us somewhere"""
18
18
 
19
- # Styling
20
19
  style: Optional[Dict[str, Any]] = None
20
+ """Styling"""
21
21
 
22
- # If the value or display_value is markdown that needs to be interpreted
23
22
  markdown: bool = False
23
+ """If the value or display_value is markdown that needs to be interpreted"""
24
24
 
25
- # How much train-test contamination affects the cell's value (`contamination.CONTAMINATION_LEVEL_{WEAK/STRONG}`)
26
- contamination_level: Optional[str] = None
25
+ run_spec_names: Optional[List[str]] = None
26
+ """The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
27
27
 
28
28
 
29
29
  @dataclass(frozen=True)
@@ -1,9 +1,9 @@
1
- from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
1
+ from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
2
2
  from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
3
3
 
4
4
 
5
5
  def test_contamination_schema():
6
- schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
6
+ schema = read_schema(get_default_schema_path())
7
7
  contamination = read_contamination()
8
8
  validate_contamination(contamination, schema)
9
9
 
@@ -3,8 +3,7 @@ import pytest
3
3
 
4
4
  from helm.common.object_spec import parse_object_spec
5
5
  from helm.benchmark.presentation.run_entry import read_run_entries
6
- from helm.benchmark.run_specs import construct_run_specs
7
- from helm.benchmark import vlm_run_specs # noqa
6
+ from helm.benchmark.run_spec_factory import construct_run_specs
8
7
 
9
8
 
10
9
  def list_fnames():
@@ -2,7 +2,7 @@ import os
2
2
  import tempfile
3
3
 
4
4
  from helm.benchmark.presentation.summarize import Summarizer
5
- from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
5
+ from helm.benchmark.presentation.schema import get_default_schema_path
6
6
  from helm.common.general import ensure_directory_exists
7
7
 
8
8
 
@@ -13,7 +13,7 @@ def test_summarize_suite():
13
13
  release=None,
14
14
  suites=None,
15
15
  suite="test_suite",
16
- schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
16
+ schema_path=get_default_schema_path(),
17
17
  output_path=output_path,
18
18
  verbose=False,
19
19
  num_threads=4,
@@ -31,7 +31,7 @@ def test_summarize_release():
31
31
  release="test_release",
32
32
  suites=["test_suite_1", "test_suite_2"],
33
33
  suite=None,
34
- schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
34
+ schema_path=get_default_schema_path(),
35
35
  output_path=output_path,
36
36
  verbose=False,
37
37
  num_threads=4,
helm/benchmark/run.py CHANGED
@@ -1,27 +1,26 @@
1
1
  import argparse
2
2
  from dataclasses import replace
3
+ import os
3
4
  from typing import List, Optional
4
- from helm.benchmark.huggingface_registration import (
5
- register_huggingface_hub_model_from_flag_value,
6
- register_huggingface_local_model_from_flag_value,
7
- )
5
+
8
6
 
9
7
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
+ from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
10
9
  from helm.common.general import ensure_directory_exists
11
10
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
12
11
  from helm.common.authentication import Authentication
13
12
  from helm.common.object_spec import parse_object_spec, get_class_by_name
14
13
  from helm.proxy.services.remote_service import create_authentication, add_service_args
14
+ from helm.proxy.services.service import CACHE_DIR
15
15
 
16
16
  from helm.benchmark.config_registry import (
17
17
  register_configs_from_directory,
18
18
  register_builtin_configs_from_helm_package,
19
19
  )
20
20
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
21
- from helm.benchmark import vlm_run_specs # noqa
22
- from .executor import ExecutionSpec
23
- from .runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
24
- from .run_specs import construct_run_specs
21
+ from helm.benchmark.executor import ExecutionSpec
22
+ from helm.benchmark.runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
23
+ from helm.benchmark.run_spec_factory import construct_run_specs
25
24
 
26
25
 
27
26
  def run_entries_to_run_specs(
@@ -85,16 +84,29 @@ def run_benchmarking(
85
84
  skip_completed_runs: bool,
86
85
  exit_on_error: bool,
87
86
  runner_class_name: Optional[str],
88
- mongo_uri: str = "",
87
+ mongo_uri: Optional[str] = None,
88
+ disable_cache: Optional[bool] = None,
89
89
  ) -> List[RunSpec]:
90
90
  """Runs RunSpecs given a list of RunSpec descriptions."""
91
+ sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
92
+ mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
93
+
94
+ if not disable_cache:
95
+ if mongo_uri:
96
+ mongo_cache_backend_config = MongoCacheBackendConfig(mongo_uri)
97
+ else:
98
+ sqlite_cache_path = os.path.join(local_path, CACHE_DIR)
99
+ ensure_directory_exists(sqlite_cache_path)
100
+ sqlite_cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
101
+
91
102
  execution_spec = ExecutionSpec(
92
103
  auth=auth,
93
104
  url=url,
94
105
  local_path=local_path,
95
106
  parallelism=num_threads,
96
107
  dry_run=dry_run,
97
- mongo_uri=mongo_uri,
108
+ sqlite_cache_backend_config=sqlite_cache_backend_config,
109
+ mongo_cache_backend_config=mongo_cache_backend_config,
98
110
  )
99
111
  with htrack_block("run_specs"):
100
112
  for run_spec in run_specs:
@@ -160,13 +172,6 @@ def add_run_args(parser: argparse.ArgumentParser):
160
172
  help="Name of the suite this run belongs to (default is today's date).",
161
173
  required=True,
162
174
  )
163
- parser.add_argument(
164
- "--local",
165
- action="store_true",
166
- help="DEPRECATED: Does nothing. Do not use. Previously enabled local mode. "
167
- "Now does nothing and will be removed in the next released version. "
168
- "Local mode is enabled by default, and only disabled if the --server_url flag is set.",
169
- )
170
175
  parser.add_argument(
171
176
  "--local-path",
172
177
  type=str,
@@ -179,6 +184,11 @@ def add_run_args(parser: argparse.ArgumentParser):
179
184
  help="If non-empty, the URL of the MongoDB database that will be used for caching instead of SQLite",
180
185
  default="",
181
186
  )
187
+ parser.add_argument(
188
+ "--disable-cache",
189
+ action="store_true",
190
+ help="If true, the request-response cache for model clients and tokenizers will be disabled.",
191
+ )
182
192
 
183
193
 
184
194
  def validate_args(args):
@@ -227,7 +237,14 @@ def main():
227
237
  help="Run RunSpecs with priority less than or equal to this number. "
228
238
  "If a value for --priority is not specified, run on everything",
229
239
  )
230
- parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
240
+ parser.add_argument(
241
+ "--run-specs",
242
+ nargs="*",
243
+ help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
244
+ "Specifies run entries to run.",
245
+ default=[],
246
+ )
247
+ parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
231
248
  parser.add_argument(
232
249
  "--enable-huggingface-models",
233
250
  nargs="+",
@@ -254,14 +271,25 @@ def main():
254
271
  register_builtin_configs_from_helm_package()
255
272
  register_configs_from_directory(args.local_path)
256
273
 
257
- for huggingface_model_name in args.enable_huggingface_models:
258
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
259
- for huggingface_model_path in args.enable_local_huggingface_models:
260
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
274
+ if args.enable_huggingface_models:
275
+ from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
276
+
277
+ for huggingface_model_name in args.enable_huggingface_models:
278
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
279
+ if args.enable_local_huggingface_models:
280
+ from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
281
+
282
+ for huggingface_model_path in args.enable_local_huggingface_models:
283
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
261
284
 
262
285
  run_entries: List[RunEntry] = []
263
286
  if args.conf_paths:
264
287
  run_entries.extend(read_run_entries(args.conf_paths).entries)
288
+ if args.run_entries:
289
+ run_entries.extend(
290
+ [RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
291
+ )
292
+ # TODO: Remove this eventually.
265
293
  if args.run_specs:
266
294
  run_entries.extend(
267
295
  [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
@@ -306,13 +334,13 @@ def main():
306
334
  exit_on_error=args.exit_on_error,
307
335
  runner_class_name=args.runner_class_name,
308
336
  mongo_uri=args.mongo_uri,
337
+ disable_cache=args.disable_cache,
309
338
  )
310
339
 
311
- if args.local:
340
+ if args.run_specs:
312
341
  hlog(
313
- "WARNING: The --local flag is deprecated. It now does nothing and will be removed in "
314
- "the next released version. Local mode is enabled by default, and only disabled if the "
315
- "--server_url flag is set. Please remove --local from your command."
342
+ "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
343
+ "Use --run-entries instead."
316
344
  )
317
345
 
318
346
  hlog("Done.")