crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -21,7 +21,8 @@ from statistics import mean, median
21
21
  from typing import List, Optional, Dict, Any, Tuple, Set
22
22
 
23
23
  from tqdm import tqdm
24
-
24
+ from helm.benchmark.model_deployment_registry import get_model_deployment
25
+ from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
25
26
  from helm.common.general import (
26
27
  write,
27
28
  ensure_directory_exists,
@@ -39,25 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
39
40
  from helm.benchmark.metrics.metric_name import MetricName
40
41
  from helm.benchmark.metrics.metric import get_all_stats_by_name
41
42
  from helm.benchmark.metrics.statistic import Stat, merge_stat
42
- from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
43
+ from helm.benchmark.run_spec import RunSpec
44
+ from helm.benchmark.runner import LATEST_SYMLINK
43
45
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
44
46
  from helm.benchmark.presentation.schema import (
45
47
  MetricNameMatcher,
46
48
  RunGroup,
49
+ Field,
47
50
  read_schema,
48
- SCHEMA_YAML_FILENAME,
51
+ get_default_schema_path,
49
52
  BY_GROUP,
50
53
  THIS_GROUP_ONLY,
51
54
  NO_GROUPS,
52
55
  )
53
- from helm.benchmark.presentation.contamination import (
54
- read_contamination,
55
- validate_contamination,
56
- CONTAMINATION_SYMBOLS,
57
- CONTAMINATION_STYLES,
58
- CONTAMINATION_LEVEL_STRONG,
59
- )
56
+ from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
60
57
  from helm.benchmark.presentation.run_display import write_run_display_json
58
+ from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
61
59
 
62
60
 
63
61
  OVERLAP_N_COUNT = 13
@@ -139,6 +137,38 @@ def get_scenario_name(group: RunGroup, scenario_spec: ScenarioSpec):
139
137
  return group.name + "_" + dict_to_str(scenario_spec.args).replace(" ", "").replace("/", "_")
140
138
 
141
139
 
140
+ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetadata:
141
+ """Return the ModelMetadata for the model in the given AdapterSpec."""
142
+ # Get model metadata based on `model` in `adapter_spec`
143
+ try:
144
+ return get_model_metadata(adapter_spec.model)
145
+ except ValueError:
146
+ pass
147
+
148
+ # Get model metadata based on `model_deployment` in `adapter_spec`
149
+ try:
150
+ model_deployment = get_model_deployment(adapter_spec.model_deployment)
151
+ if model_deployment.model_name:
152
+ return get_model_metadata(model_deployment.model_name)
153
+ except ValueError:
154
+ pass
155
+
156
+ # In some cases, some models were renamed such that the old model name is now the model deployment name
157
+ # For instance, the model called "huggingface/gpt2" is now called "openai/gpt2", but its model deployment
158
+ # is still called "huggingface/gpt2".
159
+ # Handle these cases here.
160
+ # TODO: Delete this block eventually.
161
+ try:
162
+ model_deployment = get_model_deployment(adapter_spec.model)
163
+ if model_deployment.model_name:
164
+ return get_model_metadata(model_deployment.model_name)
165
+ except ValueError:
166
+ pass
167
+
168
+ # Return a placeholder "unknown model" model metadata.
169
+ return get_unknown_model_metadata(adapter_spec.model)
170
+
171
+
142
172
  def get_coarse_adapter_spec(
143
173
  adapter_spec: AdapterSpec, scenario_spec: Optional[ScenarioSpec] = None, adapter_keys_shown: List[str] = []
144
174
  ) -> AdapterSpec:
@@ -165,7 +195,7 @@ def get_coarse_adapter_spec(
165
195
 
166
196
  # Create a new adapter_spec, keeping only the model and the keys in adapter_keys_shown
167
197
  adapter_spec_kwargs = {key: adapter_spec.__dict__[key] for key in adapter_keys_shown}
168
- return AdapterSpec(**adapter_spec_kwargs) # type: ignore
198
+ return AdapterSpec(**adapter_spec_kwargs)
169
199
 
170
200
 
171
201
  def get_method_display_name(model_display_name: Optional[str], info: Dict[str, Any]) -> str:
@@ -178,6 +208,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A
178
208
  info = dict(info)
179
209
  if "model" in info:
180
210
  del info["model"]
211
+ if "model_deployment" in info:
212
+ del info["model_deployment"]
181
213
 
182
214
  return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "")
183
215
 
@@ -199,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
199
231
  if lower_is_better is None: # column does not have a meaningful ordering
200
232
  continue
201
233
 
202
- # sort row indices by cell value and then compute the number of wins as the index in the sorted list
203
- def is_cell_valid(cell: Cell) -> bool: # ignore cells which are strongly contaminated or have no value
204
- if cell.value is None:
205
- return False
206
- if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
207
- return False
208
- return True
209
-
210
- values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
234
+ values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
211
235
  if len(values) < 2: # don't rank a single model
212
236
  continue
213
237
  for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
@@ -270,9 +294,11 @@ class Summarizer:
270
294
  release: Optional[str],
271
295
  suites: Optional[List[str]],
272
296
  suite: Optional[str],
297
+ schema_path: str,
273
298
  output_path: str,
274
299
  verbose: bool,
275
300
  num_threads: int,
301
+ allow_unknown_models: bool,
276
302
  ):
277
303
  """
278
304
  A note on the relation between `release`, `suites`, and `suite`:
@@ -288,6 +314,7 @@ class Summarizer:
288
314
  self.suites: List[str]
289
315
  self.run_suite_paths: List[str]
290
316
  self.suite: Optional[str] = None
317
+ self.schema_path = schema_path
291
318
  self.release: Optional[str] = None
292
319
  if suite:
293
320
  self.suite = suite
@@ -301,12 +328,11 @@ class Summarizer:
301
328
  self.run_suite_paths = [os.path.join(output_path, "runs", suite) for suite in suites]
302
329
  self.verbose: bool = verbose
303
330
  self.num_threads: int = num_threads
331
+ self.allow_unknown_models: bool = allow_unknown_models
304
332
 
305
333
  ensure_directory_exists(self.run_release_path)
306
334
 
307
- self.schema = read_schema()
308
- self.contamination = read_contamination()
309
- validate_contamination(self.contamination, self.schema)
335
+ self.schema = read_schema(schema_path)
310
336
 
311
337
  def read_run(self, run_path: str) -> Run:
312
338
  """Load the `Run` object from `run_path`."""
@@ -334,7 +360,7 @@ class Summarizer:
334
360
  if run_group_name not in self.schema.name_to_run_group:
335
361
  hlog(
336
362
  f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
337
- f"but undefined in {SCHEMA_YAML_FILENAME}, skipping"
363
+ f"but undefined in {self.schema_path}, skipping"
338
364
  )
339
365
  continue
340
366
  run_group = self.schema.name_to_run_group[run_group_name]
@@ -355,7 +381,13 @@ class Summarizer:
355
381
  """Load the runs in the run suite path."""
356
382
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
357
383
  # so filter them out.
358
- run_dir_names = sorted([p for p in os.listdir(run_suite_path) if p != "eval_cache" and p != "groups"])
384
+ run_dir_names = sorted(
385
+ [
386
+ p
387
+ for p in os.listdir(run_suite_path)
388
+ if p != "eval_cache" and p != "groups" and os.path.isdir(os.path.join(run_suite_path, p))
389
+ ]
390
+ )
359
391
  for run_dir_name in tqdm(run_dir_names, disable=None):
360
392
  run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
361
393
  stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
@@ -363,13 +395,8 @@ class Summarizer:
363
395
  hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
364
396
  continue
365
397
  run_path: str = os.path.join(run_suite_path, run_dir_name)
366
- self.runs.append(self.read_run(run_path))
367
-
368
- # For each group (e.g., natural_qa), map
369
- # (i) scenario spec (e.g., subject=philosophy) [optional] and
370
- # (ii) adapter spec (e.g., model = openai/davinci)
371
- # to list of runs
372
- for run in self.runs:
398
+ run = self.read_run(run_path)
399
+ self.runs.append(run)
373
400
  if run.run_spec.name in self.runs_to_run_suites:
374
401
  hlog(
375
402
  f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
@@ -377,12 +404,75 @@ class Summarizer:
377
404
  )
378
405
  self.runs_to_run_suites[run.run_spec.name] = suite
379
406
 
407
+ def group_runs(self):
408
+ # For each group (e.g., natural_qa), map
409
+ # (i) scenario spec (e.g., subject=philosophy) [optional] and
410
+ # (ii) adapter spec (e.g., model = openai/davinci)
411
+ # to list of runs
412
+ for run in self.runs:
380
413
  scenario_spec = run.run_spec.scenario_spec
381
414
  adapter_spec = run.run_spec.adapter_spec
382
415
  for group_name in run.run_spec.groups:
383
416
  self.group_adapter_to_runs[group_name][adapter_spec].append(run)
384
417
  self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
385
418
 
419
+ @dataclass(frozen=True)
420
+ class _ModelField(Field):
421
+ """The frontend version of ModelMetadata.
422
+
423
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
424
+
425
+ All attributes have the same meaning as in ModelMetadata."""
426
+
427
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
428
+ creator_organization: Optional[str] = None
429
+ access: Optional[str] = None
430
+ todo: bool = False
431
+ release_date: Optional[str] = None
432
+ num_parameters: Optional[int] = None
433
+
434
+ def get_model_field_dicts(self) -> List[Dict]:
435
+ """Get a list of `ModelField`s dicts that will be written to schema.json.
436
+
437
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
438
+
439
+ This is populated by reading the `ModelMetadata` configs and filtering down to models that were
440
+ actually used, and converting each `ModelMetadata` to a `ModelField`."""
441
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
442
+ used_model_names: Set[str] = set()
443
+ for run in self.runs:
444
+ used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
445
+
446
+ model_field_dicts: List[Dict] = []
447
+ for model_name in get_all_models():
448
+ if model_name not in used_model_names:
449
+ continue
450
+ model_metadata = get_model_metadata(model_name)
451
+ model_field = Summarizer._ModelField(
452
+ name=model_metadata.name,
453
+ display_name=model_metadata.display_name,
454
+ short_display_name=model_metadata.display_name,
455
+ description=model_metadata.description,
456
+ creator_organization=model_metadata.creator_organization_name,
457
+ access=model_metadata.access,
458
+ todo=False,
459
+ release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
460
+ num_parameters=model_metadata.num_parameters,
461
+ )
462
+ model_field_dicts.append(asdict_without_nones(model_field))
463
+ return model_field_dicts
464
+
465
+ def write_schema(self) -> None:
466
+ """Write the schema file to benchmark_output so the frontend knows about it."""
467
+ # Manually add the model metadata to the schema.json, where the frontend expects it.
468
+ # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
469
+ raw_schema = asdict_without_nones(self.schema)
470
+ raw_schema["models"] = self.get_model_field_dicts()
471
+ write(
472
+ os.path.join(self.run_release_path, "schema.json"),
473
+ json.dumps(raw_schema, indent=2),
474
+ )
475
+
386
476
  def read_runs(self):
387
477
  self.runs: List[Run] = []
388
478
  self.runs_to_run_suites: Dict[str, str] = {}
@@ -455,6 +545,7 @@ class Summarizer:
455
545
 
456
546
  return file_metadata
457
547
 
548
+ # TODO: Delete this after @andyzorigin's project is done.
458
549
  self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
459
550
 
460
551
  data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
@@ -537,7 +628,7 @@ class Summarizer:
537
628
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
538
629
  if metric_name not in defined_metric_names:
539
630
  hlog(
540
- f"WARNING: metric name {metric_name} undefined in {SCHEMA_YAML_FILENAME} "
631
+ f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
541
632
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
542
633
  )
543
634
 
@@ -564,12 +655,12 @@ class Summarizer:
564
655
  # TODO: move to write_executive_summary()
565
656
  models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int))
566
657
  for run in self.runs:
567
- model: str = run.run_spec.adapter_spec.model
658
+ deployment: str = run.run_spec.adapter_spec.model_deployment
568
659
 
569
660
  for stat in run.stats:
570
661
  stat_name = stat.name.name
571
662
  if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split:
572
- models_to_costs[model][stat_name] += stat.sum
663
+ models_to_costs[deployment][stat_name] += stat.sum
573
664
 
574
665
  # Do a second pass to add up the total number of tokens
575
666
  for costs in models_to_costs.values():
@@ -639,7 +730,8 @@ class Summarizer:
639
730
  header = [
640
731
  HeaderCell("Group"),
641
732
  HeaderCell("Description"),
642
- # Synchronize these names with `schema.yaml`
733
+ # Synchronize these names with the appropriate schema file
734
+ # TODO: different schema files might have different fields (for multimodal)
643
735
  HeaderCell("Adaptation method", description="Adaptation strategy (e.g., generation)"),
644
736
  HeaderCell("# instances", description="Number of instances evaluated on"),
645
737
  HeaderCell("# references", description="Number of references provided per instance"),
@@ -660,7 +752,7 @@ class Summarizer:
660
752
  for subgroup in self.expand_subgroups(group):
661
753
  for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items():
662
754
  filtered_runs = self.filter_runs_by_visibility(runs, subgroup)
663
- models.add(adapter_spec.model)
755
+ models.add(adapter_spec.model_deployment)
664
756
  methods.add(adapter_spec.method)
665
757
  for run in filtered_runs:
666
758
  num_instances.extend(get_all_stats_by_name(run.stats, "num_instances"))
@@ -668,9 +760,6 @@ class Summarizer:
668
760
  num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
669
761
  num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
670
762
 
671
- if len(num_instances) == 0:
672
- continue
673
-
674
763
  rows.append(
675
764
  [
676
765
  Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -704,9 +793,9 @@ class Summarizer:
704
793
  self,
705
794
  runs: List[Run],
706
795
  matcher: MetricNameMatcher,
707
- contamination_level: Optional[str],
708
796
  additional_info: Optional[str],
709
797
  hide_value: bool = False,
798
+ is_scenario_table: bool = False,
710
799
  ) -> Cell:
711
800
  """
712
801
  Use the metric name identified by `matcher` to pull out the stats from
@@ -760,18 +849,33 @@ class Summarizer:
760
849
  if self.verbose:
761
850
  description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
762
851
 
763
- style: Dict[str, Any] = {}
764
- if contamination_level is not None:
765
- style = CONTAMINATION_STYLES.get(contamination_level, style)
766
-
767
- return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
852
+ # Link the runs that this cell was aggregated from, if this is not a scenario table.
853
+ # Scenario tables link to the runs in the model cells,
854
+ # whereas non-scenario tables link to the runs in the metrics cells.
855
+ run_spec_names: Optional[List] = None
856
+ if not is_scenario_table:
857
+ # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
858
+ # run specs if a run spec belongs to multiple groups.
859
+ run_spec_names = []
860
+ run_spec_names_set = set()
861
+ for run_spec_name in aggregated_run_specs:
862
+ if run_spec_name not in run_spec_names_set:
863
+ run_spec_names.append(run_spec_name)
864
+ run_spec_names_set.add(run_spec_name)
865
+
866
+ return Cell(
867
+ value=value,
868
+ description=description,
869
+ style={},
870
+ run_spec_names=run_spec_names,
871
+ )
768
872
 
769
873
  def create_group_table(
770
874
  self,
771
875
  name: str,
772
876
  title: str,
773
877
  adapter_to_runs: Dict[AdapterSpec, List[Run]],
774
- link_to_runs: bool,
878
+ is_scenario_table: bool,
775
879
  columns: List[Tuple[RunGroup, str]], # run_group, metric_group
776
880
  sort_by_model_order: bool = True,
777
881
  sub_split: Optional[str] = None,
@@ -810,7 +914,7 @@ class Summarizer:
810
914
  matcher = replace(matcher, sub_split=sub_split)
811
915
  header_field = self.schema.name_to_metric.get(matcher.name)
812
916
  if header_field is None:
813
- hlog(f"WARNING: metric name {matcher.name} undefined in {SCHEMA_YAML_FILENAME}, skipping")
917
+ hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
814
918
  continue
815
919
  metadata = {
816
920
  "metric": header_field.get_short_display_name(),
@@ -863,55 +967,45 @@ class Summarizer:
863
967
 
864
968
  adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
865
969
  if sort_by_model_order:
866
- # Sort models by the order defined in the schema.
867
- # Models not defined in the schema will be sorted alphabetically and
868
- # placed before models in defined the schema.
869
- model_order = [model.name for model in self.schema.models]
970
+ # Sort models by the order defined in the the model metadata config.
971
+ # Models not defined in the model metadata config will be sorted alphabetically and
972
+ # placed before models in defined the model metadata config.
973
+ model_order = get_all_models()
870
974
 
871
975
  def _adapter_spec_sort_key(spec):
872
- index = model_order.index(spec.model) if spec.model in model_order else -1
873
- return (index, spec.model)
976
+ index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
977
+ return (index, spec.model_deployment)
874
978
 
875
979
  adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key))
876
980
 
877
981
  # Pull out only the keys of the method adapter_spec that is needed to
878
982
  # uniquely identify the method.
879
- infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"])
983
+ infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"])
880
984
 
881
985
  assert len(adapter_specs) == len(infos), [adapter_specs, infos]
882
986
 
883
987
  # Populate the contents of the table
884
988
  rows = []
885
989
  for adapter_spec, info in zip(adapter_specs, infos):
886
- model_name: str = adapter_spec.model
887
-
888
- # Get the model display name from the schema.
889
- # Fall back to using the model name as the model display name if the model is not
890
- # defined in the schema.
891
- model_display_name = (
892
- self.schema.name_to_model[model_name].display_name
893
- if model_name in self.schema.name_to_model
894
- else model_name
895
- )
990
+ model_metadata = get_model_metadata_for_adapter_spec(adapter_spec)
991
+
992
+ model_name: str = model_metadata.name
896
993
 
897
994
  runs = adapter_to_runs[adapter_spec]
898
- display_name = get_method_display_name(model_display_name, info)
995
+ display_name = get_method_display_name(model_metadata.display_name, info)
899
996
 
900
- # Link to all the runs under this model
901
- if link_to_runs:
997
+ # Link the runs that this row was aggregated from, if this is a scenario table.
998
+ # Scenario tables link to the runs in the model cells,
999
+ # whereas non-scenario tables link to the runs in the metrics cells.
1000
+ run_spec_names: Optional[List[str]]
1001
+ if is_scenario_table:
902
1002
  run_spec_names = [run.run_spec.name for run in runs]
903
1003
  href = run_spec_names_to_url(run_spec_names)
904
1004
  else:
1005
+ run_spec_names = None
905
1006
  href = None
906
1007
 
907
- # Render contamination information
908
- point = self.contamination.get_point(model_name, columns[0][0].name)
909
- if num_groups == 1 and point is not None: # display contamination information at the adapter level
910
- cells = [
911
- Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
912
- ]
913
- else:
914
- cells = [Cell(display_name, description="", href=href)]
1008
+ cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
915
1009
  assert len(group_names) == len(matchers)
916
1010
  for group_name, matcher in zip(group_names, matchers):
917
1011
  group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -920,13 +1014,7 @@ class Summarizer:
920
1014
  if "babi" in group_name and "task:" not in name:
921
1015
  group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
922
1016
 
923
- point = self.contamination.get_point(model_name, group_name)
924
- if point is not None:
925
- description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
926
- contamination_level = point.level
927
- else:
928
- description = ""
929
- contamination_level = None
1017
+ description = ""
930
1018
 
931
1019
  group_overlap_stats = None
932
1020
  if (model_name, group_name) in self._model_group_overlap_stats:
@@ -948,9 +1036,9 @@ class Summarizer:
948
1036
  self.create_cell(
949
1037
  group_runs,
950
1038
  matcher,
951
- contamination_level,
952
1039
  additional_info=description,
953
1040
  hide_value=hide_value,
1041
+ is_scenario_table=is_scenario_table,
954
1042
  )
955
1043
  )
956
1044
 
@@ -960,7 +1048,7 @@ class Summarizer:
960
1048
  # There could be a ton of runs, so only do this if there are 2-5
961
1049
  # TODO: replace in frontend with a selector to choose which rows to visualize.
962
1050
  links = []
963
- if link_to_runs:
1051
+ if is_scenario_table:
964
1052
  all_run_spec_names = []
965
1053
  for adapter_spec, runs in adapter_to_runs.items():
966
1054
  if len(runs) > 1:
@@ -1043,8 +1131,8 @@ class Summarizer:
1043
1131
  title=display_name,
1044
1132
  adapter_to_runs=adapter_to_runs,
1045
1133
  columns=[(subgroup, metric_group) for subgroup in subgroups],
1046
- link_to_runs=False,
1047
- add_win_rate=True,
1134
+ is_scenario_table=False,
1135
+ add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
1048
1136
  )
1049
1137
  tables.append(table)
1050
1138
  return tables
@@ -1075,7 +1163,7 @@ class Summarizer:
1075
1163
  name=scenario_name,
1076
1164
  adapter_to_runs=adapter_to_runs,
1077
1165
  columns=columns,
1078
- link_to_runs=True,
1166
+ is_scenario_table=True,
1079
1167
  )
1080
1168
  tables.append(table)
1081
1169
  scenarios_shown += 1
@@ -1087,7 +1175,7 @@ class Summarizer:
1087
1175
  name=f"{subgroup.name}:sub_split={sub_split}",
1088
1176
  adapter_to_runs=adapter_to_runs,
1089
1177
  columns=columns,
1090
- link_to_runs=False,
1178
+ is_scenario_table=False,
1091
1179
  sub_split=sub_split,
1092
1180
  )
1093
1181
  tables.append(table)
@@ -1107,7 +1195,7 @@ class Summarizer:
1107
1195
  name=subgroup.name,
1108
1196
  adapter_to_runs=adapter_to_runs,
1109
1197
  columns=columns,
1110
- link_to_runs=False,
1198
+ is_scenario_table=False,
1111
1199
  )
1112
1200
  tables = [table] + tables
1113
1201
  all_tables.extend(tables)
@@ -1209,9 +1297,9 @@ class Summarizer:
1209
1297
  for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
1210
1298
  scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
1211
1299
  scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
1212
- self.scenario_spec_instance_id_dict[
1213
- scenario_spec_instance_ids.scenario_spec
1214
- ] = scenario_spec_instance_ids.instance_ids
1300
+ self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
1301
+ scenario_spec_instance_ids.instance_ids
1302
+ )
1215
1303
 
1216
1304
  def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1217
1305
  for run in self.runs:
@@ -1249,11 +1337,12 @@ class Summarizer:
1249
1337
  if os.path.islink(symlink_path):
1250
1338
  # Remove the previous symlink if it exists.
1251
1339
  os.unlink(symlink_path)
1252
- os.symlink(os.path.abspath(self.run_release_path), symlink_path)
1340
+ os.symlink(os.path.basename(self.run_release_path), symlink_path)
1253
1341
 
1254
1342
  def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
1255
- """Run the entire summarization pipeline pipeline."""
1343
+ """Run the entire summarization pipeline."""
1256
1344
  self.read_runs()
1345
+ self.group_runs()
1257
1346
  self.check_metrics_defined()
1258
1347
 
1259
1348
  self.write_run_display_json(skip_completed)
@@ -1266,6 +1355,10 @@ class Summarizer:
1266
1355
  # because it uses self.scenario_spec_instance_id_dict
1267
1356
  self.read_overlap_stats()
1268
1357
 
1358
+ # Must happen after self.read_runs()
1359
+ # because it uses self.runs
1360
+ self.write_schema()
1361
+
1269
1362
  self.write_executive_summary()
1270
1363
  self.write_runs()
1271
1364
  self.write_run_specs()
@@ -1276,12 +1369,17 @@ class Summarizer:
1276
1369
  self.symlink_latest()
1277
1370
 
1278
1371
 
1279
- @htrack(None)
1372
+ @htrack("summarize")
1280
1373
  def main():
1281
1374
  parser = argparse.ArgumentParser()
1282
1375
  parser.add_argument(
1283
1376
  "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1284
1377
  )
1378
+ parser.add_argument(
1379
+ "--schema-path",
1380
+ type=str,
1381
+ help="Path to the schema file (e.g., schema_classic.yaml).",
1382
+ )
1285
1383
  parser.add_argument(
1286
1384
  "--suite",
1287
1385
  type=str,
@@ -1312,6 +1410,18 @@ def main():
1312
1410
  help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
1313
1411
  default=1000,
1314
1412
  )
1413
+ parser.add_argument(
1414
+ "--local-path",
1415
+ type=str,
1416
+ help="If running locally, the path for `ServerService`.",
1417
+ default="prod_env",
1418
+ )
1419
+ parser.add_argument(
1420
+ "--allow-unknown-models",
1421
+ type=bool,
1422
+ help="Whether to allow unknown models in the metadata file",
1423
+ default=True,
1424
+ )
1315
1425
  args = parser.parse_args()
1316
1426
 
1317
1427
  release: Optional[str] = None
@@ -1335,14 +1445,21 @@ def main():
1335
1445
  else:
1336
1446
  raise ValueError("Exactly one of --release or --suite must be specified.")
1337
1447
 
1448
+ schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1449
+
1450
+ register_builtin_configs_from_helm_package()
1451
+ register_configs_from_directory(args.local_path)
1452
+
1338
1453
  # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1339
1454
  summarizer = Summarizer(
1340
1455
  release=release,
1341
1456
  suites=suites,
1342
1457
  suite=suite,
1458
+ schema_path=schema_path,
1343
1459
  output_path=args.output_path,
1344
1460
  verbose=args.debug,
1345
1461
  num_threads=args.num_threads,
1462
+ allow_unknown_models=args.allow_unknown_models,
1346
1463
  )
1347
1464
  summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
1348
1465
  hlog("Done.")
@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
4
4
 
5
5
  @dataclass(frozen=True)
6
6
  class Cell:
7
- # Semantic value (that can be used for sorting)
8
7
  value: Any
8
+ """Semantic value (that can be used for sorting)"""
9
9
 
10
- # Optionally, if we want to render things specially (floating points to 3 decimal points)
11
10
  display_value: Optional[str] = None
11
+ """Optionally, if we want to render things specially (floating points to 3 decimal points)"""
12
12
 
13
- # Detailed description if hover over the cell
14
13
  description: Optional[str] = None
14
+ """Detailed description if hover over the cell"""
15
15
 
16
- # If we click on the link for this cell, it takes us somewhere
17
16
  href: Optional[str] = None
17
+ """If we click on the link for this cell, it takes us somewhere"""
18
18
 
19
- # Styling
20
19
  style: Optional[Dict[str, Any]] = None
20
+ """Styling"""
21
21
 
22
- # If the value or display_value is markdown that needs to be interpreted
23
22
  markdown: bool = False
23
+ """If the value or display_value is markdown that needs to be interpreted"""
24
24
 
25
- # How much train-test contamination affects the cell's value (`contamination.CONTAMINATION_LEVEL_{WEAK/STRONG}`)
26
- contamination_level: Optional[str] = None
25
+ run_spec_names: Optional[List[str]] = None
26
+ """The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
27
27
 
28
28
 
29
29
  @dataclass(frozen=True)
@@ -1,9 +1,9 @@
1
- from helm.benchmark.presentation.schema import read_schema
1
+ from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
2
2
  from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
3
3
 
4
4
 
5
5
  def test_contamination_schema():
6
- schema = read_schema()
6
+ schema = read_schema(get_default_schema_path())
7
7
  contamination = read_contamination()
8
8
  validate_contamination(contamination, schema)
9
9