PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +77 -0
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +168 -45
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +254 -111
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +43 -9
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +9 -2
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +19 -0
helm/config/model_deployments.yaml +412 -18
helm/config/model_metadata.yaml +447 -25
helm/config/tokenizer_configs.yaml +93 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.1
+Version: 0.5.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -70,6 +70,8 @@ Requires-Dist: pypinyin ==0.49.0 ; extra == 'cleva'
 Requires-Dist: jieba ==0.42.1 ; extra == 'cleva'
 Requires-Dist: opencc ==1.1.6 ; extra == 'cleva'
 Requires-Dist: langdetect ==1.0.9 ; extra == 'cleva'
+Provides-Extra: cohere
+Requires-Dist: cohere ~=5.3 ; extra == 'cohere'
 Provides-Extra: decodingtrust
 Requires-Dist: fairlearn ~=0.9.0 ; extra == 'decodingtrust'
 Provides-Extra: dev
@@ -79,7 +81,7 @@ Requires-Dist: black ==24.3.0 ; extra == 'dev'
 Requires-Dist: mypy ==1.5.1 ; extra == 'dev'
 Requires-Dist: flake8 ==5.0.4 ; extra == 'dev'
 Provides-Extra: google
-Requires-Dist: google-cloud-aiplatform ~=1.44 ; extra == 'google'
+Requires-Dist: google-cloud-aiplatform ~=1.48 ; extra == 'google'
 Provides-Extra: heim
 Requires-Dist: gdown ~=4.4.0 ; extra == 'heim'
 Requires-Dist: diffusers ~=0.24.0 ; extra == 'heim'
@@ -133,24 +135,31 @@ Requires-Dist: crfm-helm[aleph-alpha] ; extra == 'models'
 Requires-Dist: crfm-helm[allenai] ; extra == 'models'
 Requires-Dist: crfm-helm[amazon] ; extra == 'models'
 Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
+Requires-Dist: crfm-helm[cohere] ; extra == 'models'
 Requires-Dist: crfm-helm[google] ; extra == 'models'
 Requires-Dist: crfm-helm[mistral] ; extra == 'models'
 Requires-Dist: crfm-helm[openai] ; extra == 'models'
+Requires-Dist: crfm-helm[reka] ; extra == 'models'
 Requires-Dist: crfm-helm[together] ; extra == 'models'
 Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
 Requires-Dist: crfm-helm[yandex] ; extra == 'models'
+Requires-Dist: crfm-helm[openvino] ; extra == 'models'
 Provides-Extra: mongo
 Requires-Dist: pymongo ~=4.2 ; extra == 'mongo'
 Provides-Extra: openai
 Requires-Dist: openai ~=1.0 ; extra == 'openai'
-Requires-Dist: tiktoken ~=0.3.3 ; extra == 'openai'
+Requires-Dist: tiktoken ~=0.7 ; extra == 'openai'
 Requires-Dist: pydantic ~=2.0 ; extra == 'openai'
+Provides-Extra: openvino
+Requires-Dist: optimum[openvino] ~=1.19 ; extra == 'openvino'
 Provides-Extra: plots
 Requires-Dist: colorcet ~=3.0.1 ; extra == 'plots'
 Requires-Dist: matplotlib ~=3.6.0 ; extra == 'plots'
 Requires-Dist: seaborn ~=0.11.0 ; extra == 'plots'
 Provides-Extra: proxy-server
 Requires-Dist: gunicorn ~=20.1.0 ; extra == 'proxy-server'
+Provides-Extra: reka
+Requires-Dist: reka-api ~=2.0.0 ; extra == 'reka'
 Provides-Extra: scenarios
 Requires-Dist: gdown ~=4.4.0 ; extra == 'scenarios'
 Requires-Dist: sympy ~=1.11.1 ; extra == 'scenarios'
@@ -174,6 +183,7 @@ Requires-Dist: torch ~=2.1.2 ; extra == 'vlm'
 Requires-Dist: transformers-stream-generator ~=0.0.4 ; extra == 'vlm'
 Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
 Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
+Requires-Dist: crfm-helm[reka] ; extra == 'vlm'
 Requires-Dist: crfm-helm[images] ; extra == 'vlm'
 Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
 Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -5,14 +5,14 @@ helm/benchmark/annotation_executor.py,sha256=ZJCc5xT8E0E6gux8dq3HPS4YzQs2YPCNl4g
 helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbYs6T4,1603
 helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
 helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
-helm/benchmark/huggingface_registration.py,sha256=RzfOaLAnzAcoTphan1JNo836lNyxMSH67oQlolhNLS0,4154
+helm/benchmark/huggingface_registration.py,sha256=unEBO21V8K3-Ya0xLqjO9H1oq7RmU-f1MYV0tCIbXzY,4578
 helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
 helm/benchmark/model_metadata_registry.py,sha256=fXRJOLUIrLOHUG5duncEqhnpmfb9hyloUlGbOM2L9ds,8194
 helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
-helm/benchmark/run.py,sha256=tF_aWy5GtfwBOT1ZRKWrcI74VpFWGzlR00EKiGG7zyI,12572
-helm/benchmark/run_expander.py,sha256=jolEPDrB4lL_VJNRpT1SQta6DZ_xyq2HaIfWHdeyNtA,47785
+helm/benchmark/run.py,sha256=WNj10uNCqxwS2pCmt_s5Bn_JIC-NItEjK1PyQl9SXmo,13193
+helm/benchmark/run_expander.py,sha256=sWfcL0caHTsp1NqqsGrG-fZaIbScY8LECJqQMVIPZtE,51191
 helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
-helm/benchmark/run_spec_factory.py,sha256=nRP9737niPReD5G7t9fgyQ8_EUQ1hvg2VBQe5rSZ08Y,6816
+helm/benchmark/run_spec_factory.py,sha256=hp29n_Stb7RMwRm2jrP_qpyzxi8X8ojdqXTFN3KRSiY,6978
 helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
 helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
 helm/benchmark/server.py,sha256=ysd5MT1TDu65NH-OzIGf9wmZlr8FHNRwoy2ybjSc5Yk,6140
@@ -22,7 +22,7 @@ helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5
 helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
 helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
 helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/adaptation/adapter_spec.py,sha256=tZ40ovgNkRsxDOHan4lcD8ukutA1QPsoZUF5XOHq-VA,4382
+helm/benchmark/adaptation/adapter_spec.py,sha256=K5BwqTe2iimjswdw_SONlJo0xt-T-o5KH7VqxrPaov0,5072
 helm/benchmark/adaptation/common_adapter_specs.py,sha256=-ILsVxWjpEE6an1ncrRRrLkdP5ky_-2GN1TxSxJo38M,10449
 helm/benchmark/adaptation/prompt.py,sha256=n0Ka3RGSWMr3CBnJrPNPy626x9TJE3k677wKbG8hO9A,2133
 helm/benchmark/adaptation/request_state.py,sha256=WAPyubn35on-Ry7xKpXsVz3wYBMCMc_LidDOdcKxatI,3053
@@ -49,8 +49,11 @@ helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_a
 helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py,sha256=VjSqWiZEcW6K2jrokGUmky7syEOqJ6cbHImR7YZgwzU,10151
 helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=KKOOlna6SHLJHSPgfgguPQysc2Nf4kKrqumqwlG27bs,3542
 helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+helm/benchmark/annotation/air_bench_annotator.py,sha256=9W3zLO2f4OzxGdavkDI2dDUStxpExa7sgrI-ATGG7NY,3048
 helm/benchmark/annotation/annotator.py,sha256=2UIXY71S5dRaZBLb1v4lcv8-O6pyJ9zTeSJl78AEWGI,1538
-helm/benchmark/annotation/annotator_factory.py,sha256=z5AGBylIuy-_IfgikX66VyGvRz4SxtnOcJsyESH8990,2699
+helm/benchmark/annotation/annotator_factory.py,sha256=3Soh0V3lbsIR_HGHLg-XTc3eKVRj7SL9lLT_AoqUVTs,2997
+helm/benchmark/annotation/live_qa_annotator.py,sha256=IlUV4K-ddbL1XsvIgBAfsLH0_bdKx8kyDev1G3Kwyek,4364
+helm/benchmark/annotation/medication_qa_annotator.py,sha256=7LRmx2a1JODP5puAM0IH0HFTextfeLOzK7ef4sw9XIU,4129
 helm/benchmark/annotation/test_annotator_factory.py,sha256=ifv5hxSbFe113AHeXLqTPkVJ-C2PW_gb9L3a0SHNi-M,986
 helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD1b91jUs4Nk8Gvope-Z98,1644
 helm/benchmark/annotation/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -78,7 +81,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=g4rbyoureBaOVf_lrRXIWY
 helm/benchmark/augmentations/suffix_perturbation.py,sha256=P3AfJj_ajTVdjO7AJRQ9dKS-cT1PyRSt8Un57iZQDVc,785
 helm/benchmark/augmentations/synonym_perturbation.py,sha256=komOV5M342_8unopnwN6gkPWpJIZXidywiu6PO9_riU,4151
 helm/benchmark/augmentations/test_perturbation.py,sha256=4EooKVcyub70I81trzpNx3Ij-m1vpFa5cFIo6O52icE,13185
-helm/benchmark/augmentations/translate_perturbation.py,sha256=dn8wO5UOgYbGtP9e77SmwaK2ginrQsTw-79nrzRzfeo,1054
+helm/benchmark/augmentations/translate_perturbation.py,sha256=vMXCYXGVSo8E78IAzH9HI4p2pvyLzcvO77BnvR2QB0k,1097
 helm/benchmark/augmentations/typos_perturbation.py,sha256=_F9zwvrLie8hX7mzUtQmYq6oq6yqaFiKGsvc9LAuBr4,2798
 helm/benchmark/data_overlap/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/data_overlap/data_overlap_spec.py,sha256=aj_l1l0qxUbUMrSWr70-Sb1j_JN-7WYop5BXPG_xj44,1998
@@ -88,6 +91,7 @@ helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8
 helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
 helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
 helm/benchmark/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+helm/benchmark/metrics/air_bench_metrics.py,sha256=VMNQDDEtz2CiK4U55lCHLz0b_DxHprTAZ1WtYtGXjcY,2282
 helm/benchmark/metrics/basic_metrics.py,sha256=7hk5PZL7d09uG1y7wHBhY_ox8hlXw-n7Yt_FDv_AIKw,20375
 helm/benchmark/metrics/bbq_metrics.py,sha256=Dqccr7GdfKNs1S_1QSB75d8AY7moovEPAqvacGfrCAE,6157
 helm/benchmark/metrics/bias_metrics.py,sha256=GQ4CwOk1Sa9g-LcJCxcoQLD1vWY2Hvujck9l-9qsmf4,11418
@@ -109,16 +113,23 @@ helm/benchmark/metrics/dry_run_metrics.py,sha256=d8RgltW4nGTH1tZeGOIlQRwRaJLIxL6
 helm/benchmark/metrics/efficiency_metrics.py,sha256=v8Eg56HHIWEMQruODKBvwdUfR6ZLGgrNifo-senCaUo,11786
 helm/benchmark/metrics/evaluate_instances_metric.py,sha256=EBUf0ONnNoi7pcxYab7RD0B_JqGksqDX8TOaosSmJk8,2847
 helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=vUJavaLVfbWtrwyrIA81npK_1iirhko7_zMF1kL7Gfw,15559
-helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=Pj1itUJi_KDy0D-FOPcOyHqm4ypHMfhbAVeDJzGlyeo,9773
+helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJTrXJaHYTXn4,2470
+helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
+helm/benchmark/metrics/gpt4v_originality_critique_metrics.py,sha256=1m7IWy9vu66svnmdBRjZQI-2YsGYzH2vXZMptlRGM0Y,5654
+helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=QJxGzyERQv_vMn3PM9fy3IxfBgSg0BjcOf_mv574lGA,9786
 helm/benchmark/metrics/language_modeling_metrics.py,sha256=ofqwj1PMJQu16QhLDULXBmZ5iFz91ducwLRpNsRYELE,4510
+helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
 helm/benchmark/metrics/machine_translation_metrics.py,sha256=bp_EDXyxntIty5gORDa7va-C73quOzoTc5o8MpxFmL4,3816
+helm/benchmark/metrics/medication_qa_metrics.py,sha256=Z939iAc0A5xn_GdnCtfiefhUZK9qk6jZjtde2-F7IH8,840
 helm/benchmark/metrics/metric.py,sha256=dPq7ZMB0w-LgJKMzWYDJtfn-oYD4oG4jJX0yiUEziJM,14245
 helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
 helm/benchmark/metrics/metric_service.py,sha256=mlX_MEFSYNzME6GFS3El_VVOvzPYnOMosKI0XIxygP4,1802
 helm/benchmark/metrics/numeracy_metrics.py,sha256=panMWD3a1NPerg3Ix7l6NhR7jGOIQOQV9i_KysBeDA8,2818
 helm/benchmark/metrics/paraphrase_generation_metrics.py,sha256=-VkAknRhAEBmC_lpz_1aeXU8OppL8KfEPtIYCJkHTmw,1981
+helm/benchmark/metrics/prometheus_vision_critique_metrics.py,sha256=pexBbEFF3-bzWoPWNFuVs-3fm7XJw2EC4xgiSb3gSa4,8508
 helm/benchmark/metrics/ranking_metrics.py,sha256=5hDRapsxx_cmo-ag_80kOQnrgZn3lfVsLZVtWxuxH-s,17391
 helm/benchmark/metrics/reference_metric.py,sha256=RlIM_PFTEkBo0_EEMq8d4_BSagNSBR_XyovMtjDeqqU,6026
+helm/benchmark/metrics/reka_vibe_critique_metrics.py,sha256=CwzzQ13bBT0r_o75TqFj2Zr0ST9vzQi74K_ezWTnLCU,6568
 helm/benchmark/metrics/statistic.py,sha256=FuxNxMtAfiCkOxBS9KHlhEyxe61e0YXt2emvsufgPZQ,3424
 helm/benchmark/metrics/summarization_critique_metrics.py,sha256=Lf7PDuce62HDzyofsyxaOvH0QvzcaS-vJvDWtIs8xKk,4694
 helm/benchmark/metrics/summarization_metrics.py,sha256=laLMGRDy1wjcFvgSWXvzOZwBXshkmPr0S2Ofu79Z01Q,16461
@@ -131,7 +142,7 @@ helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SI
 helm/benchmark/metrics/test_statistic.py,sha256=AejuYLSeUwEOqpEMRKZFjnxu4HKUraeExU8TPmZEqW4,1229
 helm/benchmark/metrics/toxicity_metrics.py,sha256=6MCpHuCXbXZqWwvO57ifKYHnHWBzszN9cZjwgPQQF2Y,4027
 helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
-helm/benchmark/metrics/unitxt_metrics.py,sha256=5rw_fBQGWpFLr1nR4HcRlAwYvDZfJ6_MzGozzNo5NOA,3605
+helm/benchmark/metrics/unitxt_metrics.py,sha256=2F9T4iQV0_BbDMCWrZrd9sc30XHYv8MR4xSBd_dD3eI,4053
 helm/benchmark/metrics/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/metrics/image_generation/aesthetics_metrics.py,sha256=AXQjWBd9zBZOoCF8vQV9FjUy33teC0IF7pdbq-XiHjM,2101
 helm/benchmark/metrics/image_generation/aesthetics_scorer.py,sha256=ISdThDKMrx-SHQe69dCcr8qUrMCa_GsxX3BeZnd0WPA,2538
@@ -180,32 +191,37 @@ helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py,sha256=l9UQZ0aAI
 helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=_wJ3E3LbJB9XPLixTH82BYQbp32o3oij6Sz3lsZL30E,2648
 helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
 helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/metrics/vision_language/emd_utils.py,sha256=3yN-DY5rxMabmtLV003lj59SRnp_T83sLAi96rycKEo,15043
-helm/benchmark/metrics/vision_language/image_metrics.py,sha256=aJ3zrVOLJJzdVKqXPcFsCXp9LSHET8VGEgtvwK-nkJc,25190
+helm/benchmark/metrics/vision_language/emd_utils.py,sha256=KdZdcqu3eo016FdAjAm_83v92-wWuR90EPsTogfTcok,15196
+helm/benchmark/metrics/vision_language/image_metrics.py,sha256=HyXeZiDszSV1Q99ScqeS_xYvyrp1dlWBYahfxt42N3E,23554
 helm/benchmark/metrics/vision_language/image_utils.py,sha256=XeYF3E6MnYyPJ5hYp4TtiTP27-y4S8LTBH5bZVcvJFg,3758
 helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
 helm/benchmark/presentation/create_plots.py,sha256=2-ZOuEdRwqqF1biRmzWggMZjmODoxOQOBoz9GT7tVww,28737
 helm/benchmark/presentation/run_display.py,sha256=tC1DciLvDTQJog4BDo8StWDdX7DbBkhrG2sX_SwXSPQ,11838
 helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
-helm/benchmark/presentation/schema.py,sha256=pOwHCLvAC1Nh6vh48HV83gb7T7WREkifvo4qdovFdv4,8511
+helm/benchmark/presentation/schema.py,sha256=fPw-794HbacZR5z1SmYGUqYgqXbZ8-BrcexWV4h6vgc,10809
 helm/benchmark/presentation/summarize.py,sha256=2fJ9BYOJRxe9eBylLUK3qcZZwAwRtJF_C8plEQlAPEU,67266
 helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
 helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
 helm/benchmark/presentation/test_create_plots.py,sha256=5PPPegMTdBZurxyyUxI4rN13AVsjV3eQrwFqlobJ8UA,1286
 helm/benchmark/presentation/test_run_entry.py,sha256=OM-027j2A0Lx-ai2zBprOxSqzZhS_dh0OKw3ThocZW0,751
+helm/benchmark/presentation/test_schema.py,sha256=6mq6CeAOLW2Kxi1lX_ZW8QCVqVR73XImR8ylcRGFkBE,378
 helm/benchmark/presentation/test_summarize.py,sha256=UfSp33Q9xvuGnPYfFmLJdH5y7KWp9qbZprRMyx8LGP0,1618
 helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/run_specs/classic_run_specs.py,sha256=YKrjuuPXoVjUT6XGEtyouPHkkE0XfR6u2xHQDRqpNyA,57972
+helm/benchmark/run_specs/air_bench_run_specs.py,sha256=VdXis1HN8_KLrMHDCVi0J7WdqjRjAGbZMhrsnpzC-Kg,1604
+helm/benchmark/run_specs/classic_run_specs.py,sha256=Cn0z-6QY-ehbLaHJMvCwjw11DFBQgUyqVCaXwTVFyJ8,58331
 helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
-helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=D5g_--eFOI6-hy6fv9JNj_X4DHU03prKA5GZjlqaoRk,14254
+helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=fDyIxmOdgLLWVtwBfxcnd3nFnBZNFpJHbcM4Kyq5gZA,14315
+helm/benchmark/run_specs/experimental_run_specs.py,sha256=7aF-Ox8iBC2obfJkyKwobJaCjk1SqxtSDuRv_RxA3Eo,1310
+helm/benchmark/run_specs/finance_run_specs.py,sha256=7DCmeBQpETQjK0fvUKS1nDIbM_wxTXb2GhXcjzIDyIE,1181
 helm/benchmark/run_specs/heim_run_specs.py,sha256=Pt1eVbzvwZ5EXq8WB2b3XYw62SWYN_i1P_H3oE4i8KY,22096
 helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
 helm/benchmark/run_specs/lite_run_specs.py,sha256=ViCPJ86Aah8301GTEk6z4_MtP0g8iik33t4GudobhWQ,11113
 helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
 helm/benchmark/run_specs/unitxt_run_specs.py,sha256=ejp_knrcIjf0J4WiKj9LTgDTcUr29-XFZYHYz0w_dkM,1518
-helm/benchmark/run_specs/vlm_run_specs.py,sha256=CmdyEF-pdFIlMhBV7UraQ0FuQgQl2rqVSdTz22uYuPQ,26808
+helm/benchmark/run_specs/vlm_run_specs.py,sha256=uwnk9DHZKQj8nnC14kGiSN8xKiZfpigoz5S86TiHc4k,31118
 helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+helm/benchmark/scenarios/air_bench_scenario.py,sha256=WUZvsUTqlsjNzQsd2baZZIgO30B4Zf3g0QjsyEaGmLc,1772
 helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=Wyt7J5BAvAqC5JTqCW4fh7ex9-itX11P_9rLTocqvtk,4973
 helm/benchmark/scenarios/babi_qa_scenario.py,sha256=S1tPQY2x1I3hQL1JQ6wvUwvKyiSe7SqpRSW6N3_T0mo,5043
 helm/benchmark/scenarios/bbq_scenario.py,sha256=lT1XKSM-PXYtENI-ryScC4yb1TtII7YoH8kt_S1dZQo,9579
@@ -213,6 +229,7 @@ helm/benchmark/scenarios/big_bench_scenario.py,sha256=bSk8Ia4u_6OqMjiyadpYQAWN-8
 helm/benchmark/scenarios/blimp_scenario.py,sha256=o1MDcHT14KFDET4K9otx8pDiIgXrhsD19pvO0mR2ADU,6260
 helm/benchmark/scenarios/bold_scenario.py,sha256=NEfECMVzlVP_yo6sOuIzj6vZ5jd72_nvtEQ1lWrq85Q,4106
 helm/benchmark/scenarios/boolq_scenario.py,sha256=rvSp5SwXMCVzBo5BFxfhj1Xv06_ksqKrtTQR7nPiS-o,8013
+helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
 helm/benchmark/scenarios/civil_comments_scenario.py,sha256=VO5G-cQ9qctmBN0O76uSewnO_mFslMo5mbR2ZTrjuds,4851
 helm/benchmark/scenarios/cleva_scenario.py,sha256=xhwZ616iz0CN3fYIfrXHcV1XlcRQjyPSzML8fq8D3l4,57939
 helm/benchmark/scenarios/code_scenario.py,sha256=s4AGW8eBY0gFnu6EXvVWL0xbFYO28N9sgP1V8eBO7EI,12171
@@ -233,9 +250,10 @@ helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py,sha256=AI8HX
 helm/benchmark/scenarios/dialogue_scenarios.py,sha256=-I7FY6q1b11zpFd1_oAgar5qlfaFcXsNCKGVln9etPI,5629
 helm/benchmark/scenarios/disinformation_scenario.py,sha256=kQi0MVVoSDhx2vOTnUaCIttPXMf8zz7Eld2FD_77tnA,8504
 helm/benchmark/scenarios/dyck_language_scenario.py,sha256=vMxND9wPJenrGlCLhSw5UxOw3TV2Jq8cTmIXGpzEWaA,9318
-helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=n2mnkmSeTznEy7S-GVumqpD9bt27yctbuEmtgQrG-Y0,6399
+helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4cv7u2lmUFcigkAX_eMwIn49Pa3p-aHClkT-r-0roLU,6616
 helm/benchmark/scenarios/entity_matching_scenario.py,sha256=YjBX61TlL3CDQ3X6D-JyR-qlOYGLdoRXJxl9AEeqxYs,7022
 helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py,sha256=TklbX7Kx4y-estV-YHUbI5O08q2qCZRrOmX9D3gZS9c,2193
+helm/benchmark/scenarios/fin_qa_scenario.py,sha256=pXUeJ34KiRSlEjYERgXqVSbr7zxvdXnOuMSpXvnUw5I,5782
 helm/benchmark/scenarios/grammar.py,sha256=Pb9vEP_0Ki87UdQCj1ym7QWJ24M4DRP6TXB5d3GnhLs,5597
 helm/benchmark/scenarios/grammar_scenario.py,sha256=bl-Cm9caDs077zSu38mzaS9maZ2gM-QazgjOEMFvxYg,1454
 helm/benchmark/scenarios/gsm_scenario.py,sha256=9fV2SEw3ocKNAD-TrDZZTpq4l7mbttQQWbO0YNz4e6k,2613
@@ -279,6 +297,7 @@ helm/benchmark/scenarios/summarization_scenario.py,sha256=MlNMgsY369DC04nhMUdG2o
 helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=pzifpsJJbucmTjujNqQnwQa4Y7wpQjkS6QjNXOrgTAQ,3096
 helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=1b3e3WpFMNBV3li17-0Ug6QCSKO4qRFaWDF23bYNsvQ,16326
 helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=k8IGK6VABOr6wuha4HynP47peoAkmIViAVhScOtCANo,8345
+helm/benchmark/scenarios/test_air_bench_scenario.py,sha256=9o92CK57xxgPaA9Xt9uJPPie4Cxllzq-KbMt3G35UQ0,1320
 helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIafmzIeL0nkoU,1364
 helm/benchmark/scenarios/test_math_scenario.py,sha256=s3-CllgCB8DL9-L4DmJ6Zcf9xi803nWYN84KlhN7PhM,1016
 helm/benchmark/scenarios/test_scenario.py,sha256=HexTZBKphMDJbhIYj-HRCDwltPTDqHFHdT7FjPmu8Xs,2070
@@ -314,7 +333,7 @@ helm/benchmark/scenarios/image_generation/time_most_significant_historical_figur
 helm/benchmark/scenarios/image_generation/winoground_scenario.py,sha256=E2xPQNQzylDSmqLjjMkQB8D7A6g7bzqtSF4bXPgfVbI,2889
 helm/benchmark/scenarios/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py,sha256=zXR0LmXsD2tv_ovJsbY_HP53kdiFOvty7Y_Ai3ZCrT4,3037
-helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=LiH14xUoEKXn5ZStDbGE4bz9iMEn3-5I39eJ6kvN2UY,4045
+helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=jwGEouY30Yy5U9lRUbv0XAO98gUJ669g0dhdDCGQ-8w,4097
 helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py,sha256=82qplX4gJ4GsSVhBjwrsVU46TAHh-jym3F_M5A-odRE,4608
 helm/benchmark/scenarios/vision_language/flickr30k_scenario.py,sha256=3pBAQgOsnSyMCzt60s1m8Kf_fEJ4C7XgCDbtXatTlX0,2599
 helm/benchmark/scenarios/vision_language/gqa_scenario.py,sha256=sBQfqAxmP-Z0ifCgwTbP11aPsKA4vogcWBqSDiKlbE4,3512
@@ -329,19 +348,20 @@ helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HU
 helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
 helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
 helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
-helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=rkPR_e_RWOeSyHIlSJGJ5lVu5DD-AR3x686XYJse-1E,9885
+helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=wVcTNUql4TBClgm7oyLq5cmybsnlurc0MblqRRxXRyc,9929
 helm/benchmark/scenarios/vision_language/pope_scenario.py,sha256=uFkzMMsjhmuSYo3v_QdfJFX6RFse83JjzMfMa3ynvV4,3975
 helm/benchmark/scenarios/vision_language/seed_bench_scenario.py,sha256=5MwGb9BOyB2Xy70BGYZcjencf0ZskxBuzcPa7ABRuww,5106
-helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=e3lCq2nevy9tIFDDKEbJvmLibfk4UMQtAIyzrgnnaZs,4179
+helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=bH5FfAgwyzpVMPOJKNCmOgpX-lvJF-B42uVi4m1mY-I,4231
+helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py,sha256=2foCM7ik9RvYahauKIoNAxkGiluOYuT0w0r7FZi-MQo,3621
 helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=hJ3sOSpPnOCwLtpVnfasI_X89oofI-2PBRjMnx8eiVA,4139
 helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=2hY-qngKC69ZL9SHNei3IK3C2PvJDWvwLFVQ8yNSOVs,5196
 helm/benchmark/scenarios/vision_language/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py,sha256=ImhfiC_y_hihAGvlj9zRsaoW614QFCBopBD2KxnbSs0,1805
-helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=-eWRwo2x7kR46Z_I4vFbVlbqA_1f2UEb75Dx84XTlNE,9028
-helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=FKKybU4IeglwXCj6GZC8cAUs_GOU7ymEa6P1dkDT7uw,1350
-helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=SyAYkhsipjJG42XfM9sljz1vly5YF-dbSEWTj_dEHIU,1048
+helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=uDYN10CuXWXvgZ2BYNxlTmBsdfPNlK9G9e_VMGDKvA4,9400
+helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=RSLYpw3BsIIxkhS-6RfVM_UhjmwJDMoA3JQl3FBjv7I,1147
+helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=_pgW_aNaM3E7MTl_tNExupvENdtAH3DvZuSwZIiopCg,837
 helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py,sha256=ovg8-FfJ8_I1xbajFGSLvERZIA1fQjaUn0zd04ZbI84,15316
-helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=j2bDYeWdytYtkKskvuTMwLEIIqELDJJ6D2jdYzmdlJY,9628
+helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=dOt-gif-4Z0JekI2KAel4KS1zyvzqyqoFLP3xoe5DKY,9710
 helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py,sha256=i-i0mlG5oRRDNYNqP7o7Ul56iL02p_anJoThXaSvFiM,2826
 helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py,sha256=9WntahzuhVv54IH1m7_z0IxwLma3dbaMOne_pUx751Y,7652
@@ -355,14 +375,19 @@ helm/benchmark/static/index.html,sha256=xIJGjMg0qn9eemfdBiNbTI0jzPfBD5x0v8HJF-dM
 helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
 helm/benchmark/static/json-urls.js,sha256=AaULgfHw8OLfrQLJpBHfcC013uavQnlNNFS9vzb0qOg,1981
 helm/benchmark/static/plot-captions.js,sha256=bTR8gYx-QqF_RJyKX-L-eQP7hSEtawfJSoADCvgjKag,3011
-helm/benchmark/static/schema_classic.yaml,sha256=p-yc2WMfyGehRtD7L5ZZHbFMMQovu2HNfvct3tBlV2I,108168
-helm/benchmark/static/schema_image2structure.yaml,sha256=gig7HVyJWSwcHa96mf-09e68_fU5L02YRWzNbkPmpGg,13520
-helm/benchmark/static/schema_instruction_following.yaml,sha256=mg2g5P8TAYSCEhZbLfshPt_Hq2GKjwbvyOsQrwDqh7w,8923
-helm/benchmark/static/schema_lite.yaml,sha256=62ByEWhAJT0tIUFi-euxJ7XFhE6e9E6PT9dF6V3qoSU,40255
-helm/benchmark/static/schema_mmlu.yaml,sha256=8kiZDEGGaBXs9ucDk_Gbo2agV-OgOmWuhcYFyodRjcw,53307
-helm/benchmark/static/schema_unitxt.yaml,sha256=89GnKrooG7kKU2xh0MeoYZUB54FDUAmOPrbzuBhG1Ik,15496
-helm/benchmark/static/schema_vhelm_lite.yaml,sha256=s8tQIetR2WKu3sd8k2uZO68_5E-YtlMdsBJsTehFZKE,7331
-helm/benchmark/static/schema_vlm.yaml,sha256=o9AzLTKwSbPES5pISI0tmpUPKWWT9GR-dleDKZqoI0w,33243
+helm/benchmark/static/schema_air_bench.yaml,sha256=ePZAGL4X-yH4cAQvzS5uU44duCKwdDrMwDSvCC9y7-k,139384
+helm/benchmark/static/schema_classic.yaml,sha256=sK3yVQCrk3Tn3Kmg9WITBmJZI7AKVjmIY0f3zgH_t0c,104611
+helm/benchmark/static/schema_finance.yaml,sha256=vZG0EssYr_BVZmyV4sZmRaeLFSX2ycjni8O_L_kGzzc,5283
+helm/benchmark/static/schema_image2structure.yaml,sha256=IV57vHTaZakH6EupIlT6PRjK8aI14OSNFYUAHD9QBxo,15593
+helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
+helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
+helm/benchmark/static/schema_medical.yaml,sha256=hDk4834FKn-5cMr6pHcu1P60sh6cXJ2J0Z1ADIj2MSc,8455
+helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
+helm/benchmark/static/schema_tables.yaml,sha256=i4ylaq5yZoIEUvxPS8dniPQWKHZF5bz3hMgjNbzC_MM,7064
+helm/benchmark/static/schema_thai.yaml,sha256=25-PjBhZMHM89M01XxLQWNg0mdQnfo4H0XInF9ZzDow,7900
+helm/benchmark/static/schema_unitxt.yaml,sha256=9FQhoueYNNYQ2xMuJ2KHzpg_9-_ZhZ9efk6jtTQ3tlc,11855
+helm/benchmark/static/schema_vhelm.yaml,sha256=IZ1oAmEjnoWQ6YtMpnwZ2IQkXx86bJS1j3686mvtAGc,29476
+helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
 helm/benchmark/static/utils.js,sha256=bgN0PT53Dregc-nLmEmAEmg2psufWpS8jTf74WoypHw,7681
 helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
 helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -384,9 +409,10 @@ helm/benchmark/static/images/organizations/together.png,sha256=pmWjW4r7GnlKqFhKL
 helm/benchmark/static/images/organizations/tsinghua-keg.png,sha256=l9SzlZCsLF18BY876wYJcVgiQbgvwte7uoILPDcVwHk,7776
 helm/benchmark/static/images/organizations/yandex.png,sha256=OOCdcKubAP4x7h4VW7z5a-AHPWBiSDTjsIJea6ZiovA,27964
 helm/benchmark/static_build/config.js,sha256=ER8utDIqVZi9uge7Qrk1gmlT88TOOkFF9xYp3j10m8U,165
-helm/benchmark/static_build/index.html,sha256=g3pMdAovQ4VMr7dPGgyzWv2K1tN-E8LLkAs45ppLPGw,1149
+helm/benchmark/static_build/index.html,sha256=J0TrGE5-kOkopr-iSRHvvCzDL00w8Si-8OaIt9vSX0M,1149
 helm/benchmark/static_build/assets/01-694cb9b7.png,sha256=aUy5t0DYCg4r52HDOmeNi1S2CHsnv3mE7ySokJg3Ouo,8903
 helm/benchmark/static_build/assets/ai21-0eb91ec3.png,sha256=Drkew6Vlwi2_4_S8hjagK2x8smOwLKTNiXIT3rDiurs,10208
+helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
 helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png,sha256=fOEANHS8RymKaCzUWn9gQWebts2ghSmtW9Fdda_TjR8,7224
 helm/benchmark/static_build/assets/anthropic-70d8bc39.png,sha256=cNi8OdIshIIb8PdodcX8mAj-khaUD0O6nhah-_6nYfs,8017
 helm/benchmark/static_build/assets/bigscience-7f0400c0.png,sha256=fwQAwN1x2Fr_ztD_HZdcOkdFcyxuDjtS3B5-VuRNkuc,19036
@@ -397,13 +423,15 @@ helm/benchmark/static_build/assets/google-06d997ad.png,sha256=BtmXrVQZHr3WH5c8c2
 helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
 helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
 helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
-helm/benchmark/static_build/assets/index-737eef9e.js,sha256=PvNcOghX7gGSYAGk2bR3pvIBnwDbeWHu0JyfPNaan3o,70614
-helm/benchmark/static_build/assets/index-878a1094.css,sha256=h4oQlJUZdqMk6nS_TEkyXMZ6rtGmepw4ljoSAHZX1vY,486381
+helm/benchmark/static_build/assets/index-30dbceba.js,sha256=WXT0A-yH9f-3wCwQ3rwKWTCIOOpjETQwOQyZt2OMAwc,77064
+helm/benchmark/static_build/assets/index-66b02d40.css,sha256=ZrAtQOMv7vRJwOA9urNRk_rs8hJljom_xhn-wI89g08,486795
 helm/benchmark/static_build/assets/meta-5580e9f1.png,sha256=VYDp8arkAe2eYRJhAOcIAsZY1qY0hqyOEQDgVMbX9M8,4646
 helm/benchmark/static_build/assets/microsoft-f5ee5016.png,sha256=9e5QFl23yTbnAk8u7lZKaQOf4oPHbr_aiQda5n4MZqE,50850
 helm/benchmark/static_build/assets/mistral-18e1be23.png,sha256=GOG-Ix7XlctGOUmvJfO2oVSBM7E5O562G88OnoxsjBw,14402
 helm/benchmark/static_build/assets/nvidia-86fa75c1.png,sha256=hvp1wZMwYxkfrVMvJs73PX71JwY5L8ZvxIH_fL4n6Po,27945
 helm/benchmark/static_build/assets/openai-3f8653e4.png,sha256=P4ZT5ISIlt6Dl0mOp7juSM4Y7dfyRNPqdc0PJuwNoqg,16877
+helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
+helm/benchmark/static_build/assets/process-flow-bd2eba96.png,sha256=vS66lq700aPEKTJR7maMrmepAyBZySaL42tBNCRjFWA,190822
 helm/benchmark/static_build/assets/react-d4a0b69b.js,sha256=rNTpl8Is3LkYXqJowRMc8vc4SXQwP94Ozy4DZZWwldU,275141
 helm/benchmark/static_build/assets/recharts-6d337683.js,sha256=rDrVmtTCCSLY2hpcxSDxhlQ6CQmTTSQOESNeO3oVQgg,432466
 helm/benchmark/static_build/assets/tii-24de195c.png,sha256=JN4ZXAa0rbR2IlxPfd_mKtntFZcYpDcXocSiqrC2rNg,63389
@@ -454,38 +482,40 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/clients/ai21_client.py,sha256=LIdkmzcUDR9uIF2tIk5YgDNGNmfQ9JDYmgscvFoCHDs,5509
 helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
 helm/clients/aleph_alpha_client.py,sha256=koPqXF6uRD905atoiCaPg5yxr6B25J0g2OTWk8geebQ,4969
-helm/clients/anthropic_client.py,sha256=0hAmv3f6FQURScmDpcGbwGjnvskNRP2vhRH02OSe70I,33224
-helm/clients/auto_client.py,sha256=Qs0XFq9pyH4M9HTOLoI3_5m8kW305x3pzVukgETdrZM,10732
+helm/clients/anthropic_client.py,sha256=wptP4u4NhQknoy7VQsWqVzn9tv3IrCuJ3vUMq6fiq0E,34909
+helm/clients/auto_client.py,sha256=uK9EWQFWBt4DoV1oytm0dIeA3YpcfGi_H0rCRZSVE8c,11438
 helm/clients/bedrock_client.py,sha256=BsH9UopsP6ZHf-K0Yzg1PYSMLDwY0yIUmPHDhJVMUi0,5293
 helm/clients/bedrock_utils.py,sha256=okZ6Z8pviGOUNlrdF2QquAqFs8-QYgcqci95eij8giM,2574
-helm/clients/client.py,sha256=xoxPwV-aar7suM-3eAMsB9FmrempyqZ5FFXcLIQJz9w,8628
+helm/clients/client.py,sha256=InjCQi62TWhWHmfyi-mC3fSAVztd-YDyfB3BkpacHXk,9002
 helm/clients/clip_score_client.py,sha256=ct3GHZ2Zh3fGwyvQ9DyoIPT6PwDPI-nUaFkUFuc8PIE,1622
-helm/clients/cohere_client.py,sha256=0UUsFnHwZjEkKjXKPzM6EpZ_iuAduZTg3sCrPM1zGt0,7359
+helm/clients/cohere_client.py,sha256=PtVrDdm_-dXBiWzu_dfwiJPt5GLGw3wdN-Qw3u8ugtU,10976
 helm/clients/cohere_utils.py,sha256=aYmj60m0e9RF9BIdxp1vmA-uZv17TEALw0dbgTUSpCc,504
 helm/clients/gcs_client.py,sha256=1sK5x5uWtThgz9gqBLaA8oyiXGD_9nn1WyfMzJRyPQ8,3231
 helm/clients/google_client.py,sha256=EOpPzK5_9yzWkMjK-4ILiixDF3aeOa8AbR2SPnEO-nw,2900
 helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
 helm/clients/http_model_client.py,sha256=DBgkVDZPmg99DCcO_1Xdf6nFQo2kyxLkgoQpwC-wkHI,2806
-helm/clients/huggingface_client.py,sha256=vzUmNJKsgIXLD8ho4kUGyFCRFGXC61C74X7No0yY7N4,13235
+helm/clients/huggingface_client.py,sha256=xmdqOWoioqoYQjtBqJFN-K9Fm3oHEQrOEjyzDz4ZWBY,15847
 helm/clients/lit_gpt_client.py,sha256=Sjec16bNODosEhDoBkRc4t-LNS-nCUY_jVivWj5zvfU,6205
 helm/clients/lit_gpt_generate.py,sha256=8DdBE9ReQ00NbV3KMFYc--PlO9X-HMOR0Rhm5CADWEA,3103
 helm/clients/megatron_client.py,sha256=KFL1BBBDqxr5mtd5iu0dA6uK8_v6d4g_D6RsZrHx3a0,4107
 helm/clients/mistral_client.py,sha256=thOLMcEfrzWR00JUabIZ_PnW2o9YZsdSmNf9z3jbYKo,5982
 helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
 helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
-helm/clients/openai_client.py,sha256=gWqr4dvYfbUnBtfySSUGGVZYV-pLtqcrnYaf7nPk5-s,13936
+helm/clients/openai_client.py,sha256=tXxi9nZsxz2I4YQLrQrV-GhlgZ1Z9ifrUhC_3Aw5SPE,14238
 helm/clients/palmyra_client.py,sha256=LBYFHNc5LdpPbiSp1AAHuMm8cUUCQ2EB03BB6XnDTYQ,6551
 helm/clients/perspective_api_client.py,sha256=WQDArqlKVWwcK2SicnSIAgV6JGVHsxibTzkdezT3z_U,5920
+helm/clients/reka_client.py,sha256=K8b9p7U6LLAy4PRjgYrUS06gF4G2xjhjRoMEO4XDe0o,8329
 helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
 helm/clients/test_auto_client.py,sha256=bc-rsMJ8JM0MFnQ4B48hBJ1jL3RtRyVvmPwOgzF2mF8,3155
-helm/clients/test_client.py,sha256=g29C1WLUONnNuE2oGFZhaqMahb-doS4l_Ph4OHrQvrc,3895
-helm/clients/test_huggingface_client.py,sha256=WUPrA7VT3nnMNht7w20I6411hlpIS_77XbQC2vC0WU0,2723
+helm/clients/test_client.py,sha256=V7Y56Ahqa8C2Kc2_W2QE0VfGbBEJzFmnic3LGHZkOqQ,3940
+helm/clients/test_huggingface_client.py,sha256=x2NjMuIrinfUy0wQ1S6F5cYZVr09YfvN6LfhWmyGNAM,3388
 helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
-helm/clients/test_together_client.py,sha256=lAtGKn3WdsYe5MEfTYVYRnu_rS4DPnfFr5jRn42rvoQ,3865
-helm/clients/together_client.py,sha256=fCPJ39fX3xm_Gp6cGsc1HIf1jVMLNiE2kIkee45-Ufk,16208
+helm/clients/test_together_client.py,sha256=yYNrhU3kQjmHwhILuoP5QwUgbmkm2gg2NHiNycHjoeE,6145
+helm/clients/together_client.py,sha256=rtYdx53ZE19ziJpBc7MYTeSHJjN3Ke51I3Uldg0IAbs,20595
 helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
-helm/clients/vertexai_client.py,sha256=Mt1rb9lWeQqJLGcBSR5mflYBvJvJfsv5OeIuQz4_ng0,19726
+helm/clients/vertexai_client.py,sha256=K_vCanJU97o2P_WJOeLhUFJA8SdfJDlVNl7Mi1HuIrQ,21860
 helm/clients/vllm_client.py,sha256=p9atBtq3PBOoPkOPSifkMrYZjNLnNM_sWM6tL_3N-WY,1675
+helm/clients/yi_client.py,sha256=0t4WJ8MTLOpB1LCZ-P6UdYa-KbGB7hkDrBluSkioot0,835
 helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
 helm/clients/clip_scorers/clip_scorer.py,sha256=waLI_rI6dQPjmtywvGeQKK7bGCWXyoIgIuBc8P3zSB8,1907
@@ -544,9 +574,11 @@ helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh
 helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
 helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
-helm/clients/vision_language/huggingface_vlm_client.py,sha256=X5SX2iMZkFe9Pmq4Gx0O4bnP4gBPnKvamLThRshAEik,4875
+helm/clients/vision_language/huggingface_vlm_client.py,sha256=H7AE8mm506PkEcUO8VaLVtptHTwVX58nZx1A_BWdKzA,4968
 helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
 helm/clients/vision_language/open_flamingo_client.py,sha256=CkN0JCeR742ZG9Nc4A85hp4BSE0WLU-3Rs-ZwdmDkzs,6632
+helm/clients/vision_language/paligemma_client.py,sha256=IU_T8r1RgpGkEAqabLKBbmoUOWV6c1a9_FXgiTy8exE,6835
+helm/clients/vision_language/palmyra_vision_client.py,sha256=mY6vj918f-tbqhOmh7PCSEgnSpHzWY8UTqAdvYgXJ8Q,3757
 helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
 helm/clients/vision_language/open_flamingo/__init__.py,sha256=i1tGJj6ckeE6eS1EWV5tbQKYLmPCrdSI45mPchfv_Ic,88
 helm/clients/vision_language/open_flamingo/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -563,13 +595,13 @@ helm/common/clip_score_request.py,sha256=WnNg89owDCmG7tyy8nnQL0RdKQLsUdMWiYH9Xqq
 helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
 helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
 helm/common/credentials_utils.py,sha256=O-57nUgkWLbZF0k3lsSaVGPPHj2_OYeVuCMe0to3bRE,1118
-helm/common/critique_request.py,sha256=Exu8Ans05zCU5d5-AglEbG40mBwKYED2Z3WqY_XjXBY,2772
+helm/common/critique_request.py,sha256=yo4aRe-DEjudUmydthtpTj6LdhRXfZ3JZptxTkWzZ3U,3068
 helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
 helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
 helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
 helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
 helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
-helm/common/images_utils.py,sha256=zbzS8C_oCDb9dY2xpWY6nljI8of72rqwijryMeiBKKo,2527
+helm/common/images_utils.py,sha256=bsxgW9knrfa9NTa6V-O13_nDnflqrqHpnKlTRxul-aY,3187
 helm/common/key_value_store.py,sha256=iHi1WQuWttLNJnuM48QNOAXHoneNbmbBmtXYPq-dyys,3147
 helm/common/media_object.py,sha256=3VZqfb0py5dDKwWtnLp2kdl8svaike-Cn7Mjk-b0cvM,5130
 helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
@@ -590,9 +622,9 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
 helm/common/file_caches/local_file_cache.py,sha256=wBOAbbkGLiClaX4YdunokRfSQCKNkTYmMVx2KTLy4Lc,1921
 helm/common/file_caches/test_local_file_cache.py,sha256=bOCWR9MglwQXV98xk8auyjgFxaOr85zRdxWwxMBQW9s,663
 helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/config/model_deployments.yaml,sha256=KAD0FZ45ERfEjr3y7HbPxZmEnnJBQiiOHRHN7VxqiF4,74817
-helm/config/model_metadata.yaml,sha256=XpJnlu0kiI5sGEqswF_S6_ra0Iys3VOfsDs2Jiz_Vqk,112991
-helm/config/tokenizer_configs.yaml,sha256=3IhRANDTlN39TWqDWuPy507wQlZWOBlyaS8fA6WLDD0,12070
+helm/config/model_deployments.yaml,sha256=x4j3LMGHTV3jObKK0dT5SOtKJvReWOHyyjs6jV2D2L0,89739
+helm/config/model_metadata.yaml,sha256=M7EsOSnf4tcrSlNYBT50SiC6mReXfZ1q5rt7_OpdzpU,138011
+helm/config/tokenizer_configs.yaml,sha256=lBGPsRPRPeqlN_j194hEVP8HAMC6J5NLrIZpN95Y8ug,15078
 helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
 helm/proxy/cli.py,sha256=l8F7UYqrIOoBD9ZCIxJFA4fhxlzhae0-2Nn8A7FMkzk,8244
@@ -608,12 +640,12 @@ helm/proxy/critique/mechanical_turk_critique_client.py,sha256=OcppmFOMweBSfVTiLI
 helm/proxy/critique/mechanical_turk_critique_exporter.py,sha256=taULrc_cIP0O9c5UpGz3l9DmWQadTVzN_v-qzTgMoyo,8470
 helm/proxy/critique/mechanical_turk_critique_importer.py,sha256=NL97joO5pRkcICRdVyG4kf9JhfYRaySsxRoZ7KWDYv0,5581
 helm/proxy/critique/mechanical_turk_utils.py,sha256=mKpUv4zz3s5ptzDY7UrwuI7Cr5HmNgSjPC10BnN9AL4,1766
-helm/proxy/critique/model_critique_client.py,sha256=nrNjnvOFdcRk9tUk2MjoBugAfMM92X0hxKGSg4xsy9E,11187
+helm/proxy/critique/model_critique_client.py,sha256=QMFiMpALXnneumKbJpXOZDEb3lPPdkIaSCasmdXHB8o,12806
 helm/proxy/critique/scale_critique_client.py,sha256=B4povtceyfal95eE3N7em9cC_B5Vy4jMrHXcsXc_5m4,15889
 helm/proxy/critique/surge_ai_critique_client.py,sha256=HnzgAoF4Du9Me0GS_lbNaozZslS4a2OZx735gh-coo0,8357
 helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/services/remote_service.py,sha256=emYN0qWOJLQ7q1n06V4TwlvXaqylQcUxmqDcGZXqPJ8,9097
-helm/proxy/services/server_service.py,sha256=ehKs1gITG8ZsPpxzjbzlHqWjAJVLahiKZn5odsLhcPM,11535
+helm/proxy/services/server_service.py,sha256=U-1g0VMjCY9bBK8BecbUxVzSx7hyC_rpwSNm67bqmCg,11534
 helm/proxy/services/service.py,sha256=Be-Z5F6AN4vMzsJr3BS6tJ9NHHy_dc_yn2Ex9cm0ChU,6193
 helm/proxy/services/test_remote_service.py,sha256=NFnLjg3QNHoDKdK0DlcrtylwlKXx1vdzheNZRrLEv7c,6605
 helm/proxy/services/test_service.py,sha256=FUZoI8pGiUg5adgB1wTJ869QOgFYjPtM6yf6FGMdE64,8968
@@ -625,16 +657,17 @@ helm/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/tokenizers/ai21_tokenizer.py,sha256=oXImuAY9kMohHH6Zm7BWysfT88b00NBoSELeGQ920y4,2255
 helm/tokenizers/aleph_alpha_tokenizer.py,sha256=UlWC_SjObBvexpZ3OfKZT2yjhbSsHlKjQe_oWuRrXno,3818
 helm/tokenizers/anthropic_tokenizer.py,sha256=d-HO9OEFkhYzFZu0VkOsHjxbqqSUseCNX0KQqgb3s2Q,2114
-helm/tokenizers/auto_tokenizer.py,sha256=xKL_rLnjiaCnyH5oJUlo5gfdVSen7PmBFFD60gl9R8A,4217
+helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
 helm/tokenizers/caching_tokenizer.py,sha256=kSegrCFotRevSDgJsn0g52dWiSUCNa7_EZpRNrELeUE,8163
-helm/tokenizers/cohere_tokenizer.py,sha256=-WuvEKHzwqcpnhDPauw7x8wyZ5eVWTZalygx1LkkLnQ,3739
+helm/tokenizers/cohere_tokenizer.py,sha256=6rahykq1SxqS8vCWOzYo_oeUoVwhg_zOfWFIkQxP6GY,5632
 helm/tokenizers/http_model_tokenizer.py,sha256=wBTtDA2UdEYspffa1wqgkT3y3YHoyLXXoucnJ5PGjhs,3109
-helm/tokenizers/huggingface_tokenizer.py,sha256=IY9RxJ3YwVKfXtvMXR9DLO4uTaz9j_8hr1MOyA60H7Y,7791
+helm/tokenizers/huggingface_tokenizer.py,sha256=_XXx8uApENK7-o81qxEn0SOeJL_L2UpiiuteSYiODpE,8734
 helm/tokenizers/ice_tokenizer.py,sha256=4ZTIRpmt2cqwcxnmrDpCRhiJ0BI3ELE-GHoBuHWgrDA,1200
 helm/tokenizers/lit_gpt_tokenizer.py,sha256=LMrpaje64UmnDKoYjPG_RQeXVA4xQUwW5t48IJIeLaQ,1660
 helm/tokenizers/simple_tokenizer.py,sha256=6_NROqVbygs-HRA7bYAZluN4YB5gUhVaRsYQeRTjA1E,1147
 helm/tokenizers/test_anthropic_tokenizer.py,sha256=_wzXp9FVR2Ml0s2A79TTXbSPHyTRp28i9tiEyQ9S6Ko,3792
-helm/tokenizers/test_huggingface_tokenizer.py,sha256=o1oqYT2MS-7xrnffj48WuvJfKAHd4p8pee9W4WxwQb8,6172
+helm/tokenizers/test_cohere_tokenizer.py,sha256=15z2GJtZ-VlrliC2_Fk5DIZhQYFkJS7J73fjxYMf8YM,1431
+helm/tokenizers/test_huggingface_tokenizer.py,sha256=8tFyZQb4DLg6MdKg13a66bLbp0yf4Ar1fGWM_sYeSjg,6309
 helm/tokenizers/test_ice_tokenizer.py,sha256=-xi_f8TBSkAYr5CcA56HDq7rZ9HAGd99J7twNfkLzFU,2619
 helm/tokenizers/test_simple_tokenizer.py,sha256=vUNdcnJqZV99-E8H1rwUH85AQPJ2HTnDr5DrZ_-zRL4,1219
 helm/tokenizers/test_yalm_tokenizer.py,sha256=qWpKnUuAlePd6t-UJB_mAiBwtAacnC8caKXLJ_GdTkk,2477
@@ -646,9 +679,9 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
 helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
 helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
 helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
-crfm_helm-0.5.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
-crfm_helm-0.5.1.dist-info/METADATA,sha256=dVxnv-vEsYZb3v-ALFNpSdpbxwi5WQG5_I1oD3cMs6Y,19157
-crfm_helm-0.5.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-crfm_helm-0.5.1.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
-crfm_helm-0.5.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
-crfm_helm-0.5.1.dist-info/RECORD,,
+crfm_helm-0.5.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
+crfm_helm-0.5.2.dist-info/METADATA,sha256=g-tT_a7wm7L7iaNCQVwNIrpUnVHK8PKfbXjel0KyhmQ,19591
+crfm_helm-0.5.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+crfm_helm-0.5.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
+crfm_helm-0.5.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
+crfm_helm-0.5.2.dist-info/RECORD,,

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -39,90 +39,91 @@ class AdapterSpec:
     Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
     """
-    # Method of adaptation
     method: str = ""
+    """The high-level strategy for converting instances into a prompt for the language model."""
-    # Prepend all prompts with this string.
-    # For example, it is recommended to prefix all prompts with [NLG] for UL2.
     global_prefix: str = ""
+    """The string that is prepended to the entire prompt."""
-    # Append all prompts with this string.
     global_suffix: str = ""
+    """The string that is appended to the entire prompt."""
-    # Prompt starts with instructions
     instructions: str = ""
+    """The description of the task that is included at the very beginning of the prompt."""
-    # What goes before the input
     input_prefix: str = "Input: "
+    """The string that is included before each input (e.g., 'Question:')."""
-    # What goes after the input
     input_suffix: str = "\n"
+    """The string that is included after each input (e.g., '\\n')."""
-    # What goes before the input (for multiple choice)
     reference_prefix: str = "A. "
+    """The string that is included before each reference (for multiple-choice questions)."""
-    # What goes before the input (for multiple choice)
     reference_suffix: str = "\n"
+    """The string that is included after each reference (for multiple-choice questions)."""
-    # What goes before the output
     output_prefix: str = "Output: "
+    """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
-    # What goes after the output
     output_suffix: str = "\n"
+    """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
-    # What goes between instruction and in-context example blocks in the constructed prompt
     instance_prefix: str = "\n"
+    """The string that is included before each instance (e.g., '\\n\\n')."""
-    # List of regular expression substitutions that we perform
     substitutions: List[Substitution] = field(default_factory=list, hash=False)
+    """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
+    to perform at the very end on the prompt."""
-    # Maximum number of (in-context) training instances to put into the prompt
     max_train_instances: int = 5
+    """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
-    # Maximum number of evaluation instances. For getting valid numbers, this
-    # should be the entire dataset; only reduce this for piloting.
     max_eval_instances: Optional[int] = None
+    """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
-    # Generate this many outputs (which could be realized by `num_completions`
-    # or `top_k_per_token`).
     num_outputs: int = 5
+    """Maximum number of possible outputs to generate by sampling multiple outputs."""
-    # Number of trials, where in each trial we choose an independent, random
-    # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    """Number of trials, where in each trial we choose an independent, random set of training instances.
+    Used to compute variance."""
-    # Number of trials, where we query the model with the same requests, but different random seeds
     num_trials: int = 1
+    """Number of trials, where we query the model with the same requests, but different random seeds."""
-    # If true, randomly sample N training examples; if false, select N consecutive training examples
     sample_train: bool = True
+    """If true, randomly sample N training examples; if false, select N consecutive training examples"""
     # Decoding parameters (inherited by `Request`)
-    # Model deployment to make the request to (need to fill in)
     model_deployment: str = ""
+    """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
-    # Model to make the request to
     model: str = ""
+    """Name of the language model (<creator_organization>/<model name>) to send requests to."""
-    # Temperature to use
     temperature: float = 1
+    """Temperature parameter used in generation."""
-    # Maximum number of tokens to generate
     max_tokens: int = 100
+    """Maximum number of tokens to generate."""
-    # When to stop (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     stop_sequences: List[str] = field(default_factory=list, hash=False)
+    """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
     # Random string (used concretely to bypass cache / see diverse results)
     random: Optional[str] = None
+    """Random seed (string), which guarantees reproducibility."""
-    # If true, for instances with multiple correct reference, the gold answer should be considered
-    # to be all the correct references rather than any of the correct references.
     multi_label: bool = False
+    """If true, for instances with multiple correct reference, the gold answer should be considered to be all
+    of the correct references rather than any of the correct references."""
-    # Parameters for image generation
     image_generation_parameters: Optional[ImageGenerationParameters] = None
+    """Parameters for image generation."""
-    # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     eval_splits: Optional[List[str]] = field(default=None, hash=False)
+    """The splits from which evaluation instances will be drawn."""

crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl