PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show

{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.0
+Version: 0.5.1
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -25,7 +25,7 @@ Requires-Dist: tqdm ~=4.64
 Requires-Dist: zstandard ~=0.18.0
 Requires-Dist: sqlitedict ~=1.7
 Requires-Dist: bottle ~=0.12.23
-Requires-Dist: datasets ~=2.15
+Requires-Dist: datasets ~=2.17
 Requires-Dist: pyarrow >=11.0.0
 Requires-Dist: pyarrow-hotfix ~=0.6
 Requires-Dist: nltk ~=3.7
@@ -34,7 +34,7 @@ Requires-Dist: rouge-score ~=0.1.2
 Requires-Dist: scipy ~=1.10
 Requires-Dist: uncertainty-calibration ~=0.1.4
 Requires-Dist: scikit-learn ~=1.1
-Requires-Dist: transformers ~=4.37
+Requires-Dist: transformers ~=4.40
 Requires-Dist: torch <3.0.0,>=1.13.1
 Requires-Dist: torchvision <3.0.0,>=0.14.1
 Requires-Dist: google-api-python-client ~=2.64
@@ -136,6 +136,7 @@ Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
 Requires-Dist: crfm-helm[google] ; extra == 'models'
 Requires-Dist: crfm-helm[mistral] ; extra == 'models'
 Requires-Dist: crfm-helm[openai] ; extra == 'models'
+Requires-Dist: crfm-helm[together] ; extra == 'models'
 Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
 Requires-Dist: crfm-helm[yandex] ; extra == 'models'
 Provides-Extra: mongo
@@ -158,6 +159,8 @@ Provides-Extra: slurm
 Requires-Dist: simple-slurm ~=0.2.6 ; extra == 'slurm'
 Provides-Extra: summarization
 Requires-Dist: summ-eval ~=0.892 ; extra == 'summarization'
+Provides-Extra: together
+Requires-Dist: together ~=1.1 ; extra == 'together'
 Provides-Extra: tsinghua
 Requires-Dist: icetk ~=0.0.4 ; extra == 'tsinghua'
 Provides-Extra: unitxt
@@ -173,6 +176,7 @@ Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
 Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
 Requires-Dist: crfm-helm[images] ; extra == 'vlm'
 Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
+Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'
 Provides-Extra: yandex
 Requires-Dist: sentencepiece ~=0.1.97 ; extra == 'yandex'

{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -7,19 +7,18 @@ helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYC
 helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
 helm/benchmark/huggingface_registration.py,sha256=RzfOaLAnzAcoTphan1JNo836lNyxMSH67oQlolhNLS0,4154
 helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
-helm/benchmark/model_metadata_registry.py,sha256=q8-tERzoY6i6qFrKs9IM4R3BRjO31t0gmdT0lpCmdRE,8009
+helm/benchmark/model_metadata_registry.py,sha256=fXRJOLUIrLOHUG5duncEqhnpmfb9hyloUlGbOM2L9ds,8194
 helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
 helm/benchmark/run.py,sha256=tF_aWy5GtfwBOT1ZRKWrcI74VpFWGzlR00EKiGG7zyI,12572
-helm/benchmark/run_expander.py,sha256=NiwP10enxRtBj3yJys1AYCvVUBD9CG4mrf_L6QVBRsk,47965
+helm/benchmark/run_expander.py,sha256=jolEPDrB4lL_VJNRpT1SQta6DZ_xyq2HaIfWHdeyNtA,47785
 helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
-helm/benchmark/run_spec_factory.py,sha256=KQIUmoNa6iHc-xp8-lDMG1PhfgE2_eTFejyyZ1jXFj0,6874
+helm/benchmark/run_spec_factory.py,sha256=nRP9737niPReD5G7t9fgyQ8_EUQ1hvg2VBQe5rSZ08Y,6816
 helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
 helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
 helm/benchmark/server.py,sha256=ysd5MT1TDu65NH-OzIGf9wmZlr8FHNRwoy2ybjSc5Yk,6140
 helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
 helm/benchmark/slurm_runner.py,sha256=Tozimrjr2R6mlKHcmrGgxTy9ga-ArIW6AoAWtxqzw-M,16567
 helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
-helm/benchmark/test_model_deployment_definition.py,sha256=wGN95ku-mROh-yiHH3bL8GC7OWkBOa0YJCS2RIGRv8k,4468
 helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
 helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
 helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -44,11 +43,11 @@ helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py,sha256=f_bg
 helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=v6LLmVTopXNfzo9Qzq16EmmPPivFGGs9LuaPDJAX4vY,9506
 helm/benchmark/adaptation/adapters/multimodal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py,sha256=o7CGClyVWYOuJ4G56-whq5fTvCr7QIn51Mo6DTdvwg0,1881
-helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=9_9PgNIiJDoTm1NIhCKWytCYe3hHzHJUhaAdd8hM0PQ,6300
-helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=5jdVVfWf0lBErwzDamIHozZSbf1mEkpmACX8mPlHJc0,1907
+helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=bvY8xT2ak_3WG4m2Z5bCM6FLImPIWG1qAn9H2ZNwNv0,6359
+helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=jyL61UxBsIr68hUz-jtjBUnyB2HBp5ESNyECGp_Gf6Q,2129
 helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py,sha256=PWI_WrfDolipj7Zs43YxFQk36jBgU76PU-kL8R9gRno,4759
 helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py,sha256=VjSqWiZEcW6K2jrokGUmky7syEOqJ6cbHImR7YZgwzU,10151
-helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=2OZhhOtEZ7RvItHZD4AGsefXvVUwEQZL2T-ESHPsvdI,3482
+helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=KKOOlna6SHLJHSPgfgguPQysc2Nf4kKrqumqwlG27bs,3542
 helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/annotation/annotator.py,sha256=2UIXY71S5dRaZBLb1v4lcv8-O6pyJ9zTeSJl78AEWGI,1538
 helm/benchmark/annotation/annotator_factory.py,sha256=z5AGBylIuy-_IfgikX66VyGvRz4SxtnOcJsyESH8990,2699
@@ -57,7 +56,7 @@ helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD
 helm/benchmark/annotation/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/annotation/image2structure/image_compiler_annotator.py,sha256=eJFm3iyBe_eEN5Yt0G2IpeA1xdKxRmyR4krsNd6eXoE,3524
 helm/benchmark/annotation/image2structure/latex_compiler_annotator.py,sha256=yRifoqhGq_mQkkRcgKCFpGrZaI9gochOXYiCU8oY1KE,2477
-helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=mvUg5tfJayACaqJW_wJOehZzBvLUlJppspphR9QYHOU,4034
+helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=we6K1BynV907ZMnGI2zb_tru1uw2iGEI06Wtbnus23w,4010
 helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py,sha256=rvzdQCaVFM6ovF28TSUnNmB47f2hidlaZm6vO4DJpso,6404
 helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/augmentations/cleva_perturbation.py,sha256=arUkY_luc274YEMZocOos9rpAZVbEFZphbMlobAxTy0,29208
@@ -73,12 +72,12 @@ helm/benchmark/augmentations/lowercase_perturbation.py,sha256=e-lhkuKOV5QR_GnOp7
 helm/benchmark/augmentations/mild_mix_perturbation.py,sha256=q27-c8_di1jaLhUuo5LzqOStnJcWIjSJ0a8Sr3S5TOs,1787
 helm/benchmark/augmentations/misspelling_perturbation.py,sha256=IE45qEE0HuIln62tAGvCP9B_RjNMJ5Xu06goL6b0fWQ,2144
 helm/benchmark/augmentations/person_name_perturbation.py,sha256=AIF5VVjNJ1OlUyCC1bt2dcP1KDn4gdgqd7Y2KGvHPqs,14422
-helm/benchmark/augmentations/perturbation.py,sha256=fsvyR2qxFqMKmxF1OCzVqmKkWa0FTk5Jff0wbgKHpDQ,3083
+helm/benchmark/augmentations/perturbation.py,sha256=GapQckD3zkzZZB-PIVO1KKOy7aISGOhRbGGXEzuAFeQ,3880
 helm/benchmark/augmentations/perturbation_description.py,sha256=VKOwBRPQY-0vuxhGvtac1Z5F10metPfpFnfs8ykFVmU,1184
 helm/benchmark/augmentations/space_perturbation.py,sha256=g4rbyoureBaOVf_lrRXIWYlL6YZpqbCOJk554iOCuIs,935
 helm/benchmark/augmentations/suffix_perturbation.py,sha256=P3AfJj_ajTVdjO7AJRQ9dKS-cT1PyRSt8Un57iZQDVc,785
 helm/benchmark/augmentations/synonym_perturbation.py,sha256=komOV5M342_8unopnwN6gkPWpJIZXidywiu6PO9_riU,4151
-helm/benchmark/augmentations/test_perturbation.py,sha256=ERQX6UySs5sep0lesNlQiPfdoTlRjFmHGTpg7vMxSWE,11902
+helm/benchmark/augmentations/test_perturbation.py,sha256=4EooKVcyub70I81trzpNx3Ij-m1vpFa5cFIo6O52icE,13185
 helm/benchmark/augmentations/translate_perturbation.py,sha256=dn8wO5UOgYbGtP9e77SmwaK2ginrQsTw-79nrzRzfeo,1054
 helm/benchmark/augmentations/typos_perturbation.py,sha256=_F9zwvrLie8hX7mzUtQmYq6oq6yqaFiKGsvc9LAuBr4,2798
 helm/benchmark/data_overlap/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -107,9 +106,9 @@ helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=l6TmAflBSgQGLjB-U
 helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=KhiJ_7tnK8kHg2acKPAUWBQvAtyvjJTwcWJMUSUBHak,6416
 helm/benchmark/metrics/disinformation_metrics.py,sha256=BsLBG5krHGQh_nKDAeSDpp-li89s7fYoYTf5T0RZfMM,7782
 helm/benchmark/metrics/dry_run_metrics.py,sha256=d8RgltW4nGTH1tZeGOIlQRwRaJLIxL60h46NXv_wv1s,3674
-helm/benchmark/metrics/efficiency_metrics.py,sha256=Y3y-gVOgKfAcsWUPTINmlmyJ-mOyrQ22KuHvhMZPCRM,11522
+helm/benchmark/metrics/efficiency_metrics.py,sha256=v8Eg56HHIWEMQruODKBvwdUfR6ZLGgrNifo-senCaUo,11786
 helm/benchmark/metrics/evaluate_instances_metric.py,sha256=EBUf0ONnNoi7pcxYab7RD0B_JqGksqDX8TOaosSmJk8,2847
-helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=CtVqdmQBdHxXyb1Cv3UZwCstHTOjNPgQ0cgmEJ6mMSQ,15063
+helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=vUJavaLVfbWtrwyrIA81npK_1iirhko7_zMF1kL7Gfw,15559
 helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=Pj1itUJi_KDy0D-FOPcOyHqm4ypHMfhbAVeDJzGlyeo,9773
 helm/benchmark/metrics/language_modeling_metrics.py,sha256=ofqwj1PMJQu16QhLDULXBmZ5iFz91ducwLRpNsRYELE,4510
 helm/benchmark/metrics/machine_translation_metrics.py,sha256=bp_EDXyxntIty5gORDa7va-C73quOzoTc5o8MpxFmL4,3816
@@ -182,7 +181,7 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=_wJ3E3L
 helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
 helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/metrics/vision_language/emd_utils.py,sha256=3yN-DY5rxMabmtLV003lj59SRnp_T83sLAi96rycKEo,15043
-helm/benchmark/metrics/vision_language/image_metrics.py,sha256=8dtrEYAjYKtr1ID5ytjKxiTwXg0loppBjYAV8Av956c,20059
+helm/benchmark/metrics/vision_language/image_metrics.py,sha256=aJ3zrVOLJJzdVKqXPcFsCXp9LSHET8VGEgtvwK-nkJc,25190
 helm/benchmark/metrics/vision_language/image_utils.py,sha256=XeYF3E6MnYyPJ5hYp4TtiTP27-y4S8LTBH5bZVcvJFg,3758
 helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
@@ -205,7 +204,7 @@ helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMY
 helm/benchmark/run_specs/lite_run_specs.py,sha256=ViCPJ86Aah8301GTEk6z4_MtP0g8iik33t4GudobhWQ,11113
 helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
 helm/benchmark/run_specs/unitxt_run_specs.py,sha256=ejp_knrcIjf0J4WiKj9LTgDTcUr29-XFZYHYz0w_dkM,1518
-helm/benchmark/run_specs/vlm_run_specs.py,sha256=S3x80FbO7JQE3L6A0tZ70lBLKThfC0vAASdyP_cj3Ko,17477
+helm/benchmark/run_specs/vlm_run_specs.py,sha256=CmdyEF-pdFIlMhBV7UraQ0FuQgQl2rqVSdTz22uYuPQ,26808
 helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=Wyt7J5BAvAqC5JTqCW4fh7ex9-itX11P_9rLTocqvtk,4973
 helm/benchmark/scenarios/babi_qa_scenario.py,sha256=S1tPQY2x1I3hQL1JQ6wvUwvKyiSe7SqpRSW6N3_T0mo,5043
@@ -248,13 +247,13 @@ helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=PuwcuHnx3nCuRYEE
 helm/benchmark/scenarios/koala_scenario.py,sha256=ihx39W1C9KyOh5MkdMqw1twUwzA8S5QPsBKeQRZcZGM,1365
 helm/benchmark/scenarios/legal_summarization_scenario.py,sha256=DTYYsf-74NFjHvrNa5ZWSgMywq9ss16r5eFZI2-xZJk,8644
 helm/benchmark/scenarios/legal_support_scenario.py,sha256=hu8SlDGrU6oUTQjnrLEagjFNniW96wMahKjnrtuyjzo,3925
-helm/benchmark/scenarios/legalbench_scenario.py,sha256=55D09TaELT_B__ERyyWUp92gGwXifNgX1eIOzw2pgBE,4398
+helm/benchmark/scenarios/legalbench_scenario.py,sha256=jLo8HFE24M7pcS4PgrpAYhiZNuXSfPUb6LrGoaGCT1A,4490
 helm/benchmark/scenarios/lex_glue_scenario.py,sha256=vsy8Y03hoCwTJFgBdQf0koxlHDRFm2V9wT04n2K1tlk,10268
 helm/benchmark/scenarios/lextreme_scenario.py,sha256=HEpXLdLLywo6yfXaGF3zRgx1PiGPE2ILnpKYs9Dmsow,20396
 helm/benchmark/scenarios/live_qa_scenario.py,sha256=meJvkkbqRUQ2odKkmlOLYOVVFd0LGDhGzK5nHCjXW6Y,3642
 helm/benchmark/scenarios/lm_entry_scenario.py,sha256=_7lTn2kfUQHi3ynX7fEaoseUYkpBx-ANn62kqHr-bY4,9085
 helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=Ui68mi05N3WxGMZfCznJBscKkqaJ0ZEUO3PhIiD1xNE,6129
-helm/benchmark/scenarios/math_scenario.py,sha256=mL6Xf0xTAKcrH_YGqTWJxACnctGRUIQh7alLoLqUL8s,14144
+helm/benchmark/scenarios/math_scenario.py,sha256=LvoQJUwZ0w78h5X1tIaa6SRsef_W_PNn4L_r5L3Cus8,14168
 helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=WLfED6DiIOUPd4PowydPmrQ4v26OsjG4iHumq7ka9do,4001
 helm/benchmark/scenarios/med_dialog_scenario.py,sha256=f7hTcEpxGfoxJ-WXNrSRrgVfbyuYpB624HbyRuMP-KI,7295
 helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=JdVQuzKM4TRyWT6c1zpsMvNk1RYHnKdy_fRM6W7PCLI,5035
@@ -314,25 +313,35 @@ helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py,s
 helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py,sha256=IB4_GbzQjjXBp-551XZ6PTNUCRX1jLcGfB3bVFI5lo4,3547
 helm/benchmark/scenarios/image_generation/winoground_scenario.py,sha256=E2xPQNQzylDSmqLjjMkQB8D7A6g7bzqtSF4bXPgfVbI,2889
 helm/benchmark/scenarios/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py,sha256=zXR0LmXsD2tv_ovJsbY_HP53kdiFOvty7Y_Ai3ZCrT4,3037
 helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=LiH14xUoEKXn5ZStDbGE4bz9iMEn3-5I39eJ6kvN2UY,4045
-helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py,sha256=us75yZygU7_xPrNLi5yvRBx3G9O3_HOnRRgfjcbqXzI,3709
+helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py,sha256=82qplX4gJ4GsSVhBjwrsVU46TAHh-jym3F_M5A-odRE,4608
+helm/benchmark/scenarios/vision_language/flickr30k_scenario.py,sha256=3pBAQgOsnSyMCzt60s1m8Kf_fEJ4C7XgCDbtXatTlX0,2599
+helm/benchmark/scenarios/vision_language/gqa_scenario.py,sha256=sBQfqAxmP-Z0ifCgwTbP11aPsKA4vogcWBqSDiKlbE4,3512
+helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py,sha256=7KjsXiAaiVHDRSyW08jZXNUTWogP3Sr2Og5ViT6Xz8I,3832
 helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py,sha256=7GK_jAOfCgRIGiN_GInDePwuT2wZqmWHp1rqdx18xQg,4994
+helm/benchmark/scenarios/vision_language/math_vista_scenario.py,sha256=kzZHeyWQHUphUfAixkms2t3-KKfHRjwIKi6qm-lMyXA,4728
 helm/benchmark/scenarios/vision_language/mementos_scenario.py,sha256=Yw4zxeYgUw8HKRR5ob9QEIT0bSPrdRUULMKCo_xzFpw,4337
+helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py,sha256=ly77pj_TorwM1kN2sW7Y2AIGHOBlDkdzV0STvZTBOtc,4332
 helm/benchmark/scenarios/vision_language/mme_scenario.py,sha256=zxtdub2akvxPYEG12pkW2c57TIFqN38C7ucAXAHAdx0,5455
 helm/benchmark/scenarios/vision_language/mmmu_scenario.py,sha256=XQv7uv2m6EdbI7h0-9eDag4_bL7qE_78PuHB7c4SsHA,7654
+helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HUO09uM2rBXOfCsxzwovmwtihq53xjuzDOtQO_S3J4I,4161
+helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
 helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
+helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
+helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=rkPR_e_RWOeSyHIlSJGJ5lVu5DD-AR3x686XYJse-1E,9885
 helm/benchmark/scenarios/vision_language/pope_scenario.py,sha256=uFkzMMsjhmuSYo3v_QdfJFX6RFse83JjzMfMa3ynvV4,3975
 helm/benchmark/scenarios/vision_language/seed_bench_scenario.py,sha256=5MwGb9BOyB2Xy70BGYZcjencf0ZskxBuzcPa7ABRuww,5106
 helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=e3lCq2nevy9tIFDDKEbJvmLibfk4UMQtAIyzrgnnaZs,4179
-helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=VJCTQU0jG_ROtGD-fu21BBrcMGHH1Fv-_DTEzEOHUjg,4127
-helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=vAf6MkYNDgi6_ODhz-rCY_OVqXKxGVJiwsYGnUw7H18,5089
+helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=hJ3sOSpPnOCwLtpVnfasI_X89oofI-2PBRjMnx8eiVA,4139
+helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=2hY-qngKC69ZL9SHNei3IK3C2PvJDWvwLFVQ8yNSOVs,5196
 helm/benchmark/scenarios/vision_language/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py,sha256=ImhfiC_y_hihAGvlj9zRsaoW614QFCBopBD2KxnbSs0,1805
 helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=-eWRwo2x7kR46Z_I4vFbVlbqA_1f2UEb75Dx84XTlNE,9028
 helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=FKKybU4IeglwXCj6GZC8cAUs_GOU7ymEa6P1dkDT7uw,1350
-helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=UvYzQzAvWRrydPty4LEifCjo502xNaHlmRVUdduvk9k,1048
+helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=SyAYkhsipjJG42XfM9sljz1vly5YF-dbSEWTj_dEHIU,1048
 helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py,sha256=ovg8-FfJ8_I1xbajFGSLvERZIA1fQjaUn0zd04ZbI84,15316
-helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=PSVBqNxvE5GXEsa3iBuK51vzp3Nho25Gjv9KL_uOMLA,9628
+helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=j2bDYeWdytYtkKskvuTMwLEIIqELDJJ6D2jdYzmdlJY,9628
 helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py,sha256=i-i0mlG5oRRDNYNqP7o7Ul56iL02p_anJoThXaSvFiM,2826
 helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py,sha256=9WntahzuhVv54IH1m7_z0IxwLma3dbaMOne_pUx751Y,7652
@@ -347,11 +356,13 @@ helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheG
 helm/benchmark/static/json-urls.js,sha256=AaULgfHw8OLfrQLJpBHfcC013uavQnlNNFS9vzb0qOg,1981
 helm/benchmark/static/plot-captions.js,sha256=bTR8gYx-QqF_RJyKX-L-eQP7hSEtawfJSoADCvgjKag,3011
 helm/benchmark/static/schema_classic.yaml,sha256=p-yc2WMfyGehRtD7L5ZZHbFMMQovu2HNfvct3tBlV2I,108168
+helm/benchmark/static/schema_image2structure.yaml,sha256=gig7HVyJWSwcHa96mf-09e68_fU5L02YRWzNbkPmpGg,13520
 helm/benchmark/static/schema_instruction_following.yaml,sha256=mg2g5P8TAYSCEhZbLfshPt_Hq2GKjwbvyOsQrwDqh7w,8923
 helm/benchmark/static/schema_lite.yaml,sha256=62ByEWhAJT0tIUFi-euxJ7XFhE6e9E6PT9dF6V3qoSU,40255
 helm/benchmark/static/schema_mmlu.yaml,sha256=8kiZDEGGaBXs9ucDk_Gbo2agV-OgOmWuhcYFyodRjcw,53307
 helm/benchmark/static/schema_unitxt.yaml,sha256=89GnKrooG7kKU2xh0MeoYZUB54FDUAmOPrbzuBhG1Ik,15496
-helm/benchmark/static/schema_vlm.yaml,sha256=IJLEPRe9ZCqVPRLQSIc-VpuUmZx03AJ3r_K8C19_zNU,21823
+helm/benchmark/static/schema_vhelm_lite.yaml,sha256=s8tQIetR2WKu3sd8k2uZO68_5E-YtlMdsBJsTehFZKE,7331
+helm/benchmark/static/schema_vlm.yaml,sha256=o9AzLTKwSbPES5pISI0tmpUPKWWT9GR-dleDKZqoI0w,33243
 helm/benchmark/static/utils.js,sha256=bgN0PT53Dregc-nLmEmAEmg2psufWpS8jTf74WoypHw,7681
 helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
 helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -373,7 +384,7 @@ helm/benchmark/static/images/organizations/together.png,sha256=pmWjW4r7GnlKqFhKL
 helm/benchmark/static/images/organizations/tsinghua-keg.png,sha256=l9SzlZCsLF18BY876wYJcVgiQbgvwte7uoILPDcVwHk,7776
 helm/benchmark/static/images/organizations/yandex.png,sha256=OOCdcKubAP4x7h4VW7z5a-AHPWBiSDTjsIJea6ZiovA,27964
 helm/benchmark/static_build/config.js,sha256=ER8utDIqVZi9uge7Qrk1gmlT88TOOkFF9xYp3j10m8U,165
-helm/benchmark/static_build/index.html,sha256=fqZee8nuPxeRbBNzPgY0AUmGFHF-ngY1dG_MfwfZGLc,1149
+helm/benchmark/static_build/index.html,sha256=g3pMdAovQ4VMr7dPGgyzWv2K1tN-E8LLkAs45ppLPGw,1149
 helm/benchmark/static_build/assets/01-694cb9b7.png,sha256=aUy5t0DYCg4r52HDOmeNi1S2CHsnv3mE7ySokJg3Ouo,8903
 helm/benchmark/static_build/assets/ai21-0eb91ec3.png,sha256=Drkew6Vlwi2_4_S8hjagK2x8smOwLKTNiXIT3rDiurs,10208
 helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png,sha256=fOEANHS8RymKaCzUWn9gQWebts2ghSmtW9Fdda_TjR8,7224
@@ -386,8 +397,8 @@ helm/benchmark/static_build/assets/google-06d997ad.png,sha256=BtmXrVQZHr3WH5c8c2
 helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
 helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
 helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
-helm/benchmark/static_build/assets/index-5088afcb.css,sha256=UIivyx7XvQHnXD0s-J7WaeOZKDl2mgNxUts8n9D57Es,486112
-helm/benchmark/static_build/assets/index-d839df55.js,sha256=d0xZkf7R2yZMo7pQTJb9J1dqZRg5hcqaaoFutsjocbQ,66843
+helm/benchmark/static_build/assets/index-737eef9e.js,sha256=PvNcOghX7gGSYAGk2bR3pvIBnwDbeWHu0JyfPNaan3o,70614
+helm/benchmark/static_build/assets/index-878a1094.css,sha256=h4oQlJUZdqMk6nS_TEkyXMZ6rtGmepw4ljoSAHZX1vY,486381
 helm/benchmark/static_build/assets/meta-5580e9f1.png,sha256=VYDp8arkAe2eYRJhAOcIAsZY1qY0hqyOEQDgVMbX9M8,4646
 helm/benchmark/static_build/assets/microsoft-f5ee5016.png,sha256=9e5QFl23yTbnAk8u7lZKaQOf4oPHbr_aiQda5n4MZqE,50850
 helm/benchmark/static_build/assets/mistral-18e1be23.png,sha256=GOG-Ix7XlctGOUmvJfO2oVSBM7E5O562G88OnoxsjBw,14402
@@ -443,7 +454,7 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/clients/ai21_client.py,sha256=LIdkmzcUDR9uIF2tIk5YgDNGNmfQ9JDYmgscvFoCHDs,5509
 helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
 helm/clients/aleph_alpha_client.py,sha256=koPqXF6uRD905atoiCaPg5yxr6B25J0g2OTWk8geebQ,4969
-helm/clients/anthropic_client.py,sha256=woFek17fbUJ30vVdVj1cZX8AxI0VwMoc5eevu9z69C4,31585
+helm/clients/anthropic_client.py,sha256=0hAmv3f6FQURScmDpcGbwGjnvskNRP2vhRH02OSe70I,33224
 helm/clients/auto_client.py,sha256=Qs0XFq9pyH4M9HTOLoI3_5m8kW305x3pzVukgETdrZM,10732
 helm/clients/bedrock_client.py,sha256=BsH9UopsP6ZHf-K0Yzg1PYSMLDwY0yIUmPHDhJVMUi0,5293
 helm/clients/bedrock_utils.py,sha256=okZ6Z8pviGOUNlrdF2QquAqFs8-QYgcqci95eij8giM,2574
@@ -462,7 +473,7 @@ helm/clients/megatron_client.py,sha256=KFL1BBBDqxr5mtd5iu0dA6uK8_v6d4g_D6RsZrHx3
 helm/clients/mistral_client.py,sha256=thOLMcEfrzWR00JUabIZ_PnW2o9YZsdSmNf9z3jbYKo,5982
 helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
 helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
-helm/clients/openai_client.py,sha256=ghwmaFCUTy82HeEq49g-PD-Z9hRs89bS_T0JMlzHgEA,13922
+helm/clients/openai_client.py,sha256=gWqr4dvYfbUnBtfySSUGGVZYV-pLtqcrnYaf7nPk5-s,13936
 helm/clients/palmyra_client.py,sha256=LBYFHNc5LdpPbiSp1AAHuMm8cUUCQ2EB03BB6XnDTYQ,6551
 helm/clients/perspective_api_client.py,sha256=WQDArqlKVWwcK2SicnSIAgV6JGVHsxibTzkdezT3z_U,5920
 helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
@@ -471,9 +482,9 @@ helm/clients/test_client.py,sha256=g29C1WLUONnNuE2oGFZhaqMahb-doS4l_Ph4OHrQvrc,3
 helm/clients/test_huggingface_client.py,sha256=WUPrA7VT3nnMNht7w20I6411hlpIS_77XbQC2vC0WU0,2723
 helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
 helm/clients/test_together_client.py,sha256=lAtGKn3WdsYe5MEfTYVYRnu_rS4DPnfFr5jRn42rvoQ,3865
-helm/clients/together_client.py,sha256=xvqSaXvGeiJtf8c8MzgCMrXWM6m18y_kMFcry7NsmL4,12819
+helm/clients/together_client.py,sha256=fCPJ39fX3xm_Gp6cGsc1HIf1jVMLNiE2kIkee45-Ufk,16208
 helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
-helm/clients/vertexai_client.py,sha256=RMIuOXi7W0PGcZKy5ClsPc4dPVlVh6DHOCthQlUyrvo,18564
+helm/clients/vertexai_client.py,sha256=Mt1rb9lWeQqJLGcBSR5mflYBvJvJfsv5OeIuQz4_ng0,19726
 helm/clients/vllm_client.py,sha256=p9atBtq3PBOoPkOPSifkMrYZjNLnNM_sWM6tL_3N-WY,1675
 helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
@@ -532,8 +543,9 @@ helm/clients/image_generation/mindalle/utils/config.py,sha256=lh8dXvL7ctKmuYEbeT
 helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh9HMveJs6F49UMK57Xfa0ccnHqI8,5029
 helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
 helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/clients/vision_language/huggingface_vlm_client.py,sha256=AMjOlg_Fb65Ah1d3XAt9M-842huhO_vOA8HQfNkAApk,4433
-helm/clients/vision_language/idefics_client.py,sha256=WQSDcoLNcVNmoKZ3PsAsIZOeZHiYQ33IXFz2-ktgVIk,7680
+helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
+helm/clients/vision_language/huggingface_vlm_client.py,sha256=X5SX2iMZkFe9Pmq4Gx0O4bnP4gBPnKvamLThRshAEik,4875
+helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
 helm/clients/vision_language/open_flamingo_client.py,sha256=CkN0JCeR742ZG9Nc4A85hp4BSE0WLU-3Rs-ZwdmDkzs,6632
 helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
 helm/clients/vision_language/open_flamingo/__init__.py,sha256=i1tGJj6ckeE6eS1EWV5tbQKYLmPCrdSI45mPchfv_Ic,88
@@ -557,7 +569,7 @@ helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
 helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
 helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
 helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
-helm/common/images_utils.py,sha256=uYqxp3nnfDwzg0hAiFX58ZPat944xKekLU_pWSwadXQ,2319
+helm/common/images_utils.py,sha256=zbzS8C_oCDb9dY2xpWY6nljI8of72rqwijryMeiBKKo,2527
 helm/common/key_value_store.py,sha256=iHi1WQuWttLNJnuM48QNOAXHoneNbmbBmtXYPq-dyys,3147
 helm/common/media_object.py,sha256=3VZqfb0py5dDKwWtnLp2kdl8svaike-Cn7Mjk-b0cvM,5130
 helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
@@ -578,9 +590,9 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
 helm/common/file_caches/local_file_cache.py,sha256=wBOAbbkGLiClaX4YdunokRfSQCKNkTYmMVx2KTLy4Lc,1921
 helm/common/file_caches/test_local_file_cache.py,sha256=bOCWR9MglwQXV98xk8auyjgFxaOr85zRdxWwxMBQW9s,663
 helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/config/model_deployments.yaml,sha256=ptZz4bzZPbbuSCIvpyNm3UoKXZXxGsHyuAD0Kir9vH0,71211
-helm/config/model_metadata.yaml,sha256=LkHebb_FklRvm6DJCOO2leAcbeVQwfNI4xwcIuvMcfg,106991
-helm/config/tokenizer_configs.yaml,sha256=_XdrvaiBCPF09joBt6xKxuh9IIz3oW40whhThTfpx-c,11841
+helm/config/model_deployments.yaml,sha256=KAD0FZ45ERfEjr3y7HbPxZmEnnJBQiiOHRHN7VxqiF4,74817
+helm/config/model_metadata.yaml,sha256=XpJnlu0kiI5sGEqswF_S6_ra0Iys3VOfsDs2Jiz_Vqk,112991
+helm/config/tokenizer_configs.yaml,sha256=3IhRANDTlN39TWqDWuPy507wQlZWOBlyaS8fA6WLDD0,12070
 helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
 helm/proxy/cli.py,sha256=l8F7UYqrIOoBD9ZCIxJFA4fhxlzhae0-2Nn8A7FMkzk,8244
@@ -634,9 +646,9 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
 helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
 helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
 helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
-crfm_helm-0.5.0.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
-crfm_helm-0.5.0.dist-info/METADATA,sha256=KyvhONCTx1d4n_igPIOM-GhnFjbXoqWkYQjSUM1P3G0,18973
-crfm_helm-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-crfm_helm-0.5.0.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
-crfm_helm-0.5.0.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
-crfm_helm-0.5.0.dist-info/RECORD,,
+crfm_helm-0.5.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
+crfm_helm-0.5.1.dist-info/METADATA,sha256=dVxnv-vEsYZb3v-ALFNpSdpbxwi5WQG5_I1oD3cMs6Y,19157
+crfm_helm-0.5.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+crfm_helm-0.5.1.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
+crfm_helm-0.5.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
+crfm_helm-0.5.1.dist-info/RECORD,,

helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py CHANGED Viewed

@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
         # Prompt
         prompt = MultimodalPrompt(
             global_prefix=self.adapter_spec.global_prefix,
+            global_suffix=self.adapter_spec.global_suffix,
             instructions=self.adapter_spec.instructions,
             train_instance_blocks=train_instance_blocks,
             eval_instance_block=eval_instance_block,

helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py CHANGED Viewed

@@ -11,6 +11,9 @@ class MultimodalPrompt:
     # Global prefix, carried over from `AdapterSpec`
     global_prefix: str
+    # Global suffix, carried over from `AdapterSpec`
+    global_suffix: str
     # Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
     instance_prefix: str
@@ -47,6 +50,10 @@ class MultimodalPrompt:
         if self.global_prefix:
             result = result.add_textual_prefix(self.global_prefix)
+        # Add the global prefix if one exists
+        if self.global_suffix:
+            result = result.add_textual_suffix(self.global_suffix)
         return result
     @property

helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py CHANGED Viewed

@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="[START]",
+            global_suffix="",
             instance_prefix="\n",
             instructions="Please answer the following questions about the images.",
             train_instance_blocks=train_instance_blocks,
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="",
+            global_suffix="",
             instance_prefix="\n",
             instructions="",
             train_instance_blocks=[],

helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py CHANGED Viewed

@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
     """Annotator that compiles the text completions into a music sheet with LilyPond."""
     name: str = "lilypond_compiler"
-    base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
+    base_path = "lilypond-2.24.3/bin"
     def __init__(self, cache_config: CacheConfig, file_storage_path: str):
         super().__init__(cache_config, file_storage_path)

helm/benchmark/augmentations/perturbation.py CHANGED Viewed

@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
         description = replace(self.description, seed=seed)
+        perturbed_input: Input
+        if instance.input.multimedia_content:
+            perturbed_media_objects = []
+            for media_object in instance.input.multimedia_content.media_objects:
+                # Apply perturbations to the text data of the multimedia content
+                if media_object.is_type("text") and media_object.text is not None:
+                    perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
+                else:
+                    perturbed_media_objects.append(media_object)
+            perturbed_input = Input(
+                multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
+            )
+        else:
+            perturbed_input = Input(text=self.perturb(instance.input.text, rng))
         # Don't modify `id` of `Instance` here.
         # All the perturbed Instances generated from a single Instance should have the same ID.
         return replace(
             instance,
-            input=Input(text=self.perturb(instance.input.text, rng)),
+            input=perturbed_input,
             references=references,
             perturbation=description,
             contrast_inputs=[instance.input],

helm/benchmark/augmentations/test_perturbation.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import List
 import unittest
+from helm.common.media_object import MediaObject, MultimediaObject
 from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
 from .data_augmenter import DataAugmenter
 from .extra_space_perturbation import ExtraSpacePerturbation
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
     assert instances[1].references[0].output.text == "some name"
+def test_multimodal_text_perturbation():
+    data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
+    input: Input = Input(
+        multimedia_content=MultimediaObject(
+            [
+                MediaObject(text="Hello what is", content_type="text/plain"),
+                MediaObject(text="your name", content_type="text/plain"),
+            ]
+        )
+    )
+    instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
+    instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
+    assert len(instances) == 2
+    # Test that the first instance is unperturbed
+    assert instances[0].id == "id0"
+    assert instances[0].perturbation is None
+    media_objects = instances[0].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello what is"
+    assert media_objects[1].text == "your name"
+    assert instances[1].id == "id0"
+    assert instances[1].perturbation.name == "extra_space"
+    media_objects = instances[1].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello   what   is"
+    assert media_objects[1].text == "your   name"
 def test_misspelling_perturbation():
     data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
     instance: Instance = Instance(

helm/benchmark/metrics/efficiency_metrics.py CHANGED Viewed

@@ -91,8 +91,15 @@ class EfficiencyMetric:
         window_service: WindowService = WindowServiceFactory.get_window_service(
             adapter_spec.model_deployment, tokenizer_service
         )
-        prompt: str = request_state.request.prompt
-        num_prompt_tokens: int = window_service.get_num_tokens(prompt)
+        prompt: str
+        num_prompt_tokens: int
+        if request_state.request.multimodal_prompt is not None:
+            prompt = request_state.request.multimodal_prompt.text
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
+        else:
+            prompt = request_state.request.prompt
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
         # Total number of tokens in the completion.
         num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])

helm/benchmark/metrics/evaluate_reference_metrics.py CHANGED Viewed

@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.code_scenario import CodeReference
 from helm.benchmark.scenarios.scenario import Reference
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import GeneratedOutput
 from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
 from nltk.metrics.scores import f_measure
@@ -21,6 +22,7 @@ import string
 from . import code_metrics_helper
 import nltk
 try:
     nltk.data.find("tokenizers/punkt")
 except LookupError:
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
     return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
+def cider(gold: str, pred: str) -> float:
+    try:
+        from pycocoevalcap.cider.cider import Cider
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+    cider_evaluator = Cider()
+    candidate = {"caption": [pred]}
+    reference = {"caption": [gold]}
+    average_score, _ = cider_evaluator.compute_score(reference, candidate)
+    return average_score
 def extract_set_from_text(
     set_str: str,
     set_start_str: str = " is ",
@@ -325,6 +340,7 @@ def compute_reference_metrics(
         "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
         "code_eval_acc": code_eval,
         "pass": code_eval,
+        "cider": cider,
         "f1_score": f1_score,
         "rouge_1": get_rouge_function("rouge1"),
         "rouge_2": get_rouge_function("rouge2"),

crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl