crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -25,7 +25,7 @@ Requires-Dist: tqdm ~=4.64
25
25
  Requires-Dist: zstandard ~=0.18.0
26
26
  Requires-Dist: sqlitedict ~=1.7
27
27
  Requires-Dist: bottle ~=0.12.23
28
- Requires-Dist: datasets ~=2.15
28
+ Requires-Dist: datasets ~=2.17
29
29
  Requires-Dist: pyarrow >=11.0.0
30
30
  Requires-Dist: pyarrow-hotfix ~=0.6
31
31
  Requires-Dist: nltk ~=3.7
@@ -34,7 +34,7 @@ Requires-Dist: rouge-score ~=0.1.2
34
34
  Requires-Dist: scipy ~=1.10
35
35
  Requires-Dist: uncertainty-calibration ~=0.1.4
36
36
  Requires-Dist: scikit-learn ~=1.1
37
- Requires-Dist: transformers ~=4.37
37
+ Requires-Dist: transformers ~=4.40
38
38
  Requires-Dist: torch <3.0.0,>=1.13.1
39
39
  Requires-Dist: torchvision <3.0.0,>=0.14.1
40
40
  Requires-Dist: google-api-python-client ~=2.64
@@ -136,6 +136,7 @@ Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
136
136
  Requires-Dist: crfm-helm[google] ; extra == 'models'
137
137
  Requires-Dist: crfm-helm[mistral] ; extra == 'models'
138
138
  Requires-Dist: crfm-helm[openai] ; extra == 'models'
139
+ Requires-Dist: crfm-helm[together] ; extra == 'models'
139
140
  Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
140
141
  Requires-Dist: crfm-helm[yandex] ; extra == 'models'
141
142
  Provides-Extra: mongo
@@ -158,6 +159,8 @@ Provides-Extra: slurm
158
159
  Requires-Dist: simple-slurm ~=0.2.6 ; extra == 'slurm'
159
160
  Provides-Extra: summarization
160
161
  Requires-Dist: summ-eval ~=0.892 ; extra == 'summarization'
162
+ Provides-Extra: together
163
+ Requires-Dist: together ~=1.1 ; extra == 'together'
161
164
  Provides-Extra: tsinghua
162
165
  Requires-Dist: icetk ~=0.0.4 ; extra == 'tsinghua'
163
166
  Provides-Extra: unitxt
@@ -173,6 +176,7 @@ Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
173
176
  Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
174
177
  Requires-Dist: crfm-helm[images] ; extra == 'vlm'
175
178
  Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
179
+ Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'
176
180
  Provides-Extra: yandex
177
181
  Requires-Dist: sentencepiece ~=0.1.97 ; extra == 'yandex'
178
182
 
@@ -7,19 +7,18 @@ helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYC
7
7
  helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
8
8
  helm/benchmark/huggingface_registration.py,sha256=RzfOaLAnzAcoTphan1JNo836lNyxMSH67oQlolhNLS0,4154
9
9
  helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
10
- helm/benchmark/model_metadata_registry.py,sha256=q8-tERzoY6i6qFrKs9IM4R3BRjO31t0gmdT0lpCmdRE,8009
10
+ helm/benchmark/model_metadata_registry.py,sha256=fXRJOLUIrLOHUG5duncEqhnpmfb9hyloUlGbOM2L9ds,8194
11
11
  helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
12
12
  helm/benchmark/run.py,sha256=tF_aWy5GtfwBOT1ZRKWrcI74VpFWGzlR00EKiGG7zyI,12572
13
- helm/benchmark/run_expander.py,sha256=NiwP10enxRtBj3yJys1AYCvVUBD9CG4mrf_L6QVBRsk,47965
13
+ helm/benchmark/run_expander.py,sha256=jolEPDrB4lL_VJNRpT1SQta6DZ_xyq2HaIfWHdeyNtA,47785
14
14
  helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
15
- helm/benchmark/run_spec_factory.py,sha256=KQIUmoNa6iHc-xp8-lDMG1PhfgE2_eTFejyyZ1jXFj0,6874
15
+ helm/benchmark/run_spec_factory.py,sha256=nRP9737niPReD5G7t9fgyQ8_EUQ1hvg2VBQe5rSZ08Y,6816
16
16
  helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
17
17
  helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
18
18
  helm/benchmark/server.py,sha256=ysd5MT1TDu65NH-OzIGf9wmZlr8FHNRwoy2ybjSc5Yk,6140
19
19
  helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
20
20
  helm/benchmark/slurm_runner.py,sha256=Tozimrjr2R6mlKHcmrGgxTy9ga-ArIW6AoAWtxqzw-M,16567
21
21
  helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
22
- helm/benchmark/test_model_deployment_definition.py,sha256=wGN95ku-mROh-yiHH3bL8GC7OWkBOa0YJCS2RIGRv8k,4468
23
22
  helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
24
23
  helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
25
24
  helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -44,11 +43,11 @@ helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py,sha256=f_bg
44
43
  helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=v6LLmVTopXNfzo9Qzq16EmmPPivFGGs9LuaPDJAX4vY,9506
45
44
  helm/benchmark/adaptation/adapters/multimodal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
45
  helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py,sha256=o7CGClyVWYOuJ4G56-whq5fTvCr7QIn51Mo6DTdvwg0,1881
47
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=9_9PgNIiJDoTm1NIhCKWytCYe3hHzHJUhaAdd8hM0PQ,6300
48
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=5jdVVfWf0lBErwzDamIHozZSbf1mEkpmACX8mPlHJc0,1907
46
+ helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=bvY8xT2ak_3WG4m2Z5bCM6FLImPIWG1qAn9H2ZNwNv0,6359
47
+ helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=jyL61UxBsIr68hUz-jtjBUnyB2HBp5ESNyECGp_Gf6Q,2129
49
48
  helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py,sha256=PWI_WrfDolipj7Zs43YxFQk36jBgU76PU-kL8R9gRno,4759
50
49
  helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py,sha256=VjSqWiZEcW6K2jrokGUmky7syEOqJ6cbHImR7YZgwzU,10151
51
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=2OZhhOtEZ7RvItHZD4AGsefXvVUwEQZL2T-ESHPsvdI,3482
50
+ helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=KKOOlna6SHLJHSPgfgguPQysc2Nf4kKrqumqwlG27bs,3542
52
51
  helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
52
  helm/benchmark/annotation/annotator.py,sha256=2UIXY71S5dRaZBLb1v4lcv8-O6pyJ9zTeSJl78AEWGI,1538
54
53
  helm/benchmark/annotation/annotator_factory.py,sha256=z5AGBylIuy-_IfgikX66VyGvRz4SxtnOcJsyESH8990,2699
@@ -57,7 +56,7 @@ helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD
57
56
  helm/benchmark/annotation/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
57
  helm/benchmark/annotation/image2structure/image_compiler_annotator.py,sha256=eJFm3iyBe_eEN5Yt0G2IpeA1xdKxRmyR4krsNd6eXoE,3524
59
58
  helm/benchmark/annotation/image2structure/latex_compiler_annotator.py,sha256=yRifoqhGq_mQkkRcgKCFpGrZaI9gochOXYiCU8oY1KE,2477
60
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=mvUg5tfJayACaqJW_wJOehZzBvLUlJppspphR9QYHOU,4034
59
+ helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=we6K1BynV907ZMnGI2zb_tru1uw2iGEI06Wtbnus23w,4010
61
60
  helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py,sha256=rvzdQCaVFM6ovF28TSUnNmB47f2hidlaZm6vO4DJpso,6404
62
61
  helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
62
  helm/benchmark/augmentations/cleva_perturbation.py,sha256=arUkY_luc274YEMZocOos9rpAZVbEFZphbMlobAxTy0,29208
@@ -73,12 +72,12 @@ helm/benchmark/augmentations/lowercase_perturbation.py,sha256=e-lhkuKOV5QR_GnOp7
73
72
  helm/benchmark/augmentations/mild_mix_perturbation.py,sha256=q27-c8_di1jaLhUuo5LzqOStnJcWIjSJ0a8Sr3S5TOs,1787
74
73
  helm/benchmark/augmentations/misspelling_perturbation.py,sha256=IE45qEE0HuIln62tAGvCP9B_RjNMJ5Xu06goL6b0fWQ,2144
75
74
  helm/benchmark/augmentations/person_name_perturbation.py,sha256=AIF5VVjNJ1OlUyCC1bt2dcP1KDn4gdgqd7Y2KGvHPqs,14422
76
- helm/benchmark/augmentations/perturbation.py,sha256=fsvyR2qxFqMKmxF1OCzVqmKkWa0FTk5Jff0wbgKHpDQ,3083
75
+ helm/benchmark/augmentations/perturbation.py,sha256=GapQckD3zkzZZB-PIVO1KKOy7aISGOhRbGGXEzuAFeQ,3880
77
76
  helm/benchmark/augmentations/perturbation_description.py,sha256=VKOwBRPQY-0vuxhGvtac1Z5F10metPfpFnfs8ykFVmU,1184
78
77
  helm/benchmark/augmentations/space_perturbation.py,sha256=g4rbyoureBaOVf_lrRXIWYlL6YZpqbCOJk554iOCuIs,935
79
78
  helm/benchmark/augmentations/suffix_perturbation.py,sha256=P3AfJj_ajTVdjO7AJRQ9dKS-cT1PyRSt8Un57iZQDVc,785
80
79
  helm/benchmark/augmentations/synonym_perturbation.py,sha256=komOV5M342_8unopnwN6gkPWpJIZXidywiu6PO9_riU,4151
81
- helm/benchmark/augmentations/test_perturbation.py,sha256=ERQX6UySs5sep0lesNlQiPfdoTlRjFmHGTpg7vMxSWE,11902
80
+ helm/benchmark/augmentations/test_perturbation.py,sha256=4EooKVcyub70I81trzpNx3Ij-m1vpFa5cFIo6O52icE,13185
82
81
  helm/benchmark/augmentations/translate_perturbation.py,sha256=dn8wO5UOgYbGtP9e77SmwaK2ginrQsTw-79nrzRzfeo,1054
83
82
  helm/benchmark/augmentations/typos_perturbation.py,sha256=_F9zwvrLie8hX7mzUtQmYq6oq6yqaFiKGsvc9LAuBr4,2798
84
83
  helm/benchmark/data_overlap/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -107,9 +106,9 @@ helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=l6TmAflBSgQGLjB-U
107
106
  helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=KhiJ_7tnK8kHg2acKPAUWBQvAtyvjJTwcWJMUSUBHak,6416
108
107
  helm/benchmark/metrics/disinformation_metrics.py,sha256=BsLBG5krHGQh_nKDAeSDpp-li89s7fYoYTf5T0RZfMM,7782
109
108
  helm/benchmark/metrics/dry_run_metrics.py,sha256=d8RgltW4nGTH1tZeGOIlQRwRaJLIxL60h46NXv_wv1s,3674
110
- helm/benchmark/metrics/efficiency_metrics.py,sha256=Y3y-gVOgKfAcsWUPTINmlmyJ-mOyrQ22KuHvhMZPCRM,11522
109
+ helm/benchmark/metrics/efficiency_metrics.py,sha256=v8Eg56HHIWEMQruODKBvwdUfR6ZLGgrNifo-senCaUo,11786
111
110
  helm/benchmark/metrics/evaluate_instances_metric.py,sha256=EBUf0ONnNoi7pcxYab7RD0B_JqGksqDX8TOaosSmJk8,2847
112
- helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=CtVqdmQBdHxXyb1Cv3UZwCstHTOjNPgQ0cgmEJ6mMSQ,15063
111
+ helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=vUJavaLVfbWtrwyrIA81npK_1iirhko7_zMF1kL7Gfw,15559
113
112
  helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=Pj1itUJi_KDy0D-FOPcOyHqm4ypHMfhbAVeDJzGlyeo,9773
114
113
  helm/benchmark/metrics/language_modeling_metrics.py,sha256=ofqwj1PMJQu16QhLDULXBmZ5iFz91ducwLRpNsRYELE,4510
115
114
  helm/benchmark/metrics/machine_translation_metrics.py,sha256=bp_EDXyxntIty5gORDa7va-C73quOzoTc5o8MpxFmL4,3816
@@ -182,7 +181,7 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=_wJ3E3L
182
181
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
183
182
  helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
183
  helm/benchmark/metrics/vision_language/emd_utils.py,sha256=3yN-DY5rxMabmtLV003lj59SRnp_T83sLAi96rycKEo,15043
185
- helm/benchmark/metrics/vision_language/image_metrics.py,sha256=8dtrEYAjYKtr1ID5ytjKxiTwXg0loppBjYAV8Av956c,20059
184
+ helm/benchmark/metrics/vision_language/image_metrics.py,sha256=aJ3zrVOLJJzdVKqXPcFsCXp9LSHET8VGEgtvwK-nkJc,25190
186
185
  helm/benchmark/metrics/vision_language/image_utils.py,sha256=XeYF3E6MnYyPJ5hYp4TtiTP27-y4S8LTBH5bZVcvJFg,3758
187
186
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
187
  helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
@@ -205,7 +204,7 @@ helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMY
205
204
  helm/benchmark/run_specs/lite_run_specs.py,sha256=ViCPJ86Aah8301GTEk6z4_MtP0g8iik33t4GudobhWQ,11113
206
205
  helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
207
206
  helm/benchmark/run_specs/unitxt_run_specs.py,sha256=ejp_knrcIjf0J4WiKj9LTgDTcUr29-XFZYHYz0w_dkM,1518
208
- helm/benchmark/run_specs/vlm_run_specs.py,sha256=S3x80FbO7JQE3L6A0tZ70lBLKThfC0vAASdyP_cj3Ko,17477
207
+ helm/benchmark/run_specs/vlm_run_specs.py,sha256=CmdyEF-pdFIlMhBV7UraQ0FuQgQl2rqVSdTz22uYuPQ,26808
209
208
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
209
  helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=Wyt7J5BAvAqC5JTqCW4fh7ex9-itX11P_9rLTocqvtk,4973
211
210
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=S1tPQY2x1I3hQL1JQ6wvUwvKyiSe7SqpRSW6N3_T0mo,5043
@@ -248,13 +247,13 @@ helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=PuwcuHnx3nCuRYEE
248
247
  helm/benchmark/scenarios/koala_scenario.py,sha256=ihx39W1C9KyOh5MkdMqw1twUwzA8S5QPsBKeQRZcZGM,1365
249
248
  helm/benchmark/scenarios/legal_summarization_scenario.py,sha256=DTYYsf-74NFjHvrNa5ZWSgMywq9ss16r5eFZI2-xZJk,8644
250
249
  helm/benchmark/scenarios/legal_support_scenario.py,sha256=hu8SlDGrU6oUTQjnrLEagjFNniW96wMahKjnrtuyjzo,3925
251
- helm/benchmark/scenarios/legalbench_scenario.py,sha256=55D09TaELT_B__ERyyWUp92gGwXifNgX1eIOzw2pgBE,4398
250
+ helm/benchmark/scenarios/legalbench_scenario.py,sha256=jLo8HFE24M7pcS4PgrpAYhiZNuXSfPUb6LrGoaGCT1A,4490
252
251
  helm/benchmark/scenarios/lex_glue_scenario.py,sha256=vsy8Y03hoCwTJFgBdQf0koxlHDRFm2V9wT04n2K1tlk,10268
253
252
  helm/benchmark/scenarios/lextreme_scenario.py,sha256=HEpXLdLLywo6yfXaGF3zRgx1PiGPE2ILnpKYs9Dmsow,20396
254
253
  helm/benchmark/scenarios/live_qa_scenario.py,sha256=meJvkkbqRUQ2odKkmlOLYOVVFd0LGDhGzK5nHCjXW6Y,3642
255
254
  helm/benchmark/scenarios/lm_entry_scenario.py,sha256=_7lTn2kfUQHi3ynX7fEaoseUYkpBx-ANn62kqHr-bY4,9085
256
255
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=Ui68mi05N3WxGMZfCznJBscKkqaJ0ZEUO3PhIiD1xNE,6129
257
- helm/benchmark/scenarios/math_scenario.py,sha256=mL6Xf0xTAKcrH_YGqTWJxACnctGRUIQh7alLoLqUL8s,14144
256
+ helm/benchmark/scenarios/math_scenario.py,sha256=LvoQJUwZ0w78h5X1tIaa6SRsef_W_PNn4L_r5L3Cus8,14168
258
257
  helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=WLfED6DiIOUPd4PowydPmrQ4v26OsjG4iHumq7ka9do,4001
259
258
  helm/benchmark/scenarios/med_dialog_scenario.py,sha256=f7hTcEpxGfoxJ-WXNrSRrgVfbyuYpB624HbyRuMP-KI,7295
260
259
  helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=JdVQuzKM4TRyWT6c1zpsMvNk1RYHnKdy_fRM6W7PCLI,5035
@@ -314,25 +313,35 @@ helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py,s
314
313
  helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py,sha256=IB4_GbzQjjXBp-551XZ6PTNUCRX1jLcGfB3bVFI5lo4,3547
315
314
  helm/benchmark/scenarios/image_generation/winoground_scenario.py,sha256=E2xPQNQzylDSmqLjjMkQB8D7A6g7bzqtSF4bXPgfVbI,2889
316
315
  helm/benchmark/scenarios/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
+ helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py,sha256=zXR0LmXsD2tv_ovJsbY_HP53kdiFOvty7Y_Ai3ZCrT4,3037
317
317
  helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=LiH14xUoEKXn5ZStDbGE4bz9iMEn3-5I39eJ6kvN2UY,4045
318
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py,sha256=us75yZygU7_xPrNLi5yvRBx3G9O3_HOnRRgfjcbqXzI,3709
318
+ helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py,sha256=82qplX4gJ4GsSVhBjwrsVU46TAHh-jym3F_M5A-odRE,4608
319
+ helm/benchmark/scenarios/vision_language/flickr30k_scenario.py,sha256=3pBAQgOsnSyMCzt60s1m8Kf_fEJ4C7XgCDbtXatTlX0,2599
320
+ helm/benchmark/scenarios/vision_language/gqa_scenario.py,sha256=sBQfqAxmP-Z0ifCgwTbP11aPsKA4vogcWBqSDiKlbE4,3512
321
+ helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py,sha256=7KjsXiAaiVHDRSyW08jZXNUTWogP3Sr2Og5ViT6Xz8I,3832
319
322
  helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py,sha256=7GK_jAOfCgRIGiN_GInDePwuT2wZqmWHp1rqdx18xQg,4994
323
+ helm/benchmark/scenarios/vision_language/math_vista_scenario.py,sha256=kzZHeyWQHUphUfAixkms2t3-KKfHRjwIKi6qm-lMyXA,4728
320
324
  helm/benchmark/scenarios/vision_language/mementos_scenario.py,sha256=Yw4zxeYgUw8HKRR5ob9QEIT0bSPrdRUULMKCo_xzFpw,4337
325
+ helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py,sha256=ly77pj_TorwM1kN2sW7Y2AIGHOBlDkdzV0STvZTBOtc,4332
321
326
  helm/benchmark/scenarios/vision_language/mme_scenario.py,sha256=zxtdub2akvxPYEG12pkW2c57TIFqN38C7ucAXAHAdx0,5455
322
327
  helm/benchmark/scenarios/vision_language/mmmu_scenario.py,sha256=XQv7uv2m6EdbI7h0-9eDag4_bL7qE_78PuHB7c4SsHA,7654
328
+ helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HUO09uM2rBXOfCsxzwovmwtihq53xjuzDOtQO_S3J4I,4161
329
+ helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
323
330
  helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
331
+ helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
332
+ helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=rkPR_e_RWOeSyHIlSJGJ5lVu5DD-AR3x686XYJse-1E,9885
324
333
  helm/benchmark/scenarios/vision_language/pope_scenario.py,sha256=uFkzMMsjhmuSYo3v_QdfJFX6RFse83JjzMfMa3ynvV4,3975
325
334
  helm/benchmark/scenarios/vision_language/seed_bench_scenario.py,sha256=5MwGb9BOyB2Xy70BGYZcjencf0ZskxBuzcPa7ABRuww,5106
326
335
  helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=e3lCq2nevy9tIFDDKEbJvmLibfk4UMQtAIyzrgnnaZs,4179
327
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=VJCTQU0jG_ROtGD-fu21BBrcMGHH1Fv-_DTEzEOHUjg,4127
328
- helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=vAf6MkYNDgi6_ODhz-rCY_OVqXKxGVJiwsYGnUw7H18,5089
336
+ helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=hJ3sOSpPnOCwLtpVnfasI_X89oofI-2PBRjMnx8eiVA,4139
337
+ helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=2hY-qngKC69ZL9SHNei3IK3C2PvJDWvwLFVQ8yNSOVs,5196
329
338
  helm/benchmark/scenarios/vision_language/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
339
  helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py,sha256=ImhfiC_y_hihAGvlj9zRsaoW614QFCBopBD2KxnbSs0,1805
331
340
  helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=-eWRwo2x7kR46Z_I4vFbVlbqA_1f2UEb75Dx84XTlNE,9028
332
341
  helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=FKKybU4IeglwXCj6GZC8cAUs_GOU7ymEa6P1dkDT7uw,1350
333
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=UvYzQzAvWRrydPty4LEifCjo502xNaHlmRVUdduvk9k,1048
342
+ helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=SyAYkhsipjJG42XfM9sljz1vly5YF-dbSEWTj_dEHIU,1048
334
343
  helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py,sha256=ovg8-FfJ8_I1xbajFGSLvERZIA1fQjaUn0zd04ZbI84,15316
335
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=PSVBqNxvE5GXEsa3iBuK51vzp3Nho25Gjv9KL_uOMLA,9628
344
+ helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=j2bDYeWdytYtkKskvuTMwLEIIqELDJJ6D2jdYzmdlJY,9628
336
345
  helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
337
346
  helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py,sha256=i-i0mlG5oRRDNYNqP7o7Ul56iL02p_anJoThXaSvFiM,2826
338
347
  helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py,sha256=9WntahzuhVv54IH1m7_z0IxwLma3dbaMOne_pUx751Y,7652
@@ -347,11 +356,13 @@ helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheG
347
356
  helm/benchmark/static/json-urls.js,sha256=AaULgfHw8OLfrQLJpBHfcC013uavQnlNNFS9vzb0qOg,1981
348
357
  helm/benchmark/static/plot-captions.js,sha256=bTR8gYx-QqF_RJyKX-L-eQP7hSEtawfJSoADCvgjKag,3011
349
358
  helm/benchmark/static/schema_classic.yaml,sha256=p-yc2WMfyGehRtD7L5ZZHbFMMQovu2HNfvct3tBlV2I,108168
359
+ helm/benchmark/static/schema_image2structure.yaml,sha256=gig7HVyJWSwcHa96mf-09e68_fU5L02YRWzNbkPmpGg,13520
350
360
  helm/benchmark/static/schema_instruction_following.yaml,sha256=mg2g5P8TAYSCEhZbLfshPt_Hq2GKjwbvyOsQrwDqh7w,8923
351
361
  helm/benchmark/static/schema_lite.yaml,sha256=62ByEWhAJT0tIUFi-euxJ7XFhE6e9E6PT9dF6V3qoSU,40255
352
362
  helm/benchmark/static/schema_mmlu.yaml,sha256=8kiZDEGGaBXs9ucDk_Gbo2agV-OgOmWuhcYFyodRjcw,53307
353
363
  helm/benchmark/static/schema_unitxt.yaml,sha256=89GnKrooG7kKU2xh0MeoYZUB54FDUAmOPrbzuBhG1Ik,15496
354
- helm/benchmark/static/schema_vlm.yaml,sha256=IJLEPRe9ZCqVPRLQSIc-VpuUmZx03AJ3r_K8C19_zNU,21823
364
+ helm/benchmark/static/schema_vhelm_lite.yaml,sha256=s8tQIetR2WKu3sd8k2uZO68_5E-YtlMdsBJsTehFZKE,7331
365
+ helm/benchmark/static/schema_vlm.yaml,sha256=o9AzLTKwSbPES5pISI0tmpUPKWWT9GR-dleDKZqoI0w,33243
355
366
  helm/benchmark/static/utils.js,sha256=bgN0PT53Dregc-nLmEmAEmg2psufWpS8jTf74WoypHw,7681
356
367
  helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
357
368
  helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -373,7 +384,7 @@ helm/benchmark/static/images/organizations/together.png,sha256=pmWjW4r7GnlKqFhKL
373
384
  helm/benchmark/static/images/organizations/tsinghua-keg.png,sha256=l9SzlZCsLF18BY876wYJcVgiQbgvwte7uoILPDcVwHk,7776
374
385
  helm/benchmark/static/images/organizations/yandex.png,sha256=OOCdcKubAP4x7h4VW7z5a-AHPWBiSDTjsIJea6ZiovA,27964
375
386
  helm/benchmark/static_build/config.js,sha256=ER8utDIqVZi9uge7Qrk1gmlT88TOOkFF9xYp3j10m8U,165
376
- helm/benchmark/static_build/index.html,sha256=fqZee8nuPxeRbBNzPgY0AUmGFHF-ngY1dG_MfwfZGLc,1149
387
+ helm/benchmark/static_build/index.html,sha256=g3pMdAovQ4VMr7dPGgyzWv2K1tN-E8LLkAs45ppLPGw,1149
377
388
  helm/benchmark/static_build/assets/01-694cb9b7.png,sha256=aUy5t0DYCg4r52HDOmeNi1S2CHsnv3mE7ySokJg3Ouo,8903
378
389
  helm/benchmark/static_build/assets/ai21-0eb91ec3.png,sha256=Drkew6Vlwi2_4_S8hjagK2x8smOwLKTNiXIT3rDiurs,10208
379
390
  helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png,sha256=fOEANHS8RymKaCzUWn9gQWebts2ghSmtW9Fdda_TjR8,7224
@@ -386,8 +397,8 @@ helm/benchmark/static_build/assets/google-06d997ad.png,sha256=BtmXrVQZHr3WH5c8c2
386
397
  helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
387
398
  helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
388
399
  helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
389
- helm/benchmark/static_build/assets/index-5088afcb.css,sha256=UIivyx7XvQHnXD0s-J7WaeOZKDl2mgNxUts8n9D57Es,486112
390
- helm/benchmark/static_build/assets/index-d839df55.js,sha256=d0xZkf7R2yZMo7pQTJb9J1dqZRg5hcqaaoFutsjocbQ,66843
400
+ helm/benchmark/static_build/assets/index-737eef9e.js,sha256=PvNcOghX7gGSYAGk2bR3pvIBnwDbeWHu0JyfPNaan3o,70614
401
+ helm/benchmark/static_build/assets/index-878a1094.css,sha256=h4oQlJUZdqMk6nS_TEkyXMZ6rtGmepw4ljoSAHZX1vY,486381
391
402
  helm/benchmark/static_build/assets/meta-5580e9f1.png,sha256=VYDp8arkAe2eYRJhAOcIAsZY1qY0hqyOEQDgVMbX9M8,4646
392
403
  helm/benchmark/static_build/assets/microsoft-f5ee5016.png,sha256=9e5QFl23yTbnAk8u7lZKaQOf4oPHbr_aiQda5n4MZqE,50850
393
404
  helm/benchmark/static_build/assets/mistral-18e1be23.png,sha256=GOG-Ix7XlctGOUmvJfO2oVSBM7E5O562G88OnoxsjBw,14402
@@ -443,7 +454,7 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
443
454
  helm/clients/ai21_client.py,sha256=LIdkmzcUDR9uIF2tIk5YgDNGNmfQ9JDYmgscvFoCHDs,5509
444
455
  helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
445
456
  helm/clients/aleph_alpha_client.py,sha256=koPqXF6uRD905atoiCaPg5yxr6B25J0g2OTWk8geebQ,4969
446
- helm/clients/anthropic_client.py,sha256=woFek17fbUJ30vVdVj1cZX8AxI0VwMoc5eevu9z69C4,31585
457
+ helm/clients/anthropic_client.py,sha256=0hAmv3f6FQURScmDpcGbwGjnvskNRP2vhRH02OSe70I,33224
447
458
  helm/clients/auto_client.py,sha256=Qs0XFq9pyH4M9HTOLoI3_5m8kW305x3pzVukgETdrZM,10732
448
459
  helm/clients/bedrock_client.py,sha256=BsH9UopsP6ZHf-K0Yzg1PYSMLDwY0yIUmPHDhJVMUi0,5293
449
460
  helm/clients/bedrock_utils.py,sha256=okZ6Z8pviGOUNlrdF2QquAqFs8-QYgcqci95eij8giM,2574
@@ -462,7 +473,7 @@ helm/clients/megatron_client.py,sha256=KFL1BBBDqxr5mtd5iu0dA6uK8_v6d4g_D6RsZrHx3
462
473
  helm/clients/mistral_client.py,sha256=thOLMcEfrzWR00JUabIZ_PnW2o9YZsdSmNf9z3jbYKo,5982
463
474
  helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
464
475
  helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
465
- helm/clients/openai_client.py,sha256=ghwmaFCUTy82HeEq49g-PD-Z9hRs89bS_T0JMlzHgEA,13922
476
+ helm/clients/openai_client.py,sha256=gWqr4dvYfbUnBtfySSUGGVZYV-pLtqcrnYaf7nPk5-s,13936
466
477
  helm/clients/palmyra_client.py,sha256=LBYFHNc5LdpPbiSp1AAHuMm8cUUCQ2EB03BB6XnDTYQ,6551
467
478
  helm/clients/perspective_api_client.py,sha256=WQDArqlKVWwcK2SicnSIAgV6JGVHsxibTzkdezT3z_U,5920
468
479
  helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
@@ -471,9 +482,9 @@ helm/clients/test_client.py,sha256=g29C1WLUONnNuE2oGFZhaqMahb-doS4l_Ph4OHrQvrc,3
471
482
  helm/clients/test_huggingface_client.py,sha256=WUPrA7VT3nnMNht7w20I6411hlpIS_77XbQC2vC0WU0,2723
472
483
  helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
473
484
  helm/clients/test_together_client.py,sha256=lAtGKn3WdsYe5MEfTYVYRnu_rS4DPnfFr5jRn42rvoQ,3865
474
- helm/clients/together_client.py,sha256=xvqSaXvGeiJtf8c8MzgCMrXWM6m18y_kMFcry7NsmL4,12819
485
+ helm/clients/together_client.py,sha256=fCPJ39fX3xm_Gp6cGsc1HIf1jVMLNiE2kIkee45-Ufk,16208
475
486
  helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
476
- helm/clients/vertexai_client.py,sha256=RMIuOXi7W0PGcZKy5ClsPc4dPVlVh6DHOCthQlUyrvo,18564
487
+ helm/clients/vertexai_client.py,sha256=Mt1rb9lWeQqJLGcBSR5mflYBvJvJfsv5OeIuQz4_ng0,19726
477
488
  helm/clients/vllm_client.py,sha256=p9atBtq3PBOoPkOPSifkMrYZjNLnNM_sWM6tL_3N-WY,1675
478
489
  helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
479
490
  helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
@@ -532,8 +543,9 @@ helm/clients/image_generation/mindalle/utils/config.py,sha256=lh8dXvL7ctKmuYEbeT
532
543
  helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh9HMveJs6F49UMK57Xfa0ccnHqI8,5029
533
544
  helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
534
545
  helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
535
- helm/clients/vision_language/huggingface_vlm_client.py,sha256=AMjOlg_Fb65Ah1d3XAt9M-842huhO_vOA8HQfNkAApk,4433
536
- helm/clients/vision_language/idefics_client.py,sha256=WQSDcoLNcVNmoKZ3PsAsIZOeZHiYQ33IXFz2-ktgVIk,7680
546
+ helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
547
+ helm/clients/vision_language/huggingface_vlm_client.py,sha256=X5SX2iMZkFe9Pmq4Gx0O4bnP4gBPnKvamLThRshAEik,4875
548
+ helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
537
549
  helm/clients/vision_language/open_flamingo_client.py,sha256=CkN0JCeR742ZG9Nc4A85hp4BSE0WLU-3Rs-ZwdmDkzs,6632
538
550
  helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
539
551
  helm/clients/vision_language/open_flamingo/__init__.py,sha256=i1tGJj6ckeE6eS1EWV5tbQKYLmPCrdSI45mPchfv_Ic,88
@@ -557,7 +569,7 @@ helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
557
569
  helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
558
570
  helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
559
571
  helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
560
- helm/common/images_utils.py,sha256=uYqxp3nnfDwzg0hAiFX58ZPat944xKekLU_pWSwadXQ,2319
572
+ helm/common/images_utils.py,sha256=zbzS8C_oCDb9dY2xpWY6nljI8of72rqwijryMeiBKKo,2527
561
573
  helm/common/key_value_store.py,sha256=iHi1WQuWttLNJnuM48QNOAXHoneNbmbBmtXYPq-dyys,3147
562
574
  helm/common/media_object.py,sha256=3VZqfb0py5dDKwWtnLp2kdl8svaike-Cn7Mjk-b0cvM,5130
563
575
  helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
@@ -578,9 +590,9 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
578
590
  helm/common/file_caches/local_file_cache.py,sha256=wBOAbbkGLiClaX4YdunokRfSQCKNkTYmMVx2KTLy4Lc,1921
579
591
  helm/common/file_caches/test_local_file_cache.py,sha256=bOCWR9MglwQXV98xk8auyjgFxaOr85zRdxWwxMBQW9s,663
580
592
  helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
581
- helm/config/model_deployments.yaml,sha256=ptZz4bzZPbbuSCIvpyNm3UoKXZXxGsHyuAD0Kir9vH0,71211
582
- helm/config/model_metadata.yaml,sha256=LkHebb_FklRvm6DJCOO2leAcbeVQwfNI4xwcIuvMcfg,106991
583
- helm/config/tokenizer_configs.yaml,sha256=_XdrvaiBCPF09joBt6xKxuh9IIz3oW40whhThTfpx-c,11841
593
+ helm/config/model_deployments.yaml,sha256=KAD0FZ45ERfEjr3y7HbPxZmEnnJBQiiOHRHN7VxqiF4,74817
594
+ helm/config/model_metadata.yaml,sha256=XpJnlu0kiI5sGEqswF_S6_ra0Iys3VOfsDs2Jiz_Vqk,112991
595
+ helm/config/tokenizer_configs.yaml,sha256=3IhRANDTlN39TWqDWuPy507wQlZWOBlyaS8fA6WLDD0,12070
584
596
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
585
597
  helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
586
598
  helm/proxy/cli.py,sha256=l8F7UYqrIOoBD9ZCIxJFA4fhxlzhae0-2Nn8A7FMkzk,8244
@@ -634,9 +646,9 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
634
646
  helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
635
647
  helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
636
648
  helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
637
- crfm_helm-0.5.0.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
638
- crfm_helm-0.5.0.dist-info/METADATA,sha256=KyvhONCTx1d4n_igPIOM-GhnFjbXoqWkYQjSUM1P3G0,18973
639
- crfm_helm-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
640
- crfm_helm-0.5.0.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
641
- crfm_helm-0.5.0.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
642
- crfm_helm-0.5.0.dist-info/RECORD,,
649
+ crfm_helm-0.5.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
650
+ crfm_helm-0.5.1.dist-info/METADATA,sha256=dVxnv-vEsYZb3v-ALFNpSdpbxwi5WQG5_I1oD3cMs6Y,19157
651
+ crfm_helm-0.5.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
652
+ crfm_helm-0.5.1.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
653
+ crfm_helm-0.5.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
654
+ crfm_helm-0.5.1.dist-info/RECORD,,
@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
79
79
  # Prompt
80
80
  prompt = MultimodalPrompt(
81
81
  global_prefix=self.adapter_spec.global_prefix,
82
+ global_suffix=self.adapter_spec.global_suffix,
82
83
  instructions=self.adapter_spec.instructions,
83
84
  train_instance_blocks=train_instance_blocks,
84
85
  eval_instance_block=eval_instance_block,
@@ -11,6 +11,9 @@ class MultimodalPrompt:
11
11
  # Global prefix, carried over from `AdapterSpec`
12
12
  global_prefix: str
13
13
 
14
+ # Global suffix, carried over from `AdapterSpec`
15
+ global_suffix: str
16
+
14
17
  # Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
15
18
  instance_prefix: str
16
19
 
@@ -47,6 +50,10 @@ class MultimodalPrompt:
47
50
  if self.global_prefix:
48
51
  result = result.add_textual_prefix(self.global_prefix)
49
52
 
53
+ # Add the global prefix if one exists
54
+ if self.global_suffix:
55
+ result = result.add_textual_suffix(self.global_suffix)
56
+
50
57
  return result
51
58
 
52
59
  @property
@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
32
32
 
33
33
  prompt = MultimodalPrompt(
34
34
  global_prefix="[START]",
35
+ global_suffix="",
35
36
  instance_prefix="\n",
36
37
  instructions="Please answer the following questions about the images.",
37
38
  train_instance_blocks=train_instance_blocks,
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
67
68
 
68
69
  prompt = MultimodalPrompt(
69
70
  global_prefix="",
71
+ global_suffix="",
70
72
  instance_prefix="\n",
71
73
  instructions="",
72
74
  train_instance_blocks=[],
@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
18
18
  """Annotator that compiles the text completions into a music sheet with LilyPond."""
19
19
 
20
20
  name: str = "lilypond_compiler"
21
- base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
21
+ base_path = "lilypond-2.24.3/bin"
22
22
 
23
23
  def __init__(self, cache_config: CacheConfig, file_storage_path: str):
24
24
  super().__init__(cache_config, file_storage_path)
@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
48
48
 
49
49
  description = replace(self.description, seed=seed)
50
50
 
51
+ perturbed_input: Input
52
+ if instance.input.multimedia_content:
53
+ perturbed_media_objects = []
54
+ for media_object in instance.input.multimedia_content.media_objects:
55
+ # Apply perturbations to the text data of the multimedia content
56
+ if media_object.is_type("text") and media_object.text is not None:
57
+ perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
58
+ else:
59
+ perturbed_media_objects.append(media_object)
60
+
61
+ perturbed_input = Input(
62
+ multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
63
+ )
64
+ else:
65
+ perturbed_input = Input(text=self.perturb(instance.input.text, rng))
66
+
51
67
  # Don't modify `id` of `Instance` here.
52
68
  # All the perturbed Instances generated from a single Instance should have the same ID.
53
69
  return replace(
54
70
  instance,
55
- input=Input(text=self.perturb(instance.input.text, rng)),
71
+ input=perturbed_input,
56
72
  references=references,
57
73
  perturbation=description,
58
74
  contrast_inputs=[instance.input],
@@ -2,6 +2,7 @@
2
2
  from typing import List
3
3
  import unittest
4
4
 
5
+ from helm.common.media_object import MediaObject, MultimediaObject
5
6
  from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
6
7
  from .data_augmenter import DataAugmenter
7
8
  from .extra_space_perturbation import ExtraSpacePerturbation
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
33
34
  assert instances[1].references[0].output.text == "some name"
34
35
 
35
36
 
37
+ def test_multimodal_text_perturbation():
38
+ data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
39
+ input: Input = Input(
40
+ multimedia_content=MultimediaObject(
41
+ [
42
+ MediaObject(text="Hello what is", content_type="text/plain"),
43
+ MediaObject(text="your name", content_type="text/plain"),
44
+ ]
45
+ )
46
+ )
47
+ instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
48
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
49
+
50
+ assert len(instances) == 2
51
+
52
+ # Test that the first instance is unperturbed
53
+ assert instances[0].id == "id0"
54
+ assert instances[0].perturbation is None
55
+ media_objects = instances[0].input.multimedia_content.media_objects
56
+ assert media_objects[0].text == "Hello what is"
57
+ assert media_objects[1].text == "your name"
58
+
59
+ assert instances[1].id == "id0"
60
+ assert instances[1].perturbation.name == "extra_space"
61
+ media_objects = instances[1].input.multimedia_content.media_objects
62
+ assert media_objects[0].text == "Hello what is"
63
+ assert media_objects[1].text == "your name"
64
+
65
+
36
66
  def test_misspelling_perturbation():
37
67
  data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
38
68
  instance: Instance = Instance(
@@ -91,8 +91,15 @@ class EfficiencyMetric:
91
91
  window_service: WindowService = WindowServiceFactory.get_window_service(
92
92
  adapter_spec.model_deployment, tokenizer_service
93
93
  )
94
- prompt: str = request_state.request.prompt
95
- num_prompt_tokens: int = window_service.get_num_tokens(prompt)
94
+
95
+ prompt: str
96
+ num_prompt_tokens: int
97
+ if request_state.request.multimodal_prompt is not None:
98
+ prompt = request_state.request.multimodal_prompt.text
99
+ num_prompt_tokens = window_service.get_num_tokens(prompt)
100
+ else:
101
+ prompt = request_state.request.prompt
102
+ num_prompt_tokens = window_service.get_num_tokens(prompt)
96
103
 
97
104
  # Total number of tokens in the completion.
98
105
  num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
10
10
  from helm.benchmark.metrics.statistic import Stat
11
11
  from helm.benchmark.scenarios.code_scenario import CodeReference
12
12
  from helm.benchmark.scenarios.scenario import Reference
13
+ from helm.common.optional_dependencies import handle_module_not_found_error
13
14
  from helm.common.request import GeneratedOutput
14
15
  from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
15
16
  from nltk.metrics.scores import f_measure
@@ -21,6 +22,7 @@ import string
21
22
  from . import code_metrics_helper
22
23
  import nltk
23
24
 
25
+
24
26
  try:
25
27
  nltk.data.find("tokenizers/punkt")
26
28
  except LookupError:
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
188
190
  return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
189
191
 
190
192
 
193
+ def cider(gold: str, pred: str) -> float:
194
+ try:
195
+ from pycocoevalcap.cider.cider import Cider
196
+ except ModuleNotFoundError as e:
197
+ handle_module_not_found_error(e, ["vlm"])
198
+
199
+ cider_evaluator = Cider()
200
+ candidate = {"caption": [pred]}
201
+ reference = {"caption": [gold]}
202
+ average_score, _ = cider_evaluator.compute_score(reference, candidate)
203
+ return average_score
204
+
205
+
191
206
  def extract_set_from_text(
192
207
  set_str: str,
193
208
  set_start_str: str = " is ",
@@ -325,6 +340,7 @@ def compute_reference_metrics(
325
340
  "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
326
341
  "code_eval_acc": code_eval,
327
342
  "pass": code_eval,
343
+ "cider": cider,
328
344
  "f1_score": f1_score,
329
345
  "rouge_1": get_rouge_function("rouge1"),
330
346
  "rouge_2": get_rouge_function("rouge2"),