crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/run_expander.py +35 -63
- helm/benchmark/run_spec_factory.py +11 -10
- helm/benchmark/run_specs/vlm_run_specs.py +294 -38
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +257 -10
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +36 -6
- helm/clients/openai_client.py +2 -3
- helm/clients/together_client.py +93 -2
- helm/clients/vertexai_client.py +59 -50
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +11 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/common/images_utils.py +10 -3
- helm/config/model_deployments.yaml +100 -2
- helm/config/model_metadata.yaml +136 -31
- helm/config/tokenizer_configs.yaml +7 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: crfm-helm
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Benchmark for language models
|
|
5
5
|
Home-page: https://github.com/stanford-crfm/helm
|
|
6
6
|
Author: Stanford CRFM
|
|
@@ -25,7 +25,7 @@ Requires-Dist: tqdm ~=4.64
|
|
|
25
25
|
Requires-Dist: zstandard ~=0.18.0
|
|
26
26
|
Requires-Dist: sqlitedict ~=1.7
|
|
27
27
|
Requires-Dist: bottle ~=0.12.23
|
|
28
|
-
Requires-Dist: datasets ~=2.
|
|
28
|
+
Requires-Dist: datasets ~=2.17
|
|
29
29
|
Requires-Dist: pyarrow >=11.0.0
|
|
30
30
|
Requires-Dist: pyarrow-hotfix ~=0.6
|
|
31
31
|
Requires-Dist: nltk ~=3.7
|
|
@@ -34,7 +34,7 @@ Requires-Dist: rouge-score ~=0.1.2
|
|
|
34
34
|
Requires-Dist: scipy ~=1.10
|
|
35
35
|
Requires-Dist: uncertainty-calibration ~=0.1.4
|
|
36
36
|
Requires-Dist: scikit-learn ~=1.1
|
|
37
|
-
Requires-Dist: transformers ~=4.
|
|
37
|
+
Requires-Dist: transformers ~=4.40
|
|
38
38
|
Requires-Dist: torch <3.0.0,>=1.13.1
|
|
39
39
|
Requires-Dist: torchvision <3.0.0,>=0.14.1
|
|
40
40
|
Requires-Dist: google-api-python-client ~=2.64
|
|
@@ -136,6 +136,7 @@ Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
|
|
|
136
136
|
Requires-Dist: crfm-helm[google] ; extra == 'models'
|
|
137
137
|
Requires-Dist: crfm-helm[mistral] ; extra == 'models'
|
|
138
138
|
Requires-Dist: crfm-helm[openai] ; extra == 'models'
|
|
139
|
+
Requires-Dist: crfm-helm[together] ; extra == 'models'
|
|
139
140
|
Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
|
|
140
141
|
Requires-Dist: crfm-helm[yandex] ; extra == 'models'
|
|
141
142
|
Provides-Extra: mongo
|
|
@@ -158,6 +159,8 @@ Provides-Extra: slurm
|
|
|
158
159
|
Requires-Dist: simple-slurm ~=0.2.6 ; extra == 'slurm'
|
|
159
160
|
Provides-Extra: summarization
|
|
160
161
|
Requires-Dist: summ-eval ~=0.892 ; extra == 'summarization'
|
|
162
|
+
Provides-Extra: together
|
|
163
|
+
Requires-Dist: together ~=1.1 ; extra == 'together'
|
|
161
164
|
Provides-Extra: tsinghua
|
|
162
165
|
Requires-Dist: icetk ~=0.0.4 ; extra == 'tsinghua'
|
|
163
166
|
Provides-Extra: unitxt
|
|
@@ -173,6 +176,7 @@ Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
|
|
|
173
176
|
Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
|
|
174
177
|
Requires-Dist: crfm-helm[images] ; extra == 'vlm'
|
|
175
178
|
Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
|
|
179
|
+
Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'
|
|
176
180
|
Provides-Extra: yandex
|
|
177
181
|
Requires-Dist: sentencepiece ~=0.1.97 ; extra == 'yandex'
|
|
178
182
|
|
|
@@ -7,19 +7,18 @@ helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYC
|
|
|
7
7
|
helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
|
|
8
8
|
helm/benchmark/huggingface_registration.py,sha256=RzfOaLAnzAcoTphan1JNo836lNyxMSH67oQlolhNLS0,4154
|
|
9
9
|
helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
|
|
10
|
-
helm/benchmark/model_metadata_registry.py,sha256=
|
|
10
|
+
helm/benchmark/model_metadata_registry.py,sha256=fXRJOLUIrLOHUG5duncEqhnpmfb9hyloUlGbOM2L9ds,8194
|
|
11
11
|
helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
|
|
12
12
|
helm/benchmark/run.py,sha256=tF_aWy5GtfwBOT1ZRKWrcI74VpFWGzlR00EKiGG7zyI,12572
|
|
13
|
-
helm/benchmark/run_expander.py,sha256=
|
|
13
|
+
helm/benchmark/run_expander.py,sha256=jolEPDrB4lL_VJNRpT1SQta6DZ_xyq2HaIfWHdeyNtA,47785
|
|
14
14
|
helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
|
|
15
|
-
helm/benchmark/run_spec_factory.py,sha256=
|
|
15
|
+
helm/benchmark/run_spec_factory.py,sha256=nRP9737niPReD5G7t9fgyQ8_EUQ1hvg2VBQe5rSZ08Y,6816
|
|
16
16
|
helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
|
|
17
17
|
helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
|
|
18
18
|
helm/benchmark/server.py,sha256=ysd5MT1TDu65NH-OzIGf9wmZlr8FHNRwoy2ybjSc5Yk,6140
|
|
19
19
|
helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
|
|
20
20
|
helm/benchmark/slurm_runner.py,sha256=Tozimrjr2R6mlKHcmrGgxTy9ga-ArIW6AoAWtxqzw-M,16567
|
|
21
21
|
helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
|
|
22
|
-
helm/benchmark/test_model_deployment_definition.py,sha256=wGN95ku-mROh-yiHH3bL8GC7OWkBOa0YJCS2RIGRv8k,4468
|
|
23
22
|
helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
|
|
24
23
|
helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
|
|
25
24
|
helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -44,11 +43,11 @@ helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py,sha256=f_bg
|
|
|
44
43
|
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=v6LLmVTopXNfzo9Qzq16EmmPPivFGGs9LuaPDJAX4vY,9506
|
|
45
44
|
helm/benchmark/adaptation/adapters/multimodal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
45
|
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py,sha256=o7CGClyVWYOuJ4G56-whq5fTvCr7QIn51Mo6DTdvwg0,1881
|
|
47
|
-
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=
|
|
48
|
-
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=
|
|
46
|
+
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py,sha256=bvY8xT2ak_3WG4m2Z5bCM6FLImPIWG1qAn9H2ZNwNv0,6359
|
|
47
|
+
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py,sha256=jyL61UxBsIr68hUz-jtjBUnyB2HBp5ESNyECGp_Gf6Q,2129
|
|
49
48
|
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py,sha256=PWI_WrfDolipj7Zs43YxFQk36jBgU76PU-kL8R9gRno,4759
|
|
50
49
|
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py,sha256=VjSqWiZEcW6K2jrokGUmky7syEOqJ6cbHImR7YZgwzU,10151
|
|
51
|
-
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=
|
|
50
|
+
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=KKOOlna6SHLJHSPgfgguPQysc2Nf4kKrqumqwlG27bs,3542
|
|
52
51
|
helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
52
|
helm/benchmark/annotation/annotator.py,sha256=2UIXY71S5dRaZBLb1v4lcv8-O6pyJ9zTeSJl78AEWGI,1538
|
|
54
53
|
helm/benchmark/annotation/annotator_factory.py,sha256=z5AGBylIuy-_IfgikX66VyGvRz4SxtnOcJsyESH8990,2699
|
|
@@ -57,7 +56,7 @@ helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD
|
|
|
57
56
|
helm/benchmark/annotation/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
57
|
helm/benchmark/annotation/image2structure/image_compiler_annotator.py,sha256=eJFm3iyBe_eEN5Yt0G2IpeA1xdKxRmyR4krsNd6eXoE,3524
|
|
59
58
|
helm/benchmark/annotation/image2structure/latex_compiler_annotator.py,sha256=yRifoqhGq_mQkkRcgKCFpGrZaI9gochOXYiCU8oY1KE,2477
|
|
60
|
-
helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=
|
|
59
|
+
helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py,sha256=we6K1BynV907ZMnGI2zb_tru1uw2iGEI06Wtbnus23w,4010
|
|
61
60
|
helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py,sha256=rvzdQCaVFM6ovF28TSUnNmB47f2hidlaZm6vO4DJpso,6404
|
|
62
61
|
helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
62
|
helm/benchmark/augmentations/cleva_perturbation.py,sha256=arUkY_luc274YEMZocOos9rpAZVbEFZphbMlobAxTy0,29208
|
|
@@ -73,12 +72,12 @@ helm/benchmark/augmentations/lowercase_perturbation.py,sha256=e-lhkuKOV5QR_GnOp7
|
|
|
73
72
|
helm/benchmark/augmentations/mild_mix_perturbation.py,sha256=q27-c8_di1jaLhUuo5LzqOStnJcWIjSJ0a8Sr3S5TOs,1787
|
|
74
73
|
helm/benchmark/augmentations/misspelling_perturbation.py,sha256=IE45qEE0HuIln62tAGvCP9B_RjNMJ5Xu06goL6b0fWQ,2144
|
|
75
74
|
helm/benchmark/augmentations/person_name_perturbation.py,sha256=AIF5VVjNJ1OlUyCC1bt2dcP1KDn4gdgqd7Y2KGvHPqs,14422
|
|
76
|
-
helm/benchmark/augmentations/perturbation.py,sha256=
|
|
75
|
+
helm/benchmark/augmentations/perturbation.py,sha256=GapQckD3zkzZZB-PIVO1KKOy7aISGOhRbGGXEzuAFeQ,3880
|
|
77
76
|
helm/benchmark/augmentations/perturbation_description.py,sha256=VKOwBRPQY-0vuxhGvtac1Z5F10metPfpFnfs8ykFVmU,1184
|
|
78
77
|
helm/benchmark/augmentations/space_perturbation.py,sha256=g4rbyoureBaOVf_lrRXIWYlL6YZpqbCOJk554iOCuIs,935
|
|
79
78
|
helm/benchmark/augmentations/suffix_perturbation.py,sha256=P3AfJj_ajTVdjO7AJRQ9dKS-cT1PyRSt8Un57iZQDVc,785
|
|
80
79
|
helm/benchmark/augmentations/synonym_perturbation.py,sha256=komOV5M342_8unopnwN6gkPWpJIZXidywiu6PO9_riU,4151
|
|
81
|
-
helm/benchmark/augmentations/test_perturbation.py,sha256=
|
|
80
|
+
helm/benchmark/augmentations/test_perturbation.py,sha256=4EooKVcyub70I81trzpNx3Ij-m1vpFa5cFIo6O52icE,13185
|
|
82
81
|
helm/benchmark/augmentations/translate_perturbation.py,sha256=dn8wO5UOgYbGtP9e77SmwaK2ginrQsTw-79nrzRzfeo,1054
|
|
83
82
|
helm/benchmark/augmentations/typos_perturbation.py,sha256=_F9zwvrLie8hX7mzUtQmYq6oq6yqaFiKGsvc9LAuBr4,2798
|
|
84
83
|
helm/benchmark/data_overlap/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -107,9 +106,9 @@ helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=l6TmAflBSgQGLjB-U
|
|
|
107
106
|
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=KhiJ_7tnK8kHg2acKPAUWBQvAtyvjJTwcWJMUSUBHak,6416
|
|
108
107
|
helm/benchmark/metrics/disinformation_metrics.py,sha256=BsLBG5krHGQh_nKDAeSDpp-li89s7fYoYTf5T0RZfMM,7782
|
|
109
108
|
helm/benchmark/metrics/dry_run_metrics.py,sha256=d8RgltW4nGTH1tZeGOIlQRwRaJLIxL60h46NXv_wv1s,3674
|
|
110
|
-
helm/benchmark/metrics/efficiency_metrics.py,sha256=
|
|
109
|
+
helm/benchmark/metrics/efficiency_metrics.py,sha256=v8Eg56HHIWEMQruODKBvwdUfR6ZLGgrNifo-senCaUo,11786
|
|
111
110
|
helm/benchmark/metrics/evaluate_instances_metric.py,sha256=EBUf0ONnNoi7pcxYab7RD0B_JqGksqDX8TOaosSmJk8,2847
|
|
112
|
-
helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=
|
|
111
|
+
helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=vUJavaLVfbWtrwyrIA81npK_1iirhko7_zMF1kL7Gfw,15559
|
|
113
112
|
helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=Pj1itUJi_KDy0D-FOPcOyHqm4ypHMfhbAVeDJzGlyeo,9773
|
|
114
113
|
helm/benchmark/metrics/language_modeling_metrics.py,sha256=ofqwj1PMJQu16QhLDULXBmZ5iFz91ducwLRpNsRYELE,4510
|
|
115
114
|
helm/benchmark/metrics/machine_translation_metrics.py,sha256=bp_EDXyxntIty5gORDa7va-C73quOzoTc5o8MpxFmL4,3816
|
|
@@ -182,7 +181,7 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=_wJ3E3L
|
|
|
182
181
|
helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
|
|
183
182
|
helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
184
183
|
helm/benchmark/metrics/vision_language/emd_utils.py,sha256=3yN-DY5rxMabmtLV003lj59SRnp_T83sLAi96rycKEo,15043
|
|
185
|
-
helm/benchmark/metrics/vision_language/image_metrics.py,sha256=
|
|
184
|
+
helm/benchmark/metrics/vision_language/image_metrics.py,sha256=aJ3zrVOLJJzdVKqXPcFsCXp9LSHET8VGEgtvwK-nkJc,25190
|
|
186
185
|
helm/benchmark/metrics/vision_language/image_utils.py,sha256=XeYF3E6MnYyPJ5hYp4TtiTP27-y4S8LTBH5bZVcvJFg,3758
|
|
187
186
|
helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
187
|
helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
|
|
@@ -205,7 +204,7 @@ helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMY
|
|
|
205
204
|
helm/benchmark/run_specs/lite_run_specs.py,sha256=ViCPJ86Aah8301GTEk6z4_MtP0g8iik33t4GudobhWQ,11113
|
|
206
205
|
helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
|
|
207
206
|
helm/benchmark/run_specs/unitxt_run_specs.py,sha256=ejp_knrcIjf0J4WiKj9LTgDTcUr29-XFZYHYz0w_dkM,1518
|
|
208
|
-
helm/benchmark/run_specs/vlm_run_specs.py,sha256=
|
|
207
|
+
helm/benchmark/run_specs/vlm_run_specs.py,sha256=CmdyEF-pdFIlMhBV7UraQ0FuQgQl2rqVSdTz22uYuPQ,26808
|
|
209
208
|
helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
210
209
|
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=Wyt7J5BAvAqC5JTqCW4fh7ex9-itX11P_9rLTocqvtk,4973
|
|
211
210
|
helm/benchmark/scenarios/babi_qa_scenario.py,sha256=S1tPQY2x1I3hQL1JQ6wvUwvKyiSe7SqpRSW6N3_T0mo,5043
|
|
@@ -248,13 +247,13 @@ helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=PuwcuHnx3nCuRYEE
|
|
|
248
247
|
helm/benchmark/scenarios/koala_scenario.py,sha256=ihx39W1C9KyOh5MkdMqw1twUwzA8S5QPsBKeQRZcZGM,1365
|
|
249
248
|
helm/benchmark/scenarios/legal_summarization_scenario.py,sha256=DTYYsf-74NFjHvrNa5ZWSgMywq9ss16r5eFZI2-xZJk,8644
|
|
250
249
|
helm/benchmark/scenarios/legal_support_scenario.py,sha256=hu8SlDGrU6oUTQjnrLEagjFNniW96wMahKjnrtuyjzo,3925
|
|
251
|
-
helm/benchmark/scenarios/legalbench_scenario.py,sha256=
|
|
250
|
+
helm/benchmark/scenarios/legalbench_scenario.py,sha256=jLo8HFE24M7pcS4PgrpAYhiZNuXSfPUb6LrGoaGCT1A,4490
|
|
252
251
|
helm/benchmark/scenarios/lex_glue_scenario.py,sha256=vsy8Y03hoCwTJFgBdQf0koxlHDRFm2V9wT04n2K1tlk,10268
|
|
253
252
|
helm/benchmark/scenarios/lextreme_scenario.py,sha256=HEpXLdLLywo6yfXaGF3zRgx1PiGPE2ILnpKYs9Dmsow,20396
|
|
254
253
|
helm/benchmark/scenarios/live_qa_scenario.py,sha256=meJvkkbqRUQ2odKkmlOLYOVVFd0LGDhGzK5nHCjXW6Y,3642
|
|
255
254
|
helm/benchmark/scenarios/lm_entry_scenario.py,sha256=_7lTn2kfUQHi3ynX7fEaoseUYkpBx-ANn62kqHr-bY4,9085
|
|
256
255
|
helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=Ui68mi05N3WxGMZfCznJBscKkqaJ0ZEUO3PhIiD1xNE,6129
|
|
257
|
-
helm/benchmark/scenarios/math_scenario.py,sha256=
|
|
256
|
+
helm/benchmark/scenarios/math_scenario.py,sha256=LvoQJUwZ0w78h5X1tIaa6SRsef_W_PNn4L_r5L3Cus8,14168
|
|
258
257
|
helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=WLfED6DiIOUPd4PowydPmrQ4v26OsjG4iHumq7ka9do,4001
|
|
259
258
|
helm/benchmark/scenarios/med_dialog_scenario.py,sha256=f7hTcEpxGfoxJ-WXNrSRrgVfbyuYpB624HbyRuMP-KI,7295
|
|
260
259
|
helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=JdVQuzKM4TRyWT6c1zpsMvNk1RYHnKdy_fRM6W7PCLI,5035
|
|
@@ -314,25 +313,35 @@ helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py,s
|
|
|
314
313
|
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py,sha256=IB4_GbzQjjXBp-551XZ6PTNUCRX1jLcGfB3bVFI5lo4,3547
|
|
315
314
|
helm/benchmark/scenarios/image_generation/winoground_scenario.py,sha256=E2xPQNQzylDSmqLjjMkQB8D7A6g7bzqtSF4bXPgfVbI,2889
|
|
316
315
|
helm/benchmark/scenarios/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
316
|
+
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py,sha256=zXR0LmXsD2tv_ovJsbY_HP53kdiFOvty7Y_Ai3ZCrT4,3037
|
|
317
317
|
helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=LiH14xUoEKXn5ZStDbGE4bz9iMEn3-5I39eJ6kvN2UY,4045
|
|
318
|
-
helm/benchmark/scenarios/vision_language/
|
|
318
|
+
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py,sha256=82qplX4gJ4GsSVhBjwrsVU46TAHh-jym3F_M5A-odRE,4608
|
|
319
|
+
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py,sha256=3pBAQgOsnSyMCzt60s1m8Kf_fEJ4C7XgCDbtXatTlX0,2599
|
|
320
|
+
helm/benchmark/scenarios/vision_language/gqa_scenario.py,sha256=sBQfqAxmP-Z0ifCgwTbP11aPsKA4vogcWBqSDiKlbE4,3512
|
|
321
|
+
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py,sha256=7KjsXiAaiVHDRSyW08jZXNUTWogP3Sr2Og5ViT6Xz8I,3832
|
|
319
322
|
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py,sha256=7GK_jAOfCgRIGiN_GInDePwuT2wZqmWHp1rqdx18xQg,4994
|
|
323
|
+
helm/benchmark/scenarios/vision_language/math_vista_scenario.py,sha256=kzZHeyWQHUphUfAixkms2t3-KKfHRjwIKi6qm-lMyXA,4728
|
|
320
324
|
helm/benchmark/scenarios/vision_language/mementos_scenario.py,sha256=Yw4zxeYgUw8HKRR5ob9QEIT0bSPrdRUULMKCo_xzFpw,4337
|
|
325
|
+
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py,sha256=ly77pj_TorwM1kN2sW7Y2AIGHOBlDkdzV0STvZTBOtc,4332
|
|
321
326
|
helm/benchmark/scenarios/vision_language/mme_scenario.py,sha256=zxtdub2akvxPYEG12pkW2c57TIFqN38C7ucAXAHAdx0,5455
|
|
322
327
|
helm/benchmark/scenarios/vision_language/mmmu_scenario.py,sha256=XQv7uv2m6EdbI7h0-9eDag4_bL7qE_78PuHB7c4SsHA,7654
|
|
328
|
+
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HUO09uM2rBXOfCsxzwovmwtihq53xjuzDOtQO_S3J4I,4161
|
|
329
|
+
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
|
|
323
330
|
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
|
|
331
|
+
helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
|
|
332
|
+
helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=rkPR_e_RWOeSyHIlSJGJ5lVu5DD-AR3x686XYJse-1E,9885
|
|
324
333
|
helm/benchmark/scenarios/vision_language/pope_scenario.py,sha256=uFkzMMsjhmuSYo3v_QdfJFX6RFse83JjzMfMa3ynvV4,3975
|
|
325
334
|
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py,sha256=5MwGb9BOyB2Xy70BGYZcjencf0ZskxBuzcPa7ABRuww,5106
|
|
326
335
|
helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=e3lCq2nevy9tIFDDKEbJvmLibfk4UMQtAIyzrgnnaZs,4179
|
|
327
|
-
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=
|
|
328
|
-
helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=
|
|
336
|
+
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=hJ3sOSpPnOCwLtpVnfasI_X89oofI-2PBRjMnx8eiVA,4139
|
|
337
|
+
helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=2hY-qngKC69ZL9SHNei3IK3C2PvJDWvwLFVQ8yNSOVs,5196
|
|
329
338
|
helm/benchmark/scenarios/vision_language/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
339
|
helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py,sha256=ImhfiC_y_hihAGvlj9zRsaoW614QFCBopBD2KxnbSs0,1805
|
|
331
340
|
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=-eWRwo2x7kR46Z_I4vFbVlbqA_1f2UEb75Dx84XTlNE,9028
|
|
332
341
|
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=FKKybU4IeglwXCj6GZC8cAUs_GOU7ymEa6P1dkDT7uw,1350
|
|
333
|
-
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=
|
|
342
|
+
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=SyAYkhsipjJG42XfM9sljz1vly5YF-dbSEWTj_dEHIU,1048
|
|
334
343
|
helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py,sha256=ovg8-FfJ8_I1xbajFGSLvERZIA1fQjaUn0zd04ZbI84,15316
|
|
335
|
-
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=
|
|
344
|
+
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=j2bDYeWdytYtkKskvuTMwLEIIqELDJJ6D2jdYzmdlJY,9628
|
|
336
345
|
helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
337
346
|
helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py,sha256=i-i0mlG5oRRDNYNqP7o7Ul56iL02p_anJoThXaSvFiM,2826
|
|
338
347
|
helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py,sha256=9WntahzuhVv54IH1m7_z0IxwLma3dbaMOne_pUx751Y,7652
|
|
@@ -347,11 +356,13 @@ helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheG
|
|
|
347
356
|
helm/benchmark/static/json-urls.js,sha256=AaULgfHw8OLfrQLJpBHfcC013uavQnlNNFS9vzb0qOg,1981
|
|
348
357
|
helm/benchmark/static/plot-captions.js,sha256=bTR8gYx-QqF_RJyKX-L-eQP7hSEtawfJSoADCvgjKag,3011
|
|
349
358
|
helm/benchmark/static/schema_classic.yaml,sha256=p-yc2WMfyGehRtD7L5ZZHbFMMQovu2HNfvct3tBlV2I,108168
|
|
359
|
+
helm/benchmark/static/schema_image2structure.yaml,sha256=gig7HVyJWSwcHa96mf-09e68_fU5L02YRWzNbkPmpGg,13520
|
|
350
360
|
helm/benchmark/static/schema_instruction_following.yaml,sha256=mg2g5P8TAYSCEhZbLfshPt_Hq2GKjwbvyOsQrwDqh7w,8923
|
|
351
361
|
helm/benchmark/static/schema_lite.yaml,sha256=62ByEWhAJT0tIUFi-euxJ7XFhE6e9E6PT9dF6V3qoSU,40255
|
|
352
362
|
helm/benchmark/static/schema_mmlu.yaml,sha256=8kiZDEGGaBXs9ucDk_Gbo2agV-OgOmWuhcYFyodRjcw,53307
|
|
353
363
|
helm/benchmark/static/schema_unitxt.yaml,sha256=89GnKrooG7kKU2xh0MeoYZUB54FDUAmOPrbzuBhG1Ik,15496
|
|
354
|
-
helm/benchmark/static/
|
|
364
|
+
helm/benchmark/static/schema_vhelm_lite.yaml,sha256=s8tQIetR2WKu3sd8k2uZO68_5E-YtlMdsBJsTehFZKE,7331
|
|
365
|
+
helm/benchmark/static/schema_vlm.yaml,sha256=o9AzLTKwSbPES5pISI0tmpUPKWWT9GR-dleDKZqoI0w,33243
|
|
355
366
|
helm/benchmark/static/utils.js,sha256=bgN0PT53Dregc-nLmEmAEmg2psufWpS8jTf74WoypHw,7681
|
|
356
367
|
helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
|
|
357
368
|
helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
|
|
@@ -373,7 +384,7 @@ helm/benchmark/static/images/organizations/together.png,sha256=pmWjW4r7GnlKqFhKL
|
|
|
373
384
|
helm/benchmark/static/images/organizations/tsinghua-keg.png,sha256=l9SzlZCsLF18BY876wYJcVgiQbgvwte7uoILPDcVwHk,7776
|
|
374
385
|
helm/benchmark/static/images/organizations/yandex.png,sha256=OOCdcKubAP4x7h4VW7z5a-AHPWBiSDTjsIJea6ZiovA,27964
|
|
375
386
|
helm/benchmark/static_build/config.js,sha256=ER8utDIqVZi9uge7Qrk1gmlT88TOOkFF9xYp3j10m8U,165
|
|
376
|
-
helm/benchmark/static_build/index.html,sha256=
|
|
387
|
+
helm/benchmark/static_build/index.html,sha256=g3pMdAovQ4VMr7dPGgyzWv2K1tN-E8LLkAs45ppLPGw,1149
|
|
377
388
|
helm/benchmark/static_build/assets/01-694cb9b7.png,sha256=aUy5t0DYCg4r52HDOmeNi1S2CHsnv3mE7ySokJg3Ouo,8903
|
|
378
389
|
helm/benchmark/static_build/assets/ai21-0eb91ec3.png,sha256=Drkew6Vlwi2_4_S8hjagK2x8smOwLKTNiXIT3rDiurs,10208
|
|
379
390
|
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png,sha256=fOEANHS8RymKaCzUWn9gQWebts2ghSmtW9Fdda_TjR8,7224
|
|
@@ -386,8 +397,8 @@ helm/benchmark/static_build/assets/google-06d997ad.png,sha256=BtmXrVQZHr3WH5c8c2
|
|
|
386
397
|
helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
|
|
387
398
|
helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
|
|
388
399
|
helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
|
|
389
|
-
helm/benchmark/static_build/assets/index-
|
|
390
|
-
helm/benchmark/static_build/assets/index-
|
|
400
|
+
helm/benchmark/static_build/assets/index-737eef9e.js,sha256=PvNcOghX7gGSYAGk2bR3pvIBnwDbeWHu0JyfPNaan3o,70614
|
|
401
|
+
helm/benchmark/static_build/assets/index-878a1094.css,sha256=h4oQlJUZdqMk6nS_TEkyXMZ6rtGmepw4ljoSAHZX1vY,486381
|
|
391
402
|
helm/benchmark/static_build/assets/meta-5580e9f1.png,sha256=VYDp8arkAe2eYRJhAOcIAsZY1qY0hqyOEQDgVMbX9M8,4646
|
|
392
403
|
helm/benchmark/static_build/assets/microsoft-f5ee5016.png,sha256=9e5QFl23yTbnAk8u7lZKaQOf4oPHbr_aiQda5n4MZqE,50850
|
|
393
404
|
helm/benchmark/static_build/assets/mistral-18e1be23.png,sha256=GOG-Ix7XlctGOUmvJfO2oVSBM7E5O562G88OnoxsjBw,14402
|
|
@@ -443,7 +454,7 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
443
454
|
helm/clients/ai21_client.py,sha256=LIdkmzcUDR9uIF2tIk5YgDNGNmfQ9JDYmgscvFoCHDs,5509
|
|
444
455
|
helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
|
|
445
456
|
helm/clients/aleph_alpha_client.py,sha256=koPqXF6uRD905atoiCaPg5yxr6B25J0g2OTWk8geebQ,4969
|
|
446
|
-
helm/clients/anthropic_client.py,sha256=
|
|
457
|
+
helm/clients/anthropic_client.py,sha256=0hAmv3f6FQURScmDpcGbwGjnvskNRP2vhRH02OSe70I,33224
|
|
447
458
|
helm/clients/auto_client.py,sha256=Qs0XFq9pyH4M9HTOLoI3_5m8kW305x3pzVukgETdrZM,10732
|
|
448
459
|
helm/clients/bedrock_client.py,sha256=BsH9UopsP6ZHf-K0Yzg1PYSMLDwY0yIUmPHDhJVMUi0,5293
|
|
449
460
|
helm/clients/bedrock_utils.py,sha256=okZ6Z8pviGOUNlrdF2QquAqFs8-QYgcqci95eij8giM,2574
|
|
@@ -462,7 +473,7 @@ helm/clients/megatron_client.py,sha256=KFL1BBBDqxr5mtd5iu0dA6uK8_v6d4g_D6RsZrHx3
|
|
|
462
473
|
helm/clients/mistral_client.py,sha256=thOLMcEfrzWR00JUabIZ_PnW2o9YZsdSmNf9z3jbYKo,5982
|
|
463
474
|
helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
|
|
464
475
|
helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
|
|
465
|
-
helm/clients/openai_client.py,sha256=
|
|
476
|
+
helm/clients/openai_client.py,sha256=gWqr4dvYfbUnBtfySSUGGVZYV-pLtqcrnYaf7nPk5-s,13936
|
|
466
477
|
helm/clients/palmyra_client.py,sha256=LBYFHNc5LdpPbiSp1AAHuMm8cUUCQ2EB03BB6XnDTYQ,6551
|
|
467
478
|
helm/clients/perspective_api_client.py,sha256=WQDArqlKVWwcK2SicnSIAgV6JGVHsxibTzkdezT3z_U,5920
|
|
468
479
|
helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
|
|
@@ -471,9 +482,9 @@ helm/clients/test_client.py,sha256=g29C1WLUONnNuE2oGFZhaqMahb-doS4l_Ph4OHrQvrc,3
|
|
|
471
482
|
helm/clients/test_huggingface_client.py,sha256=WUPrA7VT3nnMNht7w20I6411hlpIS_77XbQC2vC0WU0,2723
|
|
472
483
|
helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
|
|
473
484
|
helm/clients/test_together_client.py,sha256=lAtGKn3WdsYe5MEfTYVYRnu_rS4DPnfFr5jRn42rvoQ,3865
|
|
474
|
-
helm/clients/together_client.py,sha256=
|
|
485
|
+
helm/clients/together_client.py,sha256=fCPJ39fX3xm_Gp6cGsc1HIf1jVMLNiE2kIkee45-Ufk,16208
|
|
475
486
|
helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
|
|
476
|
-
helm/clients/vertexai_client.py,sha256=
|
|
487
|
+
helm/clients/vertexai_client.py,sha256=Mt1rb9lWeQqJLGcBSR5mflYBvJvJfsv5OeIuQz4_ng0,19726
|
|
477
488
|
helm/clients/vllm_client.py,sha256=p9atBtq3PBOoPkOPSifkMrYZjNLnNM_sWM6tL_3N-WY,1675
|
|
478
489
|
helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
479
490
|
helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
|
|
@@ -532,8 +543,9 @@ helm/clients/image_generation/mindalle/utils/config.py,sha256=lh8dXvL7ctKmuYEbeT
|
|
|
532
543
|
helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh9HMveJs6F49UMK57Xfa0ccnHqI8,5029
|
|
533
544
|
helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
|
|
534
545
|
helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
535
|
-
helm/clients/vision_language/
|
|
536
|
-
helm/clients/vision_language/
|
|
546
|
+
helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
|
|
547
|
+
helm/clients/vision_language/huggingface_vlm_client.py,sha256=X5SX2iMZkFe9Pmq4Gx0O4bnP4gBPnKvamLThRshAEik,4875
|
|
548
|
+
helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
|
|
537
549
|
helm/clients/vision_language/open_flamingo_client.py,sha256=CkN0JCeR742ZG9Nc4A85hp4BSE0WLU-3Rs-ZwdmDkzs,6632
|
|
538
550
|
helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
|
|
539
551
|
helm/clients/vision_language/open_flamingo/__init__.py,sha256=i1tGJj6ckeE6eS1EWV5tbQKYLmPCrdSI45mPchfv_Ic,88
|
|
@@ -557,7 +569,7 @@ helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
|
|
|
557
569
|
helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
|
|
558
570
|
helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
|
|
559
571
|
helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
|
|
560
|
-
helm/common/images_utils.py,sha256=
|
|
572
|
+
helm/common/images_utils.py,sha256=zbzS8C_oCDb9dY2xpWY6nljI8of72rqwijryMeiBKKo,2527
|
|
561
573
|
helm/common/key_value_store.py,sha256=iHi1WQuWttLNJnuM48QNOAXHoneNbmbBmtXYPq-dyys,3147
|
|
562
574
|
helm/common/media_object.py,sha256=3VZqfb0py5dDKwWtnLp2kdl8svaike-Cn7Mjk-b0cvM,5130
|
|
563
575
|
helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
|
|
@@ -578,9 +590,9 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
|
|
|
578
590
|
helm/common/file_caches/local_file_cache.py,sha256=wBOAbbkGLiClaX4YdunokRfSQCKNkTYmMVx2KTLy4Lc,1921
|
|
579
591
|
helm/common/file_caches/test_local_file_cache.py,sha256=bOCWR9MglwQXV98xk8auyjgFxaOr85zRdxWwxMBQW9s,663
|
|
580
592
|
helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
581
|
-
helm/config/model_deployments.yaml,sha256=
|
|
582
|
-
helm/config/model_metadata.yaml,sha256=
|
|
583
|
-
helm/config/tokenizer_configs.yaml,sha256=
|
|
593
|
+
helm/config/model_deployments.yaml,sha256=KAD0FZ45ERfEjr3y7HbPxZmEnnJBQiiOHRHN7VxqiF4,74817
|
|
594
|
+
helm/config/model_metadata.yaml,sha256=XpJnlu0kiI5sGEqswF_S6_ra0Iys3VOfsDs2Jiz_Vqk,112991
|
|
595
|
+
helm/config/tokenizer_configs.yaml,sha256=3IhRANDTlN39TWqDWuPy507wQlZWOBlyaS8fA6WLDD0,12070
|
|
584
596
|
helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
585
597
|
helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
|
|
586
598
|
helm/proxy/cli.py,sha256=l8F7UYqrIOoBD9ZCIxJFA4fhxlzhae0-2Nn8A7FMkzk,8244
|
|
@@ -634,9 +646,9 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
|
|
|
634
646
|
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
|
|
635
647
|
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
|
|
636
648
|
helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
|
|
637
|
-
crfm_helm-0.5.
|
|
638
|
-
crfm_helm-0.5.
|
|
639
|
-
crfm_helm-0.5.
|
|
640
|
-
crfm_helm-0.5.
|
|
641
|
-
crfm_helm-0.5.
|
|
642
|
-
crfm_helm-0.5.
|
|
649
|
+
crfm_helm-0.5.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
|
|
650
|
+
crfm_helm-0.5.1.dist-info/METADATA,sha256=dVxnv-vEsYZb3v-ALFNpSdpbxwi5WQG5_I1oD3cMs6Y,19157
|
|
651
|
+
crfm_helm-0.5.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
652
|
+
crfm_helm-0.5.1.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
|
|
653
|
+
crfm_helm-0.5.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
|
|
654
|
+
crfm_helm-0.5.1.dist-info/RECORD,,
|
|
@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
|
|
|
79
79
|
# Prompt
|
|
80
80
|
prompt = MultimodalPrompt(
|
|
81
81
|
global_prefix=self.adapter_spec.global_prefix,
|
|
82
|
+
global_suffix=self.adapter_spec.global_suffix,
|
|
82
83
|
instructions=self.adapter_spec.instructions,
|
|
83
84
|
train_instance_blocks=train_instance_blocks,
|
|
84
85
|
eval_instance_block=eval_instance_block,
|
|
@@ -11,6 +11,9 @@ class MultimodalPrompt:
|
|
|
11
11
|
# Global prefix, carried over from `AdapterSpec`
|
|
12
12
|
global_prefix: str
|
|
13
13
|
|
|
14
|
+
# Global suffix, carried over from `AdapterSpec`
|
|
15
|
+
global_suffix: str
|
|
16
|
+
|
|
14
17
|
# Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
|
|
15
18
|
instance_prefix: str
|
|
16
19
|
|
|
@@ -47,6 +50,10 @@ class MultimodalPrompt:
|
|
|
47
50
|
if self.global_prefix:
|
|
48
51
|
result = result.add_textual_prefix(self.global_prefix)
|
|
49
52
|
|
|
53
|
+
# Add the global prefix if one exists
|
|
54
|
+
if self.global_suffix:
|
|
55
|
+
result = result.add_textual_suffix(self.global_suffix)
|
|
56
|
+
|
|
50
57
|
return result
|
|
51
58
|
|
|
52
59
|
@property
|
|
@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
|
|
|
32
32
|
|
|
33
33
|
prompt = MultimodalPrompt(
|
|
34
34
|
global_prefix="[START]",
|
|
35
|
+
global_suffix="",
|
|
35
36
|
instance_prefix="\n",
|
|
36
37
|
instructions="Please answer the following questions about the images.",
|
|
37
38
|
train_instance_blocks=train_instance_blocks,
|
|
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
|
|
|
67
68
|
|
|
68
69
|
prompt = MultimodalPrompt(
|
|
69
70
|
global_prefix="",
|
|
71
|
+
global_suffix="",
|
|
70
72
|
instance_prefix="\n",
|
|
71
73
|
instructions="",
|
|
72
74
|
train_instance_blocks=[],
|
|
@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
|
|
|
18
18
|
"""Annotator that compiles the text completions into a music sheet with LilyPond."""
|
|
19
19
|
|
|
20
20
|
name: str = "lilypond_compiler"
|
|
21
|
-
base_path = "
|
|
21
|
+
base_path = "lilypond-2.24.3/bin"
|
|
22
22
|
|
|
23
23
|
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
24
24
|
super().__init__(cache_config, file_storage_path)
|
|
@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
|
|
|
48
48
|
|
|
49
49
|
description = replace(self.description, seed=seed)
|
|
50
50
|
|
|
51
|
+
perturbed_input: Input
|
|
52
|
+
if instance.input.multimedia_content:
|
|
53
|
+
perturbed_media_objects = []
|
|
54
|
+
for media_object in instance.input.multimedia_content.media_objects:
|
|
55
|
+
# Apply perturbations to the text data of the multimedia content
|
|
56
|
+
if media_object.is_type("text") and media_object.text is not None:
|
|
57
|
+
perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
|
|
58
|
+
else:
|
|
59
|
+
perturbed_media_objects.append(media_object)
|
|
60
|
+
|
|
61
|
+
perturbed_input = Input(
|
|
62
|
+
multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
perturbed_input = Input(text=self.perturb(instance.input.text, rng))
|
|
66
|
+
|
|
51
67
|
# Don't modify `id` of `Instance` here.
|
|
52
68
|
# All the perturbed Instances generated from a single Instance should have the same ID.
|
|
53
69
|
return replace(
|
|
54
70
|
instance,
|
|
55
|
-
input=
|
|
71
|
+
input=perturbed_input,
|
|
56
72
|
references=references,
|
|
57
73
|
perturbation=description,
|
|
58
74
|
contrast_inputs=[instance.input],
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from typing import List
|
|
3
3
|
import unittest
|
|
4
4
|
|
|
5
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
5
6
|
from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
|
|
6
7
|
from .data_augmenter import DataAugmenter
|
|
7
8
|
from .extra_space_perturbation import ExtraSpacePerturbation
|
|
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
|
|
|
33
34
|
assert instances[1].references[0].output.text == "some name"
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
def test_multimodal_text_perturbation():
|
|
38
|
+
data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
|
|
39
|
+
input: Input = Input(
|
|
40
|
+
multimedia_content=MultimediaObject(
|
|
41
|
+
[
|
|
42
|
+
MediaObject(text="Hello what is", content_type="text/plain"),
|
|
43
|
+
MediaObject(text="your name", content_type="text/plain"),
|
|
44
|
+
]
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
|
|
48
|
+
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
49
|
+
|
|
50
|
+
assert len(instances) == 2
|
|
51
|
+
|
|
52
|
+
# Test that the first instance is unperturbed
|
|
53
|
+
assert instances[0].id == "id0"
|
|
54
|
+
assert instances[0].perturbation is None
|
|
55
|
+
media_objects = instances[0].input.multimedia_content.media_objects
|
|
56
|
+
assert media_objects[0].text == "Hello what is"
|
|
57
|
+
assert media_objects[1].text == "your name"
|
|
58
|
+
|
|
59
|
+
assert instances[1].id == "id0"
|
|
60
|
+
assert instances[1].perturbation.name == "extra_space"
|
|
61
|
+
media_objects = instances[1].input.multimedia_content.media_objects
|
|
62
|
+
assert media_objects[0].text == "Hello what is"
|
|
63
|
+
assert media_objects[1].text == "your name"
|
|
64
|
+
|
|
65
|
+
|
|
36
66
|
def test_misspelling_perturbation():
|
|
37
67
|
data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
|
|
38
68
|
instance: Instance = Instance(
|
|
@@ -91,8 +91,15 @@ class EfficiencyMetric:
|
|
|
91
91
|
window_service: WindowService = WindowServiceFactory.get_window_service(
|
|
92
92
|
adapter_spec.model_deployment, tokenizer_service
|
|
93
93
|
)
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
|
|
95
|
+
prompt: str
|
|
96
|
+
num_prompt_tokens: int
|
|
97
|
+
if request_state.request.multimodal_prompt is not None:
|
|
98
|
+
prompt = request_state.request.multimodal_prompt.text
|
|
99
|
+
num_prompt_tokens = window_service.get_num_tokens(prompt)
|
|
100
|
+
else:
|
|
101
|
+
prompt = request_state.request.prompt
|
|
102
|
+
num_prompt_tokens = window_service.get_num_tokens(prompt)
|
|
96
103
|
|
|
97
104
|
# Total number of tokens in the completion.
|
|
98
105
|
num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
|
|
@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
10
10
|
from helm.benchmark.metrics.statistic import Stat
|
|
11
11
|
from helm.benchmark.scenarios.code_scenario import CodeReference
|
|
12
12
|
from helm.benchmark.scenarios.scenario import Reference
|
|
13
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
14
|
from helm.common.request import GeneratedOutput
|
|
14
15
|
from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
|
|
15
16
|
from nltk.metrics.scores import f_measure
|
|
@@ -21,6 +22,7 @@ import string
|
|
|
21
22
|
from . import code_metrics_helper
|
|
22
23
|
import nltk
|
|
23
24
|
|
|
25
|
+
|
|
24
26
|
try:
|
|
25
27
|
nltk.data.find("tokenizers/punkt")
|
|
26
28
|
except LookupError:
|
|
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
|
|
|
188
190
|
return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
|
|
189
191
|
|
|
190
192
|
|
|
193
|
+
def cider(gold: str, pred: str) -> float:
|
|
194
|
+
try:
|
|
195
|
+
from pycocoevalcap.cider.cider import Cider
|
|
196
|
+
except ModuleNotFoundError as e:
|
|
197
|
+
handle_module_not_found_error(e, ["vlm"])
|
|
198
|
+
|
|
199
|
+
cider_evaluator = Cider()
|
|
200
|
+
candidate = {"caption": [pred]}
|
|
201
|
+
reference = {"caption": [gold]}
|
|
202
|
+
average_score, _ = cider_evaluator.compute_score(reference, candidate)
|
|
203
|
+
return average_score
|
|
204
|
+
|
|
205
|
+
|
|
191
206
|
def extract_set_from_text(
|
|
192
207
|
set_str: str,
|
|
193
208
|
set_start_str: str = " is ",
|
|
@@ -325,6 +340,7 @@ def compute_reference_metrics(
|
|
|
325
340
|
"math_equiv_chain_of_thought": is_equiv_chain_of_thought,
|
|
326
341
|
"code_eval_acc": code_eval,
|
|
327
342
|
"pass": code_eval,
|
|
343
|
+
"cider": cider,
|
|
328
344
|
"f1_score": f1_score,
|
|
329
345
|
"rouge_1": get_rouge_function("rouge1"),
|
|
330
346
|
"rouge_2": get_rouge_function("rouge2"),
|