crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -70,6 +70,8 @@ Requires-Dist: pypinyin ==0.49.0 ; extra == 'cleva'
70
70
  Requires-Dist: jieba ==0.42.1 ; extra == 'cleva'
71
71
  Requires-Dist: opencc ==1.1.6 ; extra == 'cleva'
72
72
  Requires-Dist: langdetect ==1.0.9 ; extra == 'cleva'
73
+ Provides-Extra: cohere
74
+ Requires-Dist: cohere ~=5.3 ; extra == 'cohere'
73
75
  Provides-Extra: decodingtrust
74
76
  Requires-Dist: fairlearn ~=0.9.0 ; extra == 'decodingtrust'
75
77
  Provides-Extra: dev
@@ -79,7 +81,7 @@ Requires-Dist: black ==24.3.0 ; extra == 'dev'
79
81
  Requires-Dist: mypy ==1.5.1 ; extra == 'dev'
80
82
  Requires-Dist: flake8 ==5.0.4 ; extra == 'dev'
81
83
  Provides-Extra: google
82
- Requires-Dist: google-cloud-aiplatform ~=1.44 ; extra == 'google'
84
+ Requires-Dist: google-cloud-aiplatform ~=1.48 ; extra == 'google'
83
85
  Provides-Extra: heim
84
86
  Requires-Dist: gdown ~=4.4.0 ; extra == 'heim'
85
87
  Requires-Dist: diffusers ~=0.24.0 ; extra == 'heim'
@@ -133,24 +135,31 @@ Requires-Dist: crfm-helm[aleph-alpha] ; extra == 'models'
133
135
  Requires-Dist: crfm-helm[allenai] ; extra == 'models'
134
136
  Requires-Dist: crfm-helm[amazon] ; extra == 'models'
135
137
  Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
138
+ Requires-Dist: crfm-helm[cohere] ; extra == 'models'
136
139
  Requires-Dist: crfm-helm[google] ; extra == 'models'
137
140
  Requires-Dist: crfm-helm[mistral] ; extra == 'models'
138
141
  Requires-Dist: crfm-helm[openai] ; extra == 'models'
142
+ Requires-Dist: crfm-helm[reka] ; extra == 'models'
139
143
  Requires-Dist: crfm-helm[together] ; extra == 'models'
140
144
  Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
141
145
  Requires-Dist: crfm-helm[yandex] ; extra == 'models'
146
+ Requires-Dist: crfm-helm[openvino] ; extra == 'models'
142
147
  Provides-Extra: mongo
143
148
  Requires-Dist: pymongo ~=4.2 ; extra == 'mongo'
144
149
  Provides-Extra: openai
145
150
  Requires-Dist: openai ~=1.0 ; extra == 'openai'
146
- Requires-Dist: tiktoken ~=0.3.3 ; extra == 'openai'
151
+ Requires-Dist: tiktoken ~=0.7 ; extra == 'openai'
147
152
  Requires-Dist: pydantic ~=2.0 ; extra == 'openai'
153
+ Provides-Extra: openvino
154
+ Requires-Dist: optimum[openvino] ~=1.19 ; extra == 'openvino'
148
155
  Provides-Extra: plots
149
156
  Requires-Dist: colorcet ~=3.0.1 ; extra == 'plots'
150
157
  Requires-Dist: matplotlib ~=3.6.0 ; extra == 'plots'
151
158
  Requires-Dist: seaborn ~=0.11.0 ; extra == 'plots'
152
159
  Provides-Extra: proxy-server
153
160
  Requires-Dist: gunicorn ~=20.1.0 ; extra == 'proxy-server'
161
+ Provides-Extra: reka
162
+ Requires-Dist: reka-api ~=2.0.0 ; extra == 'reka'
154
163
  Provides-Extra: scenarios
155
164
  Requires-Dist: gdown ~=4.4.0 ; extra == 'scenarios'
156
165
  Requires-Dist: sympy ~=1.11.1 ; extra == 'scenarios'
@@ -174,6 +183,7 @@ Requires-Dist: torch ~=2.1.2 ; extra == 'vlm'
174
183
  Requires-Dist: transformers-stream-generator ~=0.0.4 ; extra == 'vlm'
175
184
  Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
176
185
  Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
186
+ Requires-Dist: crfm-helm[reka] ; extra == 'vlm'
177
187
  Requires-Dist: crfm-helm[images] ; extra == 'vlm'
178
188
  Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
179
189
  Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'
@@ -5,14 +5,14 @@ helm/benchmark/annotation_executor.py,sha256=ZJCc5xT8E0E6gux8dq3HPS4YzQs2YPCNl4g
5
5
  helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbYs6T4,1603
6
6
  helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
7
7
  helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
8
- helm/benchmark/huggingface_registration.py,sha256=RzfOaLAnzAcoTphan1JNo836lNyxMSH67oQlolhNLS0,4154
8
+ helm/benchmark/huggingface_registration.py,sha256=unEBO21V8K3-Ya0xLqjO9H1oq7RmU-f1MYV0tCIbXzY,4578
9
9
  helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
10
10
  helm/benchmark/model_metadata_registry.py,sha256=fXRJOLUIrLOHUG5duncEqhnpmfb9hyloUlGbOM2L9ds,8194
11
11
  helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
12
- helm/benchmark/run.py,sha256=tF_aWy5GtfwBOT1ZRKWrcI74VpFWGzlR00EKiGG7zyI,12572
13
- helm/benchmark/run_expander.py,sha256=jolEPDrB4lL_VJNRpT1SQta6DZ_xyq2HaIfWHdeyNtA,47785
12
+ helm/benchmark/run.py,sha256=WNj10uNCqxwS2pCmt_s5Bn_JIC-NItEjK1PyQl9SXmo,13193
13
+ helm/benchmark/run_expander.py,sha256=sWfcL0caHTsp1NqqsGrG-fZaIbScY8LECJqQMVIPZtE,51191
14
14
  helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
15
- helm/benchmark/run_spec_factory.py,sha256=nRP9737niPReD5G7t9fgyQ8_EUQ1hvg2VBQe5rSZ08Y,6816
15
+ helm/benchmark/run_spec_factory.py,sha256=hp29n_Stb7RMwRm2jrP_qpyzxi8X8ojdqXTFN3KRSiY,6978
16
16
  helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
17
17
  helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
18
18
  helm/benchmark/server.py,sha256=ysd5MT1TDu65NH-OzIGf9wmZlr8FHNRwoy2ybjSc5Yk,6140
@@ -22,7 +22,7 @@ helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5
22
22
  helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
23
23
  helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
24
24
  helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- helm/benchmark/adaptation/adapter_spec.py,sha256=tZ40ovgNkRsxDOHan4lcD8ukutA1QPsoZUF5XOHq-VA,4382
25
+ helm/benchmark/adaptation/adapter_spec.py,sha256=K5BwqTe2iimjswdw_SONlJo0xt-T-o5KH7VqxrPaov0,5072
26
26
  helm/benchmark/adaptation/common_adapter_specs.py,sha256=-ILsVxWjpEE6an1ncrRRrLkdP5ky_-2GN1TxSxJo38M,10449
27
27
  helm/benchmark/adaptation/prompt.py,sha256=n0Ka3RGSWMr3CBnJrPNPy626x9TJE3k677wKbG8hO9A,2133
28
28
  helm/benchmark/adaptation/request_state.py,sha256=WAPyubn35on-Ry7xKpXsVz3wYBMCMc_LidDOdcKxatI,3053
@@ -49,8 +49,11 @@ helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_a
49
49
  helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py,sha256=VjSqWiZEcW6K2jrokGUmky7syEOqJ6cbHImR7YZgwzU,10151
50
50
  helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=KKOOlna6SHLJHSPgfgguPQysc2Nf4kKrqumqwlG27bs,3542
51
51
  helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ helm/benchmark/annotation/air_bench_annotator.py,sha256=9W3zLO2f4OzxGdavkDI2dDUStxpExa7sgrI-ATGG7NY,3048
52
53
  helm/benchmark/annotation/annotator.py,sha256=2UIXY71S5dRaZBLb1v4lcv8-O6pyJ9zTeSJl78AEWGI,1538
53
- helm/benchmark/annotation/annotator_factory.py,sha256=z5AGBylIuy-_IfgikX66VyGvRz4SxtnOcJsyESH8990,2699
54
+ helm/benchmark/annotation/annotator_factory.py,sha256=3Soh0V3lbsIR_HGHLg-XTc3eKVRj7SL9lLT_AoqUVTs,2997
55
+ helm/benchmark/annotation/live_qa_annotator.py,sha256=IlUV4K-ddbL1XsvIgBAfsLH0_bdKx8kyDev1G3Kwyek,4364
56
+ helm/benchmark/annotation/medication_qa_annotator.py,sha256=7LRmx2a1JODP5puAM0IH0HFTextfeLOzK7ef4sw9XIU,4129
54
57
  helm/benchmark/annotation/test_annotator_factory.py,sha256=ifv5hxSbFe113AHeXLqTPkVJ-C2PW_gb9L3a0SHNi-M,986
55
58
  helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD1b91jUs4Nk8Gvope-Z98,1644
56
59
  helm/benchmark/annotation/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -78,7 +81,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=g4rbyoureBaOVf_lrRXIWY
78
81
  helm/benchmark/augmentations/suffix_perturbation.py,sha256=P3AfJj_ajTVdjO7AJRQ9dKS-cT1PyRSt8Un57iZQDVc,785
79
82
  helm/benchmark/augmentations/synonym_perturbation.py,sha256=komOV5M342_8unopnwN6gkPWpJIZXidywiu6PO9_riU,4151
80
83
  helm/benchmark/augmentations/test_perturbation.py,sha256=4EooKVcyub70I81trzpNx3Ij-m1vpFa5cFIo6O52icE,13185
81
- helm/benchmark/augmentations/translate_perturbation.py,sha256=dn8wO5UOgYbGtP9e77SmwaK2ginrQsTw-79nrzRzfeo,1054
84
+ helm/benchmark/augmentations/translate_perturbation.py,sha256=vMXCYXGVSo8E78IAzH9HI4p2pvyLzcvO77BnvR2QB0k,1097
82
85
  helm/benchmark/augmentations/typos_perturbation.py,sha256=_F9zwvrLie8hX7mzUtQmYq6oq6yqaFiKGsvc9LAuBr4,2798
83
86
  helm/benchmark/data_overlap/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
87
  helm/benchmark/data_overlap/data_overlap_spec.py,sha256=aj_l1l0qxUbUMrSWr70-Sb1j_JN-7WYop5BXPG_xj44,1998
@@ -88,6 +91,7 @@ helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8
88
91
  helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
89
92
  helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
90
93
  helm/benchmark/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ helm/benchmark/metrics/air_bench_metrics.py,sha256=VMNQDDEtz2CiK4U55lCHLz0b_DxHprTAZ1WtYtGXjcY,2282
91
95
  helm/benchmark/metrics/basic_metrics.py,sha256=7hk5PZL7d09uG1y7wHBhY_ox8hlXw-n7Yt_FDv_AIKw,20375
92
96
  helm/benchmark/metrics/bbq_metrics.py,sha256=Dqccr7GdfKNs1S_1QSB75d8AY7moovEPAqvacGfrCAE,6157
93
97
  helm/benchmark/metrics/bias_metrics.py,sha256=GQ4CwOk1Sa9g-LcJCxcoQLD1vWY2Hvujck9l-9qsmf4,11418
@@ -109,16 +113,23 @@ helm/benchmark/metrics/dry_run_metrics.py,sha256=d8RgltW4nGTH1tZeGOIlQRwRaJLIxL6
109
113
  helm/benchmark/metrics/efficiency_metrics.py,sha256=v8Eg56HHIWEMQruODKBvwdUfR6ZLGgrNifo-senCaUo,11786
110
114
  helm/benchmark/metrics/evaluate_instances_metric.py,sha256=EBUf0ONnNoi7pcxYab7RD0B_JqGksqDX8TOaosSmJk8,2847
111
115
  helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=vUJavaLVfbWtrwyrIA81npK_1iirhko7_zMF1kL7Gfw,15559
112
- helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=Pj1itUJi_KDy0D-FOPcOyHqm4ypHMfhbAVeDJzGlyeo,9773
116
+ helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJTrXJaHYTXn4,2470
117
+ helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
118
+ helm/benchmark/metrics/gpt4v_originality_critique_metrics.py,sha256=1m7IWy9vu66svnmdBRjZQI-2YsGYzH2vXZMptlRGM0Y,5654
119
+ helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=QJxGzyERQv_vMn3PM9fy3IxfBgSg0BjcOf_mv574lGA,9786
113
120
  helm/benchmark/metrics/language_modeling_metrics.py,sha256=ofqwj1PMJQu16QhLDULXBmZ5iFz91ducwLRpNsRYELE,4510
121
+ helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
114
122
  helm/benchmark/metrics/machine_translation_metrics.py,sha256=bp_EDXyxntIty5gORDa7va-C73quOzoTc5o8MpxFmL4,3816
123
+ helm/benchmark/metrics/medication_qa_metrics.py,sha256=Z939iAc0A5xn_GdnCtfiefhUZK9qk6jZjtde2-F7IH8,840
115
124
  helm/benchmark/metrics/metric.py,sha256=dPq7ZMB0w-LgJKMzWYDJtfn-oYD4oG4jJX0yiUEziJM,14245
116
125
  helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
117
126
  helm/benchmark/metrics/metric_service.py,sha256=mlX_MEFSYNzME6GFS3El_VVOvzPYnOMosKI0XIxygP4,1802
118
127
  helm/benchmark/metrics/numeracy_metrics.py,sha256=panMWD3a1NPerg3Ix7l6NhR7jGOIQOQV9i_KysBeDA8,2818
119
128
  helm/benchmark/metrics/paraphrase_generation_metrics.py,sha256=-VkAknRhAEBmC_lpz_1aeXU8OppL8KfEPtIYCJkHTmw,1981
129
+ helm/benchmark/metrics/prometheus_vision_critique_metrics.py,sha256=pexBbEFF3-bzWoPWNFuVs-3fm7XJw2EC4xgiSb3gSa4,8508
120
130
  helm/benchmark/metrics/ranking_metrics.py,sha256=5hDRapsxx_cmo-ag_80kOQnrgZn3lfVsLZVtWxuxH-s,17391
121
131
  helm/benchmark/metrics/reference_metric.py,sha256=RlIM_PFTEkBo0_EEMq8d4_BSagNSBR_XyovMtjDeqqU,6026
132
+ helm/benchmark/metrics/reka_vibe_critique_metrics.py,sha256=CwzzQ13bBT0r_o75TqFj2Zr0ST9vzQi74K_ezWTnLCU,6568
122
133
  helm/benchmark/metrics/statistic.py,sha256=FuxNxMtAfiCkOxBS9KHlhEyxe61e0YXt2emvsufgPZQ,3424
123
134
  helm/benchmark/metrics/summarization_critique_metrics.py,sha256=Lf7PDuce62HDzyofsyxaOvH0QvzcaS-vJvDWtIs8xKk,4694
124
135
  helm/benchmark/metrics/summarization_metrics.py,sha256=laLMGRDy1wjcFvgSWXvzOZwBXshkmPr0S2Ofu79Z01Q,16461
@@ -131,7 +142,7 @@ helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SI
131
142
  helm/benchmark/metrics/test_statistic.py,sha256=AejuYLSeUwEOqpEMRKZFjnxu4HKUraeExU8TPmZEqW4,1229
132
143
  helm/benchmark/metrics/toxicity_metrics.py,sha256=6MCpHuCXbXZqWwvO57ifKYHnHWBzszN9cZjwgPQQF2Y,4027
133
144
  helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
134
- helm/benchmark/metrics/unitxt_metrics.py,sha256=5rw_fBQGWpFLr1nR4HcRlAwYvDZfJ6_MzGozzNo5NOA,3605
145
+ helm/benchmark/metrics/unitxt_metrics.py,sha256=2F9T4iQV0_BbDMCWrZrd9sc30XHYv8MR4xSBd_dD3eI,4053
135
146
  helm/benchmark/metrics/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
147
  helm/benchmark/metrics/image_generation/aesthetics_metrics.py,sha256=AXQjWBd9zBZOoCF8vQV9FjUy33teC0IF7pdbq-XiHjM,2101
137
148
  helm/benchmark/metrics/image_generation/aesthetics_scorer.py,sha256=ISdThDKMrx-SHQe69dCcr8qUrMCa_GsxX3BeZnd0WPA,2538
@@ -180,32 +191,37 @@ helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py,sha256=l9UQZ0aAI
180
191
  helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=_wJ3E3LbJB9XPLixTH82BYQbp32o3oij6Sz3lsZL30E,2648
181
192
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
182
193
  helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
- helm/benchmark/metrics/vision_language/emd_utils.py,sha256=3yN-DY5rxMabmtLV003lj59SRnp_T83sLAi96rycKEo,15043
184
- helm/benchmark/metrics/vision_language/image_metrics.py,sha256=aJ3zrVOLJJzdVKqXPcFsCXp9LSHET8VGEgtvwK-nkJc,25190
194
+ helm/benchmark/metrics/vision_language/emd_utils.py,sha256=KdZdcqu3eo016FdAjAm_83v92-wWuR90EPsTogfTcok,15196
195
+ helm/benchmark/metrics/vision_language/image_metrics.py,sha256=HyXeZiDszSV1Q99ScqeS_xYvyrp1dlWBYahfxt42N3E,23554
185
196
  helm/benchmark/metrics/vision_language/image_utils.py,sha256=XeYF3E6MnYyPJ5hYp4TtiTP27-y4S8LTBH5bZVcvJFg,3758
186
197
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
198
  helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
188
199
  helm/benchmark/presentation/create_plots.py,sha256=2-ZOuEdRwqqF1biRmzWggMZjmODoxOQOBoz9GT7tVww,28737
189
200
  helm/benchmark/presentation/run_display.py,sha256=tC1DciLvDTQJog4BDo8StWDdX7DbBkhrG2sX_SwXSPQ,11838
190
201
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
191
- helm/benchmark/presentation/schema.py,sha256=pOwHCLvAC1Nh6vh48HV83gb7T7WREkifvo4qdovFdv4,8511
202
+ helm/benchmark/presentation/schema.py,sha256=fPw-794HbacZR5z1SmYGUqYgqXbZ8-BrcexWV4h6vgc,10809
192
203
  helm/benchmark/presentation/summarize.py,sha256=2fJ9BYOJRxe9eBylLUK3qcZZwAwRtJF_C8plEQlAPEU,67266
193
204
  helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
194
205
  helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
195
206
  helm/benchmark/presentation/test_create_plots.py,sha256=5PPPegMTdBZurxyyUxI4rN13AVsjV3eQrwFqlobJ8UA,1286
196
207
  helm/benchmark/presentation/test_run_entry.py,sha256=OM-027j2A0Lx-ai2zBprOxSqzZhS_dh0OKw3ThocZW0,751
208
+ helm/benchmark/presentation/test_schema.py,sha256=6mq6CeAOLW2Kxi1lX_ZW8QCVqVR73XImR8ylcRGFkBE,378
197
209
  helm/benchmark/presentation/test_summarize.py,sha256=UfSp33Q9xvuGnPYfFmLJdH5y7KWp9qbZprRMyx8LGP0,1618
198
210
  helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
199
- helm/benchmark/run_specs/classic_run_specs.py,sha256=YKrjuuPXoVjUT6XGEtyouPHkkE0XfR6u2xHQDRqpNyA,57972
211
+ helm/benchmark/run_specs/air_bench_run_specs.py,sha256=VdXis1HN8_KLrMHDCVi0J7WdqjRjAGbZMhrsnpzC-Kg,1604
212
+ helm/benchmark/run_specs/classic_run_specs.py,sha256=Cn0z-6QY-ehbLaHJMvCwjw11DFBQgUyqVCaXwTVFyJ8,58331
200
213
  helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
201
- helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=D5g_--eFOI6-hy6fv9JNj_X4DHU03prKA5GZjlqaoRk,14254
214
+ helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=fDyIxmOdgLLWVtwBfxcnd3nFnBZNFpJHbcM4Kyq5gZA,14315
215
+ helm/benchmark/run_specs/experimental_run_specs.py,sha256=7aF-Ox8iBC2obfJkyKwobJaCjk1SqxtSDuRv_RxA3Eo,1310
216
+ helm/benchmark/run_specs/finance_run_specs.py,sha256=7DCmeBQpETQjK0fvUKS1nDIbM_wxTXb2GhXcjzIDyIE,1181
202
217
  helm/benchmark/run_specs/heim_run_specs.py,sha256=Pt1eVbzvwZ5EXq8WB2b3XYw62SWYN_i1P_H3oE4i8KY,22096
203
218
  helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
204
219
  helm/benchmark/run_specs/lite_run_specs.py,sha256=ViCPJ86Aah8301GTEk6z4_MtP0g8iik33t4GudobhWQ,11113
205
220
  helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
206
221
  helm/benchmark/run_specs/unitxt_run_specs.py,sha256=ejp_knrcIjf0J4WiKj9LTgDTcUr29-XFZYHYz0w_dkM,1518
207
- helm/benchmark/run_specs/vlm_run_specs.py,sha256=CmdyEF-pdFIlMhBV7UraQ0FuQgQl2rqVSdTz22uYuPQ,26808
222
+ helm/benchmark/run_specs/vlm_run_specs.py,sha256=uwnk9DHZKQj8nnC14kGiSN8xKiZfpigoz5S86TiHc4k,31118
208
223
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
224
+ helm/benchmark/scenarios/air_bench_scenario.py,sha256=WUZvsUTqlsjNzQsd2baZZIgO30B4Zf3g0QjsyEaGmLc,1772
209
225
  helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=Wyt7J5BAvAqC5JTqCW4fh7ex9-itX11P_9rLTocqvtk,4973
210
226
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=S1tPQY2x1I3hQL1JQ6wvUwvKyiSe7SqpRSW6N3_T0mo,5043
211
227
  helm/benchmark/scenarios/bbq_scenario.py,sha256=lT1XKSM-PXYtENI-ryScC4yb1TtII7YoH8kt_S1dZQo,9579
@@ -213,6 +229,7 @@ helm/benchmark/scenarios/big_bench_scenario.py,sha256=bSk8Ia4u_6OqMjiyadpYQAWN-8
213
229
  helm/benchmark/scenarios/blimp_scenario.py,sha256=o1MDcHT14KFDET4K9otx8pDiIgXrhsD19pvO0mR2ADU,6260
214
230
  helm/benchmark/scenarios/bold_scenario.py,sha256=NEfECMVzlVP_yo6sOuIzj6vZ5jd72_nvtEQ1lWrq85Q,4106
215
231
  helm/benchmark/scenarios/boolq_scenario.py,sha256=rvSp5SwXMCVzBo5BFxfhj1Xv06_ksqKrtTQR7nPiS-o,8013
232
+ helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
216
233
  helm/benchmark/scenarios/civil_comments_scenario.py,sha256=VO5G-cQ9qctmBN0O76uSewnO_mFslMo5mbR2ZTrjuds,4851
217
234
  helm/benchmark/scenarios/cleva_scenario.py,sha256=xhwZ616iz0CN3fYIfrXHcV1XlcRQjyPSzML8fq8D3l4,57939
218
235
  helm/benchmark/scenarios/code_scenario.py,sha256=s4AGW8eBY0gFnu6EXvVWL0xbFYO28N9sgP1V8eBO7EI,12171
@@ -233,9 +250,10 @@ helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py,sha256=AI8HX
233
250
  helm/benchmark/scenarios/dialogue_scenarios.py,sha256=-I7FY6q1b11zpFd1_oAgar5qlfaFcXsNCKGVln9etPI,5629
234
251
  helm/benchmark/scenarios/disinformation_scenario.py,sha256=kQi0MVVoSDhx2vOTnUaCIttPXMf8zz7Eld2FD_77tnA,8504
235
252
  helm/benchmark/scenarios/dyck_language_scenario.py,sha256=vMxND9wPJenrGlCLhSw5UxOw3TV2Jq8cTmIXGpzEWaA,9318
236
- helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=n2mnkmSeTznEy7S-GVumqpD9bt27yctbuEmtgQrG-Y0,6399
253
+ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4cv7u2lmUFcigkAX_eMwIn49Pa3p-aHClkT-r-0roLU,6616
237
254
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=YjBX61TlL3CDQ3X6D-JyR-qlOYGLdoRXJxl9AEeqxYs,7022
238
255
  helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py,sha256=TklbX7Kx4y-estV-YHUbI5O08q2qCZRrOmX9D3gZS9c,2193
256
+ helm/benchmark/scenarios/fin_qa_scenario.py,sha256=pXUeJ34KiRSlEjYERgXqVSbr7zxvdXnOuMSpXvnUw5I,5782
239
257
  helm/benchmark/scenarios/grammar.py,sha256=Pb9vEP_0Ki87UdQCj1ym7QWJ24M4DRP6TXB5d3GnhLs,5597
240
258
  helm/benchmark/scenarios/grammar_scenario.py,sha256=bl-Cm9caDs077zSu38mzaS9maZ2gM-QazgjOEMFvxYg,1454
241
259
  helm/benchmark/scenarios/gsm_scenario.py,sha256=9fV2SEw3ocKNAD-TrDZZTpq4l7mbttQQWbO0YNz4e6k,2613
@@ -279,6 +297,7 @@ helm/benchmark/scenarios/summarization_scenario.py,sha256=MlNMgsY369DC04nhMUdG2o
279
297
  helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=pzifpsJJbucmTjujNqQnwQa4Y7wpQjkS6QjNXOrgTAQ,3096
280
298
  helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=1b3e3WpFMNBV3li17-0Ug6QCSKO4qRFaWDF23bYNsvQ,16326
281
299
  helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=k8IGK6VABOr6wuha4HynP47peoAkmIViAVhScOtCANo,8345
300
+ helm/benchmark/scenarios/test_air_bench_scenario.py,sha256=9o92CK57xxgPaA9Xt9uJPPie4Cxllzq-KbMt3G35UQ0,1320
282
301
  helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIafmzIeL0nkoU,1364
283
302
  helm/benchmark/scenarios/test_math_scenario.py,sha256=s3-CllgCB8DL9-L4DmJ6Zcf9xi803nWYN84KlhN7PhM,1016
284
303
  helm/benchmark/scenarios/test_scenario.py,sha256=HexTZBKphMDJbhIYj-HRCDwltPTDqHFHdT7FjPmu8Xs,2070
@@ -314,7 +333,7 @@ helm/benchmark/scenarios/image_generation/time_most_significant_historical_figur
314
333
  helm/benchmark/scenarios/image_generation/winoground_scenario.py,sha256=E2xPQNQzylDSmqLjjMkQB8D7A6g7bzqtSF4bXPgfVbI,2889
315
334
  helm/benchmark/scenarios/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
335
  helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py,sha256=zXR0LmXsD2tv_ovJsbY_HP53kdiFOvty7Y_Ai3ZCrT4,3037
317
- helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=LiH14xUoEKXn5ZStDbGE4bz9iMEn3-5I39eJ6kvN2UY,4045
336
+ helm/benchmark/scenarios/vision_language/bingo_scenario.py,sha256=jwGEouY30Yy5U9lRUbv0XAO98gUJ669g0dhdDCGQ-8w,4097
318
337
  helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py,sha256=82qplX4gJ4GsSVhBjwrsVU46TAHh-jym3F_M5A-odRE,4608
319
338
  helm/benchmark/scenarios/vision_language/flickr30k_scenario.py,sha256=3pBAQgOsnSyMCzt60s1m8Kf_fEJ4C7XgCDbtXatTlX0,2599
320
339
  helm/benchmark/scenarios/vision_language/gqa_scenario.py,sha256=sBQfqAxmP-Z0ifCgwTbP11aPsKA4vogcWBqSDiKlbE4,3512
@@ -329,19 +348,20 @@ helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HU
329
348
  helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
330
349
  helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
331
350
  helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
332
- helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=rkPR_e_RWOeSyHIlSJGJ5lVu5DD-AR3x686XYJse-1E,9885
351
+ helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=wVcTNUql4TBClgm7oyLq5cmybsnlurc0MblqRRxXRyc,9929
333
352
  helm/benchmark/scenarios/vision_language/pope_scenario.py,sha256=uFkzMMsjhmuSYo3v_QdfJFX6RFse83JjzMfMa3ynvV4,3975
334
353
  helm/benchmark/scenarios/vision_language/seed_bench_scenario.py,sha256=5MwGb9BOyB2Xy70BGYZcjencf0ZskxBuzcPa7ABRuww,5106
335
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=e3lCq2nevy9tIFDDKEbJvmLibfk4UMQtAIyzrgnnaZs,4179
354
+ helm/benchmark/scenarios/vision_language/unicorn_scenario.py,sha256=bH5FfAgwyzpVMPOJKNCmOgpX-lvJF-B42uVi4m1mY-I,4231
355
+ helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py,sha256=2foCM7ik9RvYahauKIoNAxkGiluOYuT0w0r7FZi-MQo,3621
336
356
  helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py,sha256=hJ3sOSpPnOCwLtpVnfasI_X89oofI-2PBRjMnx8eiVA,4139
337
357
  helm/benchmark/scenarios/vision_language/vqa_scenario.py,sha256=2hY-qngKC69ZL9SHNei3IK3C2PvJDWvwLFVQ8yNSOVs,5196
338
358
  helm/benchmark/scenarios/vision_language/image2structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
339
359
  helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py,sha256=ImhfiC_y_hihAGvlj9zRsaoW614QFCBopBD2KxnbSs0,1805
340
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=-eWRwo2x7kR46Z_I4vFbVlbqA_1f2UEb75Dx84XTlNE,9028
341
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=FKKybU4IeglwXCj6GZC8cAUs_GOU7ymEa6P1dkDT7uw,1350
342
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=SyAYkhsipjJG42XfM9sljz1vly5YF-dbSEWTj_dEHIU,1048
360
+ helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py,sha256=uDYN10CuXWXvgZ2BYNxlTmBsdfPNlK9G9e_VMGDKvA4,9400
361
+ helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py,sha256=RSLYpw3BsIIxkhS-6RfVM_UhjmwJDMoA3JQl3FBjv7I,1147
362
+ helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py,sha256=_pgW_aNaM3E7MTl_tNExupvENdtAH3DvZuSwZIiopCg,837
343
363
  helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py,sha256=ovg8-FfJ8_I1xbajFGSLvERZIA1fQjaUn0zd04ZbI84,15316
344
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=j2bDYeWdytYtkKskvuTMwLEIIqELDJJ6D2jdYzmdlJY,9628
364
+ helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py,sha256=dOt-gif-4Z0JekI2KAel4KS1zyvzqyqoFLP3xoe5DKY,9710
345
365
  helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
346
366
  helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py,sha256=i-i0mlG5oRRDNYNqP7o7Ul56iL02p_anJoThXaSvFiM,2826
347
367
  helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py,sha256=9WntahzuhVv54IH1m7_z0IxwLma3dbaMOne_pUx751Y,7652
@@ -355,14 +375,19 @@ helm/benchmark/static/index.html,sha256=xIJGjMg0qn9eemfdBiNbTI0jzPfBD5x0v8HJF-dM
355
375
  helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
356
376
  helm/benchmark/static/json-urls.js,sha256=AaULgfHw8OLfrQLJpBHfcC013uavQnlNNFS9vzb0qOg,1981
357
377
  helm/benchmark/static/plot-captions.js,sha256=bTR8gYx-QqF_RJyKX-L-eQP7hSEtawfJSoADCvgjKag,3011
358
- helm/benchmark/static/schema_classic.yaml,sha256=p-yc2WMfyGehRtD7L5ZZHbFMMQovu2HNfvct3tBlV2I,108168
359
- helm/benchmark/static/schema_image2structure.yaml,sha256=gig7HVyJWSwcHa96mf-09e68_fU5L02YRWzNbkPmpGg,13520
360
- helm/benchmark/static/schema_instruction_following.yaml,sha256=mg2g5P8TAYSCEhZbLfshPt_Hq2GKjwbvyOsQrwDqh7w,8923
361
- helm/benchmark/static/schema_lite.yaml,sha256=62ByEWhAJT0tIUFi-euxJ7XFhE6e9E6PT9dF6V3qoSU,40255
362
- helm/benchmark/static/schema_mmlu.yaml,sha256=8kiZDEGGaBXs9ucDk_Gbo2agV-OgOmWuhcYFyodRjcw,53307
363
- helm/benchmark/static/schema_unitxt.yaml,sha256=89GnKrooG7kKU2xh0MeoYZUB54FDUAmOPrbzuBhG1Ik,15496
364
- helm/benchmark/static/schema_vhelm_lite.yaml,sha256=s8tQIetR2WKu3sd8k2uZO68_5E-YtlMdsBJsTehFZKE,7331
365
- helm/benchmark/static/schema_vlm.yaml,sha256=o9AzLTKwSbPES5pISI0tmpUPKWWT9GR-dleDKZqoI0w,33243
378
+ helm/benchmark/static/schema_air_bench.yaml,sha256=ePZAGL4X-yH4cAQvzS5uU44duCKwdDrMwDSvCC9y7-k,139384
379
+ helm/benchmark/static/schema_classic.yaml,sha256=sK3yVQCrk3Tn3Kmg9WITBmJZI7AKVjmIY0f3zgH_t0c,104611
380
+ helm/benchmark/static/schema_finance.yaml,sha256=vZG0EssYr_BVZmyV4sZmRaeLFSX2ycjni8O_L_kGzzc,5283
381
+ helm/benchmark/static/schema_image2structure.yaml,sha256=IV57vHTaZakH6EupIlT6PRjK8aI14OSNFYUAHD9QBxo,15593
382
+ helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
383
+ helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
384
+ helm/benchmark/static/schema_medical.yaml,sha256=hDk4834FKn-5cMr6pHcu1P60sh6cXJ2J0Z1ADIj2MSc,8455
385
+ helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
386
+ helm/benchmark/static/schema_tables.yaml,sha256=i4ylaq5yZoIEUvxPS8dniPQWKHZF5bz3hMgjNbzC_MM,7064
387
+ helm/benchmark/static/schema_thai.yaml,sha256=25-PjBhZMHM89M01XxLQWNg0mdQnfo4H0XInF9ZzDow,7900
388
+ helm/benchmark/static/schema_unitxt.yaml,sha256=9FQhoueYNNYQ2xMuJ2KHzpg_9-_ZhZ9efk6jtTQ3tlc,11855
389
+ helm/benchmark/static/schema_vhelm.yaml,sha256=IZ1oAmEjnoWQ6YtMpnwZ2IQkXx86bJS1j3686mvtAGc,29476
390
+ helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
366
391
  helm/benchmark/static/utils.js,sha256=bgN0PT53Dregc-nLmEmAEmg2psufWpS8jTf74WoypHw,7681
367
392
  helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
368
393
  helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -384,9 +409,10 @@ helm/benchmark/static/images/organizations/together.png,sha256=pmWjW4r7GnlKqFhKL
384
409
  helm/benchmark/static/images/organizations/tsinghua-keg.png,sha256=l9SzlZCsLF18BY876wYJcVgiQbgvwte7uoILPDcVwHk,7776
385
410
  helm/benchmark/static/images/organizations/yandex.png,sha256=OOCdcKubAP4x7h4VW7z5a-AHPWBiSDTjsIJea6ZiovA,27964
386
411
  helm/benchmark/static_build/config.js,sha256=ER8utDIqVZi9uge7Qrk1gmlT88TOOkFF9xYp3j10m8U,165
387
- helm/benchmark/static_build/index.html,sha256=g3pMdAovQ4VMr7dPGgyzWv2K1tN-E8LLkAs45ppLPGw,1149
412
+ helm/benchmark/static_build/index.html,sha256=J0TrGE5-kOkopr-iSRHvvCzDL00w8Si-8OaIt9vSX0M,1149
388
413
  helm/benchmark/static_build/assets/01-694cb9b7.png,sha256=aUy5t0DYCg4r52HDOmeNi1S2CHsnv3mE7ySokJg3Ouo,8903
389
414
  helm/benchmark/static_build/assets/ai21-0eb91ec3.png,sha256=Drkew6Vlwi2_4_S8hjagK2x8smOwLKTNiXIT3rDiurs,10208
415
+ helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
390
416
  helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png,sha256=fOEANHS8RymKaCzUWn9gQWebts2ghSmtW9Fdda_TjR8,7224
391
417
  helm/benchmark/static_build/assets/anthropic-70d8bc39.png,sha256=cNi8OdIshIIb8PdodcX8mAj-khaUD0O6nhah-_6nYfs,8017
392
418
  helm/benchmark/static_build/assets/bigscience-7f0400c0.png,sha256=fwQAwN1x2Fr_ztD_HZdcOkdFcyxuDjtS3B5-VuRNkuc,19036
@@ -397,13 +423,15 @@ helm/benchmark/static_build/assets/google-06d997ad.png,sha256=BtmXrVQZHr3WH5c8c2
397
423
  helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
398
424
  helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
399
425
  helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
400
- helm/benchmark/static_build/assets/index-737eef9e.js,sha256=PvNcOghX7gGSYAGk2bR3pvIBnwDbeWHu0JyfPNaan3o,70614
401
- helm/benchmark/static_build/assets/index-878a1094.css,sha256=h4oQlJUZdqMk6nS_TEkyXMZ6rtGmepw4ljoSAHZX1vY,486381
426
+ helm/benchmark/static_build/assets/index-30dbceba.js,sha256=WXT0A-yH9f-3wCwQ3rwKWTCIOOpjETQwOQyZt2OMAwc,77064
427
+ helm/benchmark/static_build/assets/index-66b02d40.css,sha256=ZrAtQOMv7vRJwOA9urNRk_rs8hJljom_xhn-wI89g08,486795
402
428
  helm/benchmark/static_build/assets/meta-5580e9f1.png,sha256=VYDp8arkAe2eYRJhAOcIAsZY1qY0hqyOEQDgVMbX9M8,4646
403
429
  helm/benchmark/static_build/assets/microsoft-f5ee5016.png,sha256=9e5QFl23yTbnAk8u7lZKaQOf4oPHbr_aiQda5n4MZqE,50850
404
430
  helm/benchmark/static_build/assets/mistral-18e1be23.png,sha256=GOG-Ix7XlctGOUmvJfO2oVSBM7E5O562G88OnoxsjBw,14402
405
431
  helm/benchmark/static_build/assets/nvidia-86fa75c1.png,sha256=hvp1wZMwYxkfrVMvJs73PX71JwY5L8ZvxIH_fL4n6Po,27945
406
432
  helm/benchmark/static_build/assets/openai-3f8653e4.png,sha256=P4ZT5ISIlt6Dl0mOp7juSM4Y7dfyRNPqdc0PJuwNoqg,16877
433
+ helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
434
+ helm/benchmark/static_build/assets/process-flow-bd2eba96.png,sha256=vS66lq700aPEKTJR7maMrmepAyBZySaL42tBNCRjFWA,190822
407
435
  helm/benchmark/static_build/assets/react-d4a0b69b.js,sha256=rNTpl8Is3LkYXqJowRMc8vc4SXQwP94Ozy4DZZWwldU,275141
408
436
  helm/benchmark/static_build/assets/recharts-6d337683.js,sha256=rDrVmtTCCSLY2hpcxSDxhlQ6CQmTTSQOESNeO3oVQgg,432466
409
437
  helm/benchmark/static_build/assets/tii-24de195c.png,sha256=JN4ZXAa0rbR2IlxPfd_mKtntFZcYpDcXocSiqrC2rNg,63389
@@ -454,38 +482,40 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
454
482
  helm/clients/ai21_client.py,sha256=LIdkmzcUDR9uIF2tIk5YgDNGNmfQ9JDYmgscvFoCHDs,5509
455
483
  helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
456
484
  helm/clients/aleph_alpha_client.py,sha256=koPqXF6uRD905atoiCaPg5yxr6B25J0g2OTWk8geebQ,4969
457
- helm/clients/anthropic_client.py,sha256=0hAmv3f6FQURScmDpcGbwGjnvskNRP2vhRH02OSe70I,33224
458
- helm/clients/auto_client.py,sha256=Qs0XFq9pyH4M9HTOLoI3_5m8kW305x3pzVukgETdrZM,10732
485
+ helm/clients/anthropic_client.py,sha256=wptP4u4NhQknoy7VQsWqVzn9tv3IrCuJ3vUMq6fiq0E,34909
486
+ helm/clients/auto_client.py,sha256=uK9EWQFWBt4DoV1oytm0dIeA3YpcfGi_H0rCRZSVE8c,11438
459
487
  helm/clients/bedrock_client.py,sha256=BsH9UopsP6ZHf-K0Yzg1PYSMLDwY0yIUmPHDhJVMUi0,5293
460
488
  helm/clients/bedrock_utils.py,sha256=okZ6Z8pviGOUNlrdF2QquAqFs8-QYgcqci95eij8giM,2574
461
- helm/clients/client.py,sha256=xoxPwV-aar7suM-3eAMsB9FmrempyqZ5FFXcLIQJz9w,8628
489
+ helm/clients/client.py,sha256=InjCQi62TWhWHmfyi-mC3fSAVztd-YDyfB3BkpacHXk,9002
462
490
  helm/clients/clip_score_client.py,sha256=ct3GHZ2Zh3fGwyvQ9DyoIPT6PwDPI-nUaFkUFuc8PIE,1622
463
- helm/clients/cohere_client.py,sha256=0UUsFnHwZjEkKjXKPzM6EpZ_iuAduZTg3sCrPM1zGt0,7359
491
+ helm/clients/cohere_client.py,sha256=PtVrDdm_-dXBiWzu_dfwiJPt5GLGw3wdN-Qw3u8ugtU,10976
464
492
  helm/clients/cohere_utils.py,sha256=aYmj60m0e9RF9BIdxp1vmA-uZv17TEALw0dbgTUSpCc,504
465
493
  helm/clients/gcs_client.py,sha256=1sK5x5uWtThgz9gqBLaA8oyiXGD_9nn1WyfMzJRyPQ8,3231
466
494
  helm/clients/google_client.py,sha256=EOpPzK5_9yzWkMjK-4ILiixDF3aeOa8AbR2SPnEO-nw,2900
467
495
  helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
468
496
  helm/clients/http_model_client.py,sha256=DBgkVDZPmg99DCcO_1Xdf6nFQo2kyxLkgoQpwC-wkHI,2806
469
- helm/clients/huggingface_client.py,sha256=vzUmNJKsgIXLD8ho4kUGyFCRFGXC61C74X7No0yY7N4,13235
497
+ helm/clients/huggingface_client.py,sha256=xmdqOWoioqoYQjtBqJFN-K9Fm3oHEQrOEjyzDz4ZWBY,15847
470
498
  helm/clients/lit_gpt_client.py,sha256=Sjec16bNODosEhDoBkRc4t-LNS-nCUY_jVivWj5zvfU,6205
471
499
  helm/clients/lit_gpt_generate.py,sha256=8DdBE9ReQ00NbV3KMFYc--PlO9X-HMOR0Rhm5CADWEA,3103
472
500
  helm/clients/megatron_client.py,sha256=KFL1BBBDqxr5mtd5iu0dA6uK8_v6d4g_D6RsZrHx3a0,4107
473
501
  helm/clients/mistral_client.py,sha256=thOLMcEfrzWR00JUabIZ_PnW2o9YZsdSmNf9z3jbYKo,5982
474
502
  helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
475
503
  helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
476
- helm/clients/openai_client.py,sha256=gWqr4dvYfbUnBtfySSUGGVZYV-pLtqcrnYaf7nPk5-s,13936
504
+ helm/clients/openai_client.py,sha256=tXxi9nZsxz2I4YQLrQrV-GhlgZ1Z9ifrUhC_3Aw5SPE,14238
477
505
  helm/clients/palmyra_client.py,sha256=LBYFHNc5LdpPbiSp1AAHuMm8cUUCQ2EB03BB6XnDTYQ,6551
478
506
  helm/clients/perspective_api_client.py,sha256=WQDArqlKVWwcK2SicnSIAgV6JGVHsxibTzkdezT3z_U,5920
507
+ helm/clients/reka_client.py,sha256=K8b9p7U6LLAy4PRjgYrUS06gF4G2xjhjRoMEO4XDe0o,8329
479
508
  helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
480
509
  helm/clients/test_auto_client.py,sha256=bc-rsMJ8JM0MFnQ4B48hBJ1jL3RtRyVvmPwOgzF2mF8,3155
481
- helm/clients/test_client.py,sha256=g29C1WLUONnNuE2oGFZhaqMahb-doS4l_Ph4OHrQvrc,3895
482
- helm/clients/test_huggingface_client.py,sha256=WUPrA7VT3nnMNht7w20I6411hlpIS_77XbQC2vC0WU0,2723
510
+ helm/clients/test_client.py,sha256=V7Y56Ahqa8C2Kc2_W2QE0VfGbBEJzFmnic3LGHZkOqQ,3940
511
+ helm/clients/test_huggingface_client.py,sha256=x2NjMuIrinfUy0wQ1S6F5cYZVr09YfvN6LfhWmyGNAM,3388
483
512
  helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
484
- helm/clients/test_together_client.py,sha256=lAtGKn3WdsYe5MEfTYVYRnu_rS4DPnfFr5jRn42rvoQ,3865
485
- helm/clients/together_client.py,sha256=fCPJ39fX3xm_Gp6cGsc1HIf1jVMLNiE2kIkee45-Ufk,16208
513
+ helm/clients/test_together_client.py,sha256=yYNrhU3kQjmHwhILuoP5QwUgbmkm2gg2NHiNycHjoeE,6145
514
+ helm/clients/together_client.py,sha256=rtYdx53ZE19ziJpBc7MYTeSHJjN3Ke51I3Uldg0IAbs,20595
486
515
  helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
487
- helm/clients/vertexai_client.py,sha256=Mt1rb9lWeQqJLGcBSR5mflYBvJvJfsv5OeIuQz4_ng0,19726
516
+ helm/clients/vertexai_client.py,sha256=K_vCanJU97o2P_WJOeLhUFJA8SdfJDlVNl7Mi1HuIrQ,21860
488
517
  helm/clients/vllm_client.py,sha256=p9atBtq3PBOoPkOPSifkMrYZjNLnNM_sWM6tL_3N-WY,1675
518
+ helm/clients/yi_client.py,sha256=0t4WJ8MTLOpB1LCZ-P6UdYa-KbGB7hkDrBluSkioot0,835
489
519
  helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
490
520
  helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
491
521
  helm/clients/clip_scorers/clip_scorer.py,sha256=waLI_rI6dQPjmtywvGeQKK7bGCWXyoIgIuBc8P3zSB8,1907
@@ -544,9 +574,11 @@ helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh
544
574
  helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
545
575
  helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
546
576
  helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
547
- helm/clients/vision_language/huggingface_vlm_client.py,sha256=X5SX2iMZkFe9Pmq4Gx0O4bnP4gBPnKvamLThRshAEik,4875
577
+ helm/clients/vision_language/huggingface_vlm_client.py,sha256=H7AE8mm506PkEcUO8VaLVtptHTwVX58nZx1A_BWdKzA,4968
548
578
  helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
549
579
  helm/clients/vision_language/open_flamingo_client.py,sha256=CkN0JCeR742ZG9Nc4A85hp4BSE0WLU-3Rs-ZwdmDkzs,6632
580
+ helm/clients/vision_language/paligemma_client.py,sha256=IU_T8r1RgpGkEAqabLKBbmoUOWV6c1a9_FXgiTy8exE,6835
581
+ helm/clients/vision_language/palmyra_vision_client.py,sha256=mY6vj918f-tbqhOmh7PCSEgnSpHzWY8UTqAdvYgXJ8Q,3757
550
582
  helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
551
583
  helm/clients/vision_language/open_flamingo/__init__.py,sha256=i1tGJj6ckeE6eS1EWV5tbQKYLmPCrdSI45mPchfv_Ic,88
552
584
  helm/clients/vision_language/open_flamingo/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -563,13 +595,13 @@ helm/common/clip_score_request.py,sha256=WnNg89owDCmG7tyy8nnQL0RdKQLsUdMWiYH9Xqq
563
595
  helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
564
596
  helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
565
597
  helm/common/credentials_utils.py,sha256=O-57nUgkWLbZF0k3lsSaVGPPHj2_OYeVuCMe0to3bRE,1118
566
- helm/common/critique_request.py,sha256=Exu8Ans05zCU5d5-AglEbG40mBwKYED2Z3WqY_XjXBY,2772
598
+ helm/common/critique_request.py,sha256=yo4aRe-DEjudUmydthtpTj6LdhRXfZ3JZptxTkWzZ3U,3068
567
599
  helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
568
600
  helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
569
601
  helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
570
602
  helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
571
603
  helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
572
- helm/common/images_utils.py,sha256=zbzS8C_oCDb9dY2xpWY6nljI8of72rqwijryMeiBKKo,2527
604
+ helm/common/images_utils.py,sha256=bsxgW9knrfa9NTa6V-O13_nDnflqrqHpnKlTRxul-aY,3187
573
605
  helm/common/key_value_store.py,sha256=iHi1WQuWttLNJnuM48QNOAXHoneNbmbBmtXYPq-dyys,3147
574
606
  helm/common/media_object.py,sha256=3VZqfb0py5dDKwWtnLp2kdl8svaike-Cn7Mjk-b0cvM,5130
575
607
  helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
@@ -590,9 +622,9 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
590
622
  helm/common/file_caches/local_file_cache.py,sha256=wBOAbbkGLiClaX4YdunokRfSQCKNkTYmMVx2KTLy4Lc,1921
591
623
  helm/common/file_caches/test_local_file_cache.py,sha256=bOCWR9MglwQXV98xk8auyjgFxaOr85zRdxWwxMBQW9s,663
592
624
  helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
593
- helm/config/model_deployments.yaml,sha256=KAD0FZ45ERfEjr3y7HbPxZmEnnJBQiiOHRHN7VxqiF4,74817
594
- helm/config/model_metadata.yaml,sha256=XpJnlu0kiI5sGEqswF_S6_ra0Iys3VOfsDs2Jiz_Vqk,112991
595
- helm/config/tokenizer_configs.yaml,sha256=3IhRANDTlN39TWqDWuPy507wQlZWOBlyaS8fA6WLDD0,12070
625
+ helm/config/model_deployments.yaml,sha256=x4j3LMGHTV3jObKK0dT5SOtKJvReWOHyyjs6jV2D2L0,89739
626
+ helm/config/model_metadata.yaml,sha256=M7EsOSnf4tcrSlNYBT50SiC6mReXfZ1q5rt7_OpdzpU,138011
627
+ helm/config/tokenizer_configs.yaml,sha256=lBGPsRPRPeqlN_j194hEVP8HAMC6J5NLrIZpN95Y8ug,15078
596
628
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
597
629
  helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
598
630
  helm/proxy/cli.py,sha256=l8F7UYqrIOoBD9ZCIxJFA4fhxlzhae0-2Nn8A7FMkzk,8244
@@ -608,12 +640,12 @@ helm/proxy/critique/mechanical_turk_critique_client.py,sha256=OcppmFOMweBSfVTiLI
608
640
  helm/proxy/critique/mechanical_turk_critique_exporter.py,sha256=taULrc_cIP0O9c5UpGz3l9DmWQadTVzN_v-qzTgMoyo,8470
609
641
  helm/proxy/critique/mechanical_turk_critique_importer.py,sha256=NL97joO5pRkcICRdVyG4kf9JhfYRaySsxRoZ7KWDYv0,5581
610
642
  helm/proxy/critique/mechanical_turk_utils.py,sha256=mKpUv4zz3s5ptzDY7UrwuI7Cr5HmNgSjPC10BnN9AL4,1766
611
- helm/proxy/critique/model_critique_client.py,sha256=nrNjnvOFdcRk9tUk2MjoBugAfMM92X0hxKGSg4xsy9E,11187
643
+ helm/proxy/critique/model_critique_client.py,sha256=QMFiMpALXnneumKbJpXOZDEb3lPPdkIaSCasmdXHB8o,12806
612
644
  helm/proxy/critique/scale_critique_client.py,sha256=B4povtceyfal95eE3N7em9cC_B5Vy4jMrHXcsXc_5m4,15889
613
645
  helm/proxy/critique/surge_ai_critique_client.py,sha256=HnzgAoF4Du9Me0GS_lbNaozZslS4a2OZx735gh-coo0,8357
614
646
  helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
615
647
  helm/proxy/services/remote_service.py,sha256=emYN0qWOJLQ7q1n06V4TwlvXaqylQcUxmqDcGZXqPJ8,9097
616
- helm/proxy/services/server_service.py,sha256=ehKs1gITG8ZsPpxzjbzlHqWjAJVLahiKZn5odsLhcPM,11535
648
+ helm/proxy/services/server_service.py,sha256=U-1g0VMjCY9bBK8BecbUxVzSx7hyC_rpwSNm67bqmCg,11534
617
649
  helm/proxy/services/service.py,sha256=Be-Z5F6AN4vMzsJr3BS6tJ9NHHy_dc_yn2Ex9cm0ChU,6193
618
650
  helm/proxy/services/test_remote_service.py,sha256=NFnLjg3QNHoDKdK0DlcrtylwlKXx1vdzheNZRrLEv7c,6605
619
651
  helm/proxy/services/test_service.py,sha256=FUZoI8pGiUg5adgB1wTJ869QOgFYjPtM6yf6FGMdE64,8968
@@ -625,16 +657,17 @@ helm/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
625
657
  helm/tokenizers/ai21_tokenizer.py,sha256=oXImuAY9kMohHH6Zm7BWysfT88b00NBoSELeGQ920y4,2255
626
658
  helm/tokenizers/aleph_alpha_tokenizer.py,sha256=UlWC_SjObBvexpZ3OfKZT2yjhbSsHlKjQe_oWuRrXno,3818
627
659
  helm/tokenizers/anthropic_tokenizer.py,sha256=d-HO9OEFkhYzFZu0VkOsHjxbqqSUseCNX0KQqgb3s2Q,2114
628
- helm/tokenizers/auto_tokenizer.py,sha256=xKL_rLnjiaCnyH5oJUlo5gfdVSen7PmBFFD60gl9R8A,4217
660
+ helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
629
661
  helm/tokenizers/caching_tokenizer.py,sha256=kSegrCFotRevSDgJsn0g52dWiSUCNa7_EZpRNrELeUE,8163
630
- helm/tokenizers/cohere_tokenizer.py,sha256=-WuvEKHzwqcpnhDPauw7x8wyZ5eVWTZalygx1LkkLnQ,3739
662
+ helm/tokenizers/cohere_tokenizer.py,sha256=6rahykq1SxqS8vCWOzYo_oeUoVwhg_zOfWFIkQxP6GY,5632
631
663
  helm/tokenizers/http_model_tokenizer.py,sha256=wBTtDA2UdEYspffa1wqgkT3y3YHoyLXXoucnJ5PGjhs,3109
632
- helm/tokenizers/huggingface_tokenizer.py,sha256=IY9RxJ3YwVKfXtvMXR9DLO4uTaz9j_8hr1MOyA60H7Y,7791
664
+ helm/tokenizers/huggingface_tokenizer.py,sha256=_XXx8uApENK7-o81qxEn0SOeJL_L2UpiiuteSYiODpE,8734
633
665
  helm/tokenizers/ice_tokenizer.py,sha256=4ZTIRpmt2cqwcxnmrDpCRhiJ0BI3ELE-GHoBuHWgrDA,1200
634
666
  helm/tokenizers/lit_gpt_tokenizer.py,sha256=LMrpaje64UmnDKoYjPG_RQeXVA4xQUwW5t48IJIeLaQ,1660
635
667
  helm/tokenizers/simple_tokenizer.py,sha256=6_NROqVbygs-HRA7bYAZluN4YB5gUhVaRsYQeRTjA1E,1147
636
668
  helm/tokenizers/test_anthropic_tokenizer.py,sha256=_wzXp9FVR2Ml0s2A79TTXbSPHyTRp28i9tiEyQ9S6Ko,3792
637
- helm/tokenizers/test_huggingface_tokenizer.py,sha256=o1oqYT2MS-7xrnffj48WuvJfKAHd4p8pee9W4WxwQb8,6172
669
+ helm/tokenizers/test_cohere_tokenizer.py,sha256=15z2GJtZ-VlrliC2_Fk5DIZhQYFkJS7J73fjxYMf8YM,1431
670
+ helm/tokenizers/test_huggingface_tokenizer.py,sha256=8tFyZQb4DLg6MdKg13a66bLbp0yf4Ar1fGWM_sYeSjg,6309
638
671
  helm/tokenizers/test_ice_tokenizer.py,sha256=-xi_f8TBSkAYr5CcA56HDq7rZ9HAGd99J7twNfkLzFU,2619
639
672
  helm/tokenizers/test_simple_tokenizer.py,sha256=vUNdcnJqZV99-E8H1rwUH85AQPJ2HTnDr5DrZ_-zRL4,1219
640
673
  helm/tokenizers/test_yalm_tokenizer.py,sha256=qWpKnUuAlePd6t-UJB_mAiBwtAacnC8caKXLJ_GdTkk,2477
@@ -646,9 +679,9 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
646
679
  helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
647
680
  helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
648
681
  helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
649
- crfm_helm-0.5.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
650
- crfm_helm-0.5.1.dist-info/METADATA,sha256=dVxnv-vEsYZb3v-ALFNpSdpbxwi5WQG5_I1oD3cMs6Y,19157
651
- crfm_helm-0.5.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
652
- crfm_helm-0.5.1.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
653
- crfm_helm-0.5.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
654
- crfm_helm-0.5.1.dist-info/RECORD,,
682
+ crfm_helm-0.5.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
683
+ crfm_helm-0.5.2.dist-info/METADATA,sha256=g-tT_a7wm7L7iaNCQVwNIrpUnVHK8PKfbXjel0KyhmQ,19591
684
+ crfm_helm-0.5.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
685
+ crfm_helm-0.5.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
686
+ crfm_helm-0.5.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
687
+ crfm_helm-0.5.2.dist-info/RECORD,,
@@ -39,90 +39,91 @@ class AdapterSpec:
39
39
  Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
40
40
  """
41
41
 
42
- # Method of adaptation
43
42
  method: str = ""
43
+ """The high-level strategy for converting instances into a prompt for the language model."""
44
44
 
45
- # Prepend all prompts with this string.
46
- # For example, it is recommended to prefix all prompts with [NLG] for UL2.
47
45
  global_prefix: str = ""
46
+ """The string that is prepended to the entire prompt."""
48
47
 
49
- # Append all prompts with this string.
50
48
  global_suffix: str = ""
49
+ """The string that is appended to the entire prompt."""
51
50
 
52
- # Prompt starts with instructions
53
51
  instructions: str = ""
52
+ """The description of the task that is included at the very beginning of the prompt."""
54
53
 
55
- # What goes before the input
56
54
  input_prefix: str = "Input: "
55
+ """The string that is included before each input (e.g., 'Question:')."""
57
56
 
58
- # What goes after the input
59
57
  input_suffix: str = "\n"
58
+ """The string that is included after each input (e.g., '\\n')."""
60
59
 
61
- # What goes before the input (for multiple choice)
62
60
  reference_prefix: str = "A. "
61
+ """The string that is included before each reference (for multiple-choice questions)."""
63
62
 
64
- # What goes before the input (for multiple choice)
65
63
  reference_suffix: str = "\n"
64
+ """The string that is included after each reference (for multiple-choice questions)."""
66
65
 
67
- # What goes before the output
68
66
  output_prefix: str = "Output: "
67
+ """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
69
68
 
70
- # What goes after the output
71
69
  output_suffix: str = "\n"
70
+ """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
72
71
 
73
- # What goes between instruction and in-context example blocks in the constructed prompt
74
72
  instance_prefix: str = "\n"
73
+ """The string that is included before each instance (e.g., '\\n\\n')."""
75
74
 
76
- # List of regular expression substitutions that we perform
77
75
  substitutions: List[Substitution] = field(default_factory=list, hash=False)
76
+ """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
77
+ to perform at the very end on the prompt."""
78
78
 
79
- # Maximum number of (in-context) training instances to put into the prompt
80
79
  max_train_instances: int = 5
80
+ """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
81
81
 
82
- # Maximum number of evaluation instances. For getting valid numbers, this
83
- # should be the entire dataset; only reduce this for piloting.
84
82
  max_eval_instances: Optional[int] = None
83
+ """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
85
84
 
86
- # Generate this many outputs (which could be realized by `num_completions`
87
- # or `top_k_per_token`).
88
85
  num_outputs: int = 5
86
+ """Maximum number of possible outputs to generate by sampling multiple outputs."""
89
87
 
90
- # Number of trials, where in each trial we choose an independent, random
91
- # set of training instances. Used to compute error bars.
92
88
  num_train_trials: int = 1
89
+ """Number of trials, where in each trial we choose an independent, random set of training instances.
90
+ Used to compute variance."""
93
91
 
94
- # Number of trials, where we query the model with the same requests, but different random seeds
95
92
  num_trials: int = 1
93
+ """Number of trials, where we query the model with the same requests, but different random seeds."""
96
94
 
97
- # If true, randomly sample N training examples; if false, select N consecutive training examples
98
95
  sample_train: bool = True
96
+ """If true, randomly sample N training examples; if false, select N consecutive training examples"""
99
97
 
100
98
  # Decoding parameters (inherited by `Request`)
101
99
 
102
- # Model deployment to make the request to (need to fill in)
103
100
  model_deployment: str = ""
101
+ """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
104
102
 
105
- # Model to make the request to
106
103
  model: str = ""
104
+ """Name of the language model (<creator_organization>/<model name>) to send requests to."""
107
105
 
108
- # Temperature to use
109
106
  temperature: float = 1
107
+ """Temperature parameter used in generation."""
110
108
 
111
- # Maximum number of tokens to generate
112
109
  max_tokens: int = 100
110
+ """Maximum number of tokens to generate."""
113
111
 
114
- # When to stop (set hash=False to make `AdapterSpec` hashable)
112
+ # Set hash=False to make `AdapterSpec` hashable
115
113
  stop_sequences: List[str] = field(default_factory=list, hash=False)
114
+ """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
116
115
 
117
116
  # Random string (used concretely to bypass cache / see diverse results)
118
117
  random: Optional[str] = None
118
+ """Random seed (string), which guarantees reproducibility."""
119
119
 
120
- # If true, for instances with multiple correct reference, the gold answer should be considered
121
- # to be all the correct references rather than any of the correct references.
122
120
  multi_label: bool = False
121
+ """If true, for instances with multiple correct reference, the gold answer should be considered to be all
122
+ of the correct references rather than any of the correct references."""
123
123
 
124
- # Parameters for image generation
125
124
  image_generation_parameters: Optional[ImageGenerationParameters] = None
125
+ """Parameters for image generation."""
126
126
 
127
- # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
127
+ # Set hash=False to make `AdapterSpec` hashable
128
128
  eval_splits: Optional[List[str]] = field(default=None, hash=False)
129
+ """The splits from which evaluation instances will be drawn."""