crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,24 +1,24 @@
1
- crfm_helm-0.5.5.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
1
+ crfm_helm-0.5.6.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
2
2
  helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  helm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  helm/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  helm/benchmark/annotation_executor.py,sha256=LEehcWmkmqV_bFFzzmdm3GqsObJGCqoAYi1ekwG-yQ4,5757
6
6
  helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbYs6T4,1603
7
7
  helm/benchmark/data_preprocessor.py,sha256=wqGzAiLwOYa4v6TVPe6ayrnuzdNbmfjeiofRQiO2uso,2201
8
- helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
8
+ helm/benchmark/executor.py,sha256=E7cF1vMXBn5eT1z5Le5ng4M9AaIMLjxfLgMmF1EfZy0,4843
9
9
  helm/benchmark/huggingface_registration.py,sha256=DAiHffNmo4H90rBfvQ_LHADtUCnCk6dfpI7Wbat1DZA,4389
10
- helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
10
+ helm/benchmark/model_deployment_registry.py,sha256=zDpqsgjCvtesRan-z2TQA7G97g14UPgjG0Cbi9owWaY,9472
11
11
  helm/benchmark/model_metadata_registry.py,sha256=7XisV0an_edM8hvP8LSoCnTeUN2QLJrQknOCA6-OE7M,8841
12
12
  helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
13
- helm/benchmark/reeval_run.py,sha256=ApGc7T3koXPLhW60e4g8KNMbOfhUTMwMXMWIPcHXWGo,7211
14
- helm/benchmark/reeval_runner.py,sha256=Qt9t47c6xJqGmkGYKfDLhf6idhQHThk_46fui9tsIwA,15593
15
- helm/benchmark/run.py,sha256=eVtwVYvm51R-maimOehonn3IvJObGUFPbGvqoedykKQ,13658
16
- helm/benchmark/run_expander.py,sha256=ZIVTmFUZlu9SJR0yTiNErOVT9-zSR-pU3cje8jdltuQ,55891
13
+ helm/benchmark/reeval_run.py,sha256=vImL8JNhveEOftZbRQ6JAxF0L-XCKIwh65M6fIYo4RU,7198
14
+ helm/benchmark/reeval_runner.py,sha256=bJPl7XVOVwK2fUA7voOVQYwVFEOfKVnrT2tbSGQzQY8,15584
15
+ helm/benchmark/run.py,sha256=F65P6eG3S6dHDxRK8HMqDFGQjPBGIJouX80ANsHb0Y8,13806
16
+ helm/benchmark/run_expander.py,sha256=hKFLpmq8W2KBl_mBf-ahHEbt67qZFgu-VxjvidOeQuE,56543
17
17
  helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
18
- helm/benchmark/run_spec_factory.py,sha256=lchT8iltTIYrkJ_uGAQkS5gmu9gvrZ-mVIkx2KhR10g,7728
19
- helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
18
+ helm/benchmark/run_spec_factory.py,sha256=Hxeft3fXoWNz9yGo-2nIfb5pd3GDWlwYWc6YYvAkTjM,7785
19
+ helm/benchmark/runner.py,sha256=O-91eRRrNgE4_tlCVeLq9_0QsRfNELvaQT-KWtJw894,14618
20
20
  helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
21
- helm/benchmark/server.py,sha256=_L5rb3NqtShQMkpESPKpo04KcMcRzR_ZQkWFokAb-nU,6344
21
+ helm/benchmark/server.py,sha256=uphh9L0FQnVZVVoGx50MMb_jXh-uen6ouE3uDN5GKFE,6422
22
22
  helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
23
23
  helm/benchmark/slurm_runner.py,sha256=RjmwMqMdKwOzd9B2S6fkuSqB2UjybmiSRVjraiLtzgM,16567
24
24
  helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
@@ -37,13 +37,13 @@ helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=dvwirvz4dRzJ
37
37
  helm/benchmark/adaptation/adapters/chat_adapter.py,sha256=1Pf2XgdtrqAxbZPkUfw7TUH2lrulYoDTkC8Q0sckQHA,1852
38
38
  helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py,sha256=dhDZANH5lyL5VdR_Ks72cNlP-NHbJqThZVP6xKHmXaE,5034
39
39
  helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=LI7uWpKIHvTUjGiygmjB_1HLk26vNkYYCBWIx0EEyL4,2180
40
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=QckQyDe_BvEj3sOZ65UEqR7rMcOVPEq7MREeE7DHrjA,15031
40
+ helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=8LepCkI5b0MOL70pRPGb7vEH0KFMxIlpCQIVIzQT_vE,15030
41
41
  helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=u_GFEgg5wmpate-s5U5aMsmcHuFmreJcA8J0TO1kPCc,14907
42
42
  helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=-fY4mvzoGCCoR0HesT_xf2U2m2arVjgDuj59lm07_tg,1923
43
43
  helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=lzmHwvDOHWl9IWC3NTLGfJDbduXtK_zrS2_YoUQmdc8,4464
44
44
  helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py,sha256=RV6B3i5juBbJCtPDWzSfma49YXeDq3vQAQ5xQwnH-cA,3282
45
45
  helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py,sha256=hhH9ehK092j1WdUwrKYSy5PvNJ73gsIu6-5W8aLoYVI,2190
46
- helm/benchmark/adaptation/adapters/test_adapter.py,sha256=0-JrYnogZu4kENQG1eQMXHWnuSurCLRbkLpDuSnfRqs,745
46
+ helm/benchmark/adaptation/adapters/test_adapter.py,sha256=7Nr6kMK3JN0UjMjjZ6P1fsD5xhOeaqh0D1xI6LFKCos,641
47
47
  helm/benchmark/adaptation/adapters/test_generation_adapter.py,sha256=Iq5q0HpBHrI3d2SodI0OwQ-COXuM7KvCjlBk_zNguNI,12868
48
48
  helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py,sha256=HASZNtKXYWOOIMKVe16yokWNfCNJITJXoUhDLVkk-FQ,8048
49
49
  helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=-412yPKMylDMDXpbG-SlssXEjZlr3dshecrTFZoE-wY,11942
@@ -56,39 +56,40 @@ helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimoda
56
56
  helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=6nuz0Vn89A1mOedutsiq2SwTOG3qn8dUZTiaXhKffiw,3587
57
57
  helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  helm/benchmark/annotation/aci_bench_annotator.py,sha256=SjXidlbpm5HOhdhNXg3HjabMEQvt3hq1iJ5GPajxt8M,3228
59
- helm/benchmark/annotation/air_bench_annotator.py,sha256=wC9LKP_I8XX-Qa9Na5OQes4QOYmJoVN9fi8tcXPlKuc,3500
59
+ helm/benchmark/annotation/air_bench_annotator.py,sha256=Xvqzf-f29dzLGuAMeNiQe_kSkMbXEN1_U1LwCAn6nJQ,3500
60
60
  helm/benchmark/annotation/annotator.py,sha256=__BkMVpAEpSs1pbwPK5sVWLdCAXnjsHcPYgmOqmNPu0,1843
61
61
  helm/benchmark/annotation/annotator_factory.py,sha256=8uo5uz1UpIVCHUd7CRvmy6b9XB1gspdHmgxH5UZMPVI,2335
62
62
  helm/benchmark/annotation/anthropic_red_team_annotator.py,sha256=4hob15m2k9e2A97E0aG9FstCbJ_oMM7-9y-nh2EaYqc,2395
63
63
  helm/benchmark/annotation/autobencher_capabilities_annotator.py,sha256=TkW3xbcEuaPeGwuFrlu0YNSmj896WarmVT0WYL1it_E,4913
64
64
  helm/benchmark/annotation/autobencher_safety_annotator.py,sha256=w_xjZmY1zuLjVvVbcbUygNvqcfn5dtwpXeV99yqm9aU,3914
65
- helm/benchmark/annotation/bigcodebench_annotator.py,sha256=_p_keqJ6WwOGP7wTfNFY_zAADN3HUHHNpb8QenEkcQE,4449
66
- helm/benchmark/annotation/bird_sql_annotator.py,sha256=mYK-2LeMzo9RrWzZFqaIRtn-1VyOe4ArCqBqtF6RAD0,2443
65
+ helm/benchmark/annotation/bigcodebench_annotator.py,sha256=CJG2pn1DeHJCp3yHETRquNIkCHfd6ZNuOiUjG1cQ_JY,4448
66
+ helm/benchmark/annotation/bird_sql_annotator.py,sha256=FQDZs1-O1jfJOET0eDeU7lf5xLaiMPohC5BdmQ4XkzI,2436
67
67
  helm/benchmark/annotation/call_center_annotator.py,sha256=pTEjwfA4tgZhroFbamoQ8IO_D1O9r6k5GIlD50JEg5c,11601
68
- helm/benchmark/annotation/chw_care_plan_annotator.py,sha256=LdY1GBQsU6O5z4KsVyan5z38vS6sNqpQak6ZacMmqfk,3073
68
+ helm/benchmark/annotation/chw_care_plan_annotator.py,sha256=6ybNBvJi59i0cpAhI_fLwXoSnqhAH6m7Lo6ad_PufBs,2966
69
69
  helm/benchmark/annotation/czech_bank_qa_annotator.py,sha256=YIH5g4zHe3BQF2Y-6uRVw7g9u_SPBncqBobdvZdIzyA,3096
70
70
  helm/benchmark/annotation/dischargeme_annotator.py,sha256=Z6xnUK1cNrFco9x0w8B_qhlLOEZrzXBwT6TKZPKoPBk,3676
71
- helm/benchmark/annotation/ehr_sql_annotator.py,sha256=q99HGDcnG7_YcU47nK4Yi6ZoykURCNDWW6wIwQa5lms,4028
71
+ helm/benchmark/annotation/ehr_sql_annotator.py,sha256=Izpq0biZ9lkJOPk6NwTuv2wk8Bg88vj56BKZrY8XhT4,4021
72
72
  helm/benchmark/annotation/financebench_annotator.py,sha256=gNERLY35t2kcpayXGGrY4-pBs2jbEUomqElRYbb9nho,4150
73
73
  helm/benchmark/annotation/harm_bench_annotator.py,sha256=zhkWnV3qZgY-nvHgQRHGrrCMC7605JwFHesY7UC3ZnQ,2293
74
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=ufvfXMTKcuk11Hfe3C7bEAyoqlqbrrv8D3hBEN3na6I,6057
74
+ helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=I7TjpN502Sa-Z4uUKemJXSAdOiVA3MMO92YIAAXeDBg,6034
75
75
  helm/benchmark/annotation/live_qa_annotator.py,sha256=8DXsjwmeSyvC0kfp1uYds4cwpxqzF7FcskeZaXxXiOw,3552
76
76
  helm/benchmark/annotation/med_dialog_annotator.py,sha256=OVTFIlvdhcOr_hdK0tnrDes9hYdN1mDWFTp4GDYY7O0,3162
77
77
  helm/benchmark/annotation/medalign_annotator.py,sha256=8edAZh8oQgDKUT1bQ3Hp2NBE-QnBZ_-ZQjHkV7YKWhs,3240
78
78
  helm/benchmark/annotation/medi_qa_annotator.py,sha256=v8e6hkHZX1x9KtTedCnpCseh-Y72z5kUgUrXHWPUkX8,3074
79
79
  helm/benchmark/annotation/medication_qa_annotator.py,sha256=uZ3VpJ0nsDyF70_kn8kSSBPr4OlfiNdZC7q8wq_jJFE,3090
80
80
  helm/benchmark/annotation/mental_health_annotator.py,sha256=JwgSeXtwf4KFZxNtAxsnqdLJQSvP-F-ZoCcCWdasrMQ,3275
81
+ helm/benchmark/annotation/mimic_bhc_annotator.py,sha256=pwwniNlu5VTa1ZdyO0KFcMFZcpqM5CjguujgSpEGslw,3174
81
82
  helm/benchmark/annotation/mimic_rrs_annotator.py,sha256=zABO1FJH9pOFhUe5vc2B-c14Hf5RsuU9jQAGiMg6G0I,3204
82
- helm/benchmark/annotation/model_as_judge.py,sha256=G6mDrbxNp4roC-smrhqZb5swt18Coa9b2-aJMPOaGuM,12116
83
+ helm/benchmark/annotation/model_as_judge.py,sha256=FIJOUzIhf2QpxqFf6hjgAM5hPEm0VlXzB-jiHJUrPDs,11985
83
84
  helm/benchmark/annotation/mtsamples_procedures_annotator.py,sha256=qqWHY2HfCwMP5GqvObS3JpMIYVs4yyITCsA1B7lcDks,3201
84
85
  helm/benchmark/annotation/mtsamples_replicate_annotator.py,sha256=TUxNzJcItErsw0gw76hiKZAWeQTNHGHnC0qf-_CGeF0,3316
85
- helm/benchmark/annotation/omni_math_annotator.py,sha256=fAgABWlSEs8jnmNbd8RWbU7KNBP-a32kqxTWirs229Q,6207
86
+ helm/benchmark/annotation/omni_math_annotator.py,sha256=PvZZb1oGw60qT-oHRIs93AZbh5wTbpsmD8BforudFhA,6144
86
87
  helm/benchmark/annotation/simple_safety_tests_annotator.py,sha256=if4S8MaENr1HZ42ZsOjDPXZ-kJ0p4l4B2j9m994RuxQ,2140
87
88
  helm/benchmark/annotation/spider_annotator.py,sha256=B48ylGg5J7xuTSUio7VztdXk3lI6ilMqrUvAD-ve0sE,621
88
89
  helm/benchmark/annotation/starr_patient_instructions_annotator.py,sha256=5jU-dK_0OvB_jXNLDZtQ5E3gaSUcAxFNzv6prA17eAg,3186
89
90
  helm/benchmark/annotation/test_annotator_factory.py,sha256=ifv5hxSbFe113AHeXLqTPkVJ-C2PW_gb9L3a0SHNi-M,986
90
91
  helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD1b91jUs4Nk8Gvope-Z98,1644
91
- helm/benchmark/annotation/wildbench_annotator.py,sha256=sk_GJnPeaIBC0frV04XNq3piOG7Hikn2bCF-_DqRe2A,5488
92
+ helm/benchmark/annotation/wildbench_annotator.py,sha256=OXR59zdKw9W7v3Q_sFnt1cEPN3nOzQDVqSbh4jDbEUs,5457
92
93
  helm/benchmark/annotation/xstest_annotator.py,sha256=arL5DyA_nYkiSCAtl6G7MliZz5ZYRsyc7xQJNu0RBcA,3604
93
94
  helm/benchmark/annotation/image2struct/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
95
  helm/benchmark/annotation/image2struct/image_compiler_annotator.py,sha256=iWqPDXscrXDkmzRGDg0o6ibmDVo5bQqvcWxZkr6P-d0,3620
@@ -125,35 +126,35 @@ helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8
125
126
  helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
126
127
  helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
127
128
  helm/benchmark/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- helm/benchmark/metrics/aci_bench_metrics.py,sha256=X1HCoGfoOzcNRsnYcfdqs50cy-hZcxJYDwWK69LcMuc,1324
129
+ helm/benchmark/metrics/aci_bench_metrics.py,sha256=fAuTm8Sr1vvyd7Tjcz9WWKrFkqrwCV-CiF6lqUO3dKU,442
129
130
  helm/benchmark/metrics/air_bench_metrics.py,sha256=VMNQDDEtz2CiK4U55lCHLz0b_DxHprTAZ1WtYtGXjcY,2282
130
131
  helm/benchmark/metrics/annotation_metrics.py,sha256=JbXNleQsPJVF2uc1xXgUW2bzvJqwLPZyhnndqc6THv0,4268
131
132
  helm/benchmark/metrics/basic_metrics.py,sha256=d0iwYnwrbF7w7CFtazx8vPIsZnj51U2PVVoscCb-HJA,20495
132
133
  helm/benchmark/metrics/bbq_metrics.py,sha256=GeZhSSJzqGD0e5EAiRHitIC3XtPICF7rDI6GfeYQc8E,6201
133
134
  helm/benchmark/metrics/bias_metrics.py,sha256=8qcInRJwQsuCI-lMC1umd-ZZaYvorUPrMjnuC6vSeb4,11602
134
- helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
135
+ helm/benchmark/metrics/bias_word_lists.py,sha256=eyk6we2J4SW8ZaZxQUWLB7Yapn92uM5TCekhFB5vg-U,13908
135
136
  helm/benchmark/metrics/bigcodebench_metrics.py,sha256=JcPZrSiHR-kxT-MFM8zXqOs6wTC5Hus3TbxuHFQVZow,860
136
137
  helm/benchmark/metrics/bird_sql_metrics.py,sha256=ooCuXW5nPpRs_-4seCONQmn25DzTbcUgGXznXTK9y0Y,1153
137
- helm/benchmark/metrics/chw_care_plan_metrics.py,sha256=E-TNRngWb_q7vVKs4QN8AfcpJndaKXd-XX0Ggxt96Qo,1341
138
- helm/benchmark/metrics/classification_metrics.py,sha256=TjsD7RjkIn7VpTrWtt88aJUsiGVtlv5N06FJl8VaJ-g,8858
138
+ helm/benchmark/metrics/chw_care_plan_metrics.py,sha256=WOAdwuF4vusZhjaXSAB3r7PD_ZxeNmVu2oAmOqzVLtU,460
139
+ helm/benchmark/metrics/classification_metrics.py,sha256=1Xa_bO4PqIAV2iZitE69kc4VKS4A7PloG5ElZAgvmh8,8851
139
140
  helm/benchmark/metrics/cleva_accuracy_metrics.py,sha256=1eDxHxVk-JW1mF9SBcuplIefAoi_edUwKpp-XxYbmeU,2740
140
- helm/benchmark/metrics/cleva_harms_metrics.py,sha256=PILZDbVOeUflCFbs_6cE-3qaBt5vwL8R-BirbB2jTn8,11278
141
+ helm/benchmark/metrics/cleva_harms_metrics.py,sha256=xVubv2pG3iinVs3namoVHWAmV9oUPywZwFB_0JGhP_w,11277
141
142
  helm/benchmark/metrics/cleva_metrics_helper.py,sha256=8UwiGhekUmp7DxYWU4rxqX2v3ewkg-O5-jOh49iOGmc,304
142
143
  helm/benchmark/metrics/code_metrics.py,sha256=SebQ5MXJe_phTiMfGMfhgYago-hwh_g9ctBWEHGqCnU,5230
143
144
  helm/benchmark/metrics/code_metrics_helper.py,sha256=UNai154RuhYRZM_YK-rveLct4Ui5iEBNPYmYdKq34Xs,22712
144
145
  helm/benchmark/metrics/comet_metric.py,sha256=qOvwE0ov1plb6SwwT3CbX1XuSo4GJ-M3iRe98yMiMaM,4797
145
146
  helm/benchmark/metrics/common_metric_specs.py,sha256=JKqmO4ovBdfOYKC-00OSzOMv--g9NTCVfUHLaz-1Uns,6025
146
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=q7s6B-O11jVrRSAZDSoM3PzhksXCsoAwIZkPOXkiVFY,2663
147
+ helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=Zrf6HyH_WNe7gGFgW0j8FJlX5KZvbk-05iX8QFPJDyU,2656
147
148
  helm/benchmark/metrics/copyright_metrics.py,sha256=_Lp7sKWgacY_13kFadNfnhrM2Ks8syBXnUW7zYuJkwo,7817
148
149
  helm/benchmark/metrics/czech_bank_qa_metrics.py,sha256=bKoooK2T5v_fFKNbUnsuW6Mv9muAirJD5lTrzuHfpz8,1113
149
150
  helm/benchmark/metrics/decodingtrust_fairness_metrics.py,sha256=x66XP0iQGk4ThT7ddmrlLCA0XF4arRbQMDT42LHf2kE,3297
150
151
  helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py,sha256=TxTkkWdx6d6ym0MirZTiucl_TWFdn4uJLnlTfLjQvgk,2925
151
152
  helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=OU7lka-hm6PubR5Gjj4uNyrqhjlfhe0mmjBCAz9vlRs,3456
152
153
  helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=BKDD3lblqT6Ebi5kEC4zbN_OvQwD1SdEtBv5Wf0kzWw,6460
153
- helm/benchmark/metrics/dischargeme_metrics.py,sha256=Z5EOn6uvrOZCqUQeBK_mGWTzOCqJwewh4SAndeCn9CI,1336
154
+ helm/benchmark/metrics/dischargeme_metrics.py,sha256=D8LI52E17hNSPDpEvb2tw1za4QWDE3p9xgx7Nm9l7_Y,454
154
155
  helm/benchmark/metrics/disinformation_metrics.py,sha256=5n8wgRBb6FaDjqe1nR3Cj9aS48esmMsIUq4KpBHoQoU,7870
155
156
  helm/benchmark/metrics/dry_run_metrics.py,sha256=Ss0lzf944HIbL1CX6QuJpGFPqOzhBT0qVWLNR1BoEjk,3784
156
- helm/benchmark/metrics/efficiency_metrics.py,sha256=cLnPCvOzbUETOJh-lu65iNgYwVOOZAJO_s5iTUAd0MI,11852
157
+ helm/benchmark/metrics/efficiency_metrics.py,sha256=SJqpA1d_GfBPl9H6moai8ra1GVe7tlaCfg3PeiWT54c,11845
157
158
  helm/benchmark/metrics/ehr_sql_metrics.py,sha256=YRjvPIty7zlyoyGD6wo3HYOz7y_PThySOZzVRJ38iww,4797
158
159
  helm/benchmark/metrics/evaluate_instances_metric.py,sha256=LGk1Dv_76Ak0YUlWKFTsOLEFiBSmcGVhNrbj_4zg9g4,2913
159
160
  helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=t0251_2aA0CrXB8oUBKlPRgPl-xYjzdVhLcGjwuhOgo,19621
@@ -161,30 +162,39 @@ helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJ
161
162
  helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
162
163
  helm/benchmark/metrics/gpqa_chain_of_thought_metric.py,sha256=HRRKkcTbCu5ScOVwmjzYaA7UAEGE_AJUZVOCDRuv4Po,4321
163
164
  helm/benchmark/metrics/gpt4_audio_critique_metrics.py,sha256=L9tGFwvl1-Ew3MdInQ7KPa8OlI5YexIB2KuCYVYsuPY,7023
165
+ helm/benchmark/metrics/gpt4_audio_refusal_metrics.py,sha256=vYPRJq-4uNhUWUWMrDkpHmfIBkhEyAgaMNEI6RKPP80,5896
164
166
  helm/benchmark/metrics/gpt4v_originality_critique_metrics.py,sha256=1m7IWy9vu66svnmdBRjZQI-2YsGYzH2vXZMptlRGM0Y,5654
165
167
  helm/benchmark/metrics/helpdesk_call_summarization_metrics.py,sha256=9-kB3NeBacI6nxs2oQ7Km_1SHyiz98UVZuR8PAlvCHM,1442
166
- helm/benchmark/metrics/ifeval_metrics.py,sha256=iYj-880nHHXECC8t8B93f1LZL9e6PMB-M0nxRdRBZcg,2572
168
+ helm/benchmark/metrics/ifeval_metrics.py,sha256=4_Vp9bNnrctKtv6xZ1RpvBstPAZPwv1xiohH-ogs99U,2565
167
169
  helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=RR9cMIG113oXUnBjU_denn7DaCGB11k1oGtQ5dQON3M,9874
170
+ helm/benchmark/metrics/kpi_edgar_metrics.py,sha256=1GsW-nBz8TgP4wFIVEGA4_BhI17kihmk96zuLpD4NZc,4636
168
171
  helm/benchmark/metrics/language_modeling_metrics.py,sha256=yS7k8iFjxfkckSBA0RVA7VdOivSEBtNzCjczK6We7y0,4598
169
172
  helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
173
+ helm/benchmark/metrics/llm_jury_metrics.py,sha256=yzAsdacyX0MFJy2qKIjhI0y7JvtflELpCh6R14wuCgk,1704
170
174
  helm/benchmark/metrics/machine_translation_metrics.py,sha256=22vaGBCSw12uM1wmtDG-MBBZW8OiTZwNPaerjckdtDE,3860
171
- helm/benchmark/metrics/med_dialog_metrics.py,sha256=uDa4xQkU_-zv6WVC1gzoW6YVC1lSt7bzSD9kN0cmkE8,1329
172
- helm/benchmark/metrics/medalign_metrics.py,sha256=TVqkqlQYuSePt_pG1jAJHhulrAhODFI2-hT4-RWhUkc,1321
175
+ helm/benchmark/metrics/med_dialog_metrics.py,sha256=kzmrkQcmJ15zuOF9_Onk9N0oeNeyl9Rri1JEb1AqRT4,447
176
+ helm/benchmark/metrics/medalign_metrics.py,sha256=q6l8p5Pie-H9pxhaA-lQkSOnliJWXr6zUeN8syEQ91Q,439
173
177
  helm/benchmark/metrics/medcalc_bench_metrics.py,sha256=9wZgg20-9QBNk0_XhuwR3LT940fqDPkCM4Kl0dPkbAs,5353
174
178
  helm/benchmark/metrics/medec_metrics.py,sha256=hNBOGX52G_QOmgTCp9LnIMrmGSRxbb5vgjxKU069TMQ,4152
175
- helm/benchmark/metrics/medi_qa_metrics.py,sha256=e8nZ7aMLgg7r088AQ7DBadQsncqCagkxpI81_X88qZA,1314
176
- helm/benchmark/metrics/medication_qa_metrics.py,sha256=2UEH1DbcrDzWpTfLgKQC68_AhGhJ_igACvYWO2yvspo,1338
177
- helm/benchmark/metrics/mental_health_metrics.py,sha256=T8Y2lQo0IZvDvsK7G30nnIN9djj_GlGK2CFMuFRBSBA,1344
179
+ helm/benchmark/metrics/medi_qa_metrics.py,sha256=JWAEMuT0UXDZrb7qHn13W6W79ilbprk492V_9vWrB4s,432
180
+ helm/benchmark/metrics/medication_qa_metrics.py,sha256=wit3nKNWpGFfgauu6Xye2IDTePAS0VHAQI_7OO9HR6M,462
181
+ helm/benchmark/metrics/melt_bias_metric.py,sha256=mHDCkRGLD-0pyJA_depi_KX3sn7g7Bgd3_m0XdLQahY,11520
182
+ helm/benchmark/metrics/melt_bias_word_lists.py,sha256=xA0araUdszAIOqfxiTi6MIJhKYwr_Gwsc1L9qinZx9U,27891
183
+ helm/benchmark/metrics/melt_metric_specs.py,sha256=zaeV57LQEl8qK7be36NaojiUJlzmkoKY8JyOkOVuPqs,1619
184
+ helm/benchmark/metrics/melt_toxicity_metric.py,sha256=8HxViwOJCAZ-luE_Br55xUfJn5XAVXg6lqcAUsP0GT8,4187
185
+ helm/benchmark/metrics/mental_health_metrics.py,sha256=4HXCXl2GxFPn6wDzHptHeBTuP4BJVLUzEUKffpd5R_k,462
178
186
  helm/benchmark/metrics/metric.py,sha256=jqQyiKDq_pQv-ulGqfZI56ydRDQs3N3XhfHIPysUhrk,14311
179
187
  helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
180
- helm/benchmark/metrics/metric_service.py,sha256=mlX_MEFSYNzME6GFS3El_VVOvzPYnOMosKI0XIxygP4,1802
181
- helm/benchmark/metrics/mimic_rrs_metrics.py,sha256=YPU0cwH12L0VqdLhXd12P-eKUDqn39z_sLFx3YdGrP4,1324
188
+ helm/benchmark/metrics/metric_service.py,sha256=bJaM7GisEgSWR3vPTcg7b67XF9X2K5viODacIgbGb24,1692
189
+ helm/benchmark/metrics/mimic_bhc_metrics.py,sha256=da1YYrE8fL3YHeIJ9hf4WCKZtuj_8cksm3rJ24rcy70,442
190
+ helm/benchmark/metrics/mimic_rrs_metrics.py,sha256=x3vSj1VG1UkNF3gbgJYDeA4z-crxfGIkK7iZo0xjq8c,442
182
191
  helm/benchmark/metrics/mimiciv_billing_code_metrics.py,sha256=Pu9efXoBrhsvxSeGHqwbUA5k365-pJTeXpMNhmcg0L0,3927
183
- helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=HfmNYyqHplEEM-ABzuMSL_vX92gFrZchO2DITl1Ukiw,1379
184
- helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=A0Ir6B0f99SwCf5KBGGUBFXCqV1Jo693BsYU_wIN3Ws,1374
192
+ helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=XrddVk-gnc8jF8amCI1RBa_XTS9yEXD2Y9Ld9W7Q-m8,497
193
+ helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=rmH34aTX_wZWxLi4jrxf3sR1RIqNRF0QDANLRQUGhqM,492
185
194
  helm/benchmark/metrics/nltk_helper.py,sha256=QMEps-lqJZ_pCgvjlMf4BvC0pzDu3ez5jit5F4p8dAk,1313
186
195
  helm/benchmark/metrics/numeracy_metrics.py,sha256=3E-CMmB2wuGW5tLjmEm8wFMf85DJ1ZDUANfh84SQuP0,2906
187
196
  helm/benchmark/metrics/omni_math_metrics.py,sha256=Gqih87UrE93-a0hbRhTBkjmfGLNTkuKQGaG-sTQeuG8,1287
197
+ helm/benchmark/metrics/openai_mrcr_metrics.py,sha256=TAop7G50FKaR-Jyo2EGLqmMOfJRmS2vNRDFiifa6mhg,2313
188
198
  helm/benchmark/metrics/output_processing_metric.py,sha256=ey9UBi2f3780OwFlp82ymzfjLR3MA2fpA9vW5R4W5TA,2581
189
199
  helm/benchmark/metrics/output_processors.py,sha256=ULZlDBOf6NupAXzDKBKyTDdgPZ5PSxOAlOYTbrQEek8,472
190
200
  helm/benchmark/metrics/paraphrase_generation_metrics.py,sha256=771CjpW5Ek00OCaCFfEsO6Cdy9eZb1fMlgWASvQgiK4,2025
@@ -192,14 +202,15 @@ helm/benchmark/metrics/prometheus_vision_critique_metrics.py,sha256=pexBbEFF3-bz
192
202
  helm/benchmark/metrics/ranking_metrics.py,sha256=hSNKy4h7zRkGYSgo6RWt4PXQztA5ZX1PCJorVqpCvpA,17457
193
203
  helm/benchmark/metrics/reference_metric.py,sha256=hseI7A16SOC8ymYZYFCL6nxnyxn0q9_Gywuvb1r9FLE,6092
194
204
  helm/benchmark/metrics/reka_vibe_critique_metrics.py,sha256=CwzzQ13bBT0r_o75TqFj2Zr0ST9vzQi74K_ezWTnLCU,6568
205
+ helm/benchmark/metrics/ruler_qa_metrics.py,sha256=OuiA0ksByl0Tw1Oal7zbedhKjTrhJgQJDLXAgoTLXuc,1473
195
206
  helm/benchmark/metrics/safety_metrics.py,sha256=oARko_EwVnykBKYxi-w3ytKme4qcb1waz_0N2GKbSlg,3348
196
207
  helm/benchmark/metrics/seahelm_metrics.py,sha256=egRkeXnnb8Nqi9qJJMDXJRSl4NK6WvdUxAc_LffBips,6964
197
208
  helm/benchmark/metrics/seahelm_metrics_specs.py,sha256=cx8p4kwTuEOWxZioK9CVoeTNJT0fZjxRy_6_EM9F394,452
198
209
  helm/benchmark/metrics/spider_metrics.py,sha256=RSrFJoA5SNcNxfmgVqCQixcSLrfJBYuVQw5jsfrc9Xg,189
199
- helm/benchmark/metrics/starr_patient_instructions_metrics.py,sha256=lylCQ9rj1Y990bWr_h_BfYrikGGTuejxptTRZoXeaKM,1407
210
+ helm/benchmark/metrics/starr_patient_instructions_metrics.py,sha256=YHdTeIFdZxRbvqBnlWpAyIsWzZyWAjjDFuKOXhHYiSM,525
200
211
  helm/benchmark/metrics/statistic.py,sha256=ATuOm0jU3L-0ELiZaF2GVMNF22W66-rMvzxRtlfqcII,3446
201
212
  helm/benchmark/metrics/summarization_critique_metrics.py,sha256=-mki8-zvZx54dQg8X0BG2Y6wmfypQhkIuD_9ZjNBl78,4782
202
- helm/benchmark/metrics/summarization_metrics.py,sha256=LNLGFi4DAKJEL0P60rnPlS_-yLMNLUprJbuJ6VsdL0g,16842
213
+ helm/benchmark/metrics/summarization_metrics.py,sha256=FJCdGRmlCJX5A-AmbtpGGlGRfNgg5Z8Bo0d9yFiE33E,16876
203
214
  helm/benchmark/metrics/test_bias_metrics.py,sha256=qEZsCULvwjVdIyfNgJSc2L7Xp9suKKW7L5OuQmGrwZ8,6393
204
215
  helm/benchmark/metrics/test_classification_metrics.py,sha256=CRDMGmVmzEUnNaM0C02qUTOU2AS11Mt2-GdEl89y7lw,9541
205
216
  helm/benchmark/metrics/test_disinformation_metrics.py,sha256=U3ZmS9s33oimTQbKO-7pgWeX_WiDB9chlOCtf_vslXw,2249
@@ -207,9 +218,9 @@ helm/benchmark/metrics/test_evaluate_reference_metrics.py,sha256=B7xtDDWPAxF7d-v
207
218
  helm/benchmark/metrics/test_metric.py,sha256=0sGlXE3_Al_VyKpOPBhQR_xT-XrcVgGepLpwut37DmA,771
208
219
  helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
209
220
  helm/benchmark/metrics/test_statistic.py,sha256=yK6m2BZ5UXWmb2D1cQzDH_2ELvrNDaR_lyzX4WoHw9Q,1273
210
- helm/benchmark/metrics/toxicity_metrics.py,sha256=SkVp91-LnZMa5ouEspcYa-PYvPGMi4H_bU3uuc6ve5I,4115
221
+ helm/benchmark/metrics/toxicity_metrics.py,sha256=ZLOzxDlMgbljl-9y6vT2ZgwdhsBZ4MfV-T66VpKk00U,4114
211
222
  helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
212
- helm/benchmark/metrics/unitxt_metrics.py,sha256=OBJ6Ph-4mpAOAEcl4loBemcEQjz-UrbmPVnWmzifhG4,4863
223
+ helm/benchmark/metrics/unitxt_metrics.py,sha256=8fawxnrg0xsAe0xO2wbL7S_yisj8RzJnrn6xtk8C6q8,4852
213
224
  helm/benchmark/metrics/wildbench_metrics.py,sha256=sY7MNTzRlJJK3yph3rCijgbMaajtLyCCquThlsoE5wU,1380
214
225
  helm/benchmark/metrics/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
226
  helm/benchmark/metrics/ifeval/instructions.py,sha256=qNoa1vMPDNz6ORWfyMv_efwKZ4U5zkI-cf4aApyfSqU,53247
@@ -251,7 +262,7 @@ helm/benchmark/metrics/image_generation/watermark/__init__.py,sha256=47DEQpj8HBS
251
262
  helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py,sha256=Ir4u8blJWTRtEBogb6u22qCy3JXAIzvx-Th6dSBLfdw,698
252
263
  helm/benchmark/metrics/image_generation/watermark/watermark_detector.py,sha256=w6WnTc6t6zx0W0gTjgedXC9OO5dq5iWpx9UcnioKml4,3641
253
264
  helm/benchmark/metrics/summac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
254
- helm/benchmark/metrics/summac/model_summac.py,sha256=nDB8sAJNnQ5TkBKoQBGUejFK6ynrRiaV4oyEKcm3qkg,17488
265
+ helm/benchmark/metrics/summac/model_summac.py,sha256=82S9BpPJENr_jiY-cNubECEhniu5Y3Arzv7AXK93PmE,17442
255
266
  helm/benchmark/metrics/summac/utils_misc.py,sha256=7_Q1c72cKt8PWtxn8u4R8nB53HK6_JF2nP8bBXYNk-A,1485
256
267
  helm/benchmark/metrics/tokens/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
257
268
  helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py,sha256=XDZGK8h84F2w_pK8Zjko8ssKZmVxKFqTOuHL0mLBzMY,694
@@ -261,19 +272,19 @@ helm/benchmark/metrics/tokens/free_token_cost_estimator.py,sha256=PiraoV3WtAYtcF
261
272
  helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py,sha256=sa7Cu0S9IPF35puSVU-gYnLg1uXEZYAdRyKmCc-_5ss,1549
262
273
  helm/benchmark/metrics/tokens/openai_token_cost_estimator.py,sha256=CovkJ4zeVn89bjno2gP0K8ix_Ie0EC2tUJLHLCEl378,1427
263
274
  helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py,sha256=n9f2rcgaNHROORvSYjULXC_LEA4KZZjs8wASk0vAG7o,1100
264
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=Gk1ihZsT-EhcTyMsbmNWPP2Z6FlS4nRYOpq7v41f3j0,2657
275
+ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=eDooaAAtkmIGGbK672Db9simp2soXXr5GiEG3hEQBq8,2649
265
276
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
266
277
  helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
267
- helm/benchmark/metrics/vision_language/emd_utils.py,sha256=KdZdcqu3eo016FdAjAm_83v92-wWuR90EPsTogfTcok,15196
278
+ helm/benchmark/metrics/vision_language/emd_utils.py,sha256=nVqQ7oosjKjhpR5YPPvO4ssB92bGChgODOtsqMYVEpU,15230
268
279
  helm/benchmark/metrics/vision_language/image_metrics.py,sha256=RgKAn7ftl4KCZ86V3zO_LUstNbc6Lla-0hdQq77JDXw,23841
269
- helm/benchmark/metrics/vision_language/image_utils.py,sha256=4E0NYh09O6-5sGhAPo6KZqYaZfBpCtuYbD3vLt-wQzk,3755
280
+ helm/benchmark/metrics/vision_language/image_utils.py,sha256=xwtydR8-s23cJacIGXDXL_pUhAqi6O5CbhM4XNEFlDo,3787
270
281
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
- helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
272
- helm/benchmark/presentation/create_plots.py,sha256=T6ewj8rXZfRlqg01bgbhdU1rsABK4xyrLpruhRG-7Fc,28691
273
- helm/benchmark/presentation/run_display.py,sha256=tC1DciLvDTQJog4BDo8StWDdX7DbBkhrG2sX_SwXSPQ,11838
282
+ helm/benchmark/presentation/contamination.py,sha256=07IuIP92vfuI0GwfeNC-i_NZUlF8N1azzagC19YHOMQ,2802
283
+ helm/benchmark/presentation/create_plots.py,sha256=m51mFsYD51Y1rbEQgwTbKZjCI3xQir437WyOS5z5k64,28916
284
+ helm/benchmark/presentation/run_display.py,sha256=LmY2HES4dU94kRYuUxt-c9LTMDN6MU5CspWTF6rZwDo,12419
274
285
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
275
- helm/benchmark/presentation/schema.py,sha256=cblGmgkhuqQRWPh-IT75u3Il_-SVXipeq-mh64lvgWY,10947
276
- helm/benchmark/presentation/summarize.py,sha256=Y4rNMgnQYEwOKX8Syd9R0HybjnaW_tJQZcWF4ZFrHvc,59749
286
+ helm/benchmark/presentation/schema.py,sha256=gYlMysq_rIzQTE9I1K3mIC1fFjBdDe1yHqgwb4EIciU,10989
287
+ helm/benchmark/presentation/summarize.py,sha256=Xk5FJRnWz7xAbPu6JQ96TJ6Fvb1-xWUGBdfetrTsmbA,59882
277
288
  helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
278
289
  helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
279
290
  helm/benchmark/presentation/test_create_plots.py,sha256=5PPPegMTdBZurxyyUxI4rN13AVsjV3eQrwFqlobJ8UA,1286
@@ -283,38 +294,40 @@ helm/benchmark/presentation/test_summarize.py,sha256=GzZNwBDybpstzl6wT0Rgqn75N9i
283
294
  helm/benchmark/presentation/torr_robustness_summarizer.py,sha256=SmMOZWCQ-KaJBp78otwvAeE1btWignyWalaQ8QG87r4,8242
284
295
  helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
285
296
  helm/benchmark/run_specs/air_bench_run_specs.py,sha256=K86SqpINMBOiLIpuHz-jwlQL3SrH6n6WbqjD90i4LQQ,2231
286
- helm/benchmark/run_specs/audio_run_specs.py,sha256=g6uncT8dIK59qU3aEcyvpPLiblx3Ks8yCUY6s8IMO9U,21957
297
+ helm/benchmark/run_specs/audio_run_specs.py,sha256=baJz5LZiwWZP3KD0hluKgpidtswzdorQnshX0CoqKAc,23383
287
298
  helm/benchmark/run_specs/call_center_run_specs.py,sha256=QhRQw91WblB9UaB319XNCO5K8PX8Riiza41Ym-1CcRU,7044
288
299
  helm/benchmark/run_specs/capabilities_run_specs.py,sha256=sbqhIj4AoujV45erwoVK61lWdlkjg4qssmGlu0eSr1U,12067
289
300
  helm/benchmark/run_specs/classic_run_specs.py,sha256=1NYeYIwC2F7EjkPEPxNoFb3Ap6BUcUJK_hxBKq4lzt0,56144
290
301
  helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
291
302
  helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=7slILDS9f0_Z0y-Pz5xEspoGQUmOCOI2K2r4XWUVsm8,14428
292
303
  helm/benchmark/run_specs/enem_challenge_specs.py,sha256=5UWeP2bsnwCHMMXI3DFRMUPKcnJ9_EL01qPUthbWIvE,1351
293
- helm/benchmark/run_specs/enterprise_run_specs.py,sha256=J6NbdgDO3sFvBf-Zqzug05T_JbFIk1Vx50QmkDG6QHc,8966
294
- helm/benchmark/run_specs/experimental_run_specs.py,sha256=pqK3_yD_2Qw1OWjj0biiV1G2BlZhAVnTPcUEbLnz2Wc,6765
304
+ helm/benchmark/run_specs/enterprise_run_specs.py,sha256=ul2YMPpvThOmi7yIc6xR3W0rtE-8tUIaIzuhGlMg2rY,9598
305
+ helm/benchmark/run_specs/experimental_run_specs.py,sha256=tIgAdK3cm4t6ZBGkcPcPkxx0XAslKShYA1i3QxWVJEY,7675
295
306
  helm/benchmark/run_specs/finance_run_specs.py,sha256=5mwb7GbAcSLVZiumqCiAr9dr8qBYApkEt5Oben5CFXs,4371
296
307
  helm/benchmark/run_specs/heim_run_specs.py,sha256=Pt1eVbzvwZ5EXq8WB2b3XYw62SWYN_i1P_H3oE4i8KY,22096
297
308
  helm/benchmark/run_specs/imdb_ptbr_run_specs.py,sha256=nkW5A_xeD5kCKeJVxsL8RFS8r3UpP_WCcwSdMh2s850,1215
298
309
  helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
299
310
  helm/benchmark/run_specs/lite_run_specs.py,sha256=8OkL9g3wQBG96g0ijGZ9L1Trb59b7VPDyYMqvA3hXfE,11129
300
- helm/benchmark/run_specs/long_context_run_specs.py,sha256=A1ysL7pmyvCmfnokZaBSyWEKfpMh9XCaOejFqABwL38,3033
301
- helm/benchmark/run_specs/medhelm_run_specs.py,sha256=97fWtee-VpnS9ydudc3285kHayK-JYhIM5j6qZX6p8A,40440
311
+ helm/benchmark/run_specs/long_context_run_specs.py,sha256=mxgFgjdHnatOif4-xmTicGmpr4U720mfkhPIigeTrGQ,4773
312
+ helm/benchmark/run_specs/medhelm_run_specs.py,sha256=--KgkjVwKt4uyiTebalrbeGV4FB-jGqPciYjFZED7zA,43407
313
+ helm/benchmark/run_specs/melt_run_specs.py,sha256=729MkALud2wG07yulx9zqAzejdXW_eVGkfF5cQWeGGY,32031
302
314
  helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py,sha256=kenpGGMK1XXaNtvNXsshPvdvN9ubv1sOfaPdjFM4obA,2034
303
315
  helm/benchmark/run_specs/oab_exams_specs.py,sha256=ws7Vppo_zJvxKqQ_sNhm9N7-5eQbX2CBkcDI5c_sRG4,1658
304
316
  helm/benchmark/run_specs/safety_run_specs.py,sha256=3X6tYaq2SlRsZs9q6SCtBUgjNEpOwUtV6M7iY2Kowm0,6807
305
317
  helm/benchmark/run_specs/seahelm_run_specs.py,sha256=R3mg4_OoaRizZ5n0FHcUQpJLny3j-ulBlHzOyF0a0Ok,23904
306
318
  helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
319
+ helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=FvigS7LXxKkg9ipTaIPXDN47qFk__Vrv47hb46_cR3Y,7441
307
320
  helm/benchmark/run_specs/sql_run_specs.py,sha256=JWCICELKi81m11MggyR6CJNl3vpWPwk4kr8DZSsWvj4,1965
308
321
  helm/benchmark/run_specs/tweetsentbr_run_specs.py,sha256=qogc-fb83Rh1DooKKaskhak52ycvu8DAnhabw9rc7yA,1129
309
322
  helm/benchmark/run_specs/unitxt_run_specs.py,sha256=4Vbsq0MPpSe4cIJOXzeVpMm60N9Qafa2R85X5BeFQew,1873
310
- helm/benchmark/run_specs/vlm_run_specs.py,sha256=qjo0YzyIKq8UIbkKIUhHYh7iErPDQSG76_m-5kiPKEc,36648
323
+ helm/benchmark/run_specs/vlm_run_specs.py,sha256=v-eWuDYc8u5HO46isLONPfAWv5zdA1ZOQrdyOvX3vlU,37512
311
324
  helm/benchmark/run_specs/winogrande_afr_run_specs.py,sha256=dhOm8z6Q_ZpnzYKrsS0nEbRQPWs_phkXxmL5pxCJzQA,1853
312
325
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
313
- helm/benchmark/scenarios/aci_bench_scenario.py,sha256=u-Vo3J16o1d3GPb3qqclYv4mzSaPOa_RblmZbYV_xik,5345
326
+ helm/benchmark/scenarios/aci_bench_scenario.py,sha256=W8h7eWz9mjR0kRAffKWSnA1Fs8t2l83sPyW8fjPOxWQ,5670
314
327
  helm/benchmark/scenarios/air_bench_scenario.py,sha256=B6_WMowLFe4gWfnoFA_yrHe0kagbIkZabEnK4kGGqSU,1884
315
328
  helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=jnUGbppDGEsbe5xoJjmv7nW_RvwPIYm6cwSULeqk2Fk,5133
316
329
  helm/benchmark/scenarios/anthropic_red_team_scenario.py,sha256=_OWE33eVRaZI0gmfP7bLd572uOi_6jb39z_J6nkcvfg,3182
317
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=cx5Um-crMG3cNHc8tX51r--K7sVYyM9WbhEqnrb9bag,2503
330
+ helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=fOCHumFWZa4OJZcTZefJiJbdWsb3zjQnWLJYd10Cctw,2496
318
331
  helm/benchmark/scenarios/autobencher_safety_scenario.py,sha256=MFt3f5baN5r-FmzWZfUChGR1mX_PUB_5hxoINac_Whs,1854
319
332
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SyM6RP4v08B1PjumkdQnuKrM9L8SyK0bXbx-LgmyTPo,5067
320
333
  helm/benchmark/scenarios/banking77_scenario.py,sha256=dtiM-Q_pMDWqkLi-hgl0tH-aGuDdgHkXgweE1JqrPYs,1883
@@ -328,10 +341,10 @@ helm/benchmark/scenarios/bold_scenario.py,sha256=iE9drB9IeXfRn3xvLnaQi3-nJAp-bV1
328
341
  helm/benchmark/scenarios/boolq_scenario.py,sha256=wPETIu5jcI4jgP5GoFa_xi4SsvHtS9gxQ5TD8neHmdk,8037
329
342
  helm/benchmark/scenarios/call_center_scenario.py,sha256=19J2N57WnUkPMGRRbJyZak8YCeMTRwD3BRK1SArQlL0,3037
330
343
  helm/benchmark/scenarios/casehold_scenario.py,sha256=QSe0D3KQJhlTOo6kM9OHwdKy6NlclsFGRVCAB3mTG7s,3174
331
- helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=y0B1g8wMyjK7zncJjaHUBSbvIK_4DNiAVE-Xk8KBsP0,3695
344
+ helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=BbEjDqa4C5wpdil5jIb1nzj16CCZ29hKoZVsfapSfho,4005
332
345
  helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
333
346
  helm/benchmark/scenarios/civil_comments_scenario.py,sha256=pnZU2U_cYFYOJmlmwTehHU5oLIPx_Yg8Ayxinroh4IQ,4875
334
- helm/benchmark/scenarios/clear_scenario.py,sha256=-r4YIQLKgbjT54J96urcxEcQ1bhxjKVtfyajuFOaEoA,5915
347
+ helm/benchmark/scenarios/clear_scenario.py,sha256=yGdPxWO6vY4JHNa4xywtvD-9lOn6s5cr3njpZyFA0D0,6183
335
348
  helm/benchmark/scenarios/cleva_scenario.py,sha256=yPIiToKow76YMc0EDYeqQEPx-9a_6Bm3w4S1IsRRV5E,57987
336
349
  helm/benchmark/scenarios/code_scenario.py,sha256=lSbZWw67ie9osOjXDZukj3EEZGa3L6TrMvTg--IbuxE,12520
337
350
  helm/benchmark/scenarios/code_scenario_apps_pinned_file_order.py,sha256=KC-5MQ-d8Nn46aDN4FaPxmd6yk1DtVUmVR-CIZsNCp4,1738
@@ -352,12 +365,12 @@ helm/benchmark/scenarios/decodingtrust_privacy_scenario.py,sha256=zaXn4sRPUEZiqP
352
365
  helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py,sha256=NjutVTOVVze-IJniRFecz8gqh_BUpuJG3-BUboTGKRw,2933
353
366
  helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py,sha256=EYKoXDWMesbY5dCNY-N0eYMRL0rjEfGsuS_TkeD3Suo,2952
354
367
  helm/benchmark/scenarios/dialogue_scenarios.py,sha256=yXCMZegzlgL0CXTY1W5lXdkFFHicUvq4z7_284MfRpw,5778
355
- helm/benchmark/scenarios/dischargeme_scenario.py,sha256=sTh3bj3dqvh20FCn7bGMycFzH5xphgespVS063XZ_Wg,6759
368
+ helm/benchmark/scenarios/dischargeme_scenario.py,sha256=rBzagg0JVVN3o0VUfmHy2cN7gutV_RAJAo5Fa_El0GY,7842
356
369
  helm/benchmark/scenarios/disinformation_scenario.py,sha256=0T7LhXguzBP645Fruc2udfTaMuy7XGtOEMJKpFMIFRk,8565
357
370
  helm/benchmark/scenarios/dyck_language_scenario.py,sha256=hygFPTcICGUEPwjtxULLKBSbuBOXLYpozIgiGcT__W0,9379
358
371
  helm/benchmark/scenarios/echr_judgment_classification_scenario.py,sha256=IqODoUY1-zJD1KW4Qkg3VwJcUeeLgGUKThr62bW-wx8,4915
359
- helm/benchmark/scenarios/ehr_sql_scenario.py,sha256=ufrY7zmeXlgOxsq1Sr0x0vhR7xbL6FTJJWiM0pzwIpg,5119
360
- helm/benchmark/scenarios/ehrshot_scenario.py,sha256=ROPfWBDOAaHxcnnh5eGkCh-qhwvpxORcGmpA8DrjD0A,68721
372
+ helm/benchmark/scenarios/ehr_sql_scenario.py,sha256=Gm7Kw_TSUUxHW8ns-2e4E_tTBVX7h6Ta273VOpkMCQ8,5480
373
+ helm/benchmark/scenarios/ehrshot_scenario.py,sha256=MWcTejCtwohBPbZYWei_WNZ-Hdnhml7ovTVbJAgUetU,67770
361
374
  helm/benchmark/scenarios/enem_challenge_scenario.py,sha256=sxYXKvf-mGNqctTkemwI9rrA_Rg2xA8mz3_W3TIfzUE,2147
362
375
  helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4V426oOuexGg59q0djHCTQjQmqYgyLT191Z5fayubmU,6681
363
376
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=kzzDaoVikL2P7Z-17EkLIVR_W7IHcNVerUts2oXDKLA,7111
@@ -368,12 +381,12 @@ helm/benchmark/scenarios/financebench_scenario.py,sha256=cHMljdg0_9HA3FbwcwwMt3D
368
381
  helm/benchmark/scenarios/financial_phrasebank_scenario.py,sha256=dMTfI9MRHKXnECsXOIY8xvX6w5vAPEIa6A7TYyIu2Fw,4457
369
382
  helm/benchmark/scenarios/gold_commodity_news_scenario.py,sha256=-O4ilLwNcycmpQG5h_5WtQP7yJEr4mjWjKBe2eNP0uY,4806
370
383
  helm/benchmark/scenarios/gpqa_scenario.py,sha256=369E0JvaR12EcgcEFKKRcDw1iztt4sb8ghIsk9Brzi4,2884
371
- helm/benchmark/scenarios/grammar.py,sha256=Pb9vEP_0Ki87UdQCj1ym7QWJ24M4DRP6TXB5d3GnhLs,5597
384
+ helm/benchmark/scenarios/grammar.py,sha256=58tQYKPj013V9jIpW7fXUqZBLuboqEi_WLlDjx74spM,5590
372
385
  helm/benchmark/scenarios/grammar_scenario.py,sha256=Hz59gp5ivH3tIP5UAcHZbnk8pBX6GhIABSQlG33gIRI,1502
373
386
  helm/benchmark/scenarios/gsm_scenario.py,sha256=QIj0QK5ncF31ES0GUlxbdBk6SIiJJnj5wzamj0do0tQ,2674
374
387
  helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py,sha256=8_ShEuOoEGu7iRE2b0tgi-cfBrCPF9k1L-Pgb__n3Bg,2005
375
388
  helm/benchmark/scenarios/harm_bench_scenario.py,sha256=CBo_AfbtHTlvJdsiquP0EDTKApVmDZc7EW0VTENNAfQ,2478
376
- helm/benchmark/scenarios/headqa_scenario.py,sha256=TufgA1tjcEyq8vQ6Wk1oYxYXhSm0pjxvG14lL3y8GAI,5417
389
+ helm/benchmark/scenarios/headqa_scenario.py,sha256=m6Kqt16JeqA1-OLJvmBPZzhVOVt7O6rbJGAwG9C7FZs,5658
377
390
  helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py,sha256=iv1khpdiWW0Z7lshyWOhhjRfYFdAU6etN8X5EDEQCrc,1302
378
391
  helm/benchmark/scenarios/ice_scenario.py,sha256=NCbeqvpDFIIG7kSCrJrS-Z9S3iG2THZ7HpAqghpi_y4,16725
379
392
  helm/benchmark/scenarios/ice_scenario_pinned_file_order.py,sha256=fuirubIdi-rkJMfSd7YoDdBX2q0f5K7GGTN4XVapAUY,1613
@@ -381,9 +394,11 @@ helm/benchmark/scenarios/ifeval_scenario.py,sha256=SYn9itpFG0tlWSayf6v0P8bRgdtc-
381
394
  helm/benchmark/scenarios/imdb_ptbr_scenario.py,sha256=laq9UwyvBvZZuo54rf-8SdKTLrMdDHTdGWJ4TdC8Eng,2340
382
395
  helm/benchmark/scenarios/imdb_scenario.py,sha256=qHXd-QIXTCBq8rWW3N5I2Rvg6Pz9v1zFhZkwc73w9io,6259
383
396
  helm/benchmark/scenarios/imdb_scenario_pinned_file_order.py,sha256=fjW0Gkzg2Y3IAbtYJ3KC7MueWd9U8h0tlcBCqxYmRrM,1621
384
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py,sha256=jp5a2p_yqlCQXmhJRsqpTiKN8EGZi8Xyw3h37elb2OI,2785
397
+ helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py,sha256=RxK5T6Nu_KP3rLMaKkJWiI_3Sqpskgqwgn4Zj95lEvI,2854
398
+ helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py,sha256=6z3VlcucrwK2B30artWiSpo-mOTr9tiwYV6Fu8XD0VY,2657
385
399
  helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=F-gDO6r4GPBJTLirhF5noRaV0edvoIT7tiIDlovBFfE,2253
386
400
  helm/benchmark/scenarios/koala_scenario.py,sha256=A5M6SD7Jjg7r9QlbHCtMaydBe-wpOtB6oc6gFXuZ47o,1389
401
+ helm/benchmark/scenarios/kpi_edgar_scenario.py,sha256=23rZM3IA-phf2VnuPY9QWd64scE6eaJks49apDUNfic,6355
387
402
  helm/benchmark/scenarios/legal_contract_summarization_scenario.py,sha256=xjw3iKRf8P50Wo58n7ssnFiWHR2QFehzHlZhh9P1XKs,5374
388
403
  helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py,sha256=q_iezJo23_HNNoIXYT4cLYCbwNzLYJx6uvxgPSE5bQA,2804
389
404
  helm/benchmark/scenarios/legal_summarization_scenario.py,sha256=BFK524H7uLfz_ZURuRS7KrhzRCP-WyhIcOgdcBrsldA,8709
@@ -396,59 +411,69 @@ helm/benchmark/scenarios/lm_entry_scenario.py,sha256=kQTnj5gKJmDxCgynmzQOmghwNyS
396
411
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2VUJ36vHUZp6fZuLfRIuPSsU_K6Z3Im2ums06sZENqo,6153
397
412
  helm/benchmark/scenarios/math_scenario.py,sha256=UtNj0UaCxt0RjM-uwD_Evm7SjKnvMlfCt6K0HQOAVC0,14377
398
413
  helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=7DOqQmO70BpDeJy_S4fJ5i2UcCH8tunxzjFgTIim9bQ,4062
399
- helm/benchmark/scenarios/med_dialog_scenario.py,sha256=A-OhCSsbyrkIiyScfGXf5mWJJ9mUXhWQ1S2hHFUYxQk,7254
414
+ helm/benchmark/scenarios/med_dialog_scenario.py,sha256=AE10W1UWhOrgKUnz7e2brKSaQR1WJkQUcPoo4s6n0Fs,7553
400
415
  helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=XEipvuIA-QoyZrtlm8nnaPuyZzdDaeTskAhnseD3Q68,5096
401
416
  helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=0Z1JrizLygjd9v_LLFMk8uZ805IWjJPvg-ZvPVhtMm4,7652
402
417
  helm/benchmark/scenarios/med_qa_scenario.py,sha256=m0W-FgFi58psLglZyQy_ouMQIDP-2j3aL7uInkdVtms,4478
403
- helm/benchmark/scenarios/medalign_scenario.py,sha256=yNaEyCGdeMMTZmPQcAyQeHFDD3mHZVIrauCC-WEuiZQ,3040
404
- helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=wFR15zRuuvtGc1MXaecndfHH-_uCLF6O_3twpqXZOLo,15292
405
- helm/benchmark/scenarios/medbullets_scenario.py,sha256=mAQ1-jgsbd5hM78C0E5cgFs6fPt2KYErdemZBTQ12iE,6447
406
- helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=fq1qBjGkL9EA7wQBJrg_V5fIewNgpiQVK70-qShXnaA,5374
407
- helm/benchmark/scenarios/medec_scenario.py,sha256=uS567bCzOQx08euNU6vA01YqxxXadIcSqModkqT22LQ,4969
408
- helm/benchmark/scenarios/medhallu_scenario.py,sha256=Ed2JesQzU41P_rv_9zgBnQCGD-EEkG-EkIBw1qEIXbI,2223
409
- helm/benchmark/scenarios/medi_qa_scenario.py,sha256=JtsRryV88nFy0UlFaUuR2QyEdYkY2vLYMScvKC9ndTQ,3770
410
- helm/benchmark/scenarios/medication_qa_scenario.py,sha256=YJdluvYLb2_m96JkcCOTBIQOPY2h8dfmjTbJ__7Jydk,2266
411
- helm/benchmark/scenarios/mental_health_scenario.py,sha256=nGTXWYWfeO_t1u5va4u-S_OD70qo9IKbCHk5vLEBhT4,4518
412
- helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=RLUV_ydURfA6kTUJQ48TtaYzIYu8TSY9vl_ahNtSa7c,3777
413
- helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=4baXBaW_zcZU5RhQM3JsRrzHGhFvLwGiOu0irZShjps,3401
414
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=IECc3j1RSIZeO-JqTyxOBFCbc7ovl4XsYMdGL1k6pcw,2355
418
+ helm/benchmark/scenarios/medalign_scenario.py,sha256=mhd8REXpPwxftH48-KKb0ZURJ1mdOlvPRmvN4g4M9Ho,3383
419
+ helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=itxQxG0igEr-8PY3cXmUafM45bqxtov-iHEIy_ZuQYQ,15612
420
+ helm/benchmark/scenarios/medbullets_scenario.py,sha256=8O0UsPWw-ESkrgiuWz4f8gR99jH5-wS5HtCKYwZ1ycs,6713
421
+ helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=vwmEQZ119tOVeZtl6Zt-nXKwkA8Qt4WRiH2HogIkV0w,5560
422
+ helm/benchmark/scenarios/medec_scenario.py,sha256=Lo7iVkek7C9omJ5LX-C83pA_Q5OrAfdNhJY4rslJWTQ,5270
423
+ helm/benchmark/scenarios/medhallu_scenario.py,sha256=d4HlEi1cQtvh1a39jvIHezDDmjuIEsSPdqDLLkDTzw4,2544
424
+ helm/benchmark/scenarios/medi_qa_scenario.py,sha256=FmXI3UwfbL8zinFPtSyTyw4X5VIe2d32HAg93vbXR94,4118
425
+ helm/benchmark/scenarios/medication_qa_scenario.py,sha256=StQmfHTYi8pZLP9FMPzyS-VB9gilZS0XBme7MzAL2QA,2583
426
+ helm/benchmark/scenarios/melt_ir_scenario.py,sha256=d88DEGKVJZCeGnbrXrQZO_W4VJeqW8XNaYc8wIUiJtA,5978
427
+ helm/benchmark/scenarios/melt_knowledge_scenario.py,sha256=FDG4OGYEV6Ac40VC7KAeikzbFKAK2XXFhH1-QUTw8jo,7923
428
+ helm/benchmark/scenarios/melt_lm_scenarios.py,sha256=kSm0lRRixhnXctMprPnzi09PLOmgfs-C7TAW3QI8RmE,8969
429
+ helm/benchmark/scenarios/melt_scenarios.py,sha256=Zg_Uyq-e9Y-Er4IpWU1o29YC07Q9rOxxhokPyKq57Ik,30140
430
+ helm/benchmark/scenarios/melt_srn_scenario.py,sha256=EQSOZIXbfvVWCJMJ4H2e_CiBz6wc8THJndnbK2WwTHM,14674
431
+ helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py,sha256=ptMQWgNn6R-XpAVAAjutSdZg_9ZUqG6fVotzAgeead4,7945
432
+ helm/benchmark/scenarios/melt_translation_scenario.py,sha256=j9YrY60DQHZz4m1MJZaGLzyI6FERlHRx2wy9auyAVB8,5415
433
+ helm/benchmark/scenarios/mental_health_scenario.py,sha256=O1Lfd0MxqawLZLKUDSynaqqbaGHRjDglmePIqepnJI4,4961
434
+ helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=PGa0Nvbad_wH3qRSMPHgg9CgicOi7n25qLDnEucXapo,4097
435
+ helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=bxwVWjE_z4I_Nk5eD78g3QAGyjpsNg7DVWpkp8IGWXM,3841
436
+ helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=tZBUZEaUMZvfSlsU6hcPs-pxQ0kDIL6qebGd7JmpDbk,2699
415
437
  helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py,sha256=-OkPMRyB7aO6QBFwoTl6a2rpzcoHeEl84tqz7k9kpCM,2982
416
- helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=zUY0Nb8PzwxvohS1C2Me9utRfFM-8OLr0CmUfyjiVgc,4013
438
+ helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=pwpp0wqNhsGc8v2V11aUyEWbwdkmIm-42N676j1T3Ws,4031
417
439
  helm/benchmark/scenarios/mmlu_scenario.py,sha256=_5cX2uI7CxD7K_GvO3MD8CRJLuN4EzS2o_EFvbrfjSU,3855
418
440
  helm/benchmark/scenarios/msmarco_scenario.py,sha256=-l7_rIMQjMWcpTyn6dGotmNJ5XxN_Ze8dEJyv5ftWFA,34050
419
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=ftRkhL8oLZgsDvLzI8ya5jv9xv77YcDT9TU9JZBss8o,5333
420
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=eB6PT0wwH521r6uXvoiQEo7fZQQcKATuElHuPmyVyW0,5301
421
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=DAN3xeupuZEzxubf60C938Hr3WjhkzgaI1MbNwgPu8I,13194
441
+ helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=13pXjs9lFduM-QL03mpM10hU0iA8Vr2jJG2FVBQdKOI,5577
442
+ helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=RlyWrlI9e5MLsGbkQWpO2WRsIOZJi39xHskOIBypHdo,5399
443
+ helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=-Et7hJnQJOGl1U9Xdb5mLckYTpU_Ve1sCe450M-5haw,13513
422
444
  helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=MiSq0UnUllJxHFU2gO7m4vr_vmulavJxc4ruZhsAt2U,5632
423
445
  helm/benchmark/scenarios/natural_qa_scenario.py,sha256=g-fP8L1lXs7zwNVQOc0ZUnbYkCyElQtLVt5fe5dtvSE,12564
424
446
  helm/benchmark/scenarios/newsqa_scenario.py,sha256=G25VYaLrV_JyyoT0jpzJ6p4l5qsOydm8rlzTvSptNKQ,7284
425
- helm/benchmark/scenarios/numeracy_scenario.py,sha256=E1WkVgqPiZwaKuskD5iVwoypbG3DKI_r3bPXPqZ_SSk,30885
447
+ helm/benchmark/scenarios/numeracy_scenario.py,sha256=lgTGzZc81RyL8iB4K67PAHbyYz6BM2ieub8RSFi2aRc,30895
426
448
  helm/benchmark/scenarios/oab_exams_scenario.py,sha256=vbjUzQP0zU4ckvMbsk4lh24NddVWbUAtfWmsq1h24_w,2101
427
449
  helm/benchmark/scenarios/omni_math_scenario.py,sha256=5qb2cO-Ibb3kDbwYvkzsoU_aOsoKV3ROLgZbi83OyGU,1955
428
450
  helm/benchmark/scenarios/open_assistant_scenario.py,sha256=zd8T6eLOlYMZiFyKrRjc-EPwk5_KpbBedAcKDbZ-TdI,5609
451
+ helm/benchmark/scenarios/openai_mrcr_scenario.py,sha256=XbO8Wpjjq2e8OsC2s_ZScV4TcZg3hlpVGy56hgxXY9w,3253
429
452
  helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=JK39tq306tKe0RDBDLz1AfAdZwNjK_Ng-rHvu6bTRY4,7395
430
- helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=AmgdTGffaxNOJ_xDqA1ju5jXjlvEVdx3Gz7Cp7mqsd4,7789
453
+ helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=73D9D_q4Zw904qfd3tVPPhHxpGN4IZcWjlA6ZHEfp2s,8070
431
454
  helm/benchmark/scenarios/quac_scenario.py,sha256=RpJpOPbvhB0jv3R91Odc20LcNyZsny9J4IF24GNEygQ,6689
432
- helm/benchmark/scenarios/race_based_med_scenario.py,sha256=FQl99ttwk-SQdix9UpOCG1nI92JHuemLRgWjFTHGiTA,5295
455
+ helm/benchmark/scenarios/race_based_med_scenario.py,sha256=vZB43jtM47PWrl9L4HYOf1i7orpscKcHX01m0oVmk2g,5778
433
456
  helm/benchmark/scenarios/raft_scenario.py,sha256=Yk56dUMqDGXpp6SxoGWhyxa4lAIniSQfivjkoPqMuFA,4644
434
457
  helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=zpQthgDi-AyEgOUFO5F0qaWCctLEI5WGHBEGlPEVpqc,2424
435
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=bonCyKO9hCSce4EQCTwYAv0pgRlBYwlXAmEHl6_ljHA,6323
436
- helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=bhm8IINDa1asCFHTLkB0BztzPvww0Uy0Lv6_G9By77M,2876
458
+ helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=jgVf1D4eTSxwxQsW0GBou5hfSo2dnlEJvHpVJqk3BxM,6327
459
+ helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=Dy0INRMzxSiIs9Pm3fa0hYodN-W--WPSv4kcmeQhucM,3270
437
460
  helm/benchmark/scenarios/scenario.py,sha256=kSy7tmtFeC6-QSEsBuvlrMTA1PB6fOY9jycMld-vBVM,8592
438
461
  helm/benchmark/scenarios/seahelm_scenario.py,sha256=GA46ShNGUjVdMLK0ZbN4vPuGEWFQsDPJXEGHQbs1qf8,78150
439
462
  helm/benchmark/scenarios/self_instruct_scenario.py,sha256=3Kvi3pLL6eGOEezjoQoGv9c1UxKiRVlFmILKzqst4pI,2309
440
- helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=bbl3XczUrPW_mzHPtNxSC2SHRKBzgZP7RueIi8vc5y0,2362
441
- helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=AJlKAiSoUJQ4N4WeeBjqOfYNsl2teq8G8B_8pOZOW8E,2422
442
- helm/benchmark/scenarios/shc_conf_scenario.py,sha256=AQUukzN3lT5dSQWyc6t6ZSZTEIEvOj-TC7K8BA1Q5fs,2535
443
- helm/benchmark/scenarios/shc_ent_scenario.py,sha256=urspIw8idVLiuK__cG3IvnYnky4AZWiTWzK2nzjwAVg,2530
444
- helm/benchmark/scenarios/shc_gip_scenario.py,sha256=ectxVRWal0LnqFlBsRGhtJgWN5RZls7tHAVDeMzcW4w,2337
445
- helm/benchmark/scenarios/shc_ptbm_scenario.py,sha256=bAHIu7YKwUhwvGJuS5hplo7JedwLFGxyLub_ALLZo98,3077
446
- helm/benchmark/scenarios/shc_sei_scenario.py,sha256=udGZAIQ4Fpi_bV6WsuuQyIpSY74qc8VIc1MP9yRFIRs,4213
447
- helm/benchmark/scenarios/shc_sequoia_scenario.py,sha256=Z_7LM-RHuwWGdwFwCAgAQvIz4dfXNE0uHhLM1_9m7n8,2410
463
+ helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=wF_sD61IZ4RDznBVQ1HYbGh3Vc2qjbcBuU0jdmp1aD8,2803
464
+ helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=5aVEiRgFCutEWW9yMcJBxEo11FlwW0SiZTaOyXY6ioc,2693
465
+ helm/benchmark/scenarios/shc_conf_scenario.py,sha256=3LDB2pT6yi-ubSooGAD_0Ao7sYLo_MMAHNfm5Ux9Yvk,2889
466
+ helm/benchmark/scenarios/shc_ent_scenario.py,sha256=PS_O_keZ5s5_nSKxAC1k_WV2W8umEbyyKmlFtxvaReI,2855
467
+ helm/benchmark/scenarios/shc_gip_scenario.py,sha256=cxMpMmS05QpZ4xW2eogPH1hcDv6GzA6UQoAi9OSFO_Q,2702
468
+ helm/benchmark/scenarios/shc_privacy_scenario.py,sha256=dbQI_pDqXepV6EyxMUNumIpyQ8oDwnu37qyQ29rxZfY,2998
469
+ helm/benchmark/scenarios/shc_proxy_scenario.py,sha256=edepzg5qrN_GKa7u1W0RRhkpmfUi2vFHCvI1ma205WQ,2908
470
+ helm/benchmark/scenarios/shc_ptbm_scenario.py,sha256=QOQdz21s_YaRyGz-ciCPHH-fCy6hiGIrHUZz0SWPm5o,3391
471
+ helm/benchmark/scenarios/shc_sei_scenario.py,sha256=pTcb7n97VkesyRuqUqe5JGed1jDsQEd19udciDras8E,4532
472
+ helm/benchmark/scenarios/shc_sequoia_scenario.py,sha256=vjDyRZXP9UjkQzmA6u7SmKtMBuUwwn6KRQ4rT3vZqqc,2796
448
473
  helm/benchmark/scenarios/simple_safety_tests_scenario.py,sha256=sjIHT5NZlHv_IcXr_15-pOiBUPKKwykyH-QpMfvrHAY,1247
449
474
  helm/benchmark/scenarios/simple_scenarios.py,sha256=ersSzp9bFEFfpJ-SNy368AuonwswLnuyA1n7FOgkw4U,6459
450
475
  helm/benchmark/scenarios/spider_scenario.py,sha256=mhiV3XWGwpnIQkaHFM_rvZlrwE7nqS12-F9t1eB8kdI,3306
451
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py,sha256=X7AY99m8Lv8bVpOg-Bzld0vyFzpvi6fOAqE3rstRfpw,3843
476
+ helm/benchmark/scenarios/starr_patient_instructions_scenario.py,sha256=zdokiMy2Lrg5mS3V2QEakcZyJxIkqcoT5CqVCAtyoKU,4146
452
477
  helm/benchmark/scenarios/summarization_scenario.py,sha256=WZnqhMQED6UBmRjHSboygdenLecOqIhvgdYVXzy6Q-I,6912
453
478
  helm/benchmark/scenarios/sumosum_scenario.py,sha256=HG3wrKj5alV0a2aKb_nau8bB4oKDtTOLtdf3bx8h7sw,7695
454
479
  helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=OaxEvT1H9VjOjBSw_yKs3dcYt33vFE_UARr-UIP9pBY,3120
@@ -467,7 +492,8 @@ helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIa
467
492
  helm/benchmark/scenarios/test_gsm_scenario.py,sha256=I-Sl8Sg8kmFd7u0zZbwbNmeFV1mQLuOHoQ1cQDDwovs,1123
468
493
  helm/benchmark/scenarios/test_ifeval_scenario.py,sha256=h3CBg13VKwyb1Xaddwg2GWOzAXz4stK5lXdQtHenAw0,1646
469
494
  helm/benchmark/scenarios/test_imdb_ptbr_scenario.py,sha256=8kfCkMRUMU7N4WIrWawFDoxaLB2iTvQ-sPj4RoE2Osg,887
470
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py,sha256=Jy8HDZPtYS48-bBFIStKaQtxvQv_GcAwh42wCYku0vw,1969
495
+ helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py,sha256=qZE-fi1tdNOybpvEQZJUpq9fHsyrPW7NYqj_RTwsv2A,746
496
+ helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py,sha256=t7BJ7ouT83oNtMFFoBvdyQRu2vWW15I1HUdtmzzQKLI,1221
471
497
  helm/benchmark/scenarios/test_legalbench_scenario.py,sha256=FqbgwBAhHWyTIUYSzI5FOnTDx0A3u1o2ANKa_6bfA4g,1212
472
498
  helm/benchmark/scenarios/test_math_scenario.py,sha256=8Raix_ykxUENh7UREw1RhpM287oav1p59P1Dn2gXktI,829
473
499
  helm/benchmark/scenarios/test_med_qa_scenario.py,sha256=Ekp6r5eYPkCxV3FCzVvLemKxlhENhelqdO0Mdhg5yFo,1515
@@ -484,7 +510,7 @@ helm/benchmark/scenarios/test_wildbench_scenario.py,sha256=pmQ87MNoGAXwAmPf0eoep
484
510
  helm/benchmark/scenarios/test_winogrande_afr_scenario.py,sha256=LZfE4J42BZ7OF3BvfKgMWuCHpdw4-LpWnFiKyrHGXp8,910
485
511
  helm/benchmark/scenarios/thai_exam_scenario.py,sha256=YjFsom1yiu-xBZ3SGenNuczVCwQcmyoITTMavGv-QEk,6069
486
512
  helm/benchmark/scenarios/the_pile_scenario.py,sha256=X3GWABiJ5cSoZzeNpgNUVAz7_A9SyM5MhgpJseKpZow,5019
487
- helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=p3KAHk3C01TW7ya_XZIuK3JMJE51uoMpOnARV2UKgJM,6096
513
+ helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=kUQ-Bpu1N1s525EP3pa7v3sp9Wybl0RuJv2pVu0pAGQ,6155
488
514
  helm/benchmark/scenarios/tweetsentbr_scenario.py,sha256=ppugbPWd_3hHesLC52QbC-wUknctr9ZX4tmHefnPf6w,2879
489
515
  helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=wnP-zH38J62zmbdeOLzdU-E3iclbQPApgEk4AGyhdoo,2120
490
516
  helm/benchmark/scenarios/unitxt_scenario.py,sha256=uL8Gni-Uw_eIp9xKQefp4J7XtKSttjJHzJE4USyoC2U,1930
@@ -497,14 +523,15 @@ helm/benchmark/scenarios/winogrande_afr_scenario.py,sha256=3SOVyrQ8D7Wzz06uSbczD
497
523
  helm/benchmark/scenarios/wmt_14_scenario.py,sha256=1YYjz4x2RbYfJAXBTux9X30dxYTSC-YNngCCLhEiNfI,4646
498
524
  helm/benchmark/scenarios/xstest_scenario.py,sha256=ndRNB5ApW4th5iltlmT9-Nfw9eTaVZQw5AMC4HZCI-k,1309
499
525
  helm/benchmark/scenarios/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
500
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py,sha256=wufgORN0vPuTUnp7-VeCUoLH03C5MzSa-PMYku8D0P8,5626
501
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py,sha256=8piJKvbTeKdxLTWnazp0_ydC6ESRHeb-Pj2ri-86U28,6619
526
+ helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py,sha256=NtTEHzmbeCicbjTRxPBUueZrBGOPwF6RVc2Yftc-VKs,5634
527
+ helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py,sha256=IJlM1I0MxtBX5bhvwPPrsBfUwEm_ZqqVmPze8UH_tl4,6622
502
528
  helm/benchmark/scenarios/audio_language/ami_scenario.py,sha256=SH4r2YyW2kQ8r6-nSRI_F4unJC-l-lzikr2O7hMKgEM,4371
503
529
  helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py,sha256=kiUngeoAVOXfuKgqo96RgK_volpJUPFziu-cYDqT8WM,2685
504
530
  helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py,sha256=oLOeBGjQCa3hpzjhX2bNS6637VD9VF1KbSJri9BJ3PI,2698
505
531
  helm/benchmark/scenarios/audio_language/audiocaps_scenario.py,sha256=PkVqQM1zX6ecXYk-Pz4YWlST3Hnla8NyeBHbuHvhSlY,2447
506
532
  helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=uoiB3mnkudRH_rY1qeZRgobYYZ0xDn93F1Mn6Avl24Q,6724
507
533
  helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py,sha256=CbcoGPW65xXRRkrDthotDfoVn51ozANG9s3LCsjxkLA,3706
534
+ helm/benchmark/scenarios/audio_language/corebench_scenario.py,sha256=R8RAUtdRAQcUAN0PFXybQUekdQFNtT8hXtoR1A1hMGk,3155
508
535
  helm/benchmark/scenarios/audio_language/covost2_scenario.py,sha256=3YiaQXuLGfths2XswRw30Vf26bO9jEW_kAj5wZQSOSI,5119
509
536
  helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py,sha256=OKawk6Mq6ONOxcttkk-qodeFkNet7nvP0UbeEu5EgJw,3079
510
537
  helm/benchmark/scenarios/audio_language/fleurs_scenario.py,sha256=k8AFujDJYtH37Zaquy4TH8xYcxE62cvOK6DVDfp1TKA,9235
@@ -517,9 +544,13 @@ helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=9bpcvFtWq5Pd9
517
544
  helm/benchmark/scenarios/audio_language/mutox_scenario.py,sha256=bDCQbhsRDR6iQGNlCu_35kjmjGjuzjOIoraSncfOlOY,10277
518
545
  helm/benchmark/scenarios/audio_language/parade_scenario.py,sha256=UuOa5cSrHh5n3VF_SuJp4cy1MxlI3uEKHLrNEhGuyuw,4186
519
546
  helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py,sha256=oN4vBkElVzjccaEK2JFqoXMCGFTTHD0gcYwSDhvHTpQ,5438
520
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=Ar7IgtfZXFpsHJ76QacEB1KKwXVrOBE0BcSBO_GN2T4,2718
547
+ helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py,sha256=TfMTdQ_D4foKO4NRPXygDgdF0ST2LYiOcV3gXO3WEYE,3691
548
+ helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py,sha256=OUPFMOpRCTLN0o_lo7JJ7oOHxp9VuwC0fz4abWVS7hA,4713
549
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py,sha256=7-M5HXNE-YDM44f6LO4aYKBeFQxa3PfvN7q4u4BBYxU,4089
550
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py,sha256=c36E2RkeSDumLZgN6dBGzGz1ltgPdcBSqx8XD0qNH-U,5078
551
+ helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=wkKyTCtx4isQSMufap_6DsNdGkHi7L8FQ2p7n58kKYI,3124
521
552
  helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py,sha256=4M_gTWs4CoJ1Ce9dDFBTAe9dzSovpsve_sN1eco2V2A,3155
522
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=oaUeTgmr2AkSvEJYua4SItCbXsiK6cSSrIjlqsSQC7g,4431
553
+ helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=L04ee5bM5E0UNNmkwEzVwug4HJXQoIcVjujPgxtU2h0,4366
523
554
  helm/benchmark/scenarios/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
524
555
  helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py,sha256=c8zcoGCOFqBGE4TAEx1uLsUmGXw_jIS8alI99ubGeDA,5477
525
556
  helm/benchmark/scenarios/image_generation/cub200_scenario.py,sha256=7p3G4mJRc8QHR4Mw2GLsfAFuJcEe6OeZbezVhbyc55E,4103
@@ -558,6 +589,7 @@ helm/benchmark/scenarios/vision_language/mme_scenario.py,sha256=7Aa3y0TWGZH3QrPD
558
589
  helm/benchmark/scenarios/vision_language/mmmu_scenario.py,sha256=deDMdg2-ORZPV623ngncDPlRn6z6cq_QbQtMu-z0Ydo,7665
559
590
  helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HUO09uM2rBXOfCsxzwovmwtihq53xjuzDOtQO_S3J4I,4161
560
591
  helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
592
+ helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py,sha256=qWz71kAlH4TxFSTBgAmZ7DLMVA8ir4X7jXnS4cArpZo,3024
561
593
  helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
562
594
  helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
563
595
  helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=D3nNu3uU87eMDiMZZafuRTntXjwbqPaSDygUgQm45F8,9943
@@ -582,7 +614,7 @@ helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py,s
582
614
  helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py,sha256=UYe3PnxCKBYEbZTTEzdIoTY9gW7ZZAWmVISRIdItD-A,940
583
615
  helm/benchmark/static/contamination.yaml,sha256=rAfh1DqwyUcDtyzHPQ2QiUK5eY7QfuuRtBXpZMn4TeA,3171
584
616
  helm/benchmark/static/schema_air_bench.yaml,sha256=LapSMj3Ecl1Gp9XIwVCYfrerqS93GNErvp6oDnBCtgw,142378
585
- helm/benchmark/static/schema_audio.yaml,sha256=RD8XMxFlHJ3oM578SGZln_DTjOh72EQqQz_lexSTJHE,28952
617
+ helm/benchmark/static/schema_audio.yaml,sha256=lVslZX7JmFo0ZgLU4n6amrs9DK8y43Ux0I9QyDUG-14,29119
586
618
  helm/benchmark/static/schema_autobencher.yaml,sha256=yb-NkF5w5R2YOg7RIsadNHJ_5G7lG1gbcDVq_25luEk,5716
587
619
  helm/benchmark/static/schema_call_center.yaml,sha256=i30aFzWqdOJRyAHN8vAzyHEX1v95DEK0TI1SMKTN4TE,9106
588
620
  helm/benchmark/static/schema_capabilities.yaml,sha256=HHy0aafhOaqL0C4TZw2mMt1Dce2_wuN062ORNZIbwYg,8733
@@ -591,7 +623,7 @@ helm/benchmark/static/schema_cleva.yaml,sha256=TDh-zcCzzTTs7bu0IWlY5dXYaTFhxly8s
591
623
  helm/benchmark/static/schema_czech_bank.yaml,sha256=jkTRQVmmbKkbB0zPH9AtYh6Lt33ymMInRBQnHE5lIOo,5462
592
624
  helm/benchmark/static/schema_decodingtrust.yaml,sha256=2VPxzcyKYea7mx-qmswyVRjPfVatjVH4Rs3OU82mgII,15670
593
625
  helm/benchmark/static/schema_enem_challenge.yaml,sha256=ZDcOfonL0z-ehsW5OkwaQOeiG1jLPk_toN8s2jhVIdM,5540
594
- helm/benchmark/static/schema_enterprise.yaml,sha256=TRYP0uNKi_Ln7kKIRYMqbOnGBlf7hF7aiE4dn8OVu1w,11040
626
+ helm/benchmark/static/schema_enterprise.yaml,sha256=W6eP79bBhKsvsxD8ve-lC-ELDtPXyGmRJ2Z35uK9pLo,11969
595
627
  helm/benchmark/static/schema_ewok.yaml,sha256=MluPnZSy22wZLFB2pR7ycBRgUSvIUsqvq4qM0Vk2ur4,12113
596
628
  helm/benchmark/static/schema_finance.yaml,sha256=I5-rcZmYpfwS9jVsZM53h6Iv6Um33IhQqt-LUrc4_GU,7165
597
629
  helm/benchmark/static/schema_heim.yaml,sha256=EK5F51C6vDZtbVFKqo5GDIi4tG-sfdVm3XcYpfthqNA,44396
@@ -599,42 +631,46 @@ helm/benchmark/static/schema_image2struct.yaml,sha256=cD1X99YcPI8BMAnNfDmXlM-FN0
599
631
  helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
600
632
  helm/benchmark/static/schema_legal.yaml,sha256=RpoFOuVSIowNgxlPn3UMfJC-68RFr3CGDciUGLPfVqc,28806
601
633
  helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
602
- helm/benchmark/static/schema_long_context.yaml,sha256=sTz1CWqsXRnR5yAluWzJZJJmfnesV8MHG03wD07LiMo,8430
603
- helm/benchmark/static/schema_medhelm.yaml,sha256=l31CYuop0hkPaSmwYMMq4DxSCyeB6LRnEAPQZAQF2gE,42013
634
+ helm/benchmark/static/schema_long_context.yaml,sha256=0xcyw8WI4SiLM1QPnjhTM-1SMGIyA5IDwWKpJzfQt9g,10795
635
+ helm/benchmark/static/schema_medhelm.yaml,sha256=84BrIengbq0m42ICWvyEWoYtdERR-8J8-8QbPOqUzvA,50747
636
+ helm/benchmark/static/schema_melt.yaml,sha256=mmPqwDa26DVZXsRJkmKQSyD0OStvjlxaMoSPM25SpD4,47494
604
637
  helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
605
638
  helm/benchmark/static/schema_mmlu_winogrande_afr.yaml,sha256=YIVYf-mOFPq82UVBdMhnCWNOr4sV8Oi3-ozOszJ2tWQ,40143
606
639
  helm/benchmark/static/schema_safety.yaml,sha256=7RfZDX4wr8Xr1BJ149ZwmplPzPkNL0-BKbEZuzUsl_0,9278
607
640
  helm/benchmark/static/schema_seahelm.yaml,sha256=9XF9Rlr7I-g-uW6R0LNh7Xg52Xs3_058QybXEiN-hnM,28296
641
+ helm/benchmark/static/schema_slphelm.yaml,sha256=3avOfp-ZEmVRGei3_M_WX6cSP5hQjbfHsDr1XrjayMY,5294
608
642
  helm/benchmark/static/schema_social_audio.yaml,sha256=Nj3ORXDT4RHD52cyo1RHfueWwbhqp1qW06TaVJ2lUfE,8653
609
643
  helm/benchmark/static/schema_sql.yaml,sha256=8rRff6p_i1CsH7oDbUjau2qRWbLGspuM1Hy-g5pOQiU,6047
610
644
  helm/benchmark/static/schema_thai.yaml,sha256=yJUrevvgTJ46TpyXfNecW_B9urh7LPwSbBi_mT4ZngA,8348
611
645
  helm/benchmark/static/schema_torr.yaml,sha256=9R6HgT9ZuCnbMdhYB-pFect9apwEVuLEr3R1fx-Txd0,14583
612
646
  helm/benchmark/static/schema_tweetsentbr.yaml,sha256=DwHE5Y2STJPDT0fFNm-GPFXq_n3DStQ1ubzhSu4xsoI,5453
613
647
  helm/benchmark/static/schema_unitxt.yaml,sha256=9FQhoueYNNYQ2xMuJ2KHzpg_9-_ZhZ9efk6jtTQ3tlc,11855
614
- helm/benchmark/static/schema_vhelm.yaml,sha256=_Yr04KPL8T2ZqOcQiXnUDOqxcuMn1bjZGCeOFSjbbEM,33974
648
+ helm/benchmark/static/schema_vhelm.yaml,sha256=0slYep2eepUefgtK_m4iSS785sHdJzljmO-kwDRriK0,34262
615
649
  helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
650
+ helm/benchmark/static/schema_video.yaml,sha256=FkpI5Slc4w-ty4hns82ArXIvTdqppWDnkJSpIp74QN4,9713
616
651
  helm/benchmark/static_build/config.js,sha256=o98g6QSly1NAfqhYWbU4lEoZB4LEpIrePZtmimiuoXc,165
617
- helm/benchmark/static_build/index.html,sha256=_t225NmMVglYdTTKPzwQ7Ab-cq_4g4oJgYbfkk3F2Dg,1149
652
+ helm/benchmark/static_build/index.html,sha256=kpJ5Riw0YUmOOo2lSyWPgWx5XOwxxiLvPmG3wHwn2tM,1178
618
653
  helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
619
654
  helm/benchmark/static_build/assets/crfm-logo-74391ab8.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
620
655
  helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
621
656
  helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
622
657
  helm/benchmark/static_build/assets/helm-safety-2907a7b6.png,sha256=KQentq_1e3uGwiWMViAPxHu2XZ60gqFgovP3UWTyMmw,72312
623
658
  helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
624
- helm/benchmark/static_build/assets/index-262903c1.js,sha256=mnPpe8lHsSWqq9GivvCxMwPVlZusXFP3AQChU3-bDAs,95853
625
- helm/benchmark/static_build/assets/index-42060d71.css,sha256=QgYNcW4kJWHl4GN2T1ep6DTI9tgFbZoj3MXmwTg3sfM,489884
626
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
659
+ helm/benchmark/static_build/assets/index-94295e78.js,sha256=yvo6hRwNE6Ns7NxJHOdVfUOhc8HsW8eZVadLMW0Wn0w,124386
660
+ helm/benchmark/static_build/assets/index-b9779128.css,sha256=uXeRKCUzQAC32ofNoaK3-WC7kRWR--KnR6--1m9NdQA,491471
661
+ helm/benchmark/static_build/assets/medhelm-overview-eac29843.png,sha256=6sKYQ79cN07-cUsnt-JPsdoVwUBWu5KxOaHWSdwjdgA,284408
662
+ helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
627
663
  helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
628
664
  helm/benchmark/static_build/assets/process-flow-bd2eba96.png,sha256=vS66lq700aPEKTJR7maMrmepAyBZySaL42tBNCRjFWA,190822
629
665
  helm/benchmark/static_build/assets/react-f82877fd.js,sha256=ijg4n6eANaZKXPWIVTQITqrtf-zzicjslJMm6DniDkA,275149
630
666
  helm/benchmark/static_build/assets/recharts-4037aff0.js,sha256=SP08CFvsw8cMMMMdqcXvsLviuOxkAhXGwvUIMvYUdxk,432466
631
- helm/benchmark/static_build/assets/tremor-9cefc3c5.js,sha256=5iR--BuAQHnEFO_jWnh-3hG34ezpt9LRJkTZNHc__pM,293015
667
+ helm/benchmark/static_build/assets/tremor-38a10867.js,sha256=prOrg5S4EeKHSd6RkgnBIbVfXIUq3xjeVE0MRdqvenI,293019
632
668
  helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png,sha256=FDfWcwGcJhJco4qmZli_ROomLiASrrnsX-wtKSDvMkc,542231
633
669
  helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png,sha256=oco_P6kwqp0cC3YaT_2H2RhJ6p1sh3sEQq3R0RA_cT0,71934
634
670
  helm/benchmark/static_build/assets/vhelm-model-8afb7616.png,sha256=ivt2FhDk8dwnzp1MAle5WfbXzht_Mxg4rpy-xHRybjs,180285
635
671
  helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
636
672
  helm/benchmark/window_services/default_window_service.py,sha256=HlLI3be8s-GNxDygNGrvo9exEhbrO8Vtr3w0rnSIx7M,181
637
- helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=4_WCz6NpaW-71OoUCpuYgSbRbYhV4fmB3wSg7kEZb20,2155
673
+ helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=wfdydJY6AmpYCfAv5PQu9D6nFXbuxIRum7Tsv0DemJE,2148
638
674
  helm/benchmark/window_services/ice_window_service.py,sha256=snyIWVeeknf202_pzBUmvPcA7UcN_FKyIpCGpO2CmFU,1100
639
675
  helm/benchmark/window_services/local_window_service.py,sha256=-6wlg8gN_dN80lptRWJQsPALCK6W80-KHA7gghs2-5M,5292
640
676
  helm/benchmark/window_services/no_decoding_window_service.py,sha256=s_i_cqIuU9p0GDRIBApaOHzjH7gHrBPTJ2X5NEcN33Y,1375
@@ -651,9 +687,9 @@ helm/benchmark/window_services/test_palmyra_window_service.py,sha256=u7xb7syXCxj
651
687
  helm/benchmark/window_services/test_t0pp_window_service.py,sha256=rmoMW8YsNpD_zC-GBi6M5GugT_lT9lfn5CbwNbr7d7I,4088
652
688
  helm/benchmark/window_services/test_t511b_window_service.py,sha256=zmFGL4Nwg3xQ7nRe-IEkl37wx59C33xBUS8qKHqBQeU,4091
653
689
  helm/benchmark/window_services/test_ul2_window_service.py,sha256=RhIK4i9XaUfgeqTZEEXxyqaIxdyu29BRKb0pBl7orKk,4151
654
- helm/benchmark/window_services/test_utils.py,sha256=Lej1zx3q-o5C4uhIIsAbexJjNMobY--c0wy8epXvfOk,3406
690
+ helm/benchmark/window_services/test_utils.py,sha256=O1jHGB0Dn0h03ayuosF_8AtikIe8p50d5HcfzT99rBU,3301
655
691
  helm/benchmark/window_services/test_yalm_window_service.py,sha256=PJqw2ySLOMg_iiAzJGzj-1YOrDbxFkmP6wjiDcj1RWA,4391
656
- helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
692
+ helm/benchmark/window_services/tokenizer_service.py,sha256=rf6VAZkPRkwH-KKxXoQnfQ2uozC0_A_9egGPyk1P0E4,755
657
693
  helm/benchmark/window_services/window_service.py,sha256=y6BthPY1V-ugmYfaJElm5Wfy3PSgoJLj10vHcXZZGNA,4727
658
694
  helm/benchmark/window_services/window_service_factory.py,sha256=T55F0Y2jiOYxUHHZxT4YX4fFXY5gfFhn56zIwUBhc7s,3423
659
695
  helm/benchmark/window_services/yalm_window_service.py,sha256=EwwCoMpr9WVLhCI7OI_7tmZHQfTUwn9FFWjbhIBFRfA,1089
@@ -667,20 +703,22 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
667
703
  helm/clients/ai21_client.py,sha256=RAXQufajYnxr3b_1Hl-wAZkeE_j6O8zX-vngWEits6c,8158
668
704
  helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
669
705
  helm/clients/aleph_alpha_client.py,sha256=BK2eQIHYMxLMsZNWld85ZCj17JAoy5lU7rHuSBa4fOM,4981
670
- helm/clients/anthropic_client.py,sha256=s3eCwHh8mbhxLi8up1WtQWKkUsHJa-LO44prNd7XYFc,34059
706
+ helm/clients/anthropic_client.py,sha256=R85gLYrheN2YWSGTnf3pkYTjCkTl300ktdlGLe1_1-o,36181
671
707
  helm/clients/auto_client.py,sha256=J5bCxIDZJUdV1dCv_EtbvwPzd1p2Ogtg207vpb3PhgI,11624
672
708
  helm/clients/azure_openai_client.py,sha256=mZ0udOAjadp7ZyE2KEtq8XuQp45eHlX_qM_getyzbA0,2009
673
- helm/clients/bedrock_client.py,sha256=RjkYkWCHhGFA5oB_Bry6K-WHryopkZtL5Zfh48gS34s,12145
709
+ helm/clients/bedrock_client.py,sha256=sXxzNTs3pwVIwvir5lyJWLRajI9p2lMiJq21XsZ_FZo,12267
674
710
  helm/clients/bedrock_utils.py,sha256=8ZZfyOuZkgxL_naJ-wwBnH4GKv425fu3MfyakGHxeb4,3764
675
- helm/clients/client.py,sha256=InjCQi62TWhWHmfyi-mC3fSAVztd-YDyfB3BkpacHXk,9002
711
+ helm/clients/client.py,sha256=fWJ_Eg4NyhPqlvpDvM7AjWN7cr2LU2uWdsnENLJXlTs,8963
676
712
  helm/clients/clip_score_client.py,sha256=ct3GHZ2Zh3fGwyvQ9DyoIPT6PwDPI-nUaFkUFuc8PIE,1622
677
713
  helm/clients/cohere_client.py,sha256=edQO5raoJYmYzfVREqHhNvjTcqPevG0M8EPMLOANqXY,10975
678
714
  helm/clients/cohere_utils.py,sha256=aYmj60m0e9RF9BIdxp1vmA-uZv17TEALw0dbgTUSpCc,504
679
715
  helm/clients/gcs_client.py,sha256=1sK5x5uWtThgz9gqBLaA8oyiXGD_9nn1WyfMzJRyPQ8,3231
680
716
  helm/clients/google_client.py,sha256=mIaUzK7GHCa9pqK1BEVhdt6dZsJfHv1Qdsf3I0Ayq8A,2912
681
717
  helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
718
+ helm/clients/grok_client.py,sha256=SbVB6AduTwfElzUgEMnQW2kQUFVTCv4TpPPJvElQEe0,1127
682
719
  helm/clients/http_model_client.py,sha256=_F3_y2UWqbzESQdzV0FMEsECIKjporVSAW6iUQhJ35c,2818
683
- helm/clients/huggingface_client.py,sha256=adnFKZni9DiFDDVDkpQjWXf4HLyYLvpzy3aB9PD3HyY,15428
720
+ helm/clients/huggingface_client.py,sha256=FYrg8XoCHXi5eUWjS0S_n-eiva-Ri0g1oaaeT_ky-tE,17615
721
+ helm/clients/huggingface_pipeline_client.py,sha256=ivFTMNHBwwIUjkeOHkl-veZi5nNAjtnkYvneRFWs-6Q,6154
684
722
  helm/clients/ibm_client.py,sha256=4W4fbjnDNjXrP4gVwSfBHPus0QcqFOQzFvfaST1BE1Y,9701
685
723
  helm/clients/lit_gpt_client.py,sha256=pgLfSvusNpdj8F5DVxzQdHxTDRNX4RVt6unegao803U,6229
686
724
  helm/clients/lit_gpt_generate.py,sha256=8DdBE9ReQ00NbV3KMFYc--PlO9X-HMOR0Rhm5CADWEA,3103
@@ -689,10 +727,11 @@ helm/clients/mistral_client.py,sha256=ceM8KLAcniAqK1BNVdUGzqy4av2SEEau6PVmPivxc0
689
727
  helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
690
728
  helm/clients/nvidia_nim_client.py,sha256=Z1UAqR2jHacIO_QGqQl1JUZ_82JiSPstBOtj6xURmQk,902
691
729
  helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
692
- helm/clients/openai_client.py,sha256=BImOqF2fVdxJrgI74KCsXeUffABFc_pZ5jgSK5NdZ-o,25936
693
- helm/clients/palmyra_client.py,sha256=U8iD3IENbA4iEpFLNKc8O2UFNYxffgt1QrBvSzctsWs,7151
730
+ helm/clients/openai_client.py,sha256=s62_qafDVbDu5pzIkfQsflIwRzc4sXkSiDNkmZz68Ow,27775
731
+ helm/clients/openai_responses_client.py,sha256=zua7DZWLeOdpb1yY8YV10gmuGdqvvo_9YQPW3OIGPDU,7219
732
+ helm/clients/palmyra_client.py,sha256=4AaZcV2tPHU4HJ9FWSkOY8_C9ndEckH3PH715QxJQ8E,7086
694
733
  helm/clients/perspective_api_client.py,sha256=o_1FFTCrTny6AZ4EJTstX1H9t8SQSQ8dvhi321RTcL4,6105
695
- helm/clients/reka_client.py,sha256=8PW-NFsqohRQMR-JNWn9xhlG0YfghO_X-QQAnSt9Vqc,8341
734
+ helm/clients/reka_client.py,sha256=hA0tq3Hc9669q2sYa4Jr5yWy2NAbvoFDnVqQ6vds62w,8334
696
735
  helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
697
736
  helm/clients/stanfordhealthcare_azure_openai_client.py,sha256=NGbeI6sMenmgqPQTWxYF3C1Aen29LybRcHcsmS3Jqmg,2059
698
737
  helm/clients/stanfordhealthcare_claude_client.py,sha256=ShhbLttPDRa-Pnvr35_2WmVx5s0XpsJMGzu5qhzLoLI,1020
@@ -705,17 +744,20 @@ helm/clients/test_client.py,sha256=T27UsIPWsbE1JK_8DN_DW9LkEcIGRbgDjio14YOIAb0,3
705
744
  helm/clients/test_huggingface_client.py,sha256=x2NjMuIrinfUy0wQ1S6F5cYZVr09YfvN6LfhWmyGNAM,3388
706
745
  helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
707
746
  helm/clients/test_together_client.py,sha256=kyBLu-2i4EJyuJm5ft0yg8W-H1IqmULRXggEbChuxdo,6178
708
- helm/clients/together_client.py,sha256=xA_a0R0adb9vNkMfrXOIwwdpGoIPa4Nso2tXT_2YSVg,23215
747
+ helm/clients/together_client.py,sha256=tgjMlWscrauLFfMxDenh14oEBfLWyP9XYhz--YlvKVw,24264
709
748
  helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
710
749
  helm/clients/upstage_client.py,sha256=iSL1G8G3jWSbrpacz4I0l6Lwc5T01fsLR-wZzF39ftM,679
711
- helm/clients/vertexai_client.py,sha256=haOImGAaYwCyxr4__feG6nHUeHRbCV6ExE6Kp9aKtWs,22665
750
+ helm/clients/vertexai_client.py,sha256=PjMnz4u5YQdpIbfLLBFsrPuHCNrj0_fatf1rY89d-nQ,23113
712
751
  helm/clients/vllm_client.py,sha256=YLIxGoQ_ZXejA4nfVpmFE4tmHROEFxEbFsV8Ba25Eac,1658
752
+ helm/clients/writer_client.py,sha256=flKLeMbFkyGfNmv1ozZGU4dxNy-QF5bFJF0mGHqpU3c,4467
713
753
  helm/clients/yi_client.py,sha256=nC60d2HiUL2W59FTne9tWmZ9bGGY1OvI7Ob3Ng4wSPE,750
714
754
  helm/clients/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
715
- helm/clients/audio_language/diva_llama_client.py,sha256=On6oNnGN_zZVkNq_kEFh4BgeNgvbNTbHCx7WRm4LYfI,4778
755
+ helm/clients/audio_language/diva_llama_client.py,sha256=Bvcf4wE7yMZlqETgKEMtCug8-2fQI8QCDdaGWSeQ2X8,4864
716
756
  helm/clients/audio_language/llama_omni_client.py,sha256=OCak716q97uEk9CBXQqnmUsbLFR-dddMzg5eyIZ4gzE,8718
717
- helm/clients/audio_language/qwen2_audiolm_client.py,sha256=cY2mScgTWr_No_MFZ8bZn5wKlNd9ae_IndShlegLtrs,8831
718
- helm/clients/audio_language/qwen_audiolm_client.py,sha256=_SHJh-0R3wj0qWJp3HSO7nPrDtr5G_nH3CaRSofFBxg,6236
757
+ helm/clients/audio_language/qwen2_5_omni_client.py,sha256=lbv6Hr22p0ReyR1bnN-dR8BzdPgilvGES7G03of8BWA,9090
758
+ helm/clients/audio_language/qwen2_audiolm_client.py,sha256=s9eH8fnVgw5xV39b_8AGt6IyNN3q9Uhcx6HZVxt7TM8,8981
759
+ helm/clients/audio_language/qwen_audiolm_client.py,sha256=RvYweXANEyzhHYDx38H10F0ZEFaL8kj7n7TZ-UrRmZs,6338
760
+ helm/clients/audio_language/test.py,sha256=FrKpirOwJW1__E2egq4VPgsTrgiSHZHBwfUCvxNjC0o,1969
719
761
  helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
720
762
  helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
721
763
  helm/clients/clip_scorers/clip_scorer.py,sha256=5KzYTrGuy5zA8yHX6c67Is98HLkqQooWhioPxHNLJ7s,1932
@@ -748,12 +790,12 @@ helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py,sha256=IpDLoPBDA-
748
790
  helm/clients/image_generation/dalle_mini/__init__.py,sha256=4RmnjfGTmgYaWsQmaDkOHxgo0Wxr9qqwtpMBC_5XeGg,112
749
791
  helm/clients/image_generation/dalle_mini/data.py,sha256=1unTc4lkUZ-6A2DfcbcglGtnE2KP3OuL4YWFROlsEQo,17622
750
792
  helm/clients/image_generation/dalle_mini/model/__init__.py,sha256=fyMDjpuzHxWjF5Fk9Rkfyn7KpvFAwxyRCJFoA2RDPdM,428
751
- helm/clients/image_generation/dalle_mini/model/configuration.py,sha256=8DvL27TPmbntElIy7FrOzlSqWdlkX3R2eNVxiV_QwOM,7861
752
- helm/clients/image_generation/dalle_mini/model/modeling.py,sha256=YVbiEQSVNmN16Sg1Sn-qR9SVXYfn6UD1-eCt9QcsRwQ,69702
793
+ helm/clients/image_generation/dalle_mini/model/configuration.py,sha256=AAeqmSiGOPd831VrytkWMbSSAv-4uEGk190svHsUGNU,7859
794
+ helm/clients/image_generation/dalle_mini/model/modeling.py,sha256=w9TSQYBjOygqj-QCQSqjzujahGicXRtnJObtXrCpCEQ,69700
753
795
  helm/clients/image_generation/dalle_mini/model/partitions.py,sha256=_fDpk34GL6NhNecHuP78y_gmKpWjbfw3fxMCWVEO4pc,2721
754
- helm/clients/image_generation/dalle_mini/model/processor.py,sha256=oTx5KHXKhZjVYaS0rmtlzCIbWUTJLh0plLNUWl8xxZ8,2406
796
+ helm/clients/image_generation/dalle_mini/model/processor.py,sha256=2JvF8XmYMiFrxxi4YcGDF1JrTFQPqBXfzYmb_ylCRls,2404
755
797
  helm/clients/image_generation/dalle_mini/model/text.py,sha256=Kfba8JdO2LrSmCVlQtgc7J2kSordCgjeg7WV9V45B80,7302
756
- helm/clients/image_generation/dalle_mini/model/tokenizer.py,sha256=SnPUzrfZXSAXXcQRCR8Ykhn5hJfUB3p5wNuriW5GWy0,245
798
+ helm/clients/image_generation/dalle_mini/model/tokenizer.py,sha256=fggtXzlh8HHHgT0T0d78KX6i16zFApnpkp7xOMAuD6c,243
757
799
  helm/clients/image_generation/dalle_mini/model/utils.py,sha256=clu2IiIpAT0DzTc2HvmI0ySnETFsJtpi7tocPkqOreY,1171
758
800
  helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py,sha256=01pV_QWUmcIpj5kBVihle_VGrJyw2AmV3QuhWASds2M,66
759
801
  helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py,sha256=4q39kdTUxeW55SN8NNkA9MdFZtH6rWssN8XauuOwyi0,1213
@@ -773,14 +815,14 @@ helm/clients/image_generation/mindalle/utils/config.py,sha256=lh8dXvL7ctKmuYEbeT
773
815
  helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh9HMveJs6F49UMK57Xfa0ccnHqI8,5029
774
816
  helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
775
817
  helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
776
- helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
777
- helm/clients/vision_language/huggingface_vlm_client.py,sha256=H7AE8mm506PkEcUO8VaLVtptHTwVX58nZx1A_BWdKzA,4968
778
- helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
818
+ helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=S4FDbSO917bUw3cK64xnxwH5HFH_Eb-w2zQ8ZL4eSSk,6588
819
+ helm/clients/vision_language/huggingface_vlm_client.py,sha256=OHV41AA-WZo_CnsHymwslgjDcVK0uHmIrvGbrxBDK5w,5000
820
+ helm/clients/vision_language/idefics_client.py,sha256=DURync-8rh2ccdlGDPl3NMgryBcMn5yCrrmFZisf5m0,7784
779
821
  helm/clients/vision_language/open_flamingo_client.py,sha256=QH6el-wkEl4PMZM9b3_H-o2PRaMvumGbN29ee9dmkMU,6519
780
- helm/clients/vision_language/paligemma_client.py,sha256=IU_T8r1RgpGkEAqabLKBbmoUOWV6c1a9_FXgiTy8exE,6835
822
+ helm/clients/vision_language/paligemma_client.py,sha256=K9MzXlgjXoiVafA8bbu-mKNt3Z9kq8v8AJL286DyQqI,6867
781
823
  helm/clients/vision_language/palmyra_vision_client.py,sha256=4elEdmwllMr2qzTzBdlRC8L5Ut3vOXFtanGGYrx4lv8,4074
782
- helm/clients/vision_language/qwen2_vlm_client.py,sha256=XQ6SB1mkpIuYPNZMQe6jkduvwQxMfrFtVnHKv3osFGo,7310
783
- helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
824
+ helm/clients/vision_language/qwen2_vlm_client.py,sha256=jvh_-jyvFL4r3LPX-gWPCYHT503JtJ73FVHQS2KyQ2c,8325
825
+ helm/clients/vision_language/qwen_vlm_client.py,sha256=wNxEuYOrhjaW5s4vtdRxKvJ-LCTTGyKqiqD84j7H1Do,7565
784
826
  helm/clients/vision_language/open_flamingo/__init__.py,sha256=RTxnxjYnTmTZv-608o66_W74qmKLpEO6hx0cxaZaYv8,172
785
827
  helm/clients/vision_language/open_flamingo/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
786
828
  helm/clients/vision_language/open_flamingo/src/factory.py,sha256=4KRXLV5mOEZ34-Foq2zVgTye3sQD-Buz6NZTSp2X9_A,5790
@@ -796,15 +838,17 @@ helm/common/cache_backend_config.py,sha256=4u5A6BHNBmGnnrDNhCVgrdwhXQtyAbWcUeoo7
796
838
  helm/common/clip_score_request.py,sha256=WnNg89owDCmG7tyy8nnQL0RdKQLsUdMWiYH9XqqbGw8,840
797
839
  helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
798
840
  helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
799
- helm/common/credentials_utils.py,sha256=O-57nUgkWLbZF0k3lsSaVGPPHj2_OYeVuCMe0to3bRE,1118
841
+ helm/common/context.py,sha256=0U5KNNKLHiiqjb8JVq03mninagEp9zTzFKP0He8o7A8,2788
842
+ helm/common/credentials_utils.py,sha256=BX_P6wUpLKA7Bg3Dztm7jVI2j4ls7H-h38UbmGMBt3A,1101
800
843
  helm/common/critique_request.py,sha256=yo4aRe-DEjudUmydthtpTj6LdhRXfZ3JZptxTkWzZ3U,3068
801
844
  helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
802
- helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
845
+ helm/common/general.py,sha256=TcdPXn_bgPFvXtFP2lJhncz4Q8SdTXnKOinHOTBsegw,12027
803
846
  helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
804
- helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
847
+ helm/common/hierarchical_logger.py,sha256=KR5R7tjUJN-hTFdnfzEyfwAhvgTFH3JJCH-LSiilqLk,4192
805
848
  helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
806
849
  helm/common/images_utils.py,sha256=8BsN0fd8pc0rh_TSDvippWhTfwmJJXKNF2zqKLB8cps,3372
807
850
  helm/common/key_value_store.py,sha256=D9ZBORzZncf3zHQOP4AuNbQnV8cZpO_kqHY1mDRugqQ,3174
851
+ helm/common/local_context.py,sha256=lpQSLqybZda7LDg5drYQrT8blWORvOOB4yXyCU9d8Ts,6493
808
852
  helm/common/media_object.py,sha256=1SlilnsrfZVVpfci1atin8hbREnGoNQwjBcNAH8RgBU,5151
809
853
  helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
810
854
  helm/common/mongo_key_value_store.py,sha256=G0TIWQcvwMjyXh4TnN6xJ462HKHUAZtQJJYQOrHK-K8,3887
@@ -814,7 +858,8 @@ helm/common/object_spec.py,sha256=_usgTDQULBF6_jy7C6m-9ZNVvNxbGoTE_CdGcSvBASU,43
814
858
  helm/common/optional_dependencies.py,sha256=Qam3QCHff8tuXbS-fCw-MVe-pK18gSvHw-uQoXXxT7M,616
815
859
  helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
816
860
  helm/common/reeval_parameters.py,sha256=exaEucXnSI8a076uq_qhO3CTBztMMRoRzL_7v1N4adE,300
817
- helm/common/request.py,sha256=w6N1TmVnc6C1gzyFyhspU1nf5pOE4zBwdeGMFcsBZLk,9022
861
+ helm/common/remote_context.py,sha256=DzFMii9AN03CoWp1J3k703-7oQJYHwEf9TDV5YzM6v4,2825
862
+ helm/common/request.py,sha256=HWj6IizIwJm9_NigO-geira_rI6aqhj5CevQB694m94,9161
818
863
  helm/common/response_format.py,sha256=wIptA8FydZoRjMvO5SFIplgDXhwpZvZmFI-Bi-7mcGU,516
819
864
  helm/common/test_cache.py,sha256=j19p-qzv_98X_TMW4b39ZHwSJ-MX3p91PrkYumarS6Y,4870
820
865
  helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
@@ -826,12 +871,12 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
826
871
  helm/common/file_caches/local_file_cache.py,sha256=NiXbat1BBGl5P27oERqSLFfhIHpYqA1IQrvE_N1sWR8,1944
827
872
  helm/common/file_caches/test_local_file_cache.py,sha256=ANb01ctUV-J4i1ab3l4uhg9Ce54U_56xq9Hayjt1WhQ,686
828
873
  helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
829
- helm/config/model_deployments.yaml,sha256=_zdXhMH50hqKFbtFymUFb_LmlCz3XR1-WARsDAH6ESs,135205
830
- helm/config/model_metadata.yaml,sha256=M-23M608OVESCUai6iBLBIT_17O80pI-YWkEYRax-gk,226327
831
- helm/config/tokenizer_configs.yaml,sha256=o7oX0jQXqKuoLC2z5YgdvJlcMcr15WtNjlqAkYLLDq8,32860
874
+ helm/config/model_deployments.yaml,sha256=ec7CZLii6mpJeNC93J4gMgh1YrkU6Fj2XpXJaes01xY,160890
875
+ helm/config/model_metadata.yaml,sha256=JvvKKEePcGCQf_cHGQv-k_Yj4GmB71lvRY2Is176a9s,263155
876
+ helm/config/tokenizer_configs.yaml,sha256=Xju6-GcWJD-nmS5U0dUgkOexHWVWCd-J59EiVufoOCs,37687
832
877
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
833
878
  helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
834
- helm/proxy/cli.py,sha256=3bG-w71CsnPgVzN53aYWebAf0avBNJCVaxxDLupEXk4,8264
879
+ helm/proxy/cli.py,sha256=apG3ByfyMciZFXV5wX2177p1B5eqkxCY6VoRgwJ81Kk,8316
835
880
  helm/proxy/example_queries.py,sha256=EB2vVpAryOUAFiLrwsMiFz0zGl_UAQ8TJ9SkWngvsu4,4389
836
881
  helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
837
882
  helm/proxy/retry.py,sha256=iLZmKATEJQa9jsSpOIx6YDRhmrA8G1Qm21cUxCuo2Ug,3490
@@ -843,13 +888,13 @@ helm/proxy/critique/critique_client.py,sha256=ATZuXw77lejwtpgLg3Soy3VDyv8D8xetl0
843
888
  helm/proxy/critique/mechanical_turk_critique_client.py,sha256=OcppmFOMweBSfVTiLIICIwjvPpHHTkdu9fFUTaubitQ,574
844
889
  helm/proxy/critique/mechanical_turk_critique_exporter.py,sha256=taULrc_cIP0O9c5UpGz3l9DmWQadTVzN_v-qzTgMoyo,8470
845
890
  helm/proxy/critique/mechanical_turk_critique_importer.py,sha256=NL97joO5pRkcICRdVyG4kf9JhfYRaySsxRoZ7KWDYv0,5581
846
- helm/proxy/critique/mechanical_turk_utils.py,sha256=mKpUv4zz3s5ptzDY7UrwuI7Cr5HmNgSjPC10BnN9AL4,1766
891
+ helm/proxy/critique/mechanical_turk_utils.py,sha256=MUMcxMA08OXJTtgCX7ejGQQivMNF3Xfu4AAHkvuft9s,1766
847
892
  helm/proxy/critique/model_critique_client.py,sha256=QMFiMpALXnneumKbJpXOZDEb3lPPdkIaSCasmdXHB8o,12806
848
893
  helm/proxy/critique/scale_critique_client.py,sha256=B4povtceyfal95eE3N7em9cC_B5Vy4jMrHXcsXc_5m4,15889
849
894
  helm/proxy/critique/surge_ai_critique_client.py,sha256=HnzgAoF4Du9Me0GS_lbNaozZslS4a2OZx735gh-coo0,8357
850
895
  helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
851
896
  helm/proxy/services/remote_service.py,sha256=zehXO0JYIR6fIgqSZ1p7icPBITYPYfjgTX1ZbxiN1dI,8806
852
- helm/proxy/services/server_service.py,sha256=vIf0GxDRuHFmCQHpgn5wYURlBkMNfF9e0jMQitc80-w,10691
897
+ helm/proxy/services/server_service.py,sha256=VTDkULezp2vniGKfH2fP7PHf_DAtsh4qXwKQ0tD_Wxc,7357
853
898
  helm/proxy/services/service.py,sha256=YFG5ZlBYBz3IdSVRKDIKVlAmA-oLjFCeBHE3iIe_SU8,6020
854
899
  helm/proxy/services/test_remote_service.py,sha256=xzkyptctXw3y5d1fgbidBMyw8B4rILZStC_C-hLgLUc,6643
855
900
  helm/proxy/services/test_service.py,sha256=oDYen-71iwZ6YMNBVbVSdEFsH6GMvZYw5tS5Eg4YHjY,8987
@@ -869,13 +914,15 @@ helm/tokenizers/aleph_alpha_tokenizer.py,sha256=Ofc5thTfW_eb5ztiU-y_0p6e2PIGbHMb
869
914
  helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
870
915
  helm/tokenizers/caching_tokenizer.py,sha256=BwcyVzG7vy3R2O0UgbNxNP2nN4wBnsvpG_9mXQuDYfw,7300
871
916
  helm/tokenizers/cohere_tokenizer.py,sha256=6WwHIt7SsICmYR2QQpwDJ7pfNF8VWrFHFxF5Kynq6aY,2116
917
+ helm/tokenizers/grok_tokenizer.py,sha256=Ms7QFYNookeq29AIfHUIXfKhrpRrPOPsNs0zBzWdLKA,2084
872
918
  helm/tokenizers/http_model_tokenizer.py,sha256=J5Myg6JVDNgHMN7XOHwGV3WrhilUZ9Sw_FrgO4frYuY,3124
873
- helm/tokenizers/huggingface_tokenizer.py,sha256=fpKwSnZl94AnXQybzJhVnTda5zJnGsjGphKrlPFa_Fg,8726
919
+ helm/tokenizers/huggingface_tokenizer.py,sha256=P2ri4n-SUWB9ShMlxlJ9kO-mPmbSTizMGwAf41JE5ds,8734
874
920
  helm/tokenizers/lit_gpt_tokenizer.py,sha256=0c6KDeLNHPd6h27SXQvkUfmrCSLYa1kQY1GqCHVfhvw,1675
875
921
  helm/tokenizers/simple_tokenizer.py,sha256=6_NROqVbygs-HRA7bYAZluN4YB5gUhVaRsYQeRTjA1E,1147
876
922
  helm/tokenizers/test_ai21_tokenizer.py,sha256=V8orjdKxmEV44VYoZ9Sq5E7CIq2caNnr6vjdk0T_w1A,1646
877
923
  helm/tokenizers/test_anthropic_tokenizer.py,sha256=h7sJMRv_O2yAuEzbrXLJJIo9Gy8wkTycc4gu6UFvDaw,3937
878
924
  helm/tokenizers/test_cohere_tokenizer.py,sha256=15z2GJtZ-VlrliC2_Fk5DIZhQYFkJS7J73fjxYMf8YM,1431
925
+ helm/tokenizers/test_grok_tokenizer.py,sha256=b094C_M2a1zNM3SsGzp9cNNm8aDmmoz1kFbPkubbVTQ,1212
879
926
  helm/tokenizers/test_huggingface_tokenizer.py,sha256=7OB2d0PaCp-qmGXVt0V3yf0ciilN3Kd2qnAYprWRl64,6324
880
927
  helm/tokenizers/test_simple_tokenizer.py,sha256=vUNdcnJqZV99-E8H1rwUH85AQPJ2HTnDr5DrZ_-zRL4,1219
881
928
  helm/tokenizers/test_yalm_tokenizer.py,sha256=8IeJM3X61p3ygBfK_bJtPh_xOJ83IluaZ3UM2xTtbEY,2492
@@ -887,8 +934,8 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
887
934
  helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=1ZcPL3srfk031LmA8bEdPcIraAPnHGiYi_CqTiJSTlc,904
888
935
  helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
889
936
  helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
890
- crfm_helm-0.5.5.dist-info/METADATA,sha256=xmTkUJOFSJ_wrES6pixgfpOjzgS4eJlWaEpGpidNEo8,22210
891
- crfm_helm-0.5.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
892
- crfm_helm-0.5.5.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
893
- crfm_helm-0.5.5.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
894
- crfm_helm-0.5.5.dist-info/RECORD,,
937
+ crfm_helm-0.5.6.dist-info/METADATA,sha256=QlR8qMFpWzt_gIs6aCdrEEUuOS5uCdg1kbRMoI7YGYc,23069
938
+ crfm_helm-0.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
939
+ crfm_helm-0.5.6.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
940
+ crfm_helm-0.5.6.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
941
+ crfm_helm-0.5.6.dist-info/RECORD,,