crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,26 +1,26 @@
1
- crfm_helm-0.5.5.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
1
+ crfm_helm-0.5.7.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
2
2
  helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  helm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  helm/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  helm/benchmark/annotation_executor.py,sha256=LEehcWmkmqV_bFFzzmdm3GqsObJGCqoAYi1ekwG-yQ4,5757
6
6
  helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbYs6T4,1603
7
7
  helm/benchmark/data_preprocessor.py,sha256=wqGzAiLwOYa4v6TVPe6ayrnuzdNbmfjeiofRQiO2uso,2201
8
- helm/benchmark/executor.py,sha256=simd7SdJ7TciUpoq3D0uz_XUSCZj5KIWCIP57FYm4js,4906
8
+ helm/benchmark/executor.py,sha256=E7cF1vMXBn5eT1z5Le5ng4M9AaIMLjxfLgMmF1EfZy0,4843
9
9
  helm/benchmark/huggingface_registration.py,sha256=DAiHffNmo4H90rBfvQ_LHADtUCnCk6dfpI7Wbat1DZA,4389
10
- helm/benchmark/model_deployment_registry.py,sha256=BjL0ghHgO7_Z5jZZ7kuSOj9saegI3BivaL-b699C0rc,9527
10
+ helm/benchmark/model_deployment_registry.py,sha256=aPBkSr59jqx6ThFW-DYFhi3tPsLLhSKF5JC4-pxqLrk,9011
11
11
  helm/benchmark/model_metadata_registry.py,sha256=7XisV0an_edM8hvP8LSoCnTeUN2QLJrQknOCA6-OE7M,8841
12
12
  helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
13
- helm/benchmark/reeval_run.py,sha256=ApGc7T3koXPLhW60e4g8KNMbOfhUTMwMXMWIPcHXWGo,7211
14
- helm/benchmark/reeval_runner.py,sha256=Qt9t47c6xJqGmkGYKfDLhf6idhQHThk_46fui9tsIwA,15593
15
- helm/benchmark/run.py,sha256=eVtwVYvm51R-maimOehonn3IvJObGUFPbGvqoedykKQ,13658
16
- helm/benchmark/run_expander.py,sha256=ZIVTmFUZlu9SJR0yTiNErOVT9-zSR-pU3cje8jdltuQ,55891
13
+ helm/benchmark/reeval_run.py,sha256=vImL8JNhveEOftZbRQ6JAxF0L-XCKIwh65M6fIYo4RU,7198
14
+ helm/benchmark/reeval_runner.py,sha256=bJPl7XVOVwK2fUA7voOVQYwVFEOfKVnrT2tbSGQzQY8,15584
15
+ helm/benchmark/run.py,sha256=ZyqkKnqkMqM2AH4HL6sH72H8-mrDWu0NW0piE7BY0HM,13973
16
+ helm/benchmark/run_expander.py,sha256=hKFLpmq8W2KBl_mBf-ahHEbt67qZFgu-VxjvidOeQuE,56543
17
17
  helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
18
- helm/benchmark/run_spec_factory.py,sha256=lchT8iltTIYrkJ_uGAQkS5gmu9gvrZ-mVIkx2KhR10g,7728
19
- helm/benchmark/runner.py,sha256=zlHDJ2Ys5-HxtXcwpkXcrdfXy_i886fBcq1iNeLyC3Q,14669
18
+ helm/benchmark/run_spec_factory.py,sha256=Hxeft3fXoWNz9yGo-2nIfb5pd3GDWlwYWc6YYvAkTjM,7785
19
+ helm/benchmark/runner.py,sha256=O-91eRRrNgE4_tlCVeLq9_0QsRfNELvaQT-KWtJw894,14618
20
20
  helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
21
- helm/benchmark/server.py,sha256=_L5rb3NqtShQMkpESPKpo04KcMcRzR_ZQkWFokAb-nU,6344
22
- helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
23
- helm/benchmark/slurm_runner.py,sha256=RjmwMqMdKwOzd9B2S6fkuSqB2UjybmiSRVjraiLtzgM,16567
21
+ helm/benchmark/server.py,sha256=uphh9L0FQnVZVVoGx50MMb_jXh-uen6ouE3uDN5GKFE,6422
22
+ helm/benchmark/slurm_jobs.py,sha256=6m11gyMo-cA2dwxR2pBXv4tEds5Aok4YCQQyHRmPoPk,3164
23
+ helm/benchmark/slurm_runner.py,sha256=T4vSoxwdRR8gqyL4S2sw_Le-9rv9BPC0BlOy88pwt70,16785
24
24
  helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
25
25
  helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
26
26
  helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
@@ -37,13 +37,13 @@ helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=dvwirvz4dRzJ
37
37
  helm/benchmark/adaptation/adapters/chat_adapter.py,sha256=1Pf2XgdtrqAxbZPkUfw7TUH2lrulYoDTkC8Q0sckQHA,1852
38
38
  helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py,sha256=dhDZANH5lyL5VdR_Ks72cNlP-NHbJqThZVP6xKHmXaE,5034
39
39
  helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=LI7uWpKIHvTUjGiygmjB_1HLk26vNkYYCBWIx0EEyL4,2180
40
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=QckQyDe_BvEj3sOZ65UEqR7rMcOVPEq7MREeE7DHrjA,15031
40
+ helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=8LepCkI5b0MOL70pRPGb7vEH0KFMxIlpCQIVIzQT_vE,15030
41
41
  helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=u_GFEgg5wmpate-s5U5aMsmcHuFmreJcA8J0TO1kPCc,14907
42
42
  helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=-fY4mvzoGCCoR0HesT_xf2U2m2arVjgDuj59lm07_tg,1923
43
43
  helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=lzmHwvDOHWl9IWC3NTLGfJDbduXtK_zrS2_YoUQmdc8,4464
44
44
  helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py,sha256=RV6B3i5juBbJCtPDWzSfma49YXeDq3vQAQ5xQwnH-cA,3282
45
45
  helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py,sha256=hhH9ehK092j1WdUwrKYSy5PvNJ73gsIu6-5W8aLoYVI,2190
46
- helm/benchmark/adaptation/adapters/test_adapter.py,sha256=0-JrYnogZu4kENQG1eQMXHWnuSurCLRbkLpDuSnfRqs,745
46
+ helm/benchmark/adaptation/adapters/test_adapter.py,sha256=7Nr6kMK3JN0UjMjjZ6P1fsD5xhOeaqh0D1xI6LFKCos,641
47
47
  helm/benchmark/adaptation/adapters/test_generation_adapter.py,sha256=Iq5q0HpBHrI3d2SodI0OwQ-COXuM7KvCjlBk_zNguNI,12868
48
48
  helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py,sha256=HASZNtKXYWOOIMKVe16yokWNfCNJITJXoUhDLVkk-FQ,8048
49
49
  helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=-412yPKMylDMDXpbG-SlssXEjZlr3dshecrTFZoE-wY,11942
@@ -56,39 +56,40 @@ helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimoda
56
56
  helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=6nuz0Vn89A1mOedutsiq2SwTOG3qn8dUZTiaXhKffiw,3587
57
57
  helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  helm/benchmark/annotation/aci_bench_annotator.py,sha256=SjXidlbpm5HOhdhNXg3HjabMEQvt3hq1iJ5GPajxt8M,3228
59
- helm/benchmark/annotation/air_bench_annotator.py,sha256=wC9LKP_I8XX-Qa9Na5OQes4QOYmJoVN9fi8tcXPlKuc,3500
59
+ helm/benchmark/annotation/air_bench_annotator.py,sha256=CDyHVwD4eoymfLduJC5WvvhDX1DOgYBqgjvqBjoCfU8,3501
60
60
  helm/benchmark/annotation/annotator.py,sha256=__BkMVpAEpSs1pbwPK5sVWLdCAXnjsHcPYgmOqmNPu0,1843
61
61
  helm/benchmark/annotation/annotator_factory.py,sha256=8uo5uz1UpIVCHUd7CRvmy6b9XB1gspdHmgxH5UZMPVI,2335
62
62
  helm/benchmark/annotation/anthropic_red_team_annotator.py,sha256=4hob15m2k9e2A97E0aG9FstCbJ_oMM7-9y-nh2EaYqc,2395
63
63
  helm/benchmark/annotation/autobencher_capabilities_annotator.py,sha256=TkW3xbcEuaPeGwuFrlu0YNSmj896WarmVT0WYL1it_E,4913
64
64
  helm/benchmark/annotation/autobencher_safety_annotator.py,sha256=w_xjZmY1zuLjVvVbcbUygNvqcfn5dtwpXeV99yqm9aU,3914
65
- helm/benchmark/annotation/bigcodebench_annotator.py,sha256=_p_keqJ6WwOGP7wTfNFY_zAADN3HUHHNpb8QenEkcQE,4449
66
- helm/benchmark/annotation/bird_sql_annotator.py,sha256=mYK-2LeMzo9RrWzZFqaIRtn-1VyOe4ArCqBqtF6RAD0,2443
65
+ helm/benchmark/annotation/bigcodebench_annotator.py,sha256=CJG2pn1DeHJCp3yHETRquNIkCHfd6ZNuOiUjG1cQ_JY,4448
66
+ helm/benchmark/annotation/bird_sql_annotator.py,sha256=FQDZs1-O1jfJOET0eDeU7lf5xLaiMPohC5BdmQ4XkzI,2436
67
67
  helm/benchmark/annotation/call_center_annotator.py,sha256=pTEjwfA4tgZhroFbamoQ8IO_D1O9r6k5GIlD50JEg5c,11601
68
- helm/benchmark/annotation/chw_care_plan_annotator.py,sha256=LdY1GBQsU6O5z4KsVyan5z38vS6sNqpQak6ZacMmqfk,3073
68
+ helm/benchmark/annotation/chw_care_plan_annotator.py,sha256=6ybNBvJi59i0cpAhI_fLwXoSnqhAH6m7Lo6ad_PufBs,2966
69
69
  helm/benchmark/annotation/czech_bank_qa_annotator.py,sha256=YIH5g4zHe3BQF2Y-6uRVw7g9u_SPBncqBobdvZdIzyA,3096
70
70
  helm/benchmark/annotation/dischargeme_annotator.py,sha256=Z6xnUK1cNrFco9x0w8B_qhlLOEZrzXBwT6TKZPKoPBk,3676
71
- helm/benchmark/annotation/ehr_sql_annotator.py,sha256=q99HGDcnG7_YcU47nK4Yi6ZoykURCNDWW6wIwQa5lms,4028
71
+ helm/benchmark/annotation/ehr_sql_annotator.py,sha256=Izpq0biZ9lkJOPk6NwTuv2wk8Bg88vj56BKZrY8XhT4,4021
72
72
  helm/benchmark/annotation/financebench_annotator.py,sha256=gNERLY35t2kcpayXGGrY4-pBs2jbEUomqElRYbb9nho,4150
73
73
  helm/benchmark/annotation/harm_bench_annotator.py,sha256=zhkWnV3qZgY-nvHgQRHGrrCMC7605JwFHesY7UC3ZnQ,2293
74
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=ufvfXMTKcuk11Hfe3C7bEAyoqlqbrrv8D3hBEN3na6I,6057
75
- helm/benchmark/annotation/live_qa_annotator.py,sha256=8DXsjwmeSyvC0kfp1uYds4cwpxqzF7FcskeZaXxXiOw,3552
74
+ helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=I7TjpN502Sa-Z4uUKemJXSAdOiVA3MMO92YIAAXeDBg,6034
75
+ helm/benchmark/annotation/live_qa_annotator.py,sha256=PSff59mU_t3ypmptYsYRKU3m1vMLF0dMyUySIOxBrPw,3553
76
76
  helm/benchmark/annotation/med_dialog_annotator.py,sha256=OVTFIlvdhcOr_hdK0tnrDes9hYdN1mDWFTp4GDYY7O0,3162
77
77
  helm/benchmark/annotation/medalign_annotator.py,sha256=8edAZh8oQgDKUT1bQ3Hp2NBE-QnBZ_-ZQjHkV7YKWhs,3240
78
78
  helm/benchmark/annotation/medi_qa_annotator.py,sha256=v8e6hkHZX1x9KtTedCnpCseh-Y72z5kUgUrXHWPUkX8,3074
79
79
  helm/benchmark/annotation/medication_qa_annotator.py,sha256=uZ3VpJ0nsDyF70_kn8kSSBPr4OlfiNdZC7q8wq_jJFE,3090
80
80
  helm/benchmark/annotation/mental_health_annotator.py,sha256=JwgSeXtwf4KFZxNtAxsnqdLJQSvP-F-ZoCcCWdasrMQ,3275
81
+ helm/benchmark/annotation/mimic_bhc_annotator.py,sha256=pwwniNlu5VTa1ZdyO0KFcMFZcpqM5CjguujgSpEGslw,3174
81
82
  helm/benchmark/annotation/mimic_rrs_annotator.py,sha256=zABO1FJH9pOFhUe5vc2B-c14Hf5RsuU9jQAGiMg6G0I,3204
82
- helm/benchmark/annotation/model_as_judge.py,sha256=G6mDrbxNp4roC-smrhqZb5swt18Coa9b2-aJMPOaGuM,12116
83
+ helm/benchmark/annotation/model_as_judge.py,sha256=FIJOUzIhf2QpxqFf6hjgAM5hPEm0VlXzB-jiHJUrPDs,11985
83
84
  helm/benchmark/annotation/mtsamples_procedures_annotator.py,sha256=qqWHY2HfCwMP5GqvObS3JpMIYVs4yyITCsA1B7lcDks,3201
84
85
  helm/benchmark/annotation/mtsamples_replicate_annotator.py,sha256=TUxNzJcItErsw0gw76hiKZAWeQTNHGHnC0qf-_CGeF0,3316
85
- helm/benchmark/annotation/omni_math_annotator.py,sha256=fAgABWlSEs8jnmNbd8RWbU7KNBP-a32kqxTWirs229Q,6207
86
+ helm/benchmark/annotation/omni_math_annotator.py,sha256=PvZZb1oGw60qT-oHRIs93AZbh5wTbpsmD8BforudFhA,6144
86
87
  helm/benchmark/annotation/simple_safety_tests_annotator.py,sha256=if4S8MaENr1HZ42ZsOjDPXZ-kJ0p4l4B2j9m994RuxQ,2140
87
88
  helm/benchmark/annotation/spider_annotator.py,sha256=B48ylGg5J7xuTSUio7VztdXk3lI6ilMqrUvAD-ve0sE,621
88
89
  helm/benchmark/annotation/starr_patient_instructions_annotator.py,sha256=5jU-dK_0OvB_jXNLDZtQ5E3gaSUcAxFNzv6prA17eAg,3186
89
90
  helm/benchmark/annotation/test_annotator_factory.py,sha256=ifv5hxSbFe113AHeXLqTPkVJ-C2PW_gb9L3a0SHNi-M,986
90
91
  helm/benchmark/annotation/test_dummy_annotator.py,sha256=LfY1ErJDUJ7rD8JUy92RUDD1b91jUs4Nk8Gvope-Z98,1644
91
- helm/benchmark/annotation/wildbench_annotator.py,sha256=sk_GJnPeaIBC0frV04XNq3piOG7Hikn2bCF-_DqRe2A,5488
92
+ helm/benchmark/annotation/wildbench_annotator.py,sha256=OXR59zdKw9W7v3Q_sFnt1cEPN3nOzQDVqSbh4jDbEUs,5457
92
93
  helm/benchmark/annotation/xstest_annotator.py,sha256=arL5DyA_nYkiSCAtl6G7MliZz5ZYRsyc7xQJNu0RBcA,3604
93
94
  helm/benchmark/annotation/image2struct/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
95
  helm/benchmark/annotation/image2struct/image_compiler_annotator.py,sha256=iWqPDXscrXDkmzRGDg0o6ibmDVo5bQqvcWxZkr6P-d0,3620
@@ -125,66 +126,81 @@ helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8
125
126
  helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
126
127
  helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
127
128
  helm/benchmark/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- helm/benchmark/metrics/aci_bench_metrics.py,sha256=X1HCoGfoOzcNRsnYcfdqs50cy-hZcxJYDwWK69LcMuc,1324
129
+ helm/benchmark/metrics/aci_bench_metrics.py,sha256=fAuTm8Sr1vvyd7Tjcz9WWKrFkqrwCV-CiF6lqUO3dKU,442
129
130
  helm/benchmark/metrics/air_bench_metrics.py,sha256=VMNQDDEtz2CiK4U55lCHLz0b_DxHprTAZ1WtYtGXjcY,2282
130
131
  helm/benchmark/metrics/annotation_metrics.py,sha256=JbXNleQsPJVF2uc1xXgUW2bzvJqwLPZyhnndqc6THv0,4268
131
132
  helm/benchmark/metrics/basic_metrics.py,sha256=d0iwYnwrbF7w7CFtazx8vPIsZnj51U2PVVoscCb-HJA,20495
132
133
  helm/benchmark/metrics/bbq_metrics.py,sha256=GeZhSSJzqGD0e5EAiRHitIC3XtPICF7rDI6GfeYQc8E,6201
133
134
  helm/benchmark/metrics/bias_metrics.py,sha256=8qcInRJwQsuCI-lMC1umd-ZZaYvorUPrMjnuC6vSeb4,11602
134
- helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
135
+ helm/benchmark/metrics/bias_word_lists.py,sha256=eyk6we2J4SW8ZaZxQUWLB7Yapn92uM5TCekhFB5vg-U,13908
135
136
  helm/benchmark/metrics/bigcodebench_metrics.py,sha256=JcPZrSiHR-kxT-MFM8zXqOs6wTC5Hus3TbxuHFQVZow,860
136
137
  helm/benchmark/metrics/bird_sql_metrics.py,sha256=ooCuXW5nPpRs_-4seCONQmn25DzTbcUgGXznXTK9y0Y,1153
137
- helm/benchmark/metrics/chw_care_plan_metrics.py,sha256=E-TNRngWb_q7vVKs4QN8AfcpJndaKXd-XX0Ggxt96Qo,1341
138
- helm/benchmark/metrics/classification_metrics.py,sha256=TjsD7RjkIn7VpTrWtt88aJUsiGVtlv5N06FJl8VaJ-g,8858
138
+ helm/benchmark/metrics/chw_care_plan_metrics.py,sha256=WOAdwuF4vusZhjaXSAB3r7PD_ZxeNmVu2oAmOqzVLtU,460
139
+ helm/benchmark/metrics/classification_metrics.py,sha256=1Xa_bO4PqIAV2iZitE69kc4VKS4A7PloG5ElZAgvmh8,8851
139
140
  helm/benchmark/metrics/cleva_accuracy_metrics.py,sha256=1eDxHxVk-JW1mF9SBcuplIefAoi_edUwKpp-XxYbmeU,2740
140
- helm/benchmark/metrics/cleva_harms_metrics.py,sha256=PILZDbVOeUflCFbs_6cE-3qaBt5vwL8R-BirbB2jTn8,11278
141
+ helm/benchmark/metrics/cleva_harms_metrics.py,sha256=xVubv2pG3iinVs3namoVHWAmV9oUPywZwFB_0JGhP_w,11277
141
142
  helm/benchmark/metrics/cleva_metrics_helper.py,sha256=8UwiGhekUmp7DxYWU4rxqX2v3ewkg-O5-jOh49iOGmc,304
142
143
  helm/benchmark/metrics/code_metrics.py,sha256=SebQ5MXJe_phTiMfGMfhgYago-hwh_g9ctBWEHGqCnU,5230
143
144
  helm/benchmark/metrics/code_metrics_helper.py,sha256=UNai154RuhYRZM_YK-rveLct4Ui5iEBNPYmYdKq34Xs,22712
144
- helm/benchmark/metrics/comet_metric.py,sha256=qOvwE0ov1plb6SwwT3CbX1XuSo4GJ-M3iRe98yMiMaM,4797
145
+ helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py,sha256=biKk67r4ij3pK2L0OuGTJ4BAb8ig5tpGAV86uBD1qNs,7832
146
+ helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py,sha256=QrePgX-1UALQKs1dHMfOm1qoALvOU1pbLyC4JmcINx8,19083
147
+ helm/benchmark/metrics/codeinsights_correct_code_metrics.py,sha256=CQs9HXh7P1vzkKWdpvugvttD_8ZF6W_QPp7_rhYFwsY,13873
148
+ helm/benchmark/metrics/codeinsights_edge_case_metrics.py,sha256=B7EEELwwH67VxmgrTBSP25Etyb5XYIDuadfggMrHmcE,3866
149
+ helm/benchmark/metrics/codeinsights_metric_specs.py,sha256=BkKWII9yTkChdZVsGeeeCbiWQDYvvcAKo0nxi_RTTUk,1798
150
+ helm/benchmark/metrics/comet_metric.py,sha256=EJWZ9x8CGeDDQlfxYrY-np_NVJBt5gun0XLJvtpjXVI,4798
145
151
  helm/benchmark/metrics/common_metric_specs.py,sha256=JKqmO4ovBdfOYKC-00OSzOMv--g9NTCVfUHLaz-1Uns,6025
146
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=q7s6B-O11jVrRSAZDSoM3PzhksXCsoAwIZkPOXkiVFY,2663
147
- helm/benchmark/metrics/copyright_metrics.py,sha256=_Lp7sKWgacY_13kFadNfnhrM2Ks8syBXnUW7zYuJkwo,7817
152
+ helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=Zrf6HyH_WNe7gGFgW0j8FJlX5KZvbk-05iX8QFPJDyU,2656
153
+ helm/benchmark/metrics/copyright_metrics.py,sha256=RYOWKFN97UCD2Vj51gzKGbnnY9wAq6KJgiRt2cecVfs,7824
148
154
  helm/benchmark/metrics/czech_bank_qa_metrics.py,sha256=bKoooK2T5v_fFKNbUnsuW6Mv9muAirJD5lTrzuHfpz8,1113
149
155
  helm/benchmark/metrics/decodingtrust_fairness_metrics.py,sha256=x66XP0iQGk4ThT7ddmrlLCA0XF4arRbQMDT42LHf2kE,3297
150
156
  helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py,sha256=TxTkkWdx6d6ym0MirZTiucl_TWFdn4uJLnlTfLjQvgk,2925
151
157
  helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=OU7lka-hm6PubR5Gjj4uNyrqhjlfhe0mmjBCAz9vlRs,3456
152
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=BKDD3lblqT6Ebi5kEC4zbN_OvQwD1SdEtBv5Wf0kzWw,6460
153
- helm/benchmark/metrics/dischargeme_metrics.py,sha256=Z5EOn6uvrOZCqUQeBK_mGWTzOCqJwewh4SAndeCn9CI,1336
158
+ helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=bW4zafRyKFa__8fGrdiTPUu848ovNnvakLCfqcMrcHk,6461
159
+ helm/benchmark/metrics/dischargeme_metrics.py,sha256=D8LI52E17hNSPDpEvb2tw1za4QWDE3p9xgx7Nm9l7_Y,454
154
160
  helm/benchmark/metrics/disinformation_metrics.py,sha256=5n8wgRBb6FaDjqe1nR3Cj9aS48esmMsIUq4KpBHoQoU,7870
155
161
  helm/benchmark/metrics/dry_run_metrics.py,sha256=Ss0lzf944HIbL1CX6QuJpGFPqOzhBT0qVWLNR1BoEjk,3784
156
- helm/benchmark/metrics/efficiency_metrics.py,sha256=cLnPCvOzbUETOJh-lu65iNgYwVOOZAJO_s5iTUAd0MI,11852
162
+ helm/benchmark/metrics/efficiency_metrics.py,sha256=SJqpA1d_GfBPl9H6moai8ra1GVe7tlaCfg3PeiWT54c,11845
157
163
  helm/benchmark/metrics/ehr_sql_metrics.py,sha256=YRjvPIty7zlyoyGD6wo3HYOz7y_PThySOZzVRJ38iww,4797
158
164
  helm/benchmark/metrics/evaluate_instances_metric.py,sha256=LGk1Dv_76Ak0YUlWKFTsOLEFiBSmcGVhNrbj_4zg9g4,2913
159
- helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=t0251_2aA0CrXB8oUBKlPRgPl-xYjzdVhLcGjwuhOgo,19621
165
+ helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=T3pftPfYEUR88NEZEZuzYOTNoHELo7nSbz4qmxN8oQc,19628
160
166
  helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJTrXJaHYTXn4,2470
161
167
  helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
162
168
  helm/benchmark/metrics/gpqa_chain_of_thought_metric.py,sha256=HRRKkcTbCu5ScOVwmjzYaA7UAEGE_AJUZVOCDRuv4Po,4321
163
169
  helm/benchmark/metrics/gpt4_audio_critique_metrics.py,sha256=L9tGFwvl1-Ew3MdInQ7KPa8OlI5YexIB2KuCYVYsuPY,7023
170
+ helm/benchmark/metrics/gpt4_audio_refusal_metrics.py,sha256=vYPRJq-4uNhUWUWMrDkpHmfIBkhEyAgaMNEI6RKPP80,5896
164
171
  helm/benchmark/metrics/gpt4v_originality_critique_metrics.py,sha256=1m7IWy9vu66svnmdBRjZQI-2YsGYzH2vXZMptlRGM0Y,5654
165
172
  helm/benchmark/metrics/helpdesk_call_summarization_metrics.py,sha256=9-kB3NeBacI6nxs2oQ7Km_1SHyiz98UVZuR8PAlvCHM,1442
166
- helm/benchmark/metrics/ifeval_metrics.py,sha256=iYj-880nHHXECC8t8B93f1LZL9e6PMB-M0nxRdRBZcg,2572
173
+ helm/benchmark/metrics/ifeval_metrics.py,sha256=4_Vp9bNnrctKtv6xZ1RpvBstPAZPwv1xiohH-ogs99U,2565
167
174
  helm/benchmark/metrics/instruction_following_critique_metrics.py,sha256=RR9cMIG113oXUnBjU_denn7DaCGB11k1oGtQ5dQON3M,9874
175
+ helm/benchmark/metrics/kpi_edgar_metrics.py,sha256=1GsW-nBz8TgP4wFIVEGA4_BhI17kihmk96zuLpD4NZc,4636
168
176
  helm/benchmark/metrics/language_modeling_metrics.py,sha256=yS7k8iFjxfkckSBA0RVA7VdOivSEBtNzCjczK6We7y0,4598
169
177
  helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
178
+ helm/benchmark/metrics/llm_jury_metrics.py,sha256=yzAsdacyX0MFJy2qKIjhI0y7JvtflELpCh6R14wuCgk,1704
179
+ helm/benchmark/metrics/lmkt_metric_specs.py,sha256=0Fa0xLjQDXwsRCE5VqGzEfb5ZdzKsDoSCwR_zHogFcc,376
180
+ helm/benchmark/metrics/lmkt_metrics.py,sha256=GaZTfl-NQXa1YSzcJUGlZ5wZURH1CnJxGkPFBj8ydTQ,1856
170
181
  helm/benchmark/metrics/machine_translation_metrics.py,sha256=22vaGBCSw12uM1wmtDG-MBBZW8OiTZwNPaerjckdtDE,3860
171
- helm/benchmark/metrics/med_dialog_metrics.py,sha256=uDa4xQkU_-zv6WVC1gzoW6YVC1lSt7bzSD9kN0cmkE8,1329
172
- helm/benchmark/metrics/medalign_metrics.py,sha256=TVqkqlQYuSePt_pG1jAJHhulrAhODFI2-hT4-RWhUkc,1321
182
+ helm/benchmark/metrics/med_dialog_metrics.py,sha256=kzmrkQcmJ15zuOF9_Onk9N0oeNeyl9Rri1JEb1AqRT4,447
183
+ helm/benchmark/metrics/medalign_metrics.py,sha256=q6l8p5Pie-H9pxhaA-lQkSOnliJWXr6zUeN8syEQ91Q,439
173
184
  helm/benchmark/metrics/medcalc_bench_metrics.py,sha256=9wZgg20-9QBNk0_XhuwR3LT940fqDPkCM4Kl0dPkbAs,5353
174
185
  helm/benchmark/metrics/medec_metrics.py,sha256=hNBOGX52G_QOmgTCp9LnIMrmGSRxbb5vgjxKU069TMQ,4152
175
- helm/benchmark/metrics/medi_qa_metrics.py,sha256=e8nZ7aMLgg7r088AQ7DBadQsncqCagkxpI81_X88qZA,1314
176
- helm/benchmark/metrics/medication_qa_metrics.py,sha256=2UEH1DbcrDzWpTfLgKQC68_AhGhJ_igACvYWO2yvspo,1338
177
- helm/benchmark/metrics/mental_health_metrics.py,sha256=T8Y2lQo0IZvDvsK7G30nnIN9djj_GlGK2CFMuFRBSBA,1344
186
+ helm/benchmark/metrics/medi_qa_metrics.py,sha256=JWAEMuT0UXDZrb7qHn13W6W79ilbprk492V_9vWrB4s,432
187
+ helm/benchmark/metrics/medication_qa_metrics.py,sha256=wit3nKNWpGFfgauu6Xye2IDTePAS0VHAQI_7OO9HR6M,462
188
+ helm/benchmark/metrics/melt_bias_metric.py,sha256=mHDCkRGLD-0pyJA_depi_KX3sn7g7Bgd3_m0XdLQahY,11520
189
+ helm/benchmark/metrics/melt_bias_word_lists.py,sha256=xA0araUdszAIOqfxiTi6MIJhKYwr_Gwsc1L9qinZx9U,27891
190
+ helm/benchmark/metrics/melt_metric_specs.py,sha256=zaeV57LQEl8qK7be36NaojiUJlzmkoKY8JyOkOVuPqs,1619
191
+ helm/benchmark/metrics/melt_toxicity_metric.py,sha256=ni6bb_QC51NM5jQpbFYLWtsQy3tNOLwQ_5b3PDV5vVk,4193
192
+ helm/benchmark/metrics/mental_health_metrics.py,sha256=4HXCXl2GxFPn6wDzHptHeBTuP4BJVLUzEUKffpd5R_k,462
178
193
  helm/benchmark/metrics/metric.py,sha256=jqQyiKDq_pQv-ulGqfZI56ydRDQs3N3XhfHIPysUhrk,14311
179
194
  helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
180
- helm/benchmark/metrics/metric_service.py,sha256=mlX_MEFSYNzME6GFS3El_VVOvzPYnOMosKI0XIxygP4,1802
181
- helm/benchmark/metrics/mimic_rrs_metrics.py,sha256=YPU0cwH12L0VqdLhXd12P-eKUDqn39z_sLFx3YdGrP4,1324
195
+ helm/benchmark/metrics/metric_service.py,sha256=bJaM7GisEgSWR3vPTcg7b67XF9X2K5viODacIgbGb24,1692
196
+ helm/benchmark/metrics/mimic_bhc_metrics.py,sha256=da1YYrE8fL3YHeIJ9hf4WCKZtuj_8cksm3rJ24rcy70,442
197
+ helm/benchmark/metrics/mimic_rrs_metrics.py,sha256=x3vSj1VG1UkNF3gbgJYDeA4z-crxfGIkK7iZo0xjq8c,442
182
198
  helm/benchmark/metrics/mimiciv_billing_code_metrics.py,sha256=Pu9efXoBrhsvxSeGHqwbUA5k365-pJTeXpMNhmcg0L0,3927
183
- helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=HfmNYyqHplEEM-ABzuMSL_vX92gFrZchO2DITl1Ukiw,1379
184
- helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=A0Ir6B0f99SwCf5KBGGUBFXCqV1Jo693BsYU_wIN3Ws,1374
199
+ helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=XrddVk-gnc8jF8amCI1RBa_XTS9yEXD2Y9Ld9W7Q-m8,497
200
+ helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=rmH34aTX_wZWxLi4jrxf3sR1RIqNRF0QDANLRQUGhqM,492
185
201
  helm/benchmark/metrics/nltk_helper.py,sha256=QMEps-lqJZ_pCgvjlMf4BvC0pzDu3ez5jit5F4p8dAk,1313
186
- helm/benchmark/metrics/numeracy_metrics.py,sha256=3E-CMmB2wuGW5tLjmEm8wFMf85DJ1ZDUANfh84SQuP0,2906
187
202
  helm/benchmark/metrics/omni_math_metrics.py,sha256=Gqih87UrE93-a0hbRhTBkjmfGLNTkuKQGaG-sTQeuG8,1287
203
+ helm/benchmark/metrics/openai_mrcr_metrics.py,sha256=TAop7G50FKaR-Jyo2EGLqmMOfJRmS2vNRDFiifa6mhg,2313
188
204
  helm/benchmark/metrics/output_processing_metric.py,sha256=ey9UBi2f3780OwFlp82ymzfjLR3MA2fpA9vW5R4W5TA,2581
189
205
  helm/benchmark/metrics/output_processors.py,sha256=ULZlDBOf6NupAXzDKBKyTDdgPZ5PSxOAlOYTbrQEek8,472
190
206
  helm/benchmark/metrics/paraphrase_generation_metrics.py,sha256=771CjpW5Ek00OCaCFfEsO6Cdy9eZb1fMlgWASvQgiK4,2025
@@ -192,24 +208,24 @@ helm/benchmark/metrics/prometheus_vision_critique_metrics.py,sha256=pexBbEFF3-bz
192
208
  helm/benchmark/metrics/ranking_metrics.py,sha256=hSNKy4h7zRkGYSgo6RWt4PXQztA5ZX1PCJorVqpCvpA,17457
193
209
  helm/benchmark/metrics/reference_metric.py,sha256=hseI7A16SOC8ymYZYFCL6nxnyxn0q9_Gywuvb1r9FLE,6092
194
210
  helm/benchmark/metrics/reka_vibe_critique_metrics.py,sha256=CwzzQ13bBT0r_o75TqFj2Zr0ST9vzQi74K_ezWTnLCU,6568
211
+ helm/benchmark/metrics/ruler_qa_metrics.py,sha256=OuiA0ksByl0Tw1Oal7zbedhKjTrhJgQJDLXAgoTLXuc,1473
195
212
  helm/benchmark/metrics/safety_metrics.py,sha256=oARko_EwVnykBKYxi-w3ytKme4qcb1waz_0N2GKbSlg,3348
196
213
  helm/benchmark/metrics/seahelm_metrics.py,sha256=egRkeXnnb8Nqi9qJJMDXJRSl4NK6WvdUxAc_LffBips,6964
197
214
  helm/benchmark/metrics/seahelm_metrics_specs.py,sha256=cx8p4kwTuEOWxZioK9CVoeTNJT0fZjxRy_6_EM9F394,452
198
215
  helm/benchmark/metrics/spider_metrics.py,sha256=RSrFJoA5SNcNxfmgVqCQixcSLrfJBYuVQw5jsfrc9Xg,189
199
- helm/benchmark/metrics/starr_patient_instructions_metrics.py,sha256=lylCQ9rj1Y990bWr_h_BfYrikGGTuejxptTRZoXeaKM,1407
216
+ helm/benchmark/metrics/starr_patient_instructions_metrics.py,sha256=YHdTeIFdZxRbvqBnlWpAyIsWzZyWAjjDFuKOXhHYiSM,525
200
217
  helm/benchmark/metrics/statistic.py,sha256=ATuOm0jU3L-0ELiZaF2GVMNF22W66-rMvzxRtlfqcII,3446
201
218
  helm/benchmark/metrics/summarization_critique_metrics.py,sha256=-mki8-zvZx54dQg8X0BG2Y6wmfypQhkIuD_9ZjNBl78,4782
202
- helm/benchmark/metrics/summarization_metrics.py,sha256=LNLGFi4DAKJEL0P60rnPlS_-yLMNLUprJbuJ6VsdL0g,16842
219
+ helm/benchmark/metrics/summarization_metrics.py,sha256=FJCdGRmlCJX5A-AmbtpGGlGRfNgg5Z8Bo0d9yFiE33E,16876
203
220
  helm/benchmark/metrics/test_bias_metrics.py,sha256=qEZsCULvwjVdIyfNgJSc2L7Xp9suKKW7L5OuQmGrwZ8,6393
204
221
  helm/benchmark/metrics/test_classification_metrics.py,sha256=CRDMGmVmzEUnNaM0C02qUTOU2AS11Mt2-GdEl89y7lw,9541
205
222
  helm/benchmark/metrics/test_disinformation_metrics.py,sha256=U3ZmS9s33oimTQbKO-7pgWeX_WiDB9chlOCtf_vslXw,2249
206
223
  helm/benchmark/metrics/test_evaluate_reference_metrics.py,sha256=B7xtDDWPAxF7d-vcUx_R51hFMae-DD52nUwbu_eWt6Y,1601
207
224
  helm/benchmark/metrics/test_metric.py,sha256=0sGlXE3_Al_VyKpOPBhQR_xT-XrcVgGepLpwut37DmA,771
208
- helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
209
225
  helm/benchmark/metrics/test_statistic.py,sha256=yK6m2BZ5UXWmb2D1cQzDH_2ELvrNDaR_lyzX4WoHw9Q,1273
210
- helm/benchmark/metrics/toxicity_metrics.py,sha256=SkVp91-LnZMa5ouEspcYa-PYvPGMi4H_bU3uuc6ve5I,4115
226
+ helm/benchmark/metrics/toxicity_metrics.py,sha256=ZLOzxDlMgbljl-9y6vT2ZgwdhsBZ4MfV-T66VpKk00U,4114
211
227
  helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
212
- helm/benchmark/metrics/unitxt_metrics.py,sha256=OBJ6Ph-4mpAOAEcl4loBemcEQjz-UrbmPVnWmzifhG4,4863
228
+ helm/benchmark/metrics/unitxt_metrics.py,sha256=8fawxnrg0xsAe0xO2wbL7S_yisj8RzJnrn6xtk8C6q8,4852
213
229
  helm/benchmark/metrics/wildbench_metrics.py,sha256=sY7MNTzRlJJK3yph3rCijgbMaajtLyCCquThlsoE5wU,1380
214
230
  helm/benchmark/metrics/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
231
  helm/benchmark/metrics/ifeval/instructions.py,sha256=qNoa1vMPDNz6ORWfyMv_efwKZ4U5zkI-cf4aApyfSqU,53247
@@ -219,7 +235,7 @@ helm/benchmark/metrics/ifeval/instructions_util.py,sha256=VhkJfZLCaHi094rZSoeQbo
219
235
  helm/benchmark/metrics/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
220
236
  helm/benchmark/metrics/image_generation/aesthetics_metrics.py,sha256=UqjBgAi1ylegvHBjALJ8vxINhHEqqr2fSvN9lXgyIZk,2140
221
237
  helm/benchmark/metrics/image_generation/aesthetics_scorer.py,sha256=ISdThDKMrx-SHQe69dCcr8qUrMCa_GsxX3BeZnd0WPA,2538
222
- helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=tUnAoew24jjjbjPaoE2-4iyRTq6YNW8Xfk1p5JWZkAU,3338
238
+ helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=0B2WCTP5LDHDbWGoMW2mKnnImHt-QYEU2QzqYf4HxjQ,3812
223
239
  helm/benchmark/metrics/image_generation/denoised_runtime_metric.py,sha256=Nom_yw15ePU7wUuV2DFHpLnEAqaZQjlkW9LowRElOAI,1646
224
240
  helm/benchmark/metrics/image_generation/detection_metrics.py,sha256=mfYoPbLCmqWxqMSXbcX6TM0niNnpCeipcHImuV3mZ3c,2160
225
241
  helm/benchmark/metrics/image_generation/efficiency_metrics.py,sha256=neeNJNtHAVUMWqr5rvRIRlPKl225cXUGCURLB0z-rKQ,1459
@@ -242,7 +258,7 @@ helm/benchmark/metrics/image_generation/detectors/__init__.py,sha256=47DEQpj8HBS
242
258
  helm/benchmark/metrics/image_generation/detectors/base_detector.py,sha256=e4c8vPfioGzl2ftYzWOFIBDJcZJxBmpjU13n4fXaSvY,226
243
259
  helm/benchmark/metrics/image_generation/detectors/vitdet.py,sha256=kxXS8uNBC0pQ7LatuN85CXU8pJHZn0pJXY0rOLd_39g,7526
244
260
  helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=vFO6s8QHo6Pt1QfbOKAI0m3mJrc0BeH1Hcf7u2uWMIk,2116
261
+ helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=NwE85dtiVSlCJc50E57pkckgnCiKBsW0nF3cqgc2EUo,2128
246
262
  helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py,sha256=5qKL-gHnEVmzSDW2GKDq6Uox_EJMDLe0QA55Nrl4H6s,1472
247
263
  helm/benchmark/metrics/image_generation/q16/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
248
264
  helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py,sha256=8Y5h-6RMjivm50RnNbNwV7wCug4RhKT5g8R_YeEp54I,3467
@@ -251,7 +267,7 @@ helm/benchmark/metrics/image_generation/watermark/__init__.py,sha256=47DEQpj8HBS
251
267
  helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py,sha256=Ir4u8blJWTRtEBogb6u22qCy3JXAIzvx-Th6dSBLfdw,698
252
268
  helm/benchmark/metrics/image_generation/watermark/watermark_detector.py,sha256=w6WnTc6t6zx0W0gTjgedXC9OO5dq5iWpx9UcnioKml4,3641
253
269
  helm/benchmark/metrics/summac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
254
- helm/benchmark/metrics/summac/model_summac.py,sha256=nDB8sAJNnQ5TkBKoQBGUejFK6ynrRiaV4oyEKcm3qkg,17488
270
+ helm/benchmark/metrics/summac/model_summac.py,sha256=ccOP0z4WEpR26iAzzTWviFfX33Cg9MdpZgKgSRQc9D8,17445
255
271
  helm/benchmark/metrics/summac/utils_misc.py,sha256=7_Q1c72cKt8PWtxn8u4R8nB53HK6_JF2nP8bBXYNk-A,1485
256
272
  helm/benchmark/metrics/tokens/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
257
273
  helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py,sha256=XDZGK8h84F2w_pK8Zjko8ssKZmVxKFqTOuHL0mLBzMY,694
@@ -261,60 +277,71 @@ helm/benchmark/metrics/tokens/free_token_cost_estimator.py,sha256=PiraoV3WtAYtcF
261
277
  helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py,sha256=sa7Cu0S9IPF35puSVU-gYnLg1uXEZYAdRyKmCc-_5ss,1549
262
278
  helm/benchmark/metrics/tokens/openai_token_cost_estimator.py,sha256=CovkJ4zeVn89bjno2gP0K8ix_Ie0EC2tUJLHLCEl378,1427
263
279
  helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py,sha256=n9f2rcgaNHROORvSYjULXC_LEA4KZZjs8wASk0vAG7o,1100
264
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=Gk1ihZsT-EhcTyMsbmNWPP2Z6FlS4nRYOpq7v41f3j0,2657
280
+ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=eDooaAAtkmIGGbK672Db9simp2soXXr5GiEG3hEQBq8,2649
265
281
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
266
282
  helm/benchmark/metrics/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
267
- helm/benchmark/metrics/vision_language/emd_utils.py,sha256=KdZdcqu3eo016FdAjAm_83v92-wWuR90EPsTogfTcok,15196
283
+ helm/benchmark/metrics/vision_language/emd_utils.py,sha256=nVqQ7oosjKjhpR5YPPvO4ssB92bGChgODOtsqMYVEpU,15230
268
284
  helm/benchmark/metrics/vision_language/image_metrics.py,sha256=RgKAn7ftl4KCZ86V3zO_LUstNbc6Lla-0hdQq77JDXw,23841
269
- helm/benchmark/metrics/vision_language/image_utils.py,sha256=4E0NYh09O6-5sGhAPo6KZqYaZfBpCtuYbD3vLt-wQzk,3755
285
+ helm/benchmark/metrics/vision_language/image_utils.py,sha256=xwtydR8-s23cJacIGXDXL_pUhAqi6O5CbhM4XNEFlDo,3787
270
286
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
- helm/benchmark/presentation/contamination.py,sha256=PiIdcaD3-xfExjOmyL5q4Ao2ASa-OlScJAB9u1Zxe7o,2811
272
- helm/benchmark/presentation/create_plots.py,sha256=T6ewj8rXZfRlqg01bgbhdU1rsABK4xyrLpruhRG-7Fc,28691
273
- helm/benchmark/presentation/run_display.py,sha256=tC1DciLvDTQJog4BDo8StWDdX7DbBkhrG2sX_SwXSPQ,11838
287
+ helm/benchmark/presentation/contamination.py,sha256=07IuIP92vfuI0GwfeNC-i_NZUlF8N1azzagC19YHOMQ,2802
288
+ helm/benchmark/presentation/create_plots.py,sha256=bM6UNzH0Bx8Bv2iKcyMoYp7IwfCZSQob-w_XOOI6r1M,29090
289
+ helm/benchmark/presentation/run_display.py,sha256=LmY2HES4dU94kRYuUxt-c9LTMDN6MU5CspWTF6rZwDo,12419
274
290
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
275
- helm/benchmark/presentation/schema.py,sha256=cblGmgkhuqQRWPh-IT75u3Il_-SVXipeq-mh64lvgWY,10947
276
- helm/benchmark/presentation/summarize.py,sha256=Y4rNMgnQYEwOKX8Syd9R0HybjnaW_tJQZcWF4ZFrHvc,59749
291
+ helm/benchmark/presentation/schema.py,sha256=j3gOhj-okQ4qzYoMh5N3ltsL0OXiOGuB7ydF-SI-Ug4,11229
292
+ helm/benchmark/presentation/summarize.py,sha256=_d3gd45eBpx8yMnVq1XgF9D-pPMcpbuwseSZz4giybo,60092
277
293
  helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
278
294
  helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
279
- helm/benchmark/presentation/test_create_plots.py,sha256=5PPPegMTdBZurxyyUxI4rN13AVsjV3eQrwFqlobJ8UA,1286
295
+ helm/benchmark/presentation/test_create_plots.py,sha256=1FrJZnPW-5QUQKt_pf4y47uDha4B8wHyY1o5hqhKWhc,1293
280
296
  helm/benchmark/presentation/test_run_entry.py,sha256=4n484sSYT0gQ4WVt67Fs3ctKa4vi97hI32O5XXxGY1o,794
281
297
  helm/benchmark/presentation/test_schema.py,sha256=6mq6CeAOLW2Kxi1lX_ZW8QCVqVR73XImR8ylcRGFkBE,378
282
298
  helm/benchmark/presentation/test_summarize.py,sha256=GzZNwBDybpstzl6wT0Rgqn75N9iCNrUIzrdjOfUolu0,6317
283
299
  helm/benchmark/presentation/torr_robustness_summarizer.py,sha256=SmMOZWCQ-KaJBp78otwvAeE1btWignyWalaQ8QG87r4,8242
284
300
  helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
285
301
  helm/benchmark/run_specs/air_bench_run_specs.py,sha256=K86SqpINMBOiLIpuHz-jwlQL3SrH6n6WbqjD90i4LQQ,2231
286
- helm/benchmark/run_specs/audio_run_specs.py,sha256=g6uncT8dIK59qU3aEcyvpPLiblx3Ks8yCUY6s8IMO9U,21957
302
+ helm/benchmark/run_specs/arabic_run_specs.py,sha256=p5KPvcugJI3ERYhO7Le_aiKOZ4IM2EOvsXEmZE8R4Wc,3014
303
+ helm/benchmark/run_specs/audio_run_specs.py,sha256=baJz5LZiwWZP3KD0hluKgpidtswzdorQnshX0CoqKAc,23383
304
+ helm/benchmark/run_specs/bluex_run_specs.py,sha256=OHweBHS8JC-k9_e5Zq1LUU2FZhJ2P7SDshatX-N15Ls,1798
287
305
  helm/benchmark/run_specs/call_center_run_specs.py,sha256=QhRQw91WblB9UaB319XNCO5K8PX8Riiza41Ym-1CcRU,7044
288
306
  helm/benchmark/run_specs/capabilities_run_specs.py,sha256=sbqhIj4AoujV45erwoVK61lWdlkjg4qssmGlu0eSr1U,12067
289
- helm/benchmark/run_specs/classic_run_specs.py,sha256=1NYeYIwC2F7EjkPEPxNoFb3Ap6BUcUJK_hxBKq4lzt0,56144
307
+ helm/benchmark/run_specs/classic_run_specs.py,sha256=4DA-21Tiz87dQ_iklyrKpfsyTw2f51tbwtRvv3Zs57s,53727
290
308
  helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
309
+ helm/benchmark/run_specs/codeinsights_run_specs.py,sha256=lz3yysrPjCIiObzrIkRjJsWzkABh9qIXn-o7FSqZPl0,9207
291
310
  helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=7slILDS9f0_Z0y-Pz5xEspoGQUmOCOI2K2r4XWUVsm8,14428
292
311
  helm/benchmark/run_specs/enem_challenge_specs.py,sha256=5UWeP2bsnwCHMMXI3DFRMUPKcnJ9_EL01qPUthbWIvE,1351
293
- helm/benchmark/run_specs/enterprise_run_specs.py,sha256=J6NbdgDO3sFvBf-Zqzug05T_JbFIk1Vx50QmkDG6QHc,8966
294
- helm/benchmark/run_specs/experimental_run_specs.py,sha256=pqK3_yD_2Qw1OWjj0biiV1G2BlZhAVnTPcUEbLnz2Wc,6765
312
+ helm/benchmark/run_specs/enterprise_run_specs.py,sha256=ul2YMPpvThOmi7yIc6xR3W0rtE-8tUIaIzuhGlMg2rY,9598
313
+ helm/benchmark/run_specs/experimental_run_specs.py,sha256=tIgAdK3cm4t6ZBGkcPcPkxx0XAslKShYA1i3QxWVJEY,7675
295
314
  helm/benchmark/run_specs/finance_run_specs.py,sha256=5mwb7GbAcSLVZiumqCiAr9dr8qBYApkEt5Oben5CFXs,4371
296
- helm/benchmark/run_specs/heim_run_specs.py,sha256=Pt1eVbzvwZ5EXq8WB2b3XYw62SWYN_i1P_H3oE4i8KY,22096
315
+ helm/benchmark/run_specs/healthqa_br_run_specs.py,sha256=515pDZf8rTpvebPmhr9pqY2c08Ey_OtWIGsFDVVcQqI,1416
316
+ helm/benchmark/run_specs/heim_run_specs.py,sha256=9uOB_eW5bQqoP9eYRaJ2bcigPg75pQLQnyQ67fG9wHo,22226
297
317
  helm/benchmark/run_specs/imdb_ptbr_run_specs.py,sha256=nkW5A_xeD5kCKeJVxsL8RFS8r3UpP_WCcwSdMh2s850,1215
298
318
  helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
299
319
  helm/benchmark/run_specs/lite_run_specs.py,sha256=8OkL9g3wQBG96g0ijGZ9L1Trb59b7VPDyYMqvA3hXfE,11129
300
- helm/benchmark/run_specs/long_context_run_specs.py,sha256=A1ysL7pmyvCmfnokZaBSyWEKfpMh9XCaOejFqABwL38,3033
301
- helm/benchmark/run_specs/medhelm_run_specs.py,sha256=97fWtee-VpnS9ydudc3285kHayK-JYhIM5j6qZX6p8A,40440
320
+ helm/benchmark/run_specs/lmkt_run_specs.py,sha256=tNZvlA4mXUX-NBC9enRR90qFLeh8SNGFq701rXmXc18,5376
321
+ helm/benchmark/run_specs/long_context_run_specs.py,sha256=A3yhg1IEds7kQWxkRYH7WVkMPouA1xDz28uxpHgwJvE,6229
322
+ helm/benchmark/run_specs/medhelm_run_specs.py,sha256=--KgkjVwKt4uyiTebalrbeGV4FB-jGqPciYjFZED7zA,43407
323
+ helm/benchmark/run_specs/melt_run_specs.py,sha256=729MkALud2wG07yulx9zqAzejdXW_eVGkfF5cQWeGGY,32031
302
324
  helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py,sha256=kenpGGMK1XXaNtvNXsshPvdvN9ubv1sOfaPdjFM4obA,2034
325
+ helm/benchmark/run_specs/multilingual_run_specs.py,sha256=umf8e6ZDgRXiU0G_BPoovj1UZ_dxyrXtIQ7i9WC6USg,2296
303
326
  helm/benchmark/run_specs/oab_exams_specs.py,sha256=ws7Vppo_zJvxKqQ_sNhm9N7-5eQbX2CBkcDI5c_sRG4,1658
304
327
  helm/benchmark/run_specs/safety_run_specs.py,sha256=3X6tYaq2SlRsZs9q6SCtBUgjNEpOwUtV6M7iY2Kowm0,6807
305
328
  helm/benchmark/run_specs/seahelm_run_specs.py,sha256=R3mg4_OoaRizZ5n0FHcUQpJLny3j-ulBlHzOyF0a0Ok,23904
306
329
  helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
330
+ helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=P1mxSu7ErjiK0ARbRmbIzFwYO3fC-6QpsZQeez4U3qI,7346
307
331
  helm/benchmark/run_specs/sql_run_specs.py,sha256=JWCICELKi81m11MggyR6CJNl3vpWPwk4kr8DZSsWvj4,1965
308
332
  helm/benchmark/run_specs/tweetsentbr_run_specs.py,sha256=qogc-fb83Rh1DooKKaskhak52ycvu8DAnhabw9rc7yA,1129
309
333
  helm/benchmark/run_specs/unitxt_run_specs.py,sha256=4Vbsq0MPpSe4cIJOXzeVpMm60N9Qafa2R85X5BeFQew,1873
310
- helm/benchmark/run_specs/vlm_run_specs.py,sha256=qjo0YzyIKq8UIbkKIUhHYh7iErPDQSG76_m-5kiPKEc,36648
334
+ helm/benchmark/run_specs/vlm_run_specs.py,sha256=v-eWuDYc8u5HO46isLONPfAWv5zdA1ZOQrdyOvX3vlU,37512
311
335
  helm/benchmark/run_specs/winogrande_afr_run_specs.py,sha256=dhOm8z6Q_ZpnzYKrsS0nEbRQPWs_phkXxmL5pxCJzQA,1853
312
336
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
313
- helm/benchmark/scenarios/aci_bench_scenario.py,sha256=u-Vo3J16o1d3GPb3qqclYv4mzSaPOa_RblmZbYV_xik,5345
337
+ helm/benchmark/scenarios/aci_bench_scenario.py,sha256=W8h7eWz9mjR0kRAffKWSnA1Fs8t2l83sPyW8fjPOxWQ,5670
314
338
  helm/benchmark/scenarios/air_bench_scenario.py,sha256=B6_WMowLFe4gWfnoFA_yrHe0kagbIkZabEnK4kGGqSU,1884
339
+ helm/benchmark/scenarios/alghafa_scenario.py,sha256=FJXO3W6qYzCgLJMSiJEhpddNcFyR3N5Brh8pATW_9GM,5217
315
340
  helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=jnUGbppDGEsbe5xoJjmv7nW_RvwPIYm6cwSULeqk2Fk,5133
316
341
  helm/benchmark/scenarios/anthropic_red_team_scenario.py,sha256=_OWE33eVRaZI0gmfP7bLd572uOi_6jb39z_J6nkcvfg,3182
317
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=cx5Um-crMG3cNHc8tX51r--K7sVYyM9WbhEqnrb9bag,2503
342
+ helm/benchmark/scenarios/arabic_mmlu_scenario.py,sha256=mI6ttMFAT3sH_v87qVNxYptqDS2EMUhK0b8vpfePSdY,2807
343
+ helm/benchmark/scenarios/aratrust_scenario.py,sha256=G20j6Z-C_6bUJf-bpdyUN23Hb7XK0YtieUprq_5Z5hA,2552
344
+ helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=fOCHumFWZa4OJZcTZefJiJbdWsb3zjQnWLJYd10Cctw,2496
318
345
  helm/benchmark/scenarios/autobencher_safety_scenario.py,sha256=MFt3f5baN5r-FmzWZfUChGR1mX_PUB_5hxoINac_Whs,1854
319
346
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SyM6RP4v08B1PjumkdQnuKrM9L8SyK0bXbx-LgmyTPo,5067
320
347
  helm/benchmark/scenarios/banking77_scenario.py,sha256=dtiM-Q_pMDWqkLi-hgl0tH-aGuDdgHkXgweE1JqrPYs,1883
@@ -324,18 +351,24 @@ helm/benchmark/scenarios/bigcodebench_scenario.py,sha256=zQLv91uwfGAR9N4jm_iBUmY
324
351
  helm/benchmark/scenarios/bird_sql_scenario.py,sha256=n5elzanKEX9YclAl2l1y33aCjihTmaw1VF_ZsAU5IaM,3613
325
352
  helm/benchmark/scenarios/bird_sql_scenario_helper.py,sha256=FIwPk-dwfTY-8gDXeAiTZbfbS0Oe1OuWRlYiJOhZwk4,4664
326
353
  helm/benchmark/scenarios/blimp_scenario.py,sha256=9Ge3QKRgtVHpWy7aehZVKiO6JrsxK7zrEdtqAb4zxtQ,6284
354
+ helm/benchmark/scenarios/bluex_scenario.py,sha256=eHAltiFqZ_bS0AVi0kbskTlxJbQXy7Sqj6E9nZPWqCc,2500
327
355
  helm/benchmark/scenarios/bold_scenario.py,sha256=iE9drB9IeXfRn3xvLnaQi3-nJAp-bV1RE0GJGnp9dJc,4130
328
356
  helm/benchmark/scenarios/boolq_scenario.py,sha256=wPETIu5jcI4jgP5GoFa_xi4SsvHtS9gxQ5TD8neHmdk,8037
329
357
  helm/benchmark/scenarios/call_center_scenario.py,sha256=19J2N57WnUkPMGRRbJyZak8YCeMTRwD3BRK1SArQlL0,3037
330
358
  helm/benchmark/scenarios/casehold_scenario.py,sha256=QSe0D3KQJhlTOo6kM9OHwdKy6NlclsFGRVCAB3mTG7s,3174
331
- helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=y0B1g8wMyjK7zncJjaHUBSbvIK_4DNiAVE-Xk8KBsP0,3695
359
+ helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=BbEjDqa4C5wpdil5jIb1nzj16CCZ29hKoZVsfapSfho,4005
332
360
  helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
333
361
  helm/benchmark/scenarios/civil_comments_scenario.py,sha256=pnZU2U_cYFYOJmlmwTehHU5oLIPx_Yg8Ayxinroh4IQ,4875
334
- helm/benchmark/scenarios/clear_scenario.py,sha256=-r4YIQLKgbjT54J96urcxEcQ1bhxjKVtfyajuFOaEoA,5915
335
- helm/benchmark/scenarios/cleva_scenario.py,sha256=yPIiToKow76YMc0EDYeqQEPx-9a_6Bm3w4S1IsRRV5E,57987
362
+ helm/benchmark/scenarios/clear_scenario.py,sha256=yGdPxWO6vY4JHNa4xywtvD-9lOn6s5cr3njpZyFA0D0,6183
363
+ helm/benchmark/scenarios/cleva_scenario.py,sha256=n-h2urZ06GUOuAC_60HMwspTTpBFid72Fx8eZGQppdA,57988
336
364
  helm/benchmark/scenarios/code_scenario.py,sha256=lSbZWw67ie9osOjXDZukj3EEZGa3L6TrMvTg--IbuxE,12520
337
365
  helm/benchmark/scenarios/code_scenario_apps_pinned_file_order.py,sha256=KC-5MQ-d8Nn46aDN4FaPxmd6yk1DtVUmVR-CIZsNCp4,1738
338
366
  helm/benchmark/scenarios/code_scenario_helper.py,sha256=TnXAlY-wdAFwIDylFItf0z7HOu93WD6dNThwzZYe330,5904
367
+ helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py,sha256=PK4wtuBXs4cPPwOoGfhBA4J4cGLQYC_MvRWuvWrkrv8,9068
368
+ helm/benchmark/scenarios/codeinsights_correct_code_scenario.py,sha256=7BpcezugYHleSuG8hreHe5oXpm3bxoxQ4RCnx6rjKbU,3734
369
+ helm/benchmark/scenarios/codeinsights_edge_case_scenario.py,sha256=csTwe-mv1f6Tyvnj9uZ0SYuj1GRVvgjzukV28gIhNpk,8703
370
+ helm/benchmark/scenarios/codeinsights_student_coding_scenario.py,sha256=wc5Fefn4jpCw03dQ6WswCztJ8AO5j0Vrn6omcOVUq2k,7409
371
+ helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py,sha256=qX3yckZdMojYhiwvokvEuQpRXOzmN2zmzKjQb96Ljg8,9651
339
372
  helm/benchmark/scenarios/commonsense_scenario.py,sha256=yZ6n9aqOi7UWY3q4uTDNc2JRNZxaBZPIp7n_Snt_8g8,9511
340
373
  helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py,sha256=gKEwqHDD8KlKmW8z3xAxSIGmALTXrRRPcoDUzbv_IXg,3854
341
374
  helm/benchmark/scenarios/copyright_scenario.py,sha256=FHzUYEabj-BTKl90fgq7jSCq5_Yf9cO9MA9djn50B1Q,3697
@@ -352,28 +385,30 @@ helm/benchmark/scenarios/decodingtrust_privacy_scenario.py,sha256=zaXn4sRPUEZiqP
352
385
  helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py,sha256=NjutVTOVVze-IJniRFecz8gqh_BUpuJG3-BUboTGKRw,2933
353
386
  helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py,sha256=EYKoXDWMesbY5dCNY-N0eYMRL0rjEfGsuS_TkeD3Suo,2952
354
387
  helm/benchmark/scenarios/dialogue_scenarios.py,sha256=yXCMZegzlgL0CXTY1W5lXdkFFHicUvq4z7_284MfRpw,5778
355
- helm/benchmark/scenarios/dischargeme_scenario.py,sha256=sTh3bj3dqvh20FCn7bGMycFzH5xphgespVS063XZ_Wg,6759
388
+ helm/benchmark/scenarios/dischargeme_scenario.py,sha256=rBzagg0JVVN3o0VUfmHy2cN7gutV_RAJAo5Fa_El0GY,7842
356
389
  helm/benchmark/scenarios/disinformation_scenario.py,sha256=0T7LhXguzBP645Fruc2udfTaMuy7XGtOEMJKpFMIFRk,8565
357
390
  helm/benchmark/scenarios/dyck_language_scenario.py,sha256=hygFPTcICGUEPwjtxULLKBSbuBOXLYpozIgiGcT__W0,9379
358
391
  helm/benchmark/scenarios/echr_judgment_classification_scenario.py,sha256=IqODoUY1-zJD1KW4Qkg3VwJcUeeLgGUKThr62bW-wx8,4915
359
- helm/benchmark/scenarios/ehr_sql_scenario.py,sha256=ufrY7zmeXlgOxsq1Sr0x0vhR7xbL6FTJJWiM0pzwIpg,5119
360
- helm/benchmark/scenarios/ehrshot_scenario.py,sha256=ROPfWBDOAaHxcnnh5eGkCh-qhwvpxORcGmpA8DrjD0A,68721
392
+ helm/benchmark/scenarios/ehr_sql_scenario.py,sha256=Gm7Kw_TSUUxHW8ns-2e4E_tTBVX7h6Ta273VOpkMCQ8,5480
393
+ helm/benchmark/scenarios/ehrshot_scenario.py,sha256=MWcTejCtwohBPbZYWei_WNZ-Hdnhml7ovTVbJAgUetU,67770
361
394
  helm/benchmark/scenarios/enem_challenge_scenario.py,sha256=sxYXKvf-mGNqctTkemwI9rrA_Rg2xA8mz3_W3TIfzUE,2147
362
395
  helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4V426oOuexGg59q0djHCTQjQmqYgyLT191Z5fayubmU,6681
363
396
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=kzzDaoVikL2P7Z-17EkLIVR_W7IHcNVerUts2oXDKLA,7111
364
397
  helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py,sha256=TklbX7Kx4y-estV-YHUbI5O08q2qCZRrOmX9D3gZS9c,2193
365
398
  helm/benchmark/scenarios/ewok_scenario.py,sha256=vrbJg9vakAxE6n-1jURUcwb-ihrsYoY9e32BpnEGDaQ,4684
399
+ helm/benchmark/scenarios/exams_multilingual_scenario.py,sha256=c9zMGGL8EbCeNogTm-88g_5wWUiX1Zr7z_tsyjUq2h0,5404
366
400
  helm/benchmark/scenarios/fin_qa_scenario.py,sha256=Dm_kGOivaxiKVhcqFgN8pRPs1eqm2LdBZxWy0yFhFuE,5958
367
401
  helm/benchmark/scenarios/financebench_scenario.py,sha256=cHMljdg0_9HA3FbwcwwMt3DR9rxl0jkyFN9jNrUStSE,1956
368
402
  helm/benchmark/scenarios/financial_phrasebank_scenario.py,sha256=dMTfI9MRHKXnECsXOIY8xvX6w5vAPEIa6A7TYyIu2Fw,4457
369
403
  helm/benchmark/scenarios/gold_commodity_news_scenario.py,sha256=-O4ilLwNcycmpQG5h_5WtQP7yJEr4mjWjKBe2eNP0uY,4806
370
404
  helm/benchmark/scenarios/gpqa_scenario.py,sha256=369E0JvaR12EcgcEFKKRcDw1iztt4sb8ghIsk9Brzi4,2884
371
- helm/benchmark/scenarios/grammar.py,sha256=Pb9vEP_0Ki87UdQCj1ym7QWJ24M4DRP6TXB5d3GnhLs,5597
405
+ helm/benchmark/scenarios/grammar.py,sha256=58tQYKPj013V9jIpW7fXUqZBLuboqEi_WLlDjx74spM,5590
372
406
  helm/benchmark/scenarios/grammar_scenario.py,sha256=Hz59gp5ivH3tIP5UAcHZbnk8pBX6GhIABSQlG33gIRI,1502
373
407
  helm/benchmark/scenarios/gsm_scenario.py,sha256=QIj0QK5ncF31ES0GUlxbdBk6SIiJJnj5wzamj0do0tQ,2674
374
408
  helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py,sha256=8_ShEuOoEGu7iRE2b0tgi-cfBrCPF9k1L-Pgb__n3Bg,2005
375
409
  helm/benchmark/scenarios/harm_bench_scenario.py,sha256=CBo_AfbtHTlvJdsiquP0EDTKApVmDZc7EW0VTENNAfQ,2478
376
- helm/benchmark/scenarios/headqa_scenario.py,sha256=TufgA1tjcEyq8vQ6Wk1oYxYXhSm0pjxvG14lL3y8GAI,5417
410
+ helm/benchmark/scenarios/headqa_scenario.py,sha256=m6Kqt16JeqA1-OLJvmBPZzhVOVt7O6rbJGAwG9C7FZs,5658
411
+ helm/benchmark/scenarios/healthqa_br_scenario.py,sha256=YneXTfp8V6k8rYCF3BTX6bxN2ASxdG3qrBr7uH_IFWc,3406
377
412
  helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py,sha256=iv1khpdiWW0Z7lshyWOhhjRfYFdAU6etN8X5EDEQCrc,1302
378
413
  helm/benchmark/scenarios/ice_scenario.py,sha256=NCbeqvpDFIIG7kSCrJrS-Z9S3iG2THZ7HpAqghpi_y4,16725
379
414
  helm/benchmark/scenarios/ice_scenario_pinned_file_order.py,sha256=fuirubIdi-rkJMfSd7YoDdBX2q0f5K7GGTN4XVapAUY,1613
@@ -381,9 +416,12 @@ helm/benchmark/scenarios/ifeval_scenario.py,sha256=SYn9itpFG0tlWSayf6v0P8bRgdtc-
381
416
  helm/benchmark/scenarios/imdb_ptbr_scenario.py,sha256=laq9UwyvBvZZuo54rf-8SdKTLrMdDHTdGWJ4TdC8Eng,2340
382
417
  helm/benchmark/scenarios/imdb_scenario.py,sha256=qHXd-QIXTCBq8rWW3N5I2Rvg6Pz9v1zFhZkwc73w9io,6259
383
418
  helm/benchmark/scenarios/imdb_scenario_pinned_file_order.py,sha256=fjW0Gkzg2Y3IAbtYJ3KC7MueWd9U8h0tlcBCqxYmRrM,1621
384
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py,sha256=jp5a2p_yqlCQXmhJRsqpTiKN8EGZi8Xyw3h37elb2OI,2785
419
+ helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py,sha256=JRTLaQc3PDpYeX9ewGnBteT9jXeaGbmJ1VzYGT8TsXI,3067
420
+ helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py,sha256=5fJHFonb7Ko7exHFtoUtvHar_7PhK2HjW9uDlU8Ljj0,2872
421
+ helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py,sha256=6z3VlcucrwK2B30artWiSpo-mOTr9tiwYV6Fu8XD0VY,2657
385
422
  helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=F-gDO6r4GPBJTLirhF5noRaV0edvoIT7tiIDlovBFfE,2253
386
423
  helm/benchmark/scenarios/koala_scenario.py,sha256=A5M6SD7Jjg7r9QlbHCtMaydBe-wpOtB6oc6gFXuZ47o,1389
424
+ helm/benchmark/scenarios/kpi_edgar_scenario.py,sha256=23rZM3IA-phf2VnuPY9QWd64scE6eaJks49apDUNfic,6355
387
425
  helm/benchmark/scenarios/legal_contract_summarization_scenario.py,sha256=xjw3iKRf8P50Wo58n7ssnFiWHR2QFehzHlZhh9P1XKs,5374
388
426
  helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py,sha256=q_iezJo23_HNNoIXYT4cLYCbwNzLYJx6uvxgPSE5bQA,2804
389
427
  helm/benchmark/scenarios/legal_summarization_scenario.py,sha256=BFK524H7uLfz_ZURuRS7KrhzRCP-WyhIcOgdcBrsldA,8709
@@ -393,81 +431,98 @@ helm/benchmark/scenarios/lex_glue_scenario.py,sha256=-3fsSjTXjgRN96Hl4GzDIMB_dlx
393
431
  helm/benchmark/scenarios/lextreme_scenario.py,sha256=gVTHoMYX6Q_Itt5rOVO5lYmqWfAtuuf63CnKAF8b_ak,20461
394
432
  helm/benchmark/scenarios/live_qa_scenario.py,sha256=TnWaOPOcA4U1_8JdahQOUZ9KBj0MpMf4BcK2TDBl3BE,3666
395
433
  helm/benchmark/scenarios/lm_entry_scenario.py,sha256=kQTnj5gKJmDxCgynmzQOmghwNySpna7aTY7K7RPD2x4,9109
434
+ helm/benchmark/scenarios/lmkt_scenarios.py,sha256=K51CdOZqMOMOozUmADjrJuNCpUtXVEZwcOeIY-EZrwM,11162
396
435
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2VUJ36vHUZp6fZuLfRIuPSsU_K6Z3Im2ums06sZENqo,6153
397
- helm/benchmark/scenarios/math_scenario.py,sha256=UtNj0UaCxt0RjM-uwD_Evm7SjKnvMlfCt6K0HQOAVC0,14377
436
+ helm/benchmark/scenarios/math_scenario.py,sha256=tW-nGKxyDOwOo2siqu1ZzPrCGzw_lFYGK5uiUK0lF7A,14525
398
437
  helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=7DOqQmO70BpDeJy_S4fJ5i2UcCH8tunxzjFgTIim9bQ,4062
399
- helm/benchmark/scenarios/med_dialog_scenario.py,sha256=A-OhCSsbyrkIiyScfGXf5mWJJ9mUXhWQ1S2hHFUYxQk,7254
438
+ helm/benchmark/scenarios/med_dialog_scenario.py,sha256=AE10W1UWhOrgKUnz7e2brKSaQR1WJkQUcPoo4s6n0Fs,7553
400
439
  helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=XEipvuIA-QoyZrtlm8nnaPuyZzdDaeTskAhnseD3Q68,5096
401
440
  helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=0Z1JrizLygjd9v_LLFMk8uZ805IWjJPvg-ZvPVhtMm4,7652
402
441
  helm/benchmark/scenarios/med_qa_scenario.py,sha256=m0W-FgFi58psLglZyQy_ouMQIDP-2j3aL7uInkdVtms,4478
403
- helm/benchmark/scenarios/medalign_scenario.py,sha256=yNaEyCGdeMMTZmPQcAyQeHFDD3mHZVIrauCC-WEuiZQ,3040
404
- helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=wFR15zRuuvtGc1MXaecndfHH-_uCLF6O_3twpqXZOLo,15292
405
- helm/benchmark/scenarios/medbullets_scenario.py,sha256=mAQ1-jgsbd5hM78C0E5cgFs6fPt2KYErdemZBTQ12iE,6447
406
- helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=fq1qBjGkL9EA7wQBJrg_V5fIewNgpiQVK70-qShXnaA,5374
407
- helm/benchmark/scenarios/medec_scenario.py,sha256=uS567bCzOQx08euNU6vA01YqxxXadIcSqModkqT22LQ,4969
408
- helm/benchmark/scenarios/medhallu_scenario.py,sha256=Ed2JesQzU41P_rv_9zgBnQCGD-EEkG-EkIBw1qEIXbI,2223
409
- helm/benchmark/scenarios/medi_qa_scenario.py,sha256=JtsRryV88nFy0UlFaUuR2QyEdYkY2vLYMScvKC9ndTQ,3770
410
- helm/benchmark/scenarios/medication_qa_scenario.py,sha256=YJdluvYLb2_m96JkcCOTBIQOPY2h8dfmjTbJ__7Jydk,2266
411
- helm/benchmark/scenarios/mental_health_scenario.py,sha256=nGTXWYWfeO_t1u5va4u-S_OD70qo9IKbCHk5vLEBhT4,4518
412
- helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=RLUV_ydURfA6kTUJQ48TtaYzIYu8TSY9vl_ahNtSa7c,3777
413
- helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=4baXBaW_zcZU5RhQM3JsRrzHGhFvLwGiOu0irZShjps,3401
414
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=IECc3j1RSIZeO-JqTyxOBFCbc7ovl4XsYMdGL1k6pcw,2355
442
+ helm/benchmark/scenarios/medalign_scenario.py,sha256=mhd8REXpPwxftH48-KKb0ZURJ1mdOlvPRmvN4g4M9Ho,3383
443
+ helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=fKXJFVLGnLcZKRBLsbjJA6YA4WqMaQAjkEU-i6YzSTQ,11626
444
+ helm/benchmark/scenarios/medbullets_scenario.py,sha256=8O0UsPWw-ESkrgiuWz4f8gR99jH5-wS5HtCKYwZ1ycs,6713
445
+ helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=vwmEQZ119tOVeZtl6Zt-nXKwkA8Qt4WRiH2HogIkV0w,5560
446
+ helm/benchmark/scenarios/medec_scenario.py,sha256=Lo7iVkek7C9omJ5LX-C83pA_Q5OrAfdNhJY4rslJWTQ,5270
447
+ helm/benchmark/scenarios/medhallu_scenario.py,sha256=d4HlEi1cQtvh1a39jvIHezDDmjuIEsSPdqDLLkDTzw4,2544
448
+ helm/benchmark/scenarios/medi_qa_scenario.py,sha256=FmXI3UwfbL8zinFPtSyTyw4X5VIe2d32HAg93vbXR94,4118
449
+ helm/benchmark/scenarios/medication_qa_scenario.py,sha256=StQmfHTYi8pZLP9FMPzyS-VB9gilZS0XBme7MzAL2QA,2583
450
+ helm/benchmark/scenarios/melt_ir_scenario.py,sha256=d88DEGKVJZCeGnbrXrQZO_W4VJeqW8XNaYc8wIUiJtA,5978
451
+ helm/benchmark/scenarios/melt_knowledge_scenario.py,sha256=FDG4OGYEV6Ac40VC7KAeikzbFKAK2XXFhH1-QUTw8jo,7923
452
+ helm/benchmark/scenarios/melt_lm_scenarios.py,sha256=kSm0lRRixhnXctMprPnzi09PLOmgfs-C7TAW3QI8RmE,8969
453
+ helm/benchmark/scenarios/melt_scenarios.py,sha256=_WShDpmPaKrujGbZcazCqleDn0TKDhFg1h-vu3ieS8E,30144
454
+ helm/benchmark/scenarios/melt_srn_scenario.py,sha256=EQSOZIXbfvVWCJMJ4H2e_CiBz6wc8THJndnbK2WwTHM,14674
455
+ helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py,sha256=ptMQWgNn6R-XpAVAAjutSdZg_9ZUqG6fVotzAgeead4,7945
456
+ helm/benchmark/scenarios/melt_translation_scenario.py,sha256=j9YrY60DQHZz4m1MJZaGLzyI6FERlHRx2wy9auyAVB8,5415
457
+ helm/benchmark/scenarios/mental_health_scenario.py,sha256=O1Lfd0MxqawLZLKUDSynaqqbaGHRjDglmePIqepnJI4,4961
458
+ helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=GSUlvK-NVPYB83emucc1cPj-HgAQVu2aXGuutfXJUHc,4098
459
+ helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=bxwVWjE_z4I_Nk5eD78g3QAGyjpsNg7DVWpkp8IGWXM,3841
460
+ helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=tZBUZEaUMZvfSlsU6hcPs-pxQ0kDIL6qebGd7JmpDbk,2699
415
461
  helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py,sha256=-OkPMRyB7aO6QBFwoTl6a2rpzcoHeEl84tqz7k9kpCM,2982
416
- helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=zUY0Nb8PzwxvohS1C2Me9utRfFM-8OLr0CmUfyjiVgc,4013
462
+ helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=pwpp0wqNhsGc8v2V11aUyEWbwdkmIm-42N676j1T3Ws,4031
417
463
  helm/benchmark/scenarios/mmlu_scenario.py,sha256=_5cX2uI7CxD7K_GvO3MD8CRJLuN4EzS2o_EFvbrfjSU,3855
464
+ helm/benchmark/scenarios/mmmlu_scenario.py,sha256=CyOISLOsXF9IEYGfeqWyYYkWGvrUvGivlWSJ5ttN9qY,2762
418
465
  helm/benchmark/scenarios/msmarco_scenario.py,sha256=-l7_rIMQjMWcpTyn6dGotmNJ5XxN_Ze8dEJyv5ftWFA,34050
419
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=ftRkhL8oLZgsDvLzI8ya5jv9xv77YcDT9TU9JZBss8o,5333
420
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=eB6PT0wwH521r6uXvoiQEo7fZQQcKATuElHuPmyVyW0,5301
421
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=DAN3xeupuZEzxubf60C938Hr3WjhkzgaI1MbNwgPu8I,13194
466
+ helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=13pXjs9lFduM-QL03mpM10hU0iA8Vr2jJG2FVBQdKOI,5577
467
+ helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=RlyWrlI9e5MLsGbkQWpO2WRsIOZJi39xHskOIBypHdo,5399
468
+ helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=-Et7hJnQJOGl1U9Xdb5mLckYTpU_Ve1sCe450M-5haw,13513
422
469
  helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=MiSq0UnUllJxHFU2gO7m4vr_vmulavJxc4ruZhsAt2U,5632
423
470
  helm/benchmark/scenarios/natural_qa_scenario.py,sha256=g-fP8L1lXs7zwNVQOc0ZUnbYkCyElQtLVt5fe5dtvSE,12564
424
471
  helm/benchmark/scenarios/newsqa_scenario.py,sha256=G25VYaLrV_JyyoT0jpzJ6p4l5qsOydm8rlzTvSptNKQ,7284
425
- helm/benchmark/scenarios/numeracy_scenario.py,sha256=E1WkVgqPiZwaKuskD5iVwoypbG3DKI_r3bPXPqZ_SSk,30885
426
472
  helm/benchmark/scenarios/oab_exams_scenario.py,sha256=vbjUzQP0zU4ckvMbsk4lh24NddVWbUAtfWmsq1h24_w,2101
427
473
  helm/benchmark/scenarios/omni_math_scenario.py,sha256=5qb2cO-Ibb3kDbwYvkzsoU_aOsoKV3ROLgZbi83OyGU,1955
428
474
  helm/benchmark/scenarios/open_assistant_scenario.py,sha256=zd8T6eLOlYMZiFyKrRjc-EPwk5_KpbBedAcKDbZ-TdI,5609
475
+ helm/benchmark/scenarios/openai_mrcr_scenario.py,sha256=XbO8Wpjjq2e8OsC2s_ZScV4TcZg3hlpVGy56hgxXY9w,3253
429
476
  helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=JK39tq306tKe0RDBDLz1AfAdZwNjK_Ng-rHvu6bTRY4,7395
430
- helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=AmgdTGffaxNOJ_xDqA1ju5jXjlvEVdx3Gz7Cp7mqsd4,7789
477
+ helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=73D9D_q4Zw904qfd3tVPPhHxpGN4IZcWjlA6ZHEfp2s,8070
431
478
  helm/benchmark/scenarios/quac_scenario.py,sha256=RpJpOPbvhB0jv3R91Odc20LcNyZsny9J4IF24GNEygQ,6689
432
- helm/benchmark/scenarios/race_based_med_scenario.py,sha256=FQl99ttwk-SQdix9UpOCG1nI92JHuemLRgWjFTHGiTA,5295
479
+ helm/benchmark/scenarios/race_based_med_scenario.py,sha256=vZB43jtM47PWrl9L4HYOf1i7orpscKcHX01m0oVmk2g,5778
433
480
  helm/benchmark/scenarios/raft_scenario.py,sha256=Yk56dUMqDGXpp6SxoGWhyxa4lAIniSQfivjkoPqMuFA,4644
434
481
  helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=zpQthgDi-AyEgOUFO5F0qaWCctLEI5WGHBEGlPEVpqc,2424
435
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=bonCyKO9hCSce4EQCTwYAv0pgRlBYwlXAmEHl6_ljHA,6323
436
- helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=bhm8IINDa1asCFHTLkB0BztzPvww0Uy0Lv6_G9By77M,2876
482
+ helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=jgVf1D4eTSxwxQsW0GBou5hfSo2dnlEJvHpVJqk3BxM,6327
483
+ helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=Dy0INRMzxSiIs9Pm3fa0hYodN-W--WPSv4kcmeQhucM,3270
437
484
  helm/benchmark/scenarios/scenario.py,sha256=kSy7tmtFeC6-QSEsBuvlrMTA1PB6fOY9jycMld-vBVM,8592
438
- helm/benchmark/scenarios/seahelm_scenario.py,sha256=GA46ShNGUjVdMLK0ZbN4vPuGEWFQsDPJXEGHQbs1qf8,78150
485
+ helm/benchmark/scenarios/seahelm_scenario.py,sha256=i8SnuYDQtFGFkm686h3_FF9J3vr-Bd9w_jd7h5tV5yY,78152
439
486
  helm/benchmark/scenarios/self_instruct_scenario.py,sha256=3Kvi3pLL6eGOEezjoQoGv9c1UxKiRVlFmILKzqst4pI,2309
440
- helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=bbl3XczUrPW_mzHPtNxSC2SHRKBzgZP7RueIi8vc5y0,2362
441
- helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=AJlKAiSoUJQ4N4WeeBjqOfYNsl2teq8G8B_8pOZOW8E,2422
442
- helm/benchmark/scenarios/shc_conf_scenario.py,sha256=AQUukzN3lT5dSQWyc6t6ZSZTEIEvOj-TC7K8BA1Q5fs,2535
443
- helm/benchmark/scenarios/shc_ent_scenario.py,sha256=urspIw8idVLiuK__cG3IvnYnky4AZWiTWzK2nzjwAVg,2530
444
- helm/benchmark/scenarios/shc_gip_scenario.py,sha256=ectxVRWal0LnqFlBsRGhtJgWN5RZls7tHAVDeMzcW4w,2337
445
- helm/benchmark/scenarios/shc_ptbm_scenario.py,sha256=bAHIu7YKwUhwvGJuS5hplo7JedwLFGxyLub_ALLZo98,3077
446
- helm/benchmark/scenarios/shc_sei_scenario.py,sha256=udGZAIQ4Fpi_bV6WsuuQyIpSY74qc8VIc1MP9yRFIRs,4213
447
- helm/benchmark/scenarios/shc_sequoia_scenario.py,sha256=Z_7LM-RHuwWGdwFwCAgAQvIz4dfXNE0uHhLM1_9m7n8,2410
487
+ helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=wF_sD61IZ4RDznBVQ1HYbGh3Vc2qjbcBuU0jdmp1aD8,2803
488
+ helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=5aVEiRgFCutEWW9yMcJBxEo11FlwW0SiZTaOyXY6ioc,2693
489
+ helm/benchmark/scenarios/shc_conf_scenario.py,sha256=3LDB2pT6yi-ubSooGAD_0Ao7sYLo_MMAHNfm5Ux9Yvk,2889
490
+ helm/benchmark/scenarios/shc_ent_scenario.py,sha256=PS_O_keZ5s5_nSKxAC1k_WV2W8umEbyyKmlFtxvaReI,2855
491
+ helm/benchmark/scenarios/shc_gip_scenario.py,sha256=cxMpMmS05QpZ4xW2eogPH1hcDv6GzA6UQoAi9OSFO_Q,2702
492
+ helm/benchmark/scenarios/shc_privacy_scenario.py,sha256=dbQI_pDqXepV6EyxMUNumIpyQ8oDwnu37qyQ29rxZfY,2998
493
+ helm/benchmark/scenarios/shc_proxy_scenario.py,sha256=edepzg5qrN_GKa7u1W0RRhkpmfUi2vFHCvI1ma205WQ,2908
494
+ helm/benchmark/scenarios/shc_ptbm_scenario.py,sha256=QOQdz21s_YaRyGz-ciCPHH-fCy6hiGIrHUZz0SWPm5o,3391
495
+ helm/benchmark/scenarios/shc_sei_scenario.py,sha256=pTcb7n97VkesyRuqUqe5JGed1jDsQEd19udciDras8E,4532
496
+ helm/benchmark/scenarios/shc_sequoia_scenario.py,sha256=vjDyRZXP9UjkQzmA6u7SmKtMBuUwwn6KRQ4rT3vZqqc,2796
448
497
  helm/benchmark/scenarios/simple_safety_tests_scenario.py,sha256=sjIHT5NZlHv_IcXr_15-pOiBUPKKwykyH-QpMfvrHAY,1247
449
498
  helm/benchmark/scenarios/simple_scenarios.py,sha256=ersSzp9bFEFfpJ-SNy368AuonwswLnuyA1n7FOgkw4U,6459
450
499
  helm/benchmark/scenarios/spider_scenario.py,sha256=mhiV3XWGwpnIQkaHFM_rvZlrwE7nqS12-F9t1eB8kdI,3306
451
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py,sha256=X7AY99m8Lv8bVpOg-Bzld0vyFzpvi6fOAqE3rstRfpw,3843
500
+ helm/benchmark/scenarios/starr_patient_instructions_scenario.py,sha256=zdokiMy2Lrg5mS3V2QEakcZyJxIkqcoT5CqVCAtyoKU,4146
452
501
  helm/benchmark/scenarios/summarization_scenario.py,sha256=WZnqhMQED6UBmRjHSboygdenLecOqIhvgdYVXzy6Q-I,6912
453
502
  helm/benchmark/scenarios/sumosum_scenario.py,sha256=HG3wrKj5alV0a2aKb_nau8bB4oKDtTOLtdf3bx8h7sw,7695
454
503
  helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=OaxEvT1H9VjOjBSw_yKs3dcYt33vFE_UARr-UIP9pBY,3120
455
504
  helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=pt2Aln_dX1YMSl-9hV1HJmwW90MC3fWwGsMxZg-Q-UY,16391
456
505
  helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=7STCSHiHGIQ2aaN_PwDE5jXUJ-qcu8PaS4pC-pbOceE,8410
457
506
  helm/benchmark/scenarios/test_air_bench_scenario.py,sha256=9o92CK57xxgPaA9Xt9uJPPie4Cxllzq-KbMt3G35UQ0,1320
507
+ helm/benchmark/scenarios/test_alghafa_scenario.py,sha256=ARQyzjmEpX_FpN2QLnIB7P-ToAeMtE4dqsolzlq8KPQ,1696
508
+ helm/benchmark/scenarios/test_aratrust_scenario.py,sha256=3rsIBfFCAmemNT_IJQ6RZ5liwrHseKGvtGmm2VHoBo0,904
458
509
  helm/benchmark/scenarios/test_bigcodebench_scenario.py,sha256=q9FWJsxLJoFaB3PSMLjI_-YyPoZYusOsMPwn6X6NKXw,1304
510
+ helm/benchmark/scenarios/test_bluex_scenario.py,sha256=fLTyMTmSiJ8MoJKYJ2pcE39yXwZm6zv3_oWsQbRbwH0,1930
459
511
  helm/benchmark/scenarios/test_commonsense_scenario.py,sha256=V5Mq4cxWqU6j1U3icfIuzcnCZsZO7NTKLQgF0lEpdyc,924
460
512
  helm/benchmark/scenarios/test_czech_bank_qa_scenario.py,sha256=bZNLEGu58iHmutGlSp-2uVC2931TO6Rxw7giqFh9RHY,828
461
513
  helm/benchmark/scenarios/test_enem_challenge_scenario.py,sha256=XfPkYaSwdGa63ToC_BLuVKTRSldWNBlKsZYK6CFzL3w,2000
462
514
  helm/benchmark/scenarios/test_ewok_scenario.py,sha256=WY2vqbHF1120ht4PER0uviKMb2jnoPM3ff4KwvwcU4I,1291
515
+ helm/benchmark/scenarios/test_exams_multilingual_scenario.py,sha256=vHLTcEzo5SkZgy2yXYm1Sex641qkr4HQWmVsOrlCQ_s,1764
463
516
  helm/benchmark/scenarios/test_financebench_scenario.py,sha256=EFZLJXXBoyjlTiMQFaQ6MiYkve1lfQDjQWjn4BjqgAQ,1184
464
517
  helm/benchmark/scenarios/test_gold_commodity_news_scenario.py,sha256=RO0NcIkJuujdPVO6tDygmDxhZ5YlmIIYlhwx9LeXlQs,731
465
518
  helm/benchmark/scenarios/test_gpqa_scenario.py,sha256=QQJ_-nmujZBSmhBhikRUWznFJ4jHPbGDnUVCP_17poI,1884
466
519
  helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIafmzIeL0nkoU,1364
467
520
  helm/benchmark/scenarios/test_gsm_scenario.py,sha256=I-Sl8Sg8kmFd7u0zZbwbNmeFV1mQLuOHoQ1cQDDwovs,1123
521
+ helm/benchmark/scenarios/test_healtha_br_scenario.py,sha256=YmhXK24MuTPyLFCkXXI7IlwwiiJxytAbONOEh6wSJWI,1935
468
522
  helm/benchmark/scenarios/test_ifeval_scenario.py,sha256=h3CBg13VKwyb1Xaddwg2GWOzAXz4stK5lXdQtHenAw0,1646
469
523
  helm/benchmark/scenarios/test_imdb_ptbr_scenario.py,sha256=8kfCkMRUMU7N4WIrWawFDoxaLB2iTvQ-sPj4RoE2Osg,887
470
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py,sha256=Jy8HDZPtYS48-bBFIStKaQtxvQv_GcAwh42wCYku0vw,1969
524
+ helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py,sha256=qZE-fi1tdNOybpvEQZJUpq9fHsyrPW7NYqj_RTwsv2A,746
525
+ helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py,sha256=t7BJ7ouT83oNtMFFoBvdyQRu2vWW15I1HUdtmzzQKLI,1221
471
526
  helm/benchmark/scenarios/test_legalbench_scenario.py,sha256=FqbgwBAhHWyTIUYSzI5FOnTDx0A3u1o2ANKa_6bfA4g,1212
472
527
  helm/benchmark/scenarios/test_math_scenario.py,sha256=8Raix_ykxUENh7UREw1RhpM287oav1p59P1Dn2gXktI,829
473
528
  helm/benchmark/scenarios/test_med_qa_scenario.py,sha256=Ekp6r5eYPkCxV3FCzVvLemKxlhENhelqdO0Mdhg5yFo,1515
@@ -484,7 +539,7 @@ helm/benchmark/scenarios/test_wildbench_scenario.py,sha256=pmQ87MNoGAXwAmPf0eoep
484
539
  helm/benchmark/scenarios/test_winogrande_afr_scenario.py,sha256=LZfE4J42BZ7OF3BvfKgMWuCHpdw4-LpWnFiKyrHGXp8,910
485
540
  helm/benchmark/scenarios/thai_exam_scenario.py,sha256=YjFsom1yiu-xBZ3SGenNuczVCwQcmyoITTMavGv-QEk,6069
486
541
  helm/benchmark/scenarios/the_pile_scenario.py,sha256=X3GWABiJ5cSoZzeNpgNUVAz7_A9SyM5MhgpJseKpZow,5019
487
- helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=p3KAHk3C01TW7ya_XZIuK3JMJE51uoMpOnARV2UKgJM,6096
542
+ helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=kUQ-Bpu1N1s525EP3pa7v3sp9Wybl0RuJv2pVu0pAGQ,6155
488
543
  helm/benchmark/scenarios/tweetsentbr_scenario.py,sha256=ppugbPWd_3hHesLC52QbC-wUknctr9ZX4tmHefnPf6w,2879
489
544
  helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=wnP-zH38J62zmbdeOLzdU-E3iclbQPApgEk4AGyhdoo,2120
490
545
  helm/benchmark/scenarios/unitxt_scenario.py,sha256=uL8Gni-Uw_eIp9xKQefp4J7XtKSttjJHzJE4USyoC2U,1930
@@ -497,14 +552,15 @@ helm/benchmark/scenarios/winogrande_afr_scenario.py,sha256=3SOVyrQ8D7Wzz06uSbczD
497
552
  helm/benchmark/scenarios/wmt_14_scenario.py,sha256=1YYjz4x2RbYfJAXBTux9X30dxYTSC-YNngCCLhEiNfI,4646
498
553
  helm/benchmark/scenarios/xstest_scenario.py,sha256=ndRNB5ApW4th5iltlmT9-Nfw9eTaVZQw5AMC4HZCI-k,1309
499
554
  helm/benchmark/scenarios/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
500
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py,sha256=wufgORN0vPuTUnp7-VeCUoLH03C5MzSa-PMYku8D0P8,5626
501
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py,sha256=8piJKvbTeKdxLTWnazp0_ydC6ESRHeb-Pj2ri-86U28,6619
555
+ helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py,sha256=NtTEHzmbeCicbjTRxPBUueZrBGOPwF6RVc2Yftc-VKs,5634
556
+ helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py,sha256=IJlM1I0MxtBX5bhvwPPrsBfUwEm_ZqqVmPze8UH_tl4,6622
502
557
  helm/benchmark/scenarios/audio_language/ami_scenario.py,sha256=SH4r2YyW2kQ8r6-nSRI_F4unJC-l-lzikr2O7hMKgEM,4371
503
558
  helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py,sha256=kiUngeoAVOXfuKgqo96RgK_volpJUPFziu-cYDqT8WM,2685
504
559
  helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py,sha256=oLOeBGjQCa3hpzjhX2bNS6637VD9VF1KbSJri9BJ3PI,2698
505
560
  helm/benchmark/scenarios/audio_language/audiocaps_scenario.py,sha256=PkVqQM1zX6ecXYk-Pz4YWlST3Hnla8NyeBHbuHvhSlY,2447
506
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=uoiB3mnkudRH_rY1qeZRgobYYZ0xDn93F1Mn6Avl24Q,6724
561
+ helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=4X_C68yoMKRUC3SuNHYK4_fcboOz-9gbjhbUK1g3VVY,6725
507
562
  helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py,sha256=CbcoGPW65xXRRkrDthotDfoVn51ozANG9s3LCsjxkLA,3706
563
+ helm/benchmark/scenarios/audio_language/corebench_scenario.py,sha256=R8RAUtdRAQcUAN0PFXybQUekdQFNtT8hXtoR1A1hMGk,3155
508
564
  helm/benchmark/scenarios/audio_language/covost2_scenario.py,sha256=3YiaQXuLGfths2XswRw30Vf26bO9jEW_kAj5wZQSOSI,5119
509
565
  helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py,sha256=OKawk6Mq6ONOxcttkk-qodeFkNet7nvP0UbeEu5EgJw,3079
510
566
  helm/benchmark/scenarios/audio_language/fleurs_scenario.py,sha256=k8AFujDJYtH37Zaquy4TH8xYcxE62cvOK6DVDfp1TKA,9235
@@ -513,13 +569,18 @@ helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py,sha256=
513
569
  helm/benchmark/scenarios/audio_language/librispeech_scenario.py,sha256=ogMXxnyTG05tCyJ2d4hiuiVsbQvf4TbndksYeaJXl1s,3475
514
570
  helm/benchmark/scenarios/audio_language/meld_audio_scenario.py,sha256=j1JFX0jGfcqX0QZBKSjYjDWo1jHJbW5Q9jHyOs6Kgls,4903
515
571
  helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py,sha256=Jo_-3zC226iKGT-ac0JNMhlEccazMMiHbomx_qU0rxg,3098
516
- helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=9bpcvFtWq5Pd9i9X8iaY9jod3YcRqk88xnXfjwcNMoY,6130
572
+ helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=7YHgfSpua5OdEGPlmxoufwGXQjvGJMTlEWFiJ_ap5ME,6131
517
573
  helm/benchmark/scenarios/audio_language/mutox_scenario.py,sha256=bDCQbhsRDR6iQGNlCu_35kjmjGjuzjOIoraSncfOlOY,10277
518
574
  helm/benchmark/scenarios/audio_language/parade_scenario.py,sha256=UuOa5cSrHh5n3VF_SuJp4cy1MxlI3uEKHLrNEhGuyuw,4186
519
575
  helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py,sha256=oN4vBkElVzjccaEK2JFqoXMCGFTTHD0gcYwSDhvHTpQ,5438
520
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=Ar7IgtfZXFpsHJ76QacEB1KKwXVrOBE0BcSBO_GN2T4,2718
576
+ helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py,sha256=2qzPYfn0YYzzOtffD50kQu_ePpFJj_sSW7Bq8ZS6M2g,3559
577
+ helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py,sha256=TEyfAsas3ihN4b4bpGkbK_M_uDt39fVrL5k8vl2Cdyw,3389
578
+ helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py,sha256=qPOP6eIEwxPKu6q5EzcrRmhMxMUQk5F9iq8zdJ1Ccrc,4819
579
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py,sha256=CGteDFCd31vbu_eg5oal1cnfjQ2J0Ty3C2HYyBLhI5M,4186
580
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py,sha256=sL93Q2ERzYiWcTOFEyvjUNbX0BgPdsyHKt6eTr51-Kc,5177
581
+ helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=wkKyTCtx4isQSMufap_6DsNdGkHi7L8FQ2p7n58kKYI,3124
521
582
  helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py,sha256=4M_gTWs4CoJ1Ce9dDFBTAe9dzSovpsve_sN1eco2V2A,3155
522
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=oaUeTgmr2AkSvEJYua4SItCbXsiK6cSSrIjlqsSQC7g,4431
583
+ helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=L04ee5bM5E0UNNmkwEzVwug4HJXQoIcVjujPgxtU2h0,4366
523
584
  helm/benchmark/scenarios/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
524
585
  helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py,sha256=c8zcoGCOFqBGE4TAEx1uLsUmGXw_jIS8alI99ubGeDA,5477
525
586
  helm/benchmark/scenarios/image_generation/cub200_scenario.py,sha256=7p3G4mJRc8QHR4Mw2GLsfAFuJcEe6OeZbezVhbyc55E,4103
@@ -558,6 +619,7 @@ helm/benchmark/scenarios/vision_language/mme_scenario.py,sha256=7Aa3y0TWGZH3QrPD
558
619
  helm/benchmark/scenarios/vision_language/mmmu_scenario.py,sha256=deDMdg2-ORZPV623ngncDPlRn6z6cq_QbQtMu-z0Ydo,7665
559
620
  helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py,sha256=HUO09uM2rBXOfCsxzwovmwtihq53xjuzDOtQO_S3J4I,4161
560
621
  helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py,sha256=c7YfclYMDtygsLnEfA8oP6Vl7evdrqqTZazmuD9Oy-8,5353
622
+ helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py,sha256=qWz71kAlH4TxFSTBgAmZ7DLMVA8ir4X7jXnS4cArpZo,3024
561
623
  helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py,sha256=HuizbYsN5Nlihfzu4bfGuC8KSBbeIc6TVknMS4kpVJY,7149
562
624
  helm/benchmark/scenarios/vision_language/originality_scenario.py,sha256=1inr-klQEz08CM2GWqbYdy-AuXQmMhOAywAlA0lJHik,1029
563
625
  helm/benchmark/scenarios/vision_language/pairs_scenario.py,sha256=D3nNu3uU87eMDiMZZafuRTntXjwbqPaSDygUgQm45F8,9943
@@ -582,16 +644,17 @@ helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py,s
582
644
  helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py,sha256=UYe3PnxCKBYEbZTTEzdIoTY9gW7ZZAWmVISRIdItD-A,940
583
645
  helm/benchmark/static/contamination.yaml,sha256=rAfh1DqwyUcDtyzHPQ2QiUK5eY7QfuuRtBXpZMn4TeA,3171
584
646
  helm/benchmark/static/schema_air_bench.yaml,sha256=LapSMj3Ecl1Gp9XIwVCYfrerqS93GNErvp6oDnBCtgw,142378
585
- helm/benchmark/static/schema_audio.yaml,sha256=RD8XMxFlHJ3oM578SGZln_DTjOh72EQqQz_lexSTJHE,28952
647
+ helm/benchmark/static/schema_arabic.yaml,sha256=PoudK_u7hV2lalGRvYDI5b89tSfch6Dx_bn5681Um_c,7688
648
+ helm/benchmark/static/schema_audio.yaml,sha256=lVslZX7JmFo0ZgLU4n6amrs9DK8y43Ux0I9QyDUG-14,29119
586
649
  helm/benchmark/static/schema_autobencher.yaml,sha256=yb-NkF5w5R2YOg7RIsadNHJ_5G7lG1gbcDVq_25luEk,5716
587
650
  helm/benchmark/static/schema_call_center.yaml,sha256=i30aFzWqdOJRyAHN8vAzyHEX1v95DEK0TI1SMKTN4TE,9106
588
651
  helm/benchmark/static/schema_capabilities.yaml,sha256=HHy0aafhOaqL0C4TZw2mMt1Dce2_wuN062ORNZIbwYg,8733
589
- helm/benchmark/static/schema_classic.yaml,sha256=sK3yVQCrk3Tn3Kmg9WITBmJZI7AKVjmIY0f3zgH_t0c,104611
652
+ helm/benchmark/static/schema_classic.yaml,sha256=pRkfy6jrdslx5onmeCUdkRi9y2DQrcPIjVyZLJ7uKCs,104147
590
653
  helm/benchmark/static/schema_cleva.yaml,sha256=TDh-zcCzzTTs7bu0IWlY5dXYaTFhxly8sJIBGQdBvug,25401
591
654
  helm/benchmark/static/schema_czech_bank.yaml,sha256=jkTRQVmmbKkbB0zPH9AtYh6Lt33ymMInRBQnHE5lIOo,5462
592
655
  helm/benchmark/static/schema_decodingtrust.yaml,sha256=2VPxzcyKYea7mx-qmswyVRjPfVatjVH4Rs3OU82mgII,15670
593
656
  helm/benchmark/static/schema_enem_challenge.yaml,sha256=ZDcOfonL0z-ehsW5OkwaQOeiG1jLPk_toN8s2jhVIdM,5540
594
- helm/benchmark/static/schema_enterprise.yaml,sha256=TRYP0uNKi_Ln7kKIRYMqbOnGBlf7hF7aiE4dn8OVu1w,11040
657
+ helm/benchmark/static/schema_enterprise.yaml,sha256=W6eP79bBhKsvsxD8ve-lC-ELDtPXyGmRJ2Z35uK9pLo,11969
595
658
  helm/benchmark/static/schema_ewok.yaml,sha256=MluPnZSy22wZLFB2pR7ycBRgUSvIUsqvq4qM0Vk2ur4,12113
596
659
  helm/benchmark/static/schema_finance.yaml,sha256=I5-rcZmYpfwS9jVsZM53h6Iv6Um33IhQqt-LUrc4_GU,7165
597
660
  helm/benchmark/static/schema_heim.yaml,sha256=EK5F51C6vDZtbVFKqo5GDIi4tG-sfdVm3XcYpfthqNA,44396
@@ -599,42 +662,46 @@ helm/benchmark/static/schema_image2struct.yaml,sha256=cD1X99YcPI8BMAnNfDmXlM-FN0
599
662
  helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
600
663
  helm/benchmark/static/schema_legal.yaml,sha256=RpoFOuVSIowNgxlPn3UMfJC-68RFr3CGDciUGLPfVqc,28806
601
664
  helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
602
- helm/benchmark/static/schema_long_context.yaml,sha256=sTz1CWqsXRnR5yAluWzJZJJmfnesV8MHG03wD07LiMo,8430
603
- helm/benchmark/static/schema_medhelm.yaml,sha256=l31CYuop0hkPaSmwYMMq4DxSCyeB6LRnEAPQZAQF2gE,42013
665
+ helm/benchmark/static/schema_long_context.yaml,sha256=3YjlNkQBgp4hS4PE1EjZvjpvX9v4QjaBPALtOYLpPCs,11486
666
+ helm/benchmark/static/schema_medhelm.yaml,sha256=84BrIengbq0m42ICWvyEWoYtdERR-8J8-8QbPOqUzvA,50747
667
+ helm/benchmark/static/schema_melt.yaml,sha256=mmPqwDa26DVZXsRJkmKQSyD0OStvjlxaMoSPM25SpD4,47494
604
668
  helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
605
669
  helm/benchmark/static/schema_mmlu_winogrande_afr.yaml,sha256=YIVYf-mOFPq82UVBdMhnCWNOr4sV8Oi3-ozOszJ2tWQ,40143
606
670
  helm/benchmark/static/schema_safety.yaml,sha256=7RfZDX4wr8Xr1BJ149ZwmplPzPkNL0-BKbEZuzUsl_0,9278
607
671
  helm/benchmark/static/schema_seahelm.yaml,sha256=9XF9Rlr7I-g-uW6R0LNh7Xg52Xs3_058QybXEiN-hnM,28296
672
+ helm/benchmark/static/schema_slphelm.yaml,sha256=3avOfp-ZEmVRGei3_M_WX6cSP5hQjbfHsDr1XrjayMY,5294
608
673
  helm/benchmark/static/schema_social_audio.yaml,sha256=Nj3ORXDT4RHD52cyo1RHfueWwbhqp1qW06TaVJ2lUfE,8653
609
674
  helm/benchmark/static/schema_sql.yaml,sha256=8rRff6p_i1CsH7oDbUjau2qRWbLGspuM1Hy-g5pOQiU,6047
610
675
  helm/benchmark/static/schema_thai.yaml,sha256=yJUrevvgTJ46TpyXfNecW_B9urh7LPwSbBi_mT4ZngA,8348
611
676
  helm/benchmark/static/schema_torr.yaml,sha256=9R6HgT9ZuCnbMdhYB-pFect9apwEVuLEr3R1fx-Txd0,14583
612
677
  helm/benchmark/static/schema_tweetsentbr.yaml,sha256=DwHE5Y2STJPDT0fFNm-GPFXq_n3DStQ1ubzhSu4xsoI,5453
613
678
  helm/benchmark/static/schema_unitxt.yaml,sha256=9FQhoueYNNYQ2xMuJ2KHzpg_9-_ZhZ9efk6jtTQ3tlc,11855
614
- helm/benchmark/static/schema_vhelm.yaml,sha256=_Yr04KPL8T2ZqOcQiXnUDOqxcuMn1bjZGCeOFSjbbEM,33974
679
+ helm/benchmark/static/schema_vhelm.yaml,sha256=0slYep2eepUefgtK_m4iSS785sHdJzljmO-kwDRriK0,34262
615
680
  helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
681
+ helm/benchmark/static/schema_video.yaml,sha256=FkpI5Slc4w-ty4hns82ArXIvTdqppWDnkJSpIp74QN4,9713
616
682
  helm/benchmark/static_build/config.js,sha256=o98g6QSly1NAfqhYWbU4lEoZB4LEpIrePZtmimiuoXc,165
617
- helm/benchmark/static_build/index.html,sha256=_t225NmMVglYdTTKPzwQ7Ab-cq_4g4oJgYbfkk3F2Dg,1149
683
+ helm/benchmark/static_build/index.html,sha256=hlkvPO8WVcvIJXentHj3Kn5Cd3QwOoi7OqRAou0pRVQ,1178
618
684
  helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
619
685
  helm/benchmark/static_build/assets/crfm-logo-74391ab8.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
620
686
  helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
621
687
  helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
622
688
  helm/benchmark/static_build/assets/helm-safety-2907a7b6.png,sha256=KQentq_1e3uGwiWMViAPxHu2XZ60gqFgovP3UWTyMmw,72312
623
689
  helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
624
- helm/benchmark/static_build/assets/index-262903c1.js,sha256=mnPpe8lHsSWqq9GivvCxMwPVlZusXFP3AQChU3-bDAs,95853
625
- helm/benchmark/static_build/assets/index-42060d71.css,sha256=QgYNcW4kJWHl4GN2T1ep6DTI9tgFbZoj3MXmwTg3sfM,489884
626
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
690
+ helm/benchmark/static_build/assets/index-b9779128.css,sha256=uXeRKCUzQAC32ofNoaK3-WC7kRWR--KnR6--1m9NdQA,491471
691
+ helm/benchmark/static_build/assets/index-e439d5e1.js,sha256=t7AnJSBjGs43kxIev2uLVumaInyBUxad9KVtvA86oUw,124597
692
+ helm/benchmark/static_build/assets/medhelm-overview-eac29843.png,sha256=6sKYQ79cN07-cUsnt-JPsdoVwUBWu5KxOaHWSdwjdgA,284408
693
+ helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
627
694
  helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
628
695
  helm/benchmark/static_build/assets/process-flow-bd2eba96.png,sha256=vS66lq700aPEKTJR7maMrmepAyBZySaL42tBNCRjFWA,190822
629
696
  helm/benchmark/static_build/assets/react-f82877fd.js,sha256=ijg4n6eANaZKXPWIVTQITqrtf-zzicjslJMm6DniDkA,275149
630
697
  helm/benchmark/static_build/assets/recharts-4037aff0.js,sha256=SP08CFvsw8cMMMMdqcXvsLviuOxkAhXGwvUIMvYUdxk,432466
631
- helm/benchmark/static_build/assets/tremor-9cefc3c5.js,sha256=5iR--BuAQHnEFO_jWnh-3hG34ezpt9LRJkTZNHc__pM,293015
698
+ helm/benchmark/static_build/assets/tremor-38a10867.js,sha256=prOrg5S4EeKHSd6RkgnBIbVfXIUq3xjeVE0MRdqvenI,293019
632
699
  helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png,sha256=FDfWcwGcJhJco4qmZli_ROomLiASrrnsX-wtKSDvMkc,542231
633
700
  helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png,sha256=oco_P6kwqp0cC3YaT_2H2RhJ6p1sh3sEQq3R0RA_cT0,71934
634
701
  helm/benchmark/static_build/assets/vhelm-model-8afb7616.png,sha256=ivt2FhDk8dwnzp1MAle5WfbXzht_Mxg4rpy-xHRybjs,180285
635
702
  helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
636
703
  helm/benchmark/window_services/default_window_service.py,sha256=HlLI3be8s-GNxDygNGrvo9exEhbrO8Vtr3w0rnSIx7M,181
637
- helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=4_WCz6NpaW-71OoUCpuYgSbRbYhV4fmB3wSg7kEZb20,2155
704
+ helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=wfdydJY6AmpYCfAv5PQu9D6nFXbuxIRum7Tsv0DemJE,2148
638
705
  helm/benchmark/window_services/ice_window_service.py,sha256=snyIWVeeknf202_pzBUmvPcA7UcN_FKyIpCGpO2CmFU,1100
639
706
  helm/benchmark/window_services/local_window_service.py,sha256=-6wlg8gN_dN80lptRWJQsPALCK6W80-KHA7gghs2-5M,5292
640
707
  helm/benchmark/window_services/no_decoding_window_service.py,sha256=s_i_cqIuU9p0GDRIBApaOHzjH7gHrBPTJ2X5NEcN33Y,1375
@@ -651,14 +718,14 @@ helm/benchmark/window_services/test_palmyra_window_service.py,sha256=u7xb7syXCxj
651
718
  helm/benchmark/window_services/test_t0pp_window_service.py,sha256=rmoMW8YsNpD_zC-GBi6M5GugT_lT9lfn5CbwNbr7d7I,4088
652
719
  helm/benchmark/window_services/test_t511b_window_service.py,sha256=zmFGL4Nwg3xQ7nRe-IEkl37wx59C33xBUS8qKHqBQeU,4091
653
720
  helm/benchmark/window_services/test_ul2_window_service.py,sha256=RhIK4i9XaUfgeqTZEEXxyqaIxdyu29BRKb0pBl7orKk,4151
654
- helm/benchmark/window_services/test_utils.py,sha256=Lej1zx3q-o5C4uhIIsAbexJjNMobY--c0wy8epXvfOk,3406
721
+ helm/benchmark/window_services/test_utils.py,sha256=O1jHGB0Dn0h03ayuosF_8AtikIe8p50d5HcfzT99rBU,3301
655
722
  helm/benchmark/window_services/test_yalm_window_service.py,sha256=PJqw2ySLOMg_iiAzJGzj-1YOrDbxFkmP6wjiDcj1RWA,4391
656
- helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
723
+ helm/benchmark/window_services/tokenizer_service.py,sha256=rf6VAZkPRkwH-KKxXoQnfQ2uozC0_A_9egGPyk1P0E4,755
657
724
  helm/benchmark/window_services/window_service.py,sha256=y6BthPY1V-ugmYfaJElm5Wfy3PSgoJLj10vHcXZZGNA,4727
658
725
  helm/benchmark/window_services/window_service_factory.py,sha256=T55F0Y2jiOYxUHHZxT4YX4fFXY5gfFhn56zIwUBhc7s,3423
659
726
  helm/benchmark/window_services/yalm_window_service.py,sha256=EwwCoMpr9WVLhCI7OI_7tmZHQfTUwn9FFWjbhIBFRfA,1089
660
727
  helm/benchmark/window_services/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
661
- helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=2JHld8GiR_eIQyHMPSN8K2VOswmKJEPMPJLsxlLpU-Q,631
728
+ helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=bhCZXzQDm2fEDKEslWDzkwPihQgmZS0DLVo__Ll9aLI,605
662
729
  helm/benchmark/window_services/image_generation/lexica_search_window_service.py,sha256=uDCUclHvo8toxSTMztK3zG7Eb-hjueobGQaBqPqVJlk,454
663
730
  helm/benchmark/window_services/image_generation/openai_dalle_window_service.py,sha256=8U2qDrUB1QJHRy5STV5FywkeVm6qfNOaeVBkMQhyMGc,453
664
731
  helm/benchmark/window_services/image_generation/test_clip_window_service.py,sha256=domn2MRduHVAdruSUuGPDIGKyDrh-gFxW-fZaBYR7cg,1430
@@ -667,20 +734,22 @@ helm/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
667
734
  helm/clients/ai21_client.py,sha256=RAXQufajYnxr3b_1Hl-wAZkeE_j6O8zX-vngWEits6c,8158
668
735
  helm/clients/ai21_utils.py,sha256=mlg3h615kyckccGZv9rqsP4Y60O3XpwyE-UURRMrxII,471
669
736
  helm/clients/aleph_alpha_client.py,sha256=BK2eQIHYMxLMsZNWld85ZCj17JAoy5lU7rHuSBa4fOM,4981
670
- helm/clients/anthropic_client.py,sha256=s3eCwHh8mbhxLi8up1WtQWKkUsHJa-LO44prNd7XYFc,34059
737
+ helm/clients/anthropic_client.py,sha256=R85gLYrheN2YWSGTnf3pkYTjCkTl300ktdlGLe1_1-o,36181
671
738
  helm/clients/auto_client.py,sha256=J5bCxIDZJUdV1dCv_EtbvwPzd1p2Ogtg207vpb3PhgI,11624
672
739
  helm/clients/azure_openai_client.py,sha256=mZ0udOAjadp7ZyE2KEtq8XuQp45eHlX_qM_getyzbA0,2009
673
- helm/clients/bedrock_client.py,sha256=RjkYkWCHhGFA5oB_Bry6K-WHryopkZtL5Zfh48gS34s,12145
740
+ helm/clients/bedrock_client.py,sha256=sXxzNTs3pwVIwvir5lyJWLRajI9p2lMiJq21XsZ_FZo,12267
674
741
  helm/clients/bedrock_utils.py,sha256=8ZZfyOuZkgxL_naJ-wwBnH4GKv425fu3MfyakGHxeb4,3764
675
- helm/clients/client.py,sha256=InjCQi62TWhWHmfyi-mC3fSAVztd-YDyfB3BkpacHXk,9002
742
+ helm/clients/client.py,sha256=fWJ_Eg4NyhPqlvpDvM7AjWN7cr2LU2uWdsnENLJXlTs,8963
676
743
  helm/clients/clip_score_client.py,sha256=ct3GHZ2Zh3fGwyvQ9DyoIPT6PwDPI-nUaFkUFuc8PIE,1622
677
744
  helm/clients/cohere_client.py,sha256=edQO5raoJYmYzfVREqHhNvjTcqPevG0M8EPMLOANqXY,10975
678
745
  helm/clients/cohere_utils.py,sha256=aYmj60m0e9RF9BIdxp1vmA-uZv17TEALw0dbgTUSpCc,504
679
746
  helm/clients/gcs_client.py,sha256=1sK5x5uWtThgz9gqBLaA8oyiXGD_9nn1WyfMzJRyPQ8,3231
680
747
  helm/clients/google_client.py,sha256=mIaUzK7GHCa9pqK1BEVhdt6dZsJfHv1Qdsf3I0Ayq8A,2912
681
748
  helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
749
+ helm/clients/grok_client.py,sha256=SbVB6AduTwfElzUgEMnQW2kQUFVTCv4TpPPJvElQEe0,1127
682
750
  helm/clients/http_model_client.py,sha256=_F3_y2UWqbzESQdzV0FMEsECIKjporVSAW6iUQhJ35c,2818
683
- helm/clients/huggingface_client.py,sha256=adnFKZni9DiFDDVDkpQjWXf4HLyYLvpzy3aB9PD3HyY,15428
751
+ helm/clients/huggingface_client.py,sha256=oWR4yNFk28nrnB3IoznrhcEuU0pZkNywP0E82z1-NGM,17671
752
+ helm/clients/huggingface_pipeline_client.py,sha256=ivFTMNHBwwIUjkeOHkl-veZi5nNAjtnkYvneRFWs-6Q,6154
684
753
  helm/clients/ibm_client.py,sha256=4W4fbjnDNjXrP4gVwSfBHPus0QcqFOQzFvfaST1BE1Y,9701
685
754
  helm/clients/lit_gpt_client.py,sha256=pgLfSvusNpdj8F5DVxzQdHxTDRNX4RVt6unegao803U,6229
686
755
  helm/clients/lit_gpt_generate.py,sha256=8DdBE9ReQ00NbV3KMFYc--PlO9X-HMOR0Rhm5CADWEA,3103
@@ -689,10 +758,11 @@ helm/clients/mistral_client.py,sha256=ceM8KLAcniAqK1BNVdUGzqy4av2SEEau6PVmPivxc0
689
758
  helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
690
759
  helm/clients/nvidia_nim_client.py,sha256=Z1UAqR2jHacIO_QGqQl1JUZ_82JiSPstBOtj6xURmQk,902
691
760
  helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
692
- helm/clients/openai_client.py,sha256=BImOqF2fVdxJrgI74KCsXeUffABFc_pZ5jgSK5NdZ-o,25936
693
- helm/clients/palmyra_client.py,sha256=U8iD3IENbA4iEpFLNKc8O2UFNYxffgt1QrBvSzctsWs,7151
761
+ helm/clients/openai_client.py,sha256=prSASL8IE3XO0_CaAuZ45iYSITAMQIwtD2q7UqM4qjA,27803
762
+ helm/clients/openai_responses_client.py,sha256=aixsZwO_swP4dhOhJPe1ZcOIav3rxmovPGY0Ug6s5ZA,7308
763
+ helm/clients/palmyra_client.py,sha256=4AaZcV2tPHU4HJ9FWSkOY8_C9ndEckH3PH715QxJQ8E,7086
694
764
  helm/clients/perspective_api_client.py,sha256=o_1FFTCrTny6AZ4EJTstX1H9t8SQSQ8dvhi321RTcL4,6105
695
- helm/clients/reka_client.py,sha256=8PW-NFsqohRQMR-JNWn9xhlG0YfghO_X-QQAnSt9Vqc,8341
765
+ helm/clients/reka_client.py,sha256=hA0tq3Hc9669q2sYa4Jr5yWy2NAbvoFDnVqQ6vds62w,8334
696
766
  helm/clients/simple_client.py,sha256=55S_y1eWD1bjktcG21Vs8G5bF6QbKKwmJyqs6lCUJeI,2048
697
767
  helm/clients/stanfordhealthcare_azure_openai_client.py,sha256=NGbeI6sMenmgqPQTWxYF3C1Aen29LybRcHcsmS3Jqmg,2059
698
768
  helm/clients/stanfordhealthcare_claude_client.py,sha256=ShhbLttPDRa-Pnvr35_2WmVx5s0XpsJMGzu5qhzLoLI,1020
@@ -702,20 +772,24 @@ helm/clients/stanfordhealthcare_openai_client.py,sha256=Qyl8voGz1hJPqT6g4PunMuN9
702
772
  helm/clients/stanfordhealthcare_shc_openai_client.py,sha256=V7K4KZaSjIiE0FkoY4qy6ifJ8pUiNa3vBcWiDsIwXFI,1343
703
773
  helm/clients/test_auto_client.py,sha256=bc-rsMJ8JM0MFnQ4B48hBJ1jL3RtRyVvmPwOgzF2mF8,3155
704
774
  helm/clients/test_client.py,sha256=T27UsIPWsbE1JK_8DN_DW9LkEcIGRbgDjio14YOIAb0,3854
705
- helm/clients/test_huggingface_client.py,sha256=x2NjMuIrinfUy0wQ1S6F5cYZVr09YfvN6LfhWmyGNAM,3388
775
+ helm/clients/test_huggingface_client.py,sha256=8Shzrf1Pad1UsiUAdeOSqsTPQaay0CrWXmdNeIfrJ2Y,3418
706
776
  helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
707
777
  helm/clients/test_together_client.py,sha256=kyBLu-2i4EJyuJm5ft0yg8W-H1IqmULRXggEbChuxdo,6178
708
- helm/clients/together_client.py,sha256=xA_a0R0adb9vNkMfrXOIwwdpGoIPa4Nso2tXT_2YSVg,23215
778
+ helm/clients/together_client.py,sha256=ByImeitpWRhXpZ9U6c0Kol1D8X7Fxno5xgo6D7sZYOY,24201
709
779
  helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
710
780
  helm/clients/upstage_client.py,sha256=iSL1G8G3jWSbrpacz4I0l6Lwc5T01fsLR-wZzF39ftM,679
711
- helm/clients/vertexai_client.py,sha256=haOImGAaYwCyxr4__feG6nHUeHRbCV6ExE6Kp9aKtWs,22665
712
- helm/clients/vllm_client.py,sha256=YLIxGoQ_ZXejA4nfVpmFE4tmHROEFxEbFsV8Ba25Eac,1658
781
+ helm/clients/vertexai_client.py,sha256=AxZRpZTRrzxwPs2xwKTgHH0eh7WEmHSS1ArTZwI_q3E,23268
782
+ helm/clients/vllm_client.py,sha256=xmXf35WX2oOZhpQnRxeooXGshENySOHZCUQ1E4pbQbA,2647
783
+ helm/clients/vllm_granite_thinking_client.py,sha256=fds2i8LUG78OJYke1uYdDy6XRFqE3rZgSornFjzu4Sk,2172
784
+ helm/clients/writer_client.py,sha256=flKLeMbFkyGfNmv1ozZGU4dxNy-QF5bFJF0mGHqpU3c,4467
713
785
  helm/clients/yi_client.py,sha256=nC60d2HiUL2W59FTne9tWmZ9bGGY1OvI7Ob3Ng4wSPE,750
714
786
  helm/clients/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
715
- helm/clients/audio_language/diva_llama_client.py,sha256=On6oNnGN_zZVkNq_kEFh4BgeNgvbNTbHCx7WRm4LYfI,4778
787
+ helm/clients/audio_language/diva_llama_client.py,sha256=Bvcf4wE7yMZlqETgKEMtCug8-2fQI8QCDdaGWSeQ2X8,4864
716
788
  helm/clients/audio_language/llama_omni_client.py,sha256=OCak716q97uEk9CBXQqnmUsbLFR-dddMzg5eyIZ4gzE,8718
717
- helm/clients/audio_language/qwen2_audiolm_client.py,sha256=cY2mScgTWr_No_MFZ8bZn5wKlNd9ae_IndShlegLtrs,8831
718
- helm/clients/audio_language/qwen_audiolm_client.py,sha256=_SHJh-0R3wj0qWJp3HSO7nPrDtr5G_nH3CaRSofFBxg,6236
789
+ helm/clients/audio_language/qwen2_5_omni_client.py,sha256=ftAVtOG0azvRQEcFjkSSBMU6SDk9Bi8WIks6o6UCbKQ,9684
790
+ helm/clients/audio_language/qwen2_audiolm_client.py,sha256=s9eH8fnVgw5xV39b_8AGt6IyNN3q9Uhcx6HZVxt7TM8,8981
791
+ helm/clients/audio_language/qwen_audiolm_client.py,sha256=RvYweXANEyzhHYDx38H10F0ZEFaL8kj7n7TZ-UrRmZs,6338
792
+ helm/clients/audio_language/test.py,sha256=FrKpirOwJW1__E2egq4VPgsTrgiSHZHBwfUCvxNjC0o,1969
719
793
  helm/clients/clip_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
720
794
  helm/clients/clip_scorers/base_clip_scorer.py,sha256=NfXe79g6M4Wype3Xf-oXxscaUgjbZLmy9dRnBaLiWwk,695
721
795
  helm/clients/clip_scorers/clip_scorer.py,sha256=5KzYTrGuy5zA8yHX6c67Is98HLkqQooWhioPxHNLJ7s,1932
@@ -748,12 +822,12 @@ helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py,sha256=IpDLoPBDA-
748
822
  helm/clients/image_generation/dalle_mini/__init__.py,sha256=4RmnjfGTmgYaWsQmaDkOHxgo0Wxr9qqwtpMBC_5XeGg,112
749
823
  helm/clients/image_generation/dalle_mini/data.py,sha256=1unTc4lkUZ-6A2DfcbcglGtnE2KP3OuL4YWFROlsEQo,17622
750
824
  helm/clients/image_generation/dalle_mini/model/__init__.py,sha256=fyMDjpuzHxWjF5Fk9Rkfyn7KpvFAwxyRCJFoA2RDPdM,428
751
- helm/clients/image_generation/dalle_mini/model/configuration.py,sha256=8DvL27TPmbntElIy7FrOzlSqWdlkX3R2eNVxiV_QwOM,7861
752
- helm/clients/image_generation/dalle_mini/model/modeling.py,sha256=YVbiEQSVNmN16Sg1Sn-qR9SVXYfn6UD1-eCt9QcsRwQ,69702
825
+ helm/clients/image_generation/dalle_mini/model/configuration.py,sha256=AAeqmSiGOPd831VrytkWMbSSAv-4uEGk190svHsUGNU,7859
826
+ helm/clients/image_generation/dalle_mini/model/modeling.py,sha256=w9TSQYBjOygqj-QCQSqjzujahGicXRtnJObtXrCpCEQ,69700
753
827
  helm/clients/image_generation/dalle_mini/model/partitions.py,sha256=_fDpk34GL6NhNecHuP78y_gmKpWjbfw3fxMCWVEO4pc,2721
754
- helm/clients/image_generation/dalle_mini/model/processor.py,sha256=oTx5KHXKhZjVYaS0rmtlzCIbWUTJLh0plLNUWl8xxZ8,2406
828
+ helm/clients/image_generation/dalle_mini/model/processor.py,sha256=2JvF8XmYMiFrxxi4YcGDF1JrTFQPqBXfzYmb_ylCRls,2404
755
829
  helm/clients/image_generation/dalle_mini/model/text.py,sha256=Kfba8JdO2LrSmCVlQtgc7J2kSordCgjeg7WV9V45B80,7302
756
- helm/clients/image_generation/dalle_mini/model/tokenizer.py,sha256=SnPUzrfZXSAXXcQRCR8Ykhn5hJfUB3p5wNuriW5GWy0,245
830
+ helm/clients/image_generation/dalle_mini/model/tokenizer.py,sha256=fggtXzlh8HHHgT0T0d78KX6i16zFApnpkp7xOMAuD6c,243
757
831
  helm/clients/image_generation/dalle_mini/model/utils.py,sha256=clu2IiIpAT0DzTc2HvmI0ySnETFsJtpi7tocPkqOreY,1171
758
832
  helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py,sha256=01pV_QWUmcIpj5kBVihle_VGrJyw2AmV3QuhWASds2M,66
759
833
  helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py,sha256=4q39kdTUxeW55SN8NNkA9MdFZtH6rWssN8XauuOwyi0,1213
@@ -773,14 +847,14 @@ helm/clients/image_generation/mindalle/utils/config.py,sha256=lh8dXvL7ctKmuYEbeT
773
847
  helm/clients/image_generation/mindalle/utils/sampling.py,sha256=soTHaJrN4FV1lDdh9HMveJs6F49UMK57Xfa0ccnHqI8,5029
774
848
  helm/clients/image_generation/mindalle/utils/utils.py,sha256=ESugpzG-_73GKl07mj-8o-_nim_FOICxfYkczy3s9x4,3119
775
849
  helm/clients/vision_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
776
- helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=hTywh5nM95BmPoDyKOSDWg9G3-QwLO3KZEJZVkmFroo,6478
777
- helm/clients/vision_language/huggingface_vlm_client.py,sha256=H7AE8mm506PkEcUO8VaLVtptHTwVX58nZx1A_BWdKzA,4968
778
- helm/clients/vision_language/idefics_client.py,sha256=hi1VCDBegHfBssmW0C62H3OX3U2ISVRhaSkd24gb1K4,7692
850
+ helm/clients/vision_language/huggingface_vision2seq_client.py,sha256=S4FDbSO917bUw3cK64xnxwH5HFH_Eb-w2zQ8ZL4eSSk,6588
851
+ helm/clients/vision_language/huggingface_vlm_client.py,sha256=OHV41AA-WZo_CnsHymwslgjDcVK0uHmIrvGbrxBDK5w,5000
852
+ helm/clients/vision_language/idefics_client.py,sha256=DURync-8rh2ccdlGDPl3NMgryBcMn5yCrrmFZisf5m0,7784
779
853
  helm/clients/vision_language/open_flamingo_client.py,sha256=QH6el-wkEl4PMZM9b3_H-o2PRaMvumGbN29ee9dmkMU,6519
780
- helm/clients/vision_language/paligemma_client.py,sha256=IU_T8r1RgpGkEAqabLKBbmoUOWV6c1a9_FXgiTy8exE,6835
854
+ helm/clients/vision_language/paligemma_client.py,sha256=K9MzXlgjXoiVafA8bbu-mKNt3Z9kq8v8AJL286DyQqI,6867
781
855
  helm/clients/vision_language/palmyra_vision_client.py,sha256=4elEdmwllMr2qzTzBdlRC8L5Ut3vOXFtanGGYrx4lv8,4074
782
- helm/clients/vision_language/qwen2_vlm_client.py,sha256=XQ6SB1mkpIuYPNZMQe6jkduvwQxMfrFtVnHKv3osFGo,7310
783
- helm/clients/vision_language/qwen_vlm_client.py,sha256=6rCH4gJMDyQHyjAE_GDIrLsInH_bvd6to-4RMWbRLeM,7407
856
+ helm/clients/vision_language/qwen2_vlm_client.py,sha256=jvh_-jyvFL4r3LPX-gWPCYHT503JtJ73FVHQS2KyQ2c,8325
857
+ helm/clients/vision_language/qwen_vlm_client.py,sha256=wNxEuYOrhjaW5s4vtdRxKvJ-LCTTGyKqiqD84j7H1Do,7565
784
858
  helm/clients/vision_language/open_flamingo/__init__.py,sha256=RTxnxjYnTmTZv-608o66_W74qmKLpEO6hx0cxaZaYv8,172
785
859
  helm/clients/vision_language/open_flamingo/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
786
860
  helm/clients/vision_language/open_flamingo/src/factory.py,sha256=4KRXLV5mOEZ34-Foq2zVgTye3sQD-Buz6NZTSp2X9_A,5790
@@ -796,29 +870,33 @@ helm/common/cache_backend_config.py,sha256=4u5A6BHNBmGnnrDNhCVgrdwhXQtyAbWcUeoo7
796
870
  helm/common/clip_score_request.py,sha256=WnNg89owDCmG7tyy8nnQL0RdKQLsUdMWiYH9XqqbGw8,840
797
871
  helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
798
872
  helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
799
- helm/common/credentials_utils.py,sha256=O-57nUgkWLbZF0k3lsSaVGPPHj2_OYeVuCMe0to3bRE,1118
800
- helm/common/critique_request.py,sha256=yo4aRe-DEjudUmydthtpTj6LdhRXfZ3JZptxTkWzZ3U,3068
873
+ helm/common/context.py,sha256=0U5KNNKLHiiqjb8JVq03mninagEp9zTzFKP0He8o7A8,2788
874
+ helm/common/credentials_utils.py,sha256=BX_P6wUpLKA7Bg3Dztm7jVI2j4ls7H-h38UbmGMBt3A,1101
875
+ helm/common/critique_request.py,sha256=DZhJ_sY2IMluOxz-FeHvuEkA2Ujsx65HXT__7T3UxGk,3005
801
876
  helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
802
- helm/common/general.py,sha256=nMfHNPXyAAorAMmgDClD8r8XXeJcvfF0QXTP-FgH5PQ,11690
877
+ helm/common/general.py,sha256=TcdPXn_bgPFvXtFP2lJhncz4Q8SdTXnKOinHOTBsegw,12027
803
878
  helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
804
- helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
879
+ helm/common/hierarchical_logger.py,sha256=qIbhwh-dlCcnYG10qTSMxIMM7_Q9VJj8ymDqnWlseuo,6151
805
880
  helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
806
881
  helm/common/images_utils.py,sha256=8BsN0fd8pc0rh_TSDvippWhTfwmJJXKNF2zqKLB8cps,3372
807
882
  helm/common/key_value_store.py,sha256=D9ZBORzZncf3zHQOP4AuNbQnV8cZpO_kqHY1mDRugqQ,3174
883
+ helm/common/local_context.py,sha256=lpQSLqybZda7LDg5drYQrT8blWORvOOB4yXyCU9d8Ts,6493
808
884
  helm/common/media_object.py,sha256=1SlilnsrfZVVpfci1atin8hbREnGoNQwjBcNAH8RgBU,5151
809
885
  helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu25_JFtfa68,2234
810
886
  helm/common/mongo_key_value_store.py,sha256=G0TIWQcvwMjyXh4TnN6xJ462HKHUAZtQJJYQOrHK-K8,3887
811
887
  helm/common/multimodal_request_utils.py,sha256=n6HgTyHNqfGmU9qmVK-wxQzrkPZ5Wdh-lO_y_ln6VYc,2184
812
888
  helm/common/nudity_check_request.py,sha256=VMsujI_RBy5u_cGEk0teE4KyX1dL2Zt3Pb4U6LpBdSY,728
813
- helm/common/object_spec.py,sha256=_usgTDQULBF6_jy7C6m-9ZNVvNxbGoTE_CdGcSvBASU,4327
889
+ helm/common/object_spec.py,sha256=sKcEdggqRa3a8TovHAS4lf1LaahOFInvMl5DUF4tE6c,5186
814
890
  helm/common/optional_dependencies.py,sha256=Qam3QCHff8tuXbS-fCw-MVe-pK18gSvHw-uQoXXxT7M,616
815
891
  helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
816
892
  helm/common/reeval_parameters.py,sha256=exaEucXnSI8a076uq_qhO3CTBztMMRoRzL_7v1N4adE,300
817
- helm/common/request.py,sha256=w6N1TmVnc6C1gzyFyhspU1nf5pOE4zBwdeGMFcsBZLk,9022
893
+ helm/common/remote_context.py,sha256=DzFMii9AN03CoWp1J3k703-7oQJYHwEf9TDV5YzM6v4,2825
894
+ helm/common/request.py,sha256=HWj6IizIwJm9_NigO-geira_rI6aqhj5CevQB694m94,9161
818
895
  helm/common/response_format.py,sha256=wIptA8FydZoRjMvO5SFIplgDXhwpZvZmFI-Bi-7mcGU,516
819
896
  helm/common/test_cache.py,sha256=j19p-qzv_98X_TMW4b39ZHwSJ-MX3p91PrkYumarS6Y,4870
820
897
  helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
821
898
  helm/common/test_general.py,sha256=c8Lh0mK8I-SfcMprq909B6zWRBxSBngq2nNL1L6-cYA,1788
899
+ helm/common/test_logging.py,sha256=tkb_QDPkKBfaEQ5Y8Xip9PgMYhqOFakcENqyzO5Mj2o,2681
822
900
  helm/common/test_media_object.py,sha256=SUWLfms_vkXNivRYM0ZT8AI3_2ru6GON5l-Hb-lk-t0,1661
823
901
  helm/common/tokenization_request.py,sha256=NND9ESiiDE0H8QRNpfHVjXS7MQfKKIwtVRKDIjPnnJM,3344
824
902
  helm/common/file_caches/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -826,15 +904,15 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
826
904
  helm/common/file_caches/local_file_cache.py,sha256=NiXbat1BBGl5P27oERqSLFfhIHpYqA1IQrvE_N1sWR8,1944
827
905
  helm/common/file_caches/test_local_file_cache.py,sha256=ANb01ctUV-J4i1ab3l4uhg9Ce54U_56xq9Hayjt1WhQ,686
828
906
  helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
829
- helm/config/model_deployments.yaml,sha256=_zdXhMH50hqKFbtFymUFb_LmlCz3XR1-WARsDAH6ESs,135205
830
- helm/config/model_metadata.yaml,sha256=M-23M608OVESCUai6iBLBIT_17O80pI-YWkEYRax-gk,226327
831
- helm/config/tokenizer_configs.yaml,sha256=o7oX0jQXqKuoLC2z5YgdvJlcMcr15WtNjlqAkYLLDq8,32860
907
+ helm/config/model_deployments.yaml,sha256=sB3cV6io0NzUQXuKlA49-H3UzOEvWpFDP_MZ30gH0I0,171682
908
+ helm/config/model_metadata.yaml,sha256=0Ps6WlsgElxOpCHVGiWu7QfS0o3Ls4zi1iuwC8PTUgE,269972
909
+ helm/config/tokenizer_configs.yaml,sha256=Vq6MY2nplhYgiyLR98xCXBJWQgEpm64yenrskmkm2NI,40415
832
910
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
833
911
  helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
834
- helm/proxy/cli.py,sha256=3bG-w71CsnPgVzN53aYWebAf0avBNJCVaxxDLupEXk4,8264
912
+ helm/proxy/cli.py,sha256=kEDoHpisFO0EJ0Wfm1FLpJdP9sXk9j8WCILEq42RKb0,8317
835
913
  helm/proxy/example_queries.py,sha256=EB2vVpAryOUAFiLrwsMiFz0zGl_UAQ8TJ9SkWngvsu4,4389
836
914
  helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
837
- helm/proxy/retry.py,sha256=iLZmKATEJQa9jsSpOIx6YDRhmrA8G1Qm21cUxCuo2Ug,3490
915
+ helm/proxy/retry.py,sha256=o64BZsW2vwu2iewRA18wdsru2xC3eNBQ7WUw3IjC_5g,3698
838
916
  helm/proxy/server.py,sha256=Q4Mzts8mketktGVJ5AoOEA-_SGCue5QeOlK8dqPUuHI,10853
839
917
  helm/proxy/test_accounts.py,sha256=Vs1iOzTPN29LosDAAEs6IagQ3PccvutrJTlR1qNIcj0,1146
840
918
  helm/proxy/test_retry.py,sha256=db0owyGTThmIMhYWU_Eh1U-AJvQ-Wa9j_kRmC9DNjOA,1059
@@ -843,13 +921,13 @@ helm/proxy/critique/critique_client.py,sha256=ATZuXw77lejwtpgLg3Soy3VDyv8D8xetl0
843
921
  helm/proxy/critique/mechanical_turk_critique_client.py,sha256=OcppmFOMweBSfVTiLIICIwjvPpHHTkdu9fFUTaubitQ,574
844
922
  helm/proxy/critique/mechanical_turk_critique_exporter.py,sha256=taULrc_cIP0O9c5UpGz3l9DmWQadTVzN_v-qzTgMoyo,8470
845
923
  helm/proxy/critique/mechanical_turk_critique_importer.py,sha256=NL97joO5pRkcICRdVyG4kf9JhfYRaySsxRoZ7KWDYv0,5581
846
- helm/proxy/critique/mechanical_turk_utils.py,sha256=mKpUv4zz3s5ptzDY7UrwuI7Cr5HmNgSjPC10BnN9AL4,1766
924
+ helm/proxy/critique/mechanical_turk_utils.py,sha256=MUMcxMA08OXJTtgCX7ejGQQivMNF3Xfu4AAHkvuft9s,1766
847
925
  helm/proxy/critique/model_critique_client.py,sha256=QMFiMpALXnneumKbJpXOZDEb3lPPdkIaSCasmdXHB8o,12806
848
926
  helm/proxy/critique/scale_critique_client.py,sha256=B4povtceyfal95eE3N7em9cC_B5Vy4jMrHXcsXc_5m4,15889
849
927
  helm/proxy/critique/surge_ai_critique_client.py,sha256=HnzgAoF4Du9Me0GS_lbNaozZslS4a2OZx735gh-coo0,8357
850
928
  helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
851
929
  helm/proxy/services/remote_service.py,sha256=zehXO0JYIR6fIgqSZ1p7icPBITYPYfjgTX1ZbxiN1dI,8806
852
- helm/proxy/services/server_service.py,sha256=vIf0GxDRuHFmCQHpgn5wYURlBkMNfF9e0jMQitc80-w,10691
930
+ helm/proxy/services/server_service.py,sha256=VTDkULezp2vniGKfH2fP7PHf_DAtsh4qXwKQ0tD_Wxc,7357
853
931
  helm/proxy/services/service.py,sha256=YFG5ZlBYBz3IdSVRKDIKVlAmA-oLjFCeBHE3iIe_SU8,6020
854
932
  helm/proxy/services/test_remote_service.py,sha256=xzkyptctXw3y5d1fgbidBMyw8B4rILZStC_C-hLgLUc,6643
855
933
  helm/proxy/services/test_service.py,sha256=oDYen-71iwZ6YMNBVbVSdEFsH6GMvZYw5tS5Eg4YHjY,8987
@@ -869,13 +947,15 @@ helm/tokenizers/aleph_alpha_tokenizer.py,sha256=Ofc5thTfW_eb5ztiU-y_0p6e2PIGbHMb
869
947
  helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
870
948
  helm/tokenizers/caching_tokenizer.py,sha256=BwcyVzG7vy3R2O0UgbNxNP2nN4wBnsvpG_9mXQuDYfw,7300
871
949
  helm/tokenizers/cohere_tokenizer.py,sha256=6WwHIt7SsICmYR2QQpwDJ7pfNF8VWrFHFxF5Kynq6aY,2116
950
+ helm/tokenizers/grok_tokenizer.py,sha256=t_cl1BnjRNCW24mU3Z6eAMhh-86FnCcSo-jB2AhvlL4,2142
872
951
  helm/tokenizers/http_model_tokenizer.py,sha256=J5Myg6JVDNgHMN7XOHwGV3WrhilUZ9Sw_FrgO4frYuY,3124
873
- helm/tokenizers/huggingface_tokenizer.py,sha256=fpKwSnZl94AnXQybzJhVnTda5zJnGsjGphKrlPFa_Fg,8726
952
+ helm/tokenizers/huggingface_tokenizer.py,sha256=P2ri4n-SUWB9ShMlxlJ9kO-mPmbSTizMGwAf41JE5ds,8734
874
953
  helm/tokenizers/lit_gpt_tokenizer.py,sha256=0c6KDeLNHPd6h27SXQvkUfmrCSLYa1kQY1GqCHVfhvw,1675
875
954
  helm/tokenizers/simple_tokenizer.py,sha256=6_NROqVbygs-HRA7bYAZluN4YB5gUhVaRsYQeRTjA1E,1147
876
955
  helm/tokenizers/test_ai21_tokenizer.py,sha256=V8orjdKxmEV44VYoZ9Sq5E7CIq2caNnr6vjdk0T_w1A,1646
877
956
  helm/tokenizers/test_anthropic_tokenizer.py,sha256=h7sJMRv_O2yAuEzbrXLJJIo9Gy8wkTycc4gu6UFvDaw,3937
878
957
  helm/tokenizers/test_cohere_tokenizer.py,sha256=15z2GJtZ-VlrliC2_Fk5DIZhQYFkJS7J73fjxYMf8YM,1431
958
+ helm/tokenizers/test_grok_tokenizer.py,sha256=b094C_M2a1zNM3SsGzp9cNNm8aDmmoz1kFbPkubbVTQ,1212
879
959
  helm/tokenizers/test_huggingface_tokenizer.py,sha256=7OB2d0PaCp-qmGXVt0V3yf0ciilN3Kd2qnAYprWRl64,6324
880
960
  helm/tokenizers/test_simple_tokenizer.py,sha256=vUNdcnJqZV99-E8H1rwUH85AQPJ2HTnDr5DrZ_-zRL4,1219
881
961
  helm/tokenizers/test_yalm_tokenizer.py,sha256=8IeJM3X61p3ygBfK_bJtPh_xOJ83IluaZ3UM2xTtbEY,2492
@@ -887,8 +967,8 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
887
967
  helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=1ZcPL3srfk031LmA8bEdPcIraAPnHGiYi_CqTiJSTlc,904
888
968
  helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
889
969
  helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
890
- crfm_helm-0.5.5.dist-info/METADATA,sha256=xmTkUJOFSJ_wrES6pixgfpOjzgS4eJlWaEpGpidNEo8,22210
891
- crfm_helm-0.5.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
892
- crfm_helm-0.5.5.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
893
- crfm_helm-0.5.5.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
894
- crfm_helm-0.5.5.dist-info/RECORD,,
970
+ crfm_helm-0.5.7.dist-info/METADATA,sha256=TMyCY6K4C2Z3wO2Jh5XVDq-hHQ1xxCArIm31BUeGbgM,23548
971
+ crfm_helm-0.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
972
+ crfm_helm-0.5.7.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
973
+ crfm_helm-0.5.7.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
974
+ crfm_helm-0.5.7.dist-info/RECORD,,