crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- crfm_helm-0.5.6.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
1
+ crfm_helm-0.5.7.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
2
2
  helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  helm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  helm/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -7,20 +7,20 @@ helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbY
7
7
  helm/benchmark/data_preprocessor.py,sha256=wqGzAiLwOYa4v6TVPe6ayrnuzdNbmfjeiofRQiO2uso,2201
8
8
  helm/benchmark/executor.py,sha256=E7cF1vMXBn5eT1z5Le5ng4M9AaIMLjxfLgMmF1EfZy0,4843
9
9
  helm/benchmark/huggingface_registration.py,sha256=DAiHffNmo4H90rBfvQ_LHADtUCnCk6dfpI7Wbat1DZA,4389
10
- helm/benchmark/model_deployment_registry.py,sha256=zDpqsgjCvtesRan-z2TQA7G97g14UPgjG0Cbi9owWaY,9472
10
+ helm/benchmark/model_deployment_registry.py,sha256=aPBkSr59jqx6ThFW-DYFhi3tPsLLhSKF5JC4-pxqLrk,9011
11
11
  helm/benchmark/model_metadata_registry.py,sha256=7XisV0an_edM8hvP8LSoCnTeUN2QLJrQknOCA6-OE7M,8841
12
12
  helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
13
13
  helm/benchmark/reeval_run.py,sha256=vImL8JNhveEOftZbRQ6JAxF0L-XCKIwh65M6fIYo4RU,7198
14
14
  helm/benchmark/reeval_runner.py,sha256=bJPl7XVOVwK2fUA7voOVQYwVFEOfKVnrT2tbSGQzQY8,15584
15
- helm/benchmark/run.py,sha256=F65P6eG3S6dHDxRK8HMqDFGQjPBGIJouX80ANsHb0Y8,13806
15
+ helm/benchmark/run.py,sha256=ZyqkKnqkMqM2AH4HL6sH72H8-mrDWu0NW0piE7BY0HM,13973
16
16
  helm/benchmark/run_expander.py,sha256=hKFLpmq8W2KBl_mBf-ahHEbt67qZFgu-VxjvidOeQuE,56543
17
17
  helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
18
18
  helm/benchmark/run_spec_factory.py,sha256=Hxeft3fXoWNz9yGo-2nIfb5pd3GDWlwYWc6YYvAkTjM,7785
19
19
  helm/benchmark/runner.py,sha256=O-91eRRrNgE4_tlCVeLq9_0QsRfNELvaQT-KWtJw894,14618
20
20
  helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
21
21
  helm/benchmark/server.py,sha256=uphh9L0FQnVZVVoGx50MMb_jXh-uen6ouE3uDN5GKFE,6422
22
- helm/benchmark/slurm_jobs.py,sha256=eNCAoaWDfT0Wk32ZJRIGo-x8kgjhDPnPB4Xrvw_eLB0,3225
23
- helm/benchmark/slurm_runner.py,sha256=RjmwMqMdKwOzd9B2S6fkuSqB2UjybmiSRVjraiLtzgM,16567
22
+ helm/benchmark/slurm_jobs.py,sha256=6m11gyMo-cA2dwxR2pBXv4tEds5Aok4YCQQyHRmPoPk,3164
23
+ helm/benchmark/slurm_runner.py,sha256=T4vSoxwdRR8gqyL4S2sw_Le-9rv9BPC0BlOy88pwt70,16785
24
24
  helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
25
25
  helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
26
26
  helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
@@ -56,7 +56,7 @@ helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimoda
56
56
  helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=6nuz0Vn89A1mOedutsiq2SwTOG3qn8dUZTiaXhKffiw,3587
57
57
  helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  helm/benchmark/annotation/aci_bench_annotator.py,sha256=SjXidlbpm5HOhdhNXg3HjabMEQvt3hq1iJ5GPajxt8M,3228
59
- helm/benchmark/annotation/air_bench_annotator.py,sha256=Xvqzf-f29dzLGuAMeNiQe_kSkMbXEN1_U1LwCAn6nJQ,3500
59
+ helm/benchmark/annotation/air_bench_annotator.py,sha256=CDyHVwD4eoymfLduJC5WvvhDX1DOgYBqgjvqBjoCfU8,3501
60
60
  helm/benchmark/annotation/annotator.py,sha256=__BkMVpAEpSs1pbwPK5sVWLdCAXnjsHcPYgmOqmNPu0,1843
61
61
  helm/benchmark/annotation/annotator_factory.py,sha256=8uo5uz1UpIVCHUd7CRvmy6b9XB1gspdHmgxH5UZMPVI,2335
62
62
  helm/benchmark/annotation/anthropic_red_team_annotator.py,sha256=4hob15m2k9e2A97E0aG9FstCbJ_oMM7-9y-nh2EaYqc,2395
@@ -72,7 +72,7 @@ helm/benchmark/annotation/ehr_sql_annotator.py,sha256=Izpq0biZ9lkJOPk6NwTuv2wk8B
72
72
  helm/benchmark/annotation/financebench_annotator.py,sha256=gNERLY35t2kcpayXGGrY4-pBs2jbEUomqElRYbb9nho,4150
73
73
  helm/benchmark/annotation/harm_bench_annotator.py,sha256=zhkWnV3qZgY-nvHgQRHGrrCMC7605JwFHesY7UC3ZnQ,2293
74
74
  helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=I7TjpN502Sa-Z4uUKemJXSAdOiVA3MMO92YIAAXeDBg,6034
75
- helm/benchmark/annotation/live_qa_annotator.py,sha256=8DXsjwmeSyvC0kfp1uYds4cwpxqzF7FcskeZaXxXiOw,3552
75
+ helm/benchmark/annotation/live_qa_annotator.py,sha256=PSff59mU_t3ypmptYsYRKU3m1vMLF0dMyUySIOxBrPw,3553
76
76
  helm/benchmark/annotation/med_dialog_annotator.py,sha256=OVTFIlvdhcOr_hdK0tnrDes9hYdN1mDWFTp4GDYY7O0,3162
77
77
  helm/benchmark/annotation/medalign_annotator.py,sha256=8edAZh8oQgDKUT1bQ3Hp2NBE-QnBZ_-ZQjHkV7YKWhs,3240
78
78
  helm/benchmark/annotation/medi_qa_annotator.py,sha256=v8e6hkHZX1x9KtTedCnpCseh-Y72z5kUgUrXHWPUkX8,3074
@@ -142,22 +142,27 @@ helm/benchmark/metrics/cleva_harms_metrics.py,sha256=xVubv2pG3iinVs3namoVHWAmV9o
142
142
  helm/benchmark/metrics/cleva_metrics_helper.py,sha256=8UwiGhekUmp7DxYWU4rxqX2v3ewkg-O5-jOh49iOGmc,304
143
143
  helm/benchmark/metrics/code_metrics.py,sha256=SebQ5MXJe_phTiMfGMfhgYago-hwh_g9ctBWEHGqCnU,5230
144
144
  helm/benchmark/metrics/code_metrics_helper.py,sha256=UNai154RuhYRZM_YK-rveLct4Ui5iEBNPYmYdKq34Xs,22712
145
- helm/benchmark/metrics/comet_metric.py,sha256=qOvwE0ov1plb6SwwT3CbX1XuSo4GJ-M3iRe98yMiMaM,4797
145
+ helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py,sha256=biKk67r4ij3pK2L0OuGTJ4BAb8ig5tpGAV86uBD1qNs,7832
146
+ helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py,sha256=QrePgX-1UALQKs1dHMfOm1qoALvOU1pbLyC4JmcINx8,19083
147
+ helm/benchmark/metrics/codeinsights_correct_code_metrics.py,sha256=CQs9HXh7P1vzkKWdpvugvttD_8ZF6W_QPp7_rhYFwsY,13873
148
+ helm/benchmark/metrics/codeinsights_edge_case_metrics.py,sha256=B7EEELwwH67VxmgrTBSP25Etyb5XYIDuadfggMrHmcE,3866
149
+ helm/benchmark/metrics/codeinsights_metric_specs.py,sha256=BkKWII9yTkChdZVsGeeeCbiWQDYvvcAKo0nxi_RTTUk,1798
150
+ helm/benchmark/metrics/comet_metric.py,sha256=EJWZ9x8CGeDDQlfxYrY-np_NVJBt5gun0XLJvtpjXVI,4798
146
151
  helm/benchmark/metrics/common_metric_specs.py,sha256=JKqmO4ovBdfOYKC-00OSzOMv--g9NTCVfUHLaz-1Uns,6025
147
152
  helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=Zrf6HyH_WNe7gGFgW0j8FJlX5KZvbk-05iX8QFPJDyU,2656
148
- helm/benchmark/metrics/copyright_metrics.py,sha256=_Lp7sKWgacY_13kFadNfnhrM2Ks8syBXnUW7zYuJkwo,7817
153
+ helm/benchmark/metrics/copyright_metrics.py,sha256=RYOWKFN97UCD2Vj51gzKGbnnY9wAq6KJgiRt2cecVfs,7824
149
154
  helm/benchmark/metrics/czech_bank_qa_metrics.py,sha256=bKoooK2T5v_fFKNbUnsuW6Mv9muAirJD5lTrzuHfpz8,1113
150
155
  helm/benchmark/metrics/decodingtrust_fairness_metrics.py,sha256=x66XP0iQGk4ThT7ddmrlLCA0XF4arRbQMDT42LHf2kE,3297
151
156
  helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py,sha256=TxTkkWdx6d6ym0MirZTiucl_TWFdn4uJLnlTfLjQvgk,2925
152
157
  helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=OU7lka-hm6PubR5Gjj4uNyrqhjlfhe0mmjBCAz9vlRs,3456
153
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=BKDD3lblqT6Ebi5kEC4zbN_OvQwD1SdEtBv5Wf0kzWw,6460
158
+ helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=bW4zafRyKFa__8fGrdiTPUu848ovNnvakLCfqcMrcHk,6461
154
159
  helm/benchmark/metrics/dischargeme_metrics.py,sha256=D8LI52E17hNSPDpEvb2tw1za4QWDE3p9xgx7Nm9l7_Y,454
155
160
  helm/benchmark/metrics/disinformation_metrics.py,sha256=5n8wgRBb6FaDjqe1nR3Cj9aS48esmMsIUq4KpBHoQoU,7870
156
161
  helm/benchmark/metrics/dry_run_metrics.py,sha256=Ss0lzf944HIbL1CX6QuJpGFPqOzhBT0qVWLNR1BoEjk,3784
157
162
  helm/benchmark/metrics/efficiency_metrics.py,sha256=SJqpA1d_GfBPl9H6moai8ra1GVe7tlaCfg3PeiWT54c,11845
158
163
  helm/benchmark/metrics/ehr_sql_metrics.py,sha256=YRjvPIty7zlyoyGD6wo3HYOz7y_PThySOZzVRJ38iww,4797
159
164
  helm/benchmark/metrics/evaluate_instances_metric.py,sha256=LGk1Dv_76Ak0YUlWKFTsOLEFiBSmcGVhNrbj_4zg9g4,2913
160
- helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=t0251_2aA0CrXB8oUBKlPRgPl-xYjzdVhLcGjwuhOgo,19621
165
+ helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=T3pftPfYEUR88NEZEZuzYOTNoHELo7nSbz4qmxN8oQc,19628
161
166
  helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJTrXJaHYTXn4,2470
162
167
  helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
163
168
  helm/benchmark/metrics/gpqa_chain_of_thought_metric.py,sha256=HRRKkcTbCu5ScOVwmjzYaA7UAEGE_AJUZVOCDRuv4Po,4321
@@ -171,6 +176,8 @@ helm/benchmark/metrics/kpi_edgar_metrics.py,sha256=1GsW-nBz8TgP4wFIVEGA4_BhI17ki
171
176
  helm/benchmark/metrics/language_modeling_metrics.py,sha256=yS7k8iFjxfkckSBA0RVA7VdOivSEBtNzCjczK6We7y0,4598
172
177
  helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
173
178
  helm/benchmark/metrics/llm_jury_metrics.py,sha256=yzAsdacyX0MFJy2qKIjhI0y7JvtflELpCh6R14wuCgk,1704
179
+ helm/benchmark/metrics/lmkt_metric_specs.py,sha256=0Fa0xLjQDXwsRCE5VqGzEfb5ZdzKsDoSCwR_zHogFcc,376
180
+ helm/benchmark/metrics/lmkt_metrics.py,sha256=GaZTfl-NQXa1YSzcJUGlZ5wZURH1CnJxGkPFBj8ydTQ,1856
174
181
  helm/benchmark/metrics/machine_translation_metrics.py,sha256=22vaGBCSw12uM1wmtDG-MBBZW8OiTZwNPaerjckdtDE,3860
175
182
  helm/benchmark/metrics/med_dialog_metrics.py,sha256=kzmrkQcmJ15zuOF9_Onk9N0oeNeyl9Rri1JEb1AqRT4,447
176
183
  helm/benchmark/metrics/medalign_metrics.py,sha256=q6l8p5Pie-H9pxhaA-lQkSOnliJWXr6zUeN8syEQ91Q,439
@@ -181,7 +188,7 @@ helm/benchmark/metrics/medication_qa_metrics.py,sha256=wit3nKNWpGFfgauu6Xye2IDTe
181
188
  helm/benchmark/metrics/melt_bias_metric.py,sha256=mHDCkRGLD-0pyJA_depi_KX3sn7g7Bgd3_m0XdLQahY,11520
182
189
  helm/benchmark/metrics/melt_bias_word_lists.py,sha256=xA0araUdszAIOqfxiTi6MIJhKYwr_Gwsc1L9qinZx9U,27891
183
190
  helm/benchmark/metrics/melt_metric_specs.py,sha256=zaeV57LQEl8qK7be36NaojiUJlzmkoKY8JyOkOVuPqs,1619
184
- helm/benchmark/metrics/melt_toxicity_metric.py,sha256=8HxViwOJCAZ-luE_Br55xUfJn5XAVXg6lqcAUsP0GT8,4187
191
+ helm/benchmark/metrics/melt_toxicity_metric.py,sha256=ni6bb_QC51NM5jQpbFYLWtsQy3tNOLwQ_5b3PDV5vVk,4193
185
192
  helm/benchmark/metrics/mental_health_metrics.py,sha256=4HXCXl2GxFPn6wDzHptHeBTuP4BJVLUzEUKffpd5R_k,462
186
193
  helm/benchmark/metrics/metric.py,sha256=jqQyiKDq_pQv-ulGqfZI56ydRDQs3N3XhfHIPysUhrk,14311
187
194
  helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
@@ -192,7 +199,6 @@ helm/benchmark/metrics/mimiciv_billing_code_metrics.py,sha256=Pu9efXoBrhsvxSeGHq
192
199
  helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=XrddVk-gnc8jF8amCI1RBa_XTS9yEXD2Y9Ld9W7Q-m8,497
193
200
  helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=rmH34aTX_wZWxLi4jrxf3sR1RIqNRF0QDANLRQUGhqM,492
194
201
  helm/benchmark/metrics/nltk_helper.py,sha256=QMEps-lqJZ_pCgvjlMf4BvC0pzDu3ez5jit5F4p8dAk,1313
195
- helm/benchmark/metrics/numeracy_metrics.py,sha256=3E-CMmB2wuGW5tLjmEm8wFMf85DJ1ZDUANfh84SQuP0,2906
196
202
  helm/benchmark/metrics/omni_math_metrics.py,sha256=Gqih87UrE93-a0hbRhTBkjmfGLNTkuKQGaG-sTQeuG8,1287
197
203
  helm/benchmark/metrics/openai_mrcr_metrics.py,sha256=TAop7G50FKaR-Jyo2EGLqmMOfJRmS2vNRDFiifa6mhg,2313
198
204
  helm/benchmark/metrics/output_processing_metric.py,sha256=ey9UBi2f3780OwFlp82ymzfjLR3MA2fpA9vW5R4W5TA,2581
@@ -216,7 +222,6 @@ helm/benchmark/metrics/test_classification_metrics.py,sha256=CRDMGmVmzEUnNaM0C02
216
222
  helm/benchmark/metrics/test_disinformation_metrics.py,sha256=U3ZmS9s33oimTQbKO-7pgWeX_WiDB9chlOCtf_vslXw,2249
217
223
  helm/benchmark/metrics/test_evaluate_reference_metrics.py,sha256=B7xtDDWPAxF7d-vcUx_R51hFMae-DD52nUwbu_eWt6Y,1601
218
224
  helm/benchmark/metrics/test_metric.py,sha256=0sGlXE3_Al_VyKpOPBhQR_xT-XrcVgGepLpwut37DmA,771
219
- helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
220
225
  helm/benchmark/metrics/test_statistic.py,sha256=yK6m2BZ5UXWmb2D1cQzDH_2ELvrNDaR_lyzX4WoHw9Q,1273
221
226
  helm/benchmark/metrics/toxicity_metrics.py,sha256=ZLOzxDlMgbljl-9y6vT2ZgwdhsBZ4MfV-T66VpKk00U,4114
222
227
  helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
@@ -230,7 +235,7 @@ helm/benchmark/metrics/ifeval/instructions_util.py,sha256=VhkJfZLCaHi094rZSoeQbo
230
235
  helm/benchmark/metrics/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
231
236
  helm/benchmark/metrics/image_generation/aesthetics_metrics.py,sha256=UqjBgAi1ylegvHBjALJ8vxINhHEqqr2fSvN9lXgyIZk,2140
232
237
  helm/benchmark/metrics/image_generation/aesthetics_scorer.py,sha256=ISdThDKMrx-SHQe69dCcr8qUrMCa_GsxX3BeZnd0WPA,2538
233
- helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=tUnAoew24jjjbjPaoE2-4iyRTq6YNW8Xfk1p5JWZkAU,3338
238
+ helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=0B2WCTP5LDHDbWGoMW2mKnnImHt-QYEU2QzqYf4HxjQ,3812
234
239
  helm/benchmark/metrics/image_generation/denoised_runtime_metric.py,sha256=Nom_yw15ePU7wUuV2DFHpLnEAqaZQjlkW9LowRElOAI,1646
235
240
  helm/benchmark/metrics/image_generation/detection_metrics.py,sha256=mfYoPbLCmqWxqMSXbcX6TM0niNnpCeipcHImuV3mZ3c,2160
236
241
  helm/benchmark/metrics/image_generation/efficiency_metrics.py,sha256=neeNJNtHAVUMWqr5rvRIRlPKl225cXUGCURLB0z-rKQ,1459
@@ -253,7 +258,7 @@ helm/benchmark/metrics/image_generation/detectors/__init__.py,sha256=47DEQpj8HBS
253
258
  helm/benchmark/metrics/image_generation/detectors/base_detector.py,sha256=e4c8vPfioGzl2ftYzWOFIBDJcZJxBmpjU13n4fXaSvY,226
254
259
  helm/benchmark/metrics/image_generation/detectors/vitdet.py,sha256=kxXS8uNBC0pQ7LatuN85CXU8pJHZn0pJXY0rOLd_39g,7526
255
260
  helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
256
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=vFO6s8QHo6Pt1QfbOKAI0m3mJrc0BeH1Hcf7u2uWMIk,2116
261
+ helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=NwE85dtiVSlCJc50E57pkckgnCiKBsW0nF3cqgc2EUo,2128
257
262
  helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py,sha256=5qKL-gHnEVmzSDW2GKDq6Uox_EJMDLe0QA55Nrl4H6s,1472
258
263
  helm/benchmark/metrics/image_generation/q16/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
259
264
  helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py,sha256=8Y5h-6RMjivm50RnNbNwV7wCug4RhKT5g8R_YeEp54I,3467
@@ -262,7 +267,7 @@ helm/benchmark/metrics/image_generation/watermark/__init__.py,sha256=47DEQpj8HBS
262
267
  helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py,sha256=Ir4u8blJWTRtEBogb6u22qCy3JXAIzvx-Th6dSBLfdw,698
263
268
  helm/benchmark/metrics/image_generation/watermark/watermark_detector.py,sha256=w6WnTc6t6zx0W0gTjgedXC9OO5dq5iWpx9UcnioKml4,3641
264
269
  helm/benchmark/metrics/summac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
- helm/benchmark/metrics/summac/model_summac.py,sha256=82S9BpPJENr_jiY-cNubECEhniu5Y3Arzv7AXK93PmE,17442
270
+ helm/benchmark/metrics/summac/model_summac.py,sha256=ccOP0z4WEpR26iAzzTWviFfX33Cg9MdpZgKgSRQc9D8,17445
266
271
  helm/benchmark/metrics/summac/utils_misc.py,sha256=7_Q1c72cKt8PWtxn8u4R8nB53HK6_JF2nP8bBXYNk-A,1485
267
272
  helm/benchmark/metrics/tokens/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
268
273
  helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py,sha256=XDZGK8h84F2w_pK8Zjko8ssKZmVxKFqTOuHL0mLBzMY,694
@@ -280,43 +285,49 @@ helm/benchmark/metrics/vision_language/image_metrics.py,sha256=RgKAn7ftl4KCZ86V3
280
285
  helm/benchmark/metrics/vision_language/image_utils.py,sha256=xwtydR8-s23cJacIGXDXL_pUhAqi6O5CbhM4XNEFlDo,3787
281
286
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
282
287
  helm/benchmark/presentation/contamination.py,sha256=07IuIP92vfuI0GwfeNC-i_NZUlF8N1azzagC19YHOMQ,2802
283
- helm/benchmark/presentation/create_plots.py,sha256=m51mFsYD51Y1rbEQgwTbKZjCI3xQir437WyOS5z5k64,28916
288
+ helm/benchmark/presentation/create_plots.py,sha256=bM6UNzH0Bx8Bv2iKcyMoYp7IwfCZSQob-w_XOOI6r1M,29090
284
289
  helm/benchmark/presentation/run_display.py,sha256=LmY2HES4dU94kRYuUxt-c9LTMDN6MU5CspWTF6rZwDo,12419
285
290
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
286
- helm/benchmark/presentation/schema.py,sha256=gYlMysq_rIzQTE9I1K3mIC1fFjBdDe1yHqgwb4EIciU,10989
287
- helm/benchmark/presentation/summarize.py,sha256=Xk5FJRnWz7xAbPu6JQ96TJ6Fvb1-xWUGBdfetrTsmbA,59882
291
+ helm/benchmark/presentation/schema.py,sha256=j3gOhj-okQ4qzYoMh5N3ltsL0OXiOGuB7ydF-SI-Ug4,11229
292
+ helm/benchmark/presentation/summarize.py,sha256=_d3gd45eBpx8yMnVq1XgF9D-pPMcpbuwseSZz4giybo,60092
288
293
  helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
289
294
  helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
290
- helm/benchmark/presentation/test_create_plots.py,sha256=5PPPegMTdBZurxyyUxI4rN13AVsjV3eQrwFqlobJ8UA,1286
295
+ helm/benchmark/presentation/test_create_plots.py,sha256=1FrJZnPW-5QUQKt_pf4y47uDha4B8wHyY1o5hqhKWhc,1293
291
296
  helm/benchmark/presentation/test_run_entry.py,sha256=4n484sSYT0gQ4WVt67Fs3ctKa4vi97hI32O5XXxGY1o,794
292
297
  helm/benchmark/presentation/test_schema.py,sha256=6mq6CeAOLW2Kxi1lX_ZW8QCVqVR73XImR8ylcRGFkBE,378
293
298
  helm/benchmark/presentation/test_summarize.py,sha256=GzZNwBDybpstzl6wT0Rgqn75N9iCNrUIzrdjOfUolu0,6317
294
299
  helm/benchmark/presentation/torr_robustness_summarizer.py,sha256=SmMOZWCQ-KaJBp78otwvAeE1btWignyWalaQ8QG87r4,8242
295
300
  helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
296
301
  helm/benchmark/run_specs/air_bench_run_specs.py,sha256=K86SqpINMBOiLIpuHz-jwlQL3SrH6n6WbqjD90i4LQQ,2231
302
+ helm/benchmark/run_specs/arabic_run_specs.py,sha256=p5KPvcugJI3ERYhO7Le_aiKOZ4IM2EOvsXEmZE8R4Wc,3014
297
303
  helm/benchmark/run_specs/audio_run_specs.py,sha256=baJz5LZiwWZP3KD0hluKgpidtswzdorQnshX0CoqKAc,23383
304
+ helm/benchmark/run_specs/bluex_run_specs.py,sha256=OHweBHS8JC-k9_e5Zq1LUU2FZhJ2P7SDshatX-N15Ls,1798
298
305
  helm/benchmark/run_specs/call_center_run_specs.py,sha256=QhRQw91WblB9UaB319XNCO5K8PX8Riiza41Ym-1CcRU,7044
299
306
  helm/benchmark/run_specs/capabilities_run_specs.py,sha256=sbqhIj4AoujV45erwoVK61lWdlkjg4qssmGlu0eSr1U,12067
300
- helm/benchmark/run_specs/classic_run_specs.py,sha256=1NYeYIwC2F7EjkPEPxNoFb3Ap6BUcUJK_hxBKq4lzt0,56144
307
+ helm/benchmark/run_specs/classic_run_specs.py,sha256=4DA-21Tiz87dQ_iklyrKpfsyTw2f51tbwtRvv3Zs57s,53727
301
308
  helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
309
+ helm/benchmark/run_specs/codeinsights_run_specs.py,sha256=lz3yysrPjCIiObzrIkRjJsWzkABh9qIXn-o7FSqZPl0,9207
302
310
  helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=7slILDS9f0_Z0y-Pz5xEspoGQUmOCOI2K2r4XWUVsm8,14428
303
311
  helm/benchmark/run_specs/enem_challenge_specs.py,sha256=5UWeP2bsnwCHMMXI3DFRMUPKcnJ9_EL01qPUthbWIvE,1351
304
312
  helm/benchmark/run_specs/enterprise_run_specs.py,sha256=ul2YMPpvThOmi7yIc6xR3W0rtE-8tUIaIzuhGlMg2rY,9598
305
313
  helm/benchmark/run_specs/experimental_run_specs.py,sha256=tIgAdK3cm4t6ZBGkcPcPkxx0XAslKShYA1i3QxWVJEY,7675
306
314
  helm/benchmark/run_specs/finance_run_specs.py,sha256=5mwb7GbAcSLVZiumqCiAr9dr8qBYApkEt5Oben5CFXs,4371
307
- helm/benchmark/run_specs/heim_run_specs.py,sha256=Pt1eVbzvwZ5EXq8WB2b3XYw62SWYN_i1P_H3oE4i8KY,22096
315
+ helm/benchmark/run_specs/healthqa_br_run_specs.py,sha256=515pDZf8rTpvebPmhr9pqY2c08Ey_OtWIGsFDVVcQqI,1416
316
+ helm/benchmark/run_specs/heim_run_specs.py,sha256=9uOB_eW5bQqoP9eYRaJ2bcigPg75pQLQnyQ67fG9wHo,22226
308
317
  helm/benchmark/run_specs/imdb_ptbr_run_specs.py,sha256=nkW5A_xeD5kCKeJVxsL8RFS8r3UpP_WCcwSdMh2s850,1215
309
318
  helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
310
319
  helm/benchmark/run_specs/lite_run_specs.py,sha256=8OkL9g3wQBG96g0ijGZ9L1Trb59b7VPDyYMqvA3hXfE,11129
311
- helm/benchmark/run_specs/long_context_run_specs.py,sha256=mxgFgjdHnatOif4-xmTicGmpr4U720mfkhPIigeTrGQ,4773
320
+ helm/benchmark/run_specs/lmkt_run_specs.py,sha256=tNZvlA4mXUX-NBC9enRR90qFLeh8SNGFq701rXmXc18,5376
321
+ helm/benchmark/run_specs/long_context_run_specs.py,sha256=A3yhg1IEds7kQWxkRYH7WVkMPouA1xDz28uxpHgwJvE,6229
312
322
  helm/benchmark/run_specs/medhelm_run_specs.py,sha256=--KgkjVwKt4uyiTebalrbeGV4FB-jGqPciYjFZED7zA,43407
313
323
  helm/benchmark/run_specs/melt_run_specs.py,sha256=729MkALud2wG07yulx9zqAzejdXW_eVGkfF5cQWeGGY,32031
314
324
  helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py,sha256=kenpGGMK1XXaNtvNXsshPvdvN9ubv1sOfaPdjFM4obA,2034
325
+ helm/benchmark/run_specs/multilingual_run_specs.py,sha256=umf8e6ZDgRXiU0G_BPoovj1UZ_dxyrXtIQ7i9WC6USg,2296
315
326
  helm/benchmark/run_specs/oab_exams_specs.py,sha256=ws7Vppo_zJvxKqQ_sNhm9N7-5eQbX2CBkcDI5c_sRG4,1658
316
327
  helm/benchmark/run_specs/safety_run_specs.py,sha256=3X6tYaq2SlRsZs9q6SCtBUgjNEpOwUtV6M7iY2Kowm0,6807
317
328
  helm/benchmark/run_specs/seahelm_run_specs.py,sha256=R3mg4_OoaRizZ5n0FHcUQpJLny3j-ulBlHzOyF0a0Ok,23904
318
329
  helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
319
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=FvigS7LXxKkg9ipTaIPXDN47qFk__Vrv47hb46_cR3Y,7441
330
+ helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=P1mxSu7ErjiK0ARbRmbIzFwYO3fC-6QpsZQeez4U3qI,7346
320
331
  helm/benchmark/run_specs/sql_run_specs.py,sha256=JWCICELKi81m11MggyR6CJNl3vpWPwk4kr8DZSsWvj4,1965
321
332
  helm/benchmark/run_specs/tweetsentbr_run_specs.py,sha256=qogc-fb83Rh1DooKKaskhak52ycvu8DAnhabw9rc7yA,1129
322
333
  helm/benchmark/run_specs/unitxt_run_specs.py,sha256=4Vbsq0MPpSe4cIJOXzeVpMm60N9Qafa2R85X5BeFQew,1873
@@ -325,8 +336,11 @@ helm/benchmark/run_specs/winogrande_afr_run_specs.py,sha256=dhOm8z6Q_ZpnzYKrsS0n
325
336
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
326
337
  helm/benchmark/scenarios/aci_bench_scenario.py,sha256=W8h7eWz9mjR0kRAffKWSnA1Fs8t2l83sPyW8fjPOxWQ,5670
327
338
  helm/benchmark/scenarios/air_bench_scenario.py,sha256=B6_WMowLFe4gWfnoFA_yrHe0kagbIkZabEnK4kGGqSU,1884
339
+ helm/benchmark/scenarios/alghafa_scenario.py,sha256=FJXO3W6qYzCgLJMSiJEhpddNcFyR3N5Brh8pATW_9GM,5217
328
340
  helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=jnUGbppDGEsbe5xoJjmv7nW_RvwPIYm6cwSULeqk2Fk,5133
329
341
  helm/benchmark/scenarios/anthropic_red_team_scenario.py,sha256=_OWE33eVRaZI0gmfP7bLd572uOi_6jb39z_J6nkcvfg,3182
342
+ helm/benchmark/scenarios/arabic_mmlu_scenario.py,sha256=mI6ttMFAT3sH_v87qVNxYptqDS2EMUhK0b8vpfePSdY,2807
343
+ helm/benchmark/scenarios/aratrust_scenario.py,sha256=G20j6Z-C_6bUJf-bpdyUN23Hb7XK0YtieUprq_5Z5hA,2552
330
344
  helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=fOCHumFWZa4OJZcTZefJiJbdWsb3zjQnWLJYd10Cctw,2496
331
345
  helm/benchmark/scenarios/autobencher_safety_scenario.py,sha256=MFt3f5baN5r-FmzWZfUChGR1mX_PUB_5hxoINac_Whs,1854
332
346
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SyM6RP4v08B1PjumkdQnuKrM9L8SyK0bXbx-LgmyTPo,5067
@@ -337,6 +351,7 @@ helm/benchmark/scenarios/bigcodebench_scenario.py,sha256=zQLv91uwfGAR9N4jm_iBUmY
337
351
  helm/benchmark/scenarios/bird_sql_scenario.py,sha256=n5elzanKEX9YclAl2l1y33aCjihTmaw1VF_ZsAU5IaM,3613
338
352
  helm/benchmark/scenarios/bird_sql_scenario_helper.py,sha256=FIwPk-dwfTY-8gDXeAiTZbfbS0Oe1OuWRlYiJOhZwk4,4664
339
353
  helm/benchmark/scenarios/blimp_scenario.py,sha256=9Ge3QKRgtVHpWy7aehZVKiO6JrsxK7zrEdtqAb4zxtQ,6284
354
+ helm/benchmark/scenarios/bluex_scenario.py,sha256=eHAltiFqZ_bS0AVi0kbskTlxJbQXy7Sqj6E9nZPWqCc,2500
340
355
  helm/benchmark/scenarios/bold_scenario.py,sha256=iE9drB9IeXfRn3xvLnaQi3-nJAp-bV1RE0GJGnp9dJc,4130
341
356
  helm/benchmark/scenarios/boolq_scenario.py,sha256=wPETIu5jcI4jgP5GoFa_xi4SsvHtS9gxQ5TD8neHmdk,8037
342
357
  helm/benchmark/scenarios/call_center_scenario.py,sha256=19J2N57WnUkPMGRRbJyZak8YCeMTRwD3BRK1SArQlL0,3037
@@ -345,10 +360,15 @@ helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=BbEjDqa4C5wpdil5jIb1nz
345
360
  helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
346
361
  helm/benchmark/scenarios/civil_comments_scenario.py,sha256=pnZU2U_cYFYOJmlmwTehHU5oLIPx_Yg8Ayxinroh4IQ,4875
347
362
  helm/benchmark/scenarios/clear_scenario.py,sha256=yGdPxWO6vY4JHNa4xywtvD-9lOn6s5cr3njpZyFA0D0,6183
348
- helm/benchmark/scenarios/cleva_scenario.py,sha256=yPIiToKow76YMc0EDYeqQEPx-9a_6Bm3w4S1IsRRV5E,57987
363
+ helm/benchmark/scenarios/cleva_scenario.py,sha256=n-h2urZ06GUOuAC_60HMwspTTpBFid72Fx8eZGQppdA,57988
349
364
  helm/benchmark/scenarios/code_scenario.py,sha256=lSbZWw67ie9osOjXDZukj3EEZGa3L6TrMvTg--IbuxE,12520
350
365
  helm/benchmark/scenarios/code_scenario_apps_pinned_file_order.py,sha256=KC-5MQ-d8Nn46aDN4FaPxmd6yk1DtVUmVR-CIZsNCp4,1738
351
366
  helm/benchmark/scenarios/code_scenario_helper.py,sha256=TnXAlY-wdAFwIDylFItf0z7HOu93WD6dNThwzZYe330,5904
367
+ helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py,sha256=PK4wtuBXs4cPPwOoGfhBA4J4cGLQYC_MvRWuvWrkrv8,9068
368
+ helm/benchmark/scenarios/codeinsights_correct_code_scenario.py,sha256=7BpcezugYHleSuG8hreHe5oXpm3bxoxQ4RCnx6rjKbU,3734
369
+ helm/benchmark/scenarios/codeinsights_edge_case_scenario.py,sha256=csTwe-mv1f6Tyvnj9uZ0SYuj1GRVvgjzukV28gIhNpk,8703
370
+ helm/benchmark/scenarios/codeinsights_student_coding_scenario.py,sha256=wc5Fefn4jpCw03dQ6WswCztJ8AO5j0Vrn6omcOVUq2k,7409
371
+ helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py,sha256=qX3yckZdMojYhiwvokvEuQpRXOzmN2zmzKjQb96Ljg8,9651
352
372
  helm/benchmark/scenarios/commonsense_scenario.py,sha256=yZ6n9aqOi7UWY3q4uTDNc2JRNZxaBZPIp7n_Snt_8g8,9511
353
373
  helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py,sha256=gKEwqHDD8KlKmW8z3xAxSIGmALTXrRRPcoDUzbv_IXg,3854
354
374
  helm/benchmark/scenarios/copyright_scenario.py,sha256=FHzUYEabj-BTKl90fgq7jSCq5_Yf9cO9MA9djn50B1Q,3697
@@ -376,6 +396,7 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4V426oOuexGg5
376
396
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=kzzDaoVikL2P7Z-17EkLIVR_W7IHcNVerUts2oXDKLA,7111
377
397
  helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py,sha256=TklbX7Kx4y-estV-YHUbI5O08q2qCZRrOmX9D3gZS9c,2193
378
398
  helm/benchmark/scenarios/ewok_scenario.py,sha256=vrbJg9vakAxE6n-1jURUcwb-ihrsYoY9e32BpnEGDaQ,4684
399
+ helm/benchmark/scenarios/exams_multilingual_scenario.py,sha256=c9zMGGL8EbCeNogTm-88g_5wWUiX1Zr7z_tsyjUq2h0,5404
379
400
  helm/benchmark/scenarios/fin_qa_scenario.py,sha256=Dm_kGOivaxiKVhcqFgN8pRPs1eqm2LdBZxWy0yFhFuE,5958
380
401
  helm/benchmark/scenarios/financebench_scenario.py,sha256=cHMljdg0_9HA3FbwcwwMt3DR9rxl0jkyFN9jNrUStSE,1956
381
402
  helm/benchmark/scenarios/financial_phrasebank_scenario.py,sha256=dMTfI9MRHKXnECsXOIY8xvX6w5vAPEIa6A7TYyIu2Fw,4457
@@ -387,6 +408,7 @@ helm/benchmark/scenarios/gsm_scenario.py,sha256=QIj0QK5ncF31ES0GUlxbdBk6SIiJJnj5
387
408
  helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py,sha256=8_ShEuOoEGu7iRE2b0tgi-cfBrCPF9k1L-Pgb__n3Bg,2005
388
409
  helm/benchmark/scenarios/harm_bench_scenario.py,sha256=CBo_AfbtHTlvJdsiquP0EDTKApVmDZc7EW0VTENNAfQ,2478
389
410
  helm/benchmark/scenarios/headqa_scenario.py,sha256=m6Kqt16JeqA1-OLJvmBPZzhVOVt7O6rbJGAwG9C7FZs,5658
411
+ helm/benchmark/scenarios/healthqa_br_scenario.py,sha256=YneXTfp8V6k8rYCF3BTX6bxN2ASxdG3qrBr7uH_IFWc,3406
390
412
  helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py,sha256=iv1khpdiWW0Z7lshyWOhhjRfYFdAU6etN8X5EDEQCrc,1302
391
413
  helm/benchmark/scenarios/ice_scenario.py,sha256=NCbeqvpDFIIG7kSCrJrS-Z9S3iG2THZ7HpAqghpi_y4,16725
392
414
  helm/benchmark/scenarios/ice_scenario_pinned_file_order.py,sha256=fuirubIdi-rkJMfSd7YoDdBX2q0f5K7GGTN4XVapAUY,1613
@@ -394,7 +416,8 @@ helm/benchmark/scenarios/ifeval_scenario.py,sha256=SYn9itpFG0tlWSayf6v0P8bRgdtc-
394
416
  helm/benchmark/scenarios/imdb_ptbr_scenario.py,sha256=laq9UwyvBvZZuo54rf-8SdKTLrMdDHTdGWJ4TdC8Eng,2340
395
417
  helm/benchmark/scenarios/imdb_scenario.py,sha256=qHXd-QIXTCBq8rWW3N5I2Rvg6Pz9v1zFhZkwc73w9io,6259
396
418
  helm/benchmark/scenarios/imdb_scenario_pinned_file_order.py,sha256=fjW0Gkzg2Y3IAbtYJ3KC7MueWd9U8h0tlcBCqxYmRrM,1621
397
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py,sha256=RxK5T6Nu_KP3rLMaKkJWiI_3Sqpskgqwgn4Zj95lEvI,2854
419
+ helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py,sha256=JRTLaQc3PDpYeX9ewGnBteT9jXeaGbmJ1VzYGT8TsXI,3067
420
+ helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py,sha256=5fJHFonb7Ko7exHFtoUtvHar_7PhK2HjW9uDlU8Ljj0,2872
398
421
  helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py,sha256=6z3VlcucrwK2B30artWiSpo-mOTr9tiwYV6Fu8XD0VY,2657
399
422
  helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=F-gDO6r4GPBJTLirhF5noRaV0edvoIT7tiIDlovBFfE,2253
400
423
  helm/benchmark/scenarios/koala_scenario.py,sha256=A5M6SD7Jjg7r9QlbHCtMaydBe-wpOtB6oc6gFXuZ47o,1389
@@ -408,15 +431,16 @@ helm/benchmark/scenarios/lex_glue_scenario.py,sha256=-3fsSjTXjgRN96Hl4GzDIMB_dlx
408
431
  helm/benchmark/scenarios/lextreme_scenario.py,sha256=gVTHoMYX6Q_Itt5rOVO5lYmqWfAtuuf63CnKAF8b_ak,20461
409
432
  helm/benchmark/scenarios/live_qa_scenario.py,sha256=TnWaOPOcA4U1_8JdahQOUZ9KBj0MpMf4BcK2TDBl3BE,3666
410
433
  helm/benchmark/scenarios/lm_entry_scenario.py,sha256=kQTnj5gKJmDxCgynmzQOmghwNySpna7aTY7K7RPD2x4,9109
434
+ helm/benchmark/scenarios/lmkt_scenarios.py,sha256=K51CdOZqMOMOozUmADjrJuNCpUtXVEZwcOeIY-EZrwM,11162
411
435
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2VUJ36vHUZp6fZuLfRIuPSsU_K6Z3Im2ums06sZENqo,6153
412
- helm/benchmark/scenarios/math_scenario.py,sha256=UtNj0UaCxt0RjM-uwD_Evm7SjKnvMlfCt6K0HQOAVC0,14377
436
+ helm/benchmark/scenarios/math_scenario.py,sha256=tW-nGKxyDOwOo2siqu1ZzPrCGzw_lFYGK5uiUK0lF7A,14525
413
437
  helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=7DOqQmO70BpDeJy_S4fJ5i2UcCH8tunxzjFgTIim9bQ,4062
414
438
  helm/benchmark/scenarios/med_dialog_scenario.py,sha256=AE10W1UWhOrgKUnz7e2brKSaQR1WJkQUcPoo4s6n0Fs,7553
415
439
  helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=XEipvuIA-QoyZrtlm8nnaPuyZzdDaeTskAhnseD3Q68,5096
416
440
  helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=0Z1JrizLygjd9v_LLFMk8uZ805IWjJPvg-ZvPVhtMm4,7652
417
441
  helm/benchmark/scenarios/med_qa_scenario.py,sha256=m0W-FgFi58psLglZyQy_ouMQIDP-2j3aL7uInkdVtms,4478
418
442
  helm/benchmark/scenarios/medalign_scenario.py,sha256=mhd8REXpPwxftH48-KKb0ZURJ1mdOlvPRmvN4g4M9Ho,3383
419
- helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=itxQxG0igEr-8PY3cXmUafM45bqxtov-iHEIy_ZuQYQ,15612
443
+ helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=fKXJFVLGnLcZKRBLsbjJA6YA4WqMaQAjkEU-i6YzSTQ,11626
420
444
  helm/benchmark/scenarios/medbullets_scenario.py,sha256=8O0UsPWw-ESkrgiuWz4f8gR99jH5-wS5HtCKYwZ1ycs,6713
421
445
  helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=vwmEQZ119tOVeZtl6Zt-nXKwkA8Qt4WRiH2HogIkV0w,5560
422
446
  helm/benchmark/scenarios/medec_scenario.py,sha256=Lo7iVkek7C9omJ5LX-C83pA_Q5OrAfdNhJY4rslJWTQ,5270
@@ -426,17 +450,18 @@ helm/benchmark/scenarios/medication_qa_scenario.py,sha256=StQmfHTYi8pZLP9FMPzyS-
426
450
  helm/benchmark/scenarios/melt_ir_scenario.py,sha256=d88DEGKVJZCeGnbrXrQZO_W4VJeqW8XNaYc8wIUiJtA,5978
427
451
  helm/benchmark/scenarios/melt_knowledge_scenario.py,sha256=FDG4OGYEV6Ac40VC7KAeikzbFKAK2XXFhH1-QUTw8jo,7923
428
452
  helm/benchmark/scenarios/melt_lm_scenarios.py,sha256=kSm0lRRixhnXctMprPnzi09PLOmgfs-C7TAW3QI8RmE,8969
429
- helm/benchmark/scenarios/melt_scenarios.py,sha256=Zg_Uyq-e9Y-Er4IpWU1o29YC07Q9rOxxhokPyKq57Ik,30140
453
+ helm/benchmark/scenarios/melt_scenarios.py,sha256=_WShDpmPaKrujGbZcazCqleDn0TKDhFg1h-vu3ieS8E,30144
430
454
  helm/benchmark/scenarios/melt_srn_scenario.py,sha256=EQSOZIXbfvVWCJMJ4H2e_CiBz6wc8THJndnbK2WwTHM,14674
431
455
  helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py,sha256=ptMQWgNn6R-XpAVAAjutSdZg_9ZUqG6fVotzAgeead4,7945
432
456
  helm/benchmark/scenarios/melt_translation_scenario.py,sha256=j9YrY60DQHZz4m1MJZaGLzyI6FERlHRx2wy9auyAVB8,5415
433
457
  helm/benchmark/scenarios/mental_health_scenario.py,sha256=O1Lfd0MxqawLZLKUDSynaqqbaGHRjDglmePIqepnJI4,4961
434
- helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=PGa0Nvbad_wH3qRSMPHgg9CgicOi7n25qLDnEucXapo,4097
458
+ helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=GSUlvK-NVPYB83emucc1cPj-HgAQVu2aXGuutfXJUHc,4098
435
459
  helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=bxwVWjE_z4I_Nk5eD78g3QAGyjpsNg7DVWpkp8IGWXM,3841
436
460
  helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=tZBUZEaUMZvfSlsU6hcPs-pxQ0kDIL6qebGd7JmpDbk,2699
437
461
  helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py,sha256=-OkPMRyB7aO6QBFwoTl6a2rpzcoHeEl84tqz7k9kpCM,2982
438
462
  helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=pwpp0wqNhsGc8v2V11aUyEWbwdkmIm-42N676j1T3Ws,4031
439
463
  helm/benchmark/scenarios/mmlu_scenario.py,sha256=_5cX2uI7CxD7K_GvO3MD8CRJLuN4EzS2o_EFvbrfjSU,3855
464
+ helm/benchmark/scenarios/mmmlu_scenario.py,sha256=CyOISLOsXF9IEYGfeqWyYYkWGvrUvGivlWSJ5ttN9qY,2762
440
465
  helm/benchmark/scenarios/msmarco_scenario.py,sha256=-l7_rIMQjMWcpTyn6dGotmNJ5XxN_Ze8dEJyv5ftWFA,34050
441
466
  helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=13pXjs9lFduM-QL03mpM10hU0iA8Vr2jJG2FVBQdKOI,5577
442
467
  helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=RlyWrlI9e5MLsGbkQWpO2WRsIOZJi39xHskOIBypHdo,5399
@@ -444,7 +469,6 @@ helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=-Et7hJnQJOGl1U9Xdb5
444
469
  helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=MiSq0UnUllJxHFU2gO7m4vr_vmulavJxc4ruZhsAt2U,5632
445
470
  helm/benchmark/scenarios/natural_qa_scenario.py,sha256=g-fP8L1lXs7zwNVQOc0ZUnbYkCyElQtLVt5fe5dtvSE,12564
446
471
  helm/benchmark/scenarios/newsqa_scenario.py,sha256=G25VYaLrV_JyyoT0jpzJ6p4l5qsOydm8rlzTvSptNKQ,7284
447
- helm/benchmark/scenarios/numeracy_scenario.py,sha256=lgTGzZc81RyL8iB4K67PAHbyYz6BM2ieub8RSFi2aRc,30895
448
472
  helm/benchmark/scenarios/oab_exams_scenario.py,sha256=vbjUzQP0zU4ckvMbsk4lh24NddVWbUAtfWmsq1h24_w,2101
449
473
  helm/benchmark/scenarios/omni_math_scenario.py,sha256=5qb2cO-Ibb3kDbwYvkzsoU_aOsoKV3ROLgZbi83OyGU,1955
450
474
  helm/benchmark/scenarios/open_assistant_scenario.py,sha256=zd8T6eLOlYMZiFyKrRjc-EPwk5_KpbBedAcKDbZ-TdI,5609
@@ -458,7 +482,7 @@ helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=zpQthgDi-AyEgO
458
482
  helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=jgVf1D4eTSxwxQsW0GBou5hfSo2dnlEJvHpVJqk3BxM,6327
459
483
  helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=Dy0INRMzxSiIs9Pm3fa0hYodN-W--WPSv4kcmeQhucM,3270
460
484
  helm/benchmark/scenarios/scenario.py,sha256=kSy7tmtFeC6-QSEsBuvlrMTA1PB6fOY9jycMld-vBVM,8592
461
- helm/benchmark/scenarios/seahelm_scenario.py,sha256=GA46ShNGUjVdMLK0ZbN4vPuGEWFQsDPJXEGHQbs1qf8,78150
485
+ helm/benchmark/scenarios/seahelm_scenario.py,sha256=i8SnuYDQtFGFkm686h3_FF9J3vr-Bd9w_jd7h5tV5yY,78152
462
486
  helm/benchmark/scenarios/self_instruct_scenario.py,sha256=3Kvi3pLL6eGOEezjoQoGv9c1UxKiRVlFmILKzqst4pI,2309
463
487
  helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=wF_sD61IZ4RDznBVQ1HYbGh3Vc2qjbcBuU0jdmp1aD8,2803
464
488
  helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=5aVEiRgFCutEWW9yMcJBxEo11FlwW0SiZTaOyXY6ioc,2693
@@ -480,16 +504,21 @@ helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=OaxEvT1H9VjOjBS
480
504
  helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=pt2Aln_dX1YMSl-9hV1HJmwW90MC3fWwGsMxZg-Q-UY,16391
481
505
  helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=7STCSHiHGIQ2aaN_PwDE5jXUJ-qcu8PaS4pC-pbOceE,8410
482
506
  helm/benchmark/scenarios/test_air_bench_scenario.py,sha256=9o92CK57xxgPaA9Xt9uJPPie4Cxllzq-KbMt3G35UQ0,1320
507
+ helm/benchmark/scenarios/test_alghafa_scenario.py,sha256=ARQyzjmEpX_FpN2QLnIB7P-ToAeMtE4dqsolzlq8KPQ,1696
508
+ helm/benchmark/scenarios/test_aratrust_scenario.py,sha256=3rsIBfFCAmemNT_IJQ6RZ5liwrHseKGvtGmm2VHoBo0,904
483
509
  helm/benchmark/scenarios/test_bigcodebench_scenario.py,sha256=q9FWJsxLJoFaB3PSMLjI_-YyPoZYusOsMPwn6X6NKXw,1304
510
+ helm/benchmark/scenarios/test_bluex_scenario.py,sha256=fLTyMTmSiJ8MoJKYJ2pcE39yXwZm6zv3_oWsQbRbwH0,1930
484
511
  helm/benchmark/scenarios/test_commonsense_scenario.py,sha256=V5Mq4cxWqU6j1U3icfIuzcnCZsZO7NTKLQgF0lEpdyc,924
485
512
  helm/benchmark/scenarios/test_czech_bank_qa_scenario.py,sha256=bZNLEGu58iHmutGlSp-2uVC2931TO6Rxw7giqFh9RHY,828
486
513
  helm/benchmark/scenarios/test_enem_challenge_scenario.py,sha256=XfPkYaSwdGa63ToC_BLuVKTRSldWNBlKsZYK6CFzL3w,2000
487
514
  helm/benchmark/scenarios/test_ewok_scenario.py,sha256=WY2vqbHF1120ht4PER0uviKMb2jnoPM3ff4KwvwcU4I,1291
515
+ helm/benchmark/scenarios/test_exams_multilingual_scenario.py,sha256=vHLTcEzo5SkZgy2yXYm1Sex641qkr4HQWmVsOrlCQ_s,1764
488
516
  helm/benchmark/scenarios/test_financebench_scenario.py,sha256=EFZLJXXBoyjlTiMQFaQ6MiYkve1lfQDjQWjn4BjqgAQ,1184
489
517
  helm/benchmark/scenarios/test_gold_commodity_news_scenario.py,sha256=RO0NcIkJuujdPVO6tDygmDxhZ5YlmIIYlhwx9LeXlQs,731
490
518
  helm/benchmark/scenarios/test_gpqa_scenario.py,sha256=QQJ_-nmujZBSmhBhikRUWznFJ4jHPbGDnUVCP_17poI,1884
491
519
  helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIafmzIeL0nkoU,1364
492
520
  helm/benchmark/scenarios/test_gsm_scenario.py,sha256=I-Sl8Sg8kmFd7u0zZbwbNmeFV1mQLuOHoQ1cQDDwovs,1123
521
+ helm/benchmark/scenarios/test_healtha_br_scenario.py,sha256=YmhXK24MuTPyLFCkXXI7IlwwiiJxytAbONOEh6wSJWI,1935
493
522
  helm/benchmark/scenarios/test_ifeval_scenario.py,sha256=h3CBg13VKwyb1Xaddwg2GWOzAXz4stK5lXdQtHenAw0,1646
494
523
  helm/benchmark/scenarios/test_imdb_ptbr_scenario.py,sha256=8kfCkMRUMU7N4WIrWawFDoxaLB2iTvQ-sPj4RoE2Osg,887
495
524
  helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py,sha256=qZE-fi1tdNOybpvEQZJUpq9fHsyrPW7NYqj_RTwsv2A,746
@@ -529,7 +558,7 @@ helm/benchmark/scenarios/audio_language/ami_scenario.py,sha256=SH4r2YyW2kQ8r6-nS
529
558
  helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py,sha256=kiUngeoAVOXfuKgqo96RgK_volpJUPFziu-cYDqT8WM,2685
530
559
  helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py,sha256=oLOeBGjQCa3hpzjhX2bNS6637VD9VF1KbSJri9BJ3PI,2698
531
560
  helm/benchmark/scenarios/audio_language/audiocaps_scenario.py,sha256=PkVqQM1zX6ecXYk-Pz4YWlST3Hnla8NyeBHbuHvhSlY,2447
532
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=uoiB3mnkudRH_rY1qeZRgobYYZ0xDn93F1Mn6Avl24Q,6724
561
+ helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=4X_C68yoMKRUC3SuNHYK4_fcboOz-9gbjhbUK1g3VVY,6725
533
562
  helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py,sha256=CbcoGPW65xXRRkrDthotDfoVn51ozANG9s3LCsjxkLA,3706
534
563
  helm/benchmark/scenarios/audio_language/corebench_scenario.py,sha256=R8RAUtdRAQcUAN0PFXybQUekdQFNtT8hXtoR1A1hMGk,3155
535
564
  helm/benchmark/scenarios/audio_language/covost2_scenario.py,sha256=3YiaQXuLGfths2XswRw30Vf26bO9jEW_kAj5wZQSOSI,5119
@@ -540,14 +569,15 @@ helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py,sha256=
540
569
  helm/benchmark/scenarios/audio_language/librispeech_scenario.py,sha256=ogMXxnyTG05tCyJ2d4hiuiVsbQvf4TbndksYeaJXl1s,3475
541
570
  helm/benchmark/scenarios/audio_language/meld_audio_scenario.py,sha256=j1JFX0jGfcqX0QZBKSjYjDWo1jHJbW5Q9jHyOs6Kgls,4903
542
571
  helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py,sha256=Jo_-3zC226iKGT-ac0JNMhlEccazMMiHbomx_qU0rxg,3098
543
- helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=9bpcvFtWq5Pd9i9X8iaY9jod3YcRqk88xnXfjwcNMoY,6130
572
+ helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=7YHgfSpua5OdEGPlmxoufwGXQjvGJMTlEWFiJ_ap5ME,6131
544
573
  helm/benchmark/scenarios/audio_language/mutox_scenario.py,sha256=bDCQbhsRDR6iQGNlCu_35kjmjGjuzjOIoraSncfOlOY,10277
545
574
  helm/benchmark/scenarios/audio_language/parade_scenario.py,sha256=UuOa5cSrHh5n3VF_SuJp4cy1MxlI3uEKHLrNEhGuyuw,4186
546
575
  helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py,sha256=oN4vBkElVzjccaEK2JFqoXMCGFTTHD0gcYwSDhvHTpQ,5438
547
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py,sha256=TfMTdQ_D4foKO4NRPXygDgdF0ST2LYiOcV3gXO3WEYE,3691
548
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py,sha256=OUPFMOpRCTLN0o_lo7JJ7oOHxp9VuwC0fz4abWVS7hA,4713
549
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py,sha256=7-M5HXNE-YDM44f6LO4aYKBeFQxa3PfvN7q4u4BBYxU,4089
550
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py,sha256=c36E2RkeSDumLZgN6dBGzGz1ltgPdcBSqx8XD0qNH-U,5078
576
+ helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py,sha256=2qzPYfn0YYzzOtffD50kQu_ePpFJj_sSW7Bq8ZS6M2g,3559
577
+ helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py,sha256=TEyfAsas3ihN4b4bpGkbK_M_uDt39fVrL5k8vl2Cdyw,3389
578
+ helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py,sha256=qPOP6eIEwxPKu6q5EzcrRmhMxMUQk5F9iq8zdJ1Ccrc,4819
579
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py,sha256=CGteDFCd31vbu_eg5oal1cnfjQ2J0Ty3C2HYyBLhI5M,4186
580
+ helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py,sha256=sL93Q2ERzYiWcTOFEyvjUNbX0BgPdsyHKt6eTr51-Kc,5177
551
581
  helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=wkKyTCtx4isQSMufap_6DsNdGkHi7L8FQ2p7n58kKYI,3124
552
582
  helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py,sha256=4M_gTWs4CoJ1Ce9dDFBTAe9dzSovpsve_sN1eco2V2A,3155
553
583
  helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=L04ee5bM5E0UNNmkwEzVwug4HJXQoIcVjujPgxtU2h0,4366
@@ -614,11 +644,12 @@ helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py,s
614
644
  helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py,sha256=UYe3PnxCKBYEbZTTEzdIoTY9gW7ZZAWmVISRIdItD-A,940
615
645
  helm/benchmark/static/contamination.yaml,sha256=rAfh1DqwyUcDtyzHPQ2QiUK5eY7QfuuRtBXpZMn4TeA,3171
616
646
  helm/benchmark/static/schema_air_bench.yaml,sha256=LapSMj3Ecl1Gp9XIwVCYfrerqS93GNErvp6oDnBCtgw,142378
647
+ helm/benchmark/static/schema_arabic.yaml,sha256=PoudK_u7hV2lalGRvYDI5b89tSfch6Dx_bn5681Um_c,7688
617
648
  helm/benchmark/static/schema_audio.yaml,sha256=lVslZX7JmFo0ZgLU4n6amrs9DK8y43Ux0I9QyDUG-14,29119
618
649
  helm/benchmark/static/schema_autobencher.yaml,sha256=yb-NkF5w5R2YOg7RIsadNHJ_5G7lG1gbcDVq_25luEk,5716
619
650
  helm/benchmark/static/schema_call_center.yaml,sha256=i30aFzWqdOJRyAHN8vAzyHEX1v95DEK0TI1SMKTN4TE,9106
620
651
  helm/benchmark/static/schema_capabilities.yaml,sha256=HHy0aafhOaqL0C4TZw2mMt1Dce2_wuN062ORNZIbwYg,8733
621
- helm/benchmark/static/schema_classic.yaml,sha256=sK3yVQCrk3Tn3Kmg9WITBmJZI7AKVjmIY0f3zgH_t0c,104611
652
+ helm/benchmark/static/schema_classic.yaml,sha256=pRkfy6jrdslx5onmeCUdkRi9y2DQrcPIjVyZLJ7uKCs,104147
622
653
  helm/benchmark/static/schema_cleva.yaml,sha256=TDh-zcCzzTTs7bu0IWlY5dXYaTFhxly8sJIBGQdBvug,25401
623
654
  helm/benchmark/static/schema_czech_bank.yaml,sha256=jkTRQVmmbKkbB0zPH9AtYh6Lt33ymMInRBQnHE5lIOo,5462
624
655
  helm/benchmark/static/schema_decodingtrust.yaml,sha256=2VPxzcyKYea7mx-qmswyVRjPfVatjVH4Rs3OU82mgII,15670
@@ -631,7 +662,7 @@ helm/benchmark/static/schema_image2struct.yaml,sha256=cD1X99YcPI8BMAnNfDmXlM-FN0
631
662
  helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
632
663
  helm/benchmark/static/schema_legal.yaml,sha256=RpoFOuVSIowNgxlPn3UMfJC-68RFr3CGDciUGLPfVqc,28806
633
664
  helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
634
- helm/benchmark/static/schema_long_context.yaml,sha256=0xcyw8WI4SiLM1QPnjhTM-1SMGIyA5IDwWKpJzfQt9g,10795
665
+ helm/benchmark/static/schema_long_context.yaml,sha256=3YjlNkQBgp4hS4PE1EjZvjpvX9v4QjaBPALtOYLpPCs,11486
635
666
  helm/benchmark/static/schema_medhelm.yaml,sha256=84BrIengbq0m42ICWvyEWoYtdERR-8J8-8QbPOqUzvA,50747
636
667
  helm/benchmark/static/schema_melt.yaml,sha256=mmPqwDa26DVZXsRJkmKQSyD0OStvjlxaMoSPM25SpD4,47494
637
668
  helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
@@ -649,15 +680,15 @@ helm/benchmark/static/schema_vhelm.yaml,sha256=0slYep2eepUefgtK_m4iSS785sHdJzljm
649
680
  helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
650
681
  helm/benchmark/static/schema_video.yaml,sha256=FkpI5Slc4w-ty4hns82ArXIvTdqppWDnkJSpIp74QN4,9713
651
682
  helm/benchmark/static_build/config.js,sha256=o98g6QSly1NAfqhYWbU4lEoZB4LEpIrePZtmimiuoXc,165
652
- helm/benchmark/static_build/index.html,sha256=kpJ5Riw0YUmOOo2lSyWPgWx5XOwxxiLvPmG3wHwn2tM,1178
683
+ helm/benchmark/static_build/index.html,sha256=hlkvPO8WVcvIJXentHj3Kn5Cd3QwOoi7OqRAou0pRVQ,1178
653
684
  helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
654
685
  helm/benchmark/static_build/assets/crfm-logo-74391ab8.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
655
686
  helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
656
687
  helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
657
688
  helm/benchmark/static_build/assets/helm-safety-2907a7b6.png,sha256=KQentq_1e3uGwiWMViAPxHu2XZ60gqFgovP3UWTyMmw,72312
658
689
  helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
659
- helm/benchmark/static_build/assets/index-94295e78.js,sha256=yvo6hRwNE6Ns7NxJHOdVfUOhc8HsW8eZVadLMW0Wn0w,124386
660
690
  helm/benchmark/static_build/assets/index-b9779128.css,sha256=uXeRKCUzQAC32ofNoaK3-WC7kRWR--KnR6--1m9NdQA,491471
691
+ helm/benchmark/static_build/assets/index-e439d5e1.js,sha256=t7AnJSBjGs43kxIev2uLVumaInyBUxad9KVtvA86oUw,124597
661
692
  helm/benchmark/static_build/assets/medhelm-overview-eac29843.png,sha256=6sKYQ79cN07-cUsnt-JPsdoVwUBWu5KxOaHWSdwjdgA,284408
662
693
  helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
663
694
  helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
@@ -694,7 +725,7 @@ helm/benchmark/window_services/window_service.py,sha256=y6BthPY1V-ugmYfaJElm5Wfy
694
725
  helm/benchmark/window_services/window_service_factory.py,sha256=T55F0Y2jiOYxUHHZxT4YX4fFXY5gfFhn56zIwUBhc7s,3423
695
726
  helm/benchmark/window_services/yalm_window_service.py,sha256=EwwCoMpr9WVLhCI7OI_7tmZHQfTUwn9FFWjbhIBFRfA,1089
696
727
  helm/benchmark/window_services/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
697
- helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=2JHld8GiR_eIQyHMPSN8K2VOswmKJEPMPJLsxlLpU-Q,631
728
+ helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=bhCZXzQDm2fEDKEslWDzkwPihQgmZS0DLVo__Ll9aLI,605
698
729
  helm/benchmark/window_services/image_generation/lexica_search_window_service.py,sha256=uDCUclHvo8toxSTMztK3zG7Eb-hjueobGQaBqPqVJlk,454
699
730
  helm/benchmark/window_services/image_generation/openai_dalle_window_service.py,sha256=8U2qDrUB1QJHRy5STV5FywkeVm6qfNOaeVBkMQhyMGc,453
700
731
  helm/benchmark/window_services/image_generation/test_clip_window_service.py,sha256=domn2MRduHVAdruSUuGPDIGKyDrh-gFxW-fZaBYR7cg,1430
@@ -717,7 +748,7 @@ helm/clients/google_client.py,sha256=mIaUzK7GHCa9pqK1BEVhdt6dZsJfHv1Qdsf3I0Ayq8A
717
748
  helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
718
749
  helm/clients/grok_client.py,sha256=SbVB6AduTwfElzUgEMnQW2kQUFVTCv4TpPPJvElQEe0,1127
719
750
  helm/clients/http_model_client.py,sha256=_F3_y2UWqbzESQdzV0FMEsECIKjporVSAW6iUQhJ35c,2818
720
- helm/clients/huggingface_client.py,sha256=FYrg8XoCHXi5eUWjS0S_n-eiva-Ri0g1oaaeT_ky-tE,17615
751
+ helm/clients/huggingface_client.py,sha256=oWR4yNFk28nrnB3IoznrhcEuU0pZkNywP0E82z1-NGM,17671
721
752
  helm/clients/huggingface_pipeline_client.py,sha256=ivFTMNHBwwIUjkeOHkl-veZi5nNAjtnkYvneRFWs-6Q,6154
722
753
  helm/clients/ibm_client.py,sha256=4W4fbjnDNjXrP4gVwSfBHPus0QcqFOQzFvfaST1BE1Y,9701
723
754
  helm/clients/lit_gpt_client.py,sha256=pgLfSvusNpdj8F5DVxzQdHxTDRNX4RVt6unegao803U,6229
@@ -727,8 +758,8 @@ helm/clients/mistral_client.py,sha256=ceM8KLAcniAqK1BNVdUGzqy4av2SEEau6PVmPivxc0
727
758
  helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
728
759
  helm/clients/nvidia_nim_client.py,sha256=Z1UAqR2jHacIO_QGqQl1JUZ_82JiSPstBOtj6xURmQk,902
729
760
  helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
730
- helm/clients/openai_client.py,sha256=s62_qafDVbDu5pzIkfQsflIwRzc4sXkSiDNkmZz68Ow,27775
731
- helm/clients/openai_responses_client.py,sha256=zua7DZWLeOdpb1yY8YV10gmuGdqvvo_9YQPW3OIGPDU,7219
761
+ helm/clients/openai_client.py,sha256=prSASL8IE3XO0_CaAuZ45iYSITAMQIwtD2q7UqM4qjA,27803
762
+ helm/clients/openai_responses_client.py,sha256=aixsZwO_swP4dhOhJPe1ZcOIav3rxmovPGY0Ug6s5ZA,7308
732
763
  helm/clients/palmyra_client.py,sha256=4AaZcV2tPHU4HJ9FWSkOY8_C9ndEckH3PH715QxJQ8E,7086
733
764
  helm/clients/perspective_api_client.py,sha256=o_1FFTCrTny6AZ4EJTstX1H9t8SQSQ8dvhi321RTcL4,6105
734
765
  helm/clients/reka_client.py,sha256=hA0tq3Hc9669q2sYa4Jr5yWy2NAbvoFDnVqQ6vds62w,8334
@@ -741,20 +772,21 @@ helm/clients/stanfordhealthcare_openai_client.py,sha256=Qyl8voGz1hJPqT6g4PunMuN9
741
772
  helm/clients/stanfordhealthcare_shc_openai_client.py,sha256=V7K4KZaSjIiE0FkoY4qy6ifJ8pUiNa3vBcWiDsIwXFI,1343
742
773
  helm/clients/test_auto_client.py,sha256=bc-rsMJ8JM0MFnQ4B48hBJ1jL3RtRyVvmPwOgzF2mF8,3155
743
774
  helm/clients/test_client.py,sha256=T27UsIPWsbE1JK_8DN_DW9LkEcIGRbgDjio14YOIAb0,3854
744
- helm/clients/test_huggingface_client.py,sha256=x2NjMuIrinfUy0wQ1S6F5cYZVr09YfvN6LfhWmyGNAM,3388
775
+ helm/clients/test_huggingface_client.py,sha256=8Shzrf1Pad1UsiUAdeOSqsTPQaay0CrWXmdNeIfrJ2Y,3418
745
776
  helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
746
777
  helm/clients/test_together_client.py,sha256=kyBLu-2i4EJyuJm5ft0yg8W-H1IqmULRXggEbChuxdo,6178
747
- helm/clients/together_client.py,sha256=tgjMlWscrauLFfMxDenh14oEBfLWyP9XYhz--YlvKVw,24264
778
+ helm/clients/together_client.py,sha256=ByImeitpWRhXpZ9U6c0Kol1D8X7Fxno5xgo6D7sZYOY,24201
748
779
  helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
749
780
  helm/clients/upstage_client.py,sha256=iSL1G8G3jWSbrpacz4I0l6Lwc5T01fsLR-wZzF39ftM,679
750
- helm/clients/vertexai_client.py,sha256=PjMnz4u5YQdpIbfLLBFsrPuHCNrj0_fatf1rY89d-nQ,23113
751
- helm/clients/vllm_client.py,sha256=YLIxGoQ_ZXejA4nfVpmFE4tmHROEFxEbFsV8Ba25Eac,1658
781
+ helm/clients/vertexai_client.py,sha256=AxZRpZTRrzxwPs2xwKTgHH0eh7WEmHSS1ArTZwI_q3E,23268
782
+ helm/clients/vllm_client.py,sha256=xmXf35WX2oOZhpQnRxeooXGshENySOHZCUQ1E4pbQbA,2647
783
+ helm/clients/vllm_granite_thinking_client.py,sha256=fds2i8LUG78OJYke1uYdDy6XRFqE3rZgSornFjzu4Sk,2172
752
784
  helm/clients/writer_client.py,sha256=flKLeMbFkyGfNmv1ozZGU4dxNy-QF5bFJF0mGHqpU3c,4467
753
785
  helm/clients/yi_client.py,sha256=nC60d2HiUL2W59FTne9tWmZ9bGGY1OvI7Ob3Ng4wSPE,750
754
786
  helm/clients/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
755
787
  helm/clients/audio_language/diva_llama_client.py,sha256=Bvcf4wE7yMZlqETgKEMtCug8-2fQI8QCDdaGWSeQ2X8,4864
756
788
  helm/clients/audio_language/llama_omni_client.py,sha256=OCak716q97uEk9CBXQqnmUsbLFR-dddMzg5eyIZ4gzE,8718
757
- helm/clients/audio_language/qwen2_5_omni_client.py,sha256=lbv6Hr22p0ReyR1bnN-dR8BzdPgilvGES7G03of8BWA,9090
789
+ helm/clients/audio_language/qwen2_5_omni_client.py,sha256=ftAVtOG0azvRQEcFjkSSBMU6SDk9Bi8WIks6o6UCbKQ,9684
758
790
  helm/clients/audio_language/qwen2_audiolm_client.py,sha256=s9eH8fnVgw5xV39b_8AGt6IyNN3q9Uhcx6HZVxt7TM8,8981
759
791
  helm/clients/audio_language/qwen_audiolm_client.py,sha256=RvYweXANEyzhHYDx38H10F0ZEFaL8kj7n7TZ-UrRmZs,6338
760
792
  helm/clients/audio_language/test.py,sha256=FrKpirOwJW1__E2egq4VPgsTrgiSHZHBwfUCvxNjC0o,1969
@@ -840,11 +872,11 @@ helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
840
872
  helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
841
873
  helm/common/context.py,sha256=0U5KNNKLHiiqjb8JVq03mninagEp9zTzFKP0He8o7A8,2788
842
874
  helm/common/credentials_utils.py,sha256=BX_P6wUpLKA7Bg3Dztm7jVI2j4ls7H-h38UbmGMBt3A,1101
843
- helm/common/critique_request.py,sha256=yo4aRe-DEjudUmydthtpTj6LdhRXfZ3JZptxTkWzZ3U,3068
875
+ helm/common/critique_request.py,sha256=DZhJ_sY2IMluOxz-FeHvuEkA2Ujsx65HXT__7T3UxGk,3005
844
876
  helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
845
877
  helm/common/general.py,sha256=TcdPXn_bgPFvXtFP2lJhncz4Q8SdTXnKOinHOTBsegw,12027
846
878
  helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
847
- helm/common/hierarchical_logger.py,sha256=KR5R7tjUJN-hTFdnfzEyfwAhvgTFH3JJCH-LSiilqLk,4192
879
+ helm/common/hierarchical_logger.py,sha256=qIbhwh-dlCcnYG10qTSMxIMM7_Q9VJj8ymDqnWlseuo,6151
848
880
  helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
849
881
  helm/common/images_utils.py,sha256=8BsN0fd8pc0rh_TSDvippWhTfwmJJXKNF2zqKLB8cps,3372
850
882
  helm/common/key_value_store.py,sha256=D9ZBORzZncf3zHQOP4AuNbQnV8cZpO_kqHY1mDRugqQ,3174
@@ -854,7 +886,7 @@ helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu2
854
886
  helm/common/mongo_key_value_store.py,sha256=G0TIWQcvwMjyXh4TnN6xJ462HKHUAZtQJJYQOrHK-K8,3887
855
887
  helm/common/multimodal_request_utils.py,sha256=n6HgTyHNqfGmU9qmVK-wxQzrkPZ5Wdh-lO_y_ln6VYc,2184
856
888
  helm/common/nudity_check_request.py,sha256=VMsujI_RBy5u_cGEk0teE4KyX1dL2Zt3Pb4U6LpBdSY,728
857
- helm/common/object_spec.py,sha256=_usgTDQULBF6_jy7C6m-9ZNVvNxbGoTE_CdGcSvBASU,4327
889
+ helm/common/object_spec.py,sha256=sKcEdggqRa3a8TovHAS4lf1LaahOFInvMl5DUF4tE6c,5186
858
890
  helm/common/optional_dependencies.py,sha256=Qam3QCHff8tuXbS-fCw-MVe-pK18gSvHw-uQoXXxT7M,616
859
891
  helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
860
892
  helm/common/reeval_parameters.py,sha256=exaEucXnSI8a076uq_qhO3CTBztMMRoRzL_7v1N4adE,300
@@ -864,6 +896,7 @@ helm/common/response_format.py,sha256=wIptA8FydZoRjMvO5SFIplgDXhwpZvZmFI-Bi-7mcG
864
896
  helm/common/test_cache.py,sha256=j19p-qzv_98X_TMW4b39ZHwSJ-MX3p91PrkYumarS6Y,4870
865
897
  helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
866
898
  helm/common/test_general.py,sha256=c8Lh0mK8I-SfcMprq909B6zWRBxSBngq2nNL1L6-cYA,1788
899
+ helm/common/test_logging.py,sha256=tkb_QDPkKBfaEQ5Y8Xip9PgMYhqOFakcENqyzO5Mj2o,2681
867
900
  helm/common/test_media_object.py,sha256=SUWLfms_vkXNivRYM0ZT8AI3_2ru6GON5l-Hb-lk-t0,1661
868
901
  helm/common/tokenization_request.py,sha256=NND9ESiiDE0H8QRNpfHVjXS7MQfKKIwtVRKDIjPnnJM,3344
869
902
  helm/common/file_caches/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -871,15 +904,15 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
871
904
  helm/common/file_caches/local_file_cache.py,sha256=NiXbat1BBGl5P27oERqSLFfhIHpYqA1IQrvE_N1sWR8,1944
872
905
  helm/common/file_caches/test_local_file_cache.py,sha256=ANb01ctUV-J4i1ab3l4uhg9Ce54U_56xq9Hayjt1WhQ,686
873
906
  helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
874
- helm/config/model_deployments.yaml,sha256=ec7CZLii6mpJeNC93J4gMgh1YrkU6Fj2XpXJaes01xY,160890
875
- helm/config/model_metadata.yaml,sha256=JvvKKEePcGCQf_cHGQv-k_Yj4GmB71lvRY2Is176a9s,263155
876
- helm/config/tokenizer_configs.yaml,sha256=Xju6-GcWJD-nmS5U0dUgkOexHWVWCd-J59EiVufoOCs,37687
907
+ helm/config/model_deployments.yaml,sha256=sB3cV6io0NzUQXuKlA49-H3UzOEvWpFDP_MZ30gH0I0,171682
908
+ helm/config/model_metadata.yaml,sha256=0Ps6WlsgElxOpCHVGiWu7QfS0o3Ls4zi1iuwC8PTUgE,269972
909
+ helm/config/tokenizer_configs.yaml,sha256=Vq6MY2nplhYgiyLR98xCXBJWQgEpm64yenrskmkm2NI,40415
877
910
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
878
911
  helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
879
- helm/proxy/cli.py,sha256=apG3ByfyMciZFXV5wX2177p1B5eqkxCY6VoRgwJ81Kk,8316
912
+ helm/proxy/cli.py,sha256=kEDoHpisFO0EJ0Wfm1FLpJdP9sXk9j8WCILEq42RKb0,8317
880
913
  helm/proxy/example_queries.py,sha256=EB2vVpAryOUAFiLrwsMiFz0zGl_UAQ8TJ9SkWngvsu4,4389
881
914
  helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
882
- helm/proxy/retry.py,sha256=iLZmKATEJQa9jsSpOIx6YDRhmrA8G1Qm21cUxCuo2Ug,3490
915
+ helm/proxy/retry.py,sha256=o64BZsW2vwu2iewRA18wdsru2xC3eNBQ7WUw3IjC_5g,3698
883
916
  helm/proxy/server.py,sha256=Q4Mzts8mketktGVJ5AoOEA-_SGCue5QeOlK8dqPUuHI,10853
884
917
  helm/proxy/test_accounts.py,sha256=Vs1iOzTPN29LosDAAEs6IagQ3PccvutrJTlR1qNIcj0,1146
885
918
  helm/proxy/test_retry.py,sha256=db0owyGTThmIMhYWU_Eh1U-AJvQ-Wa9j_kRmC9DNjOA,1059
@@ -914,7 +947,7 @@ helm/tokenizers/aleph_alpha_tokenizer.py,sha256=Ofc5thTfW_eb5ztiU-y_0p6e2PIGbHMb
914
947
  helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
915
948
  helm/tokenizers/caching_tokenizer.py,sha256=BwcyVzG7vy3R2O0UgbNxNP2nN4wBnsvpG_9mXQuDYfw,7300
916
949
  helm/tokenizers/cohere_tokenizer.py,sha256=6WwHIt7SsICmYR2QQpwDJ7pfNF8VWrFHFxF5Kynq6aY,2116
917
- helm/tokenizers/grok_tokenizer.py,sha256=Ms7QFYNookeq29AIfHUIXfKhrpRrPOPsNs0zBzWdLKA,2084
950
+ helm/tokenizers/grok_tokenizer.py,sha256=t_cl1BnjRNCW24mU3Z6eAMhh-86FnCcSo-jB2AhvlL4,2142
918
951
  helm/tokenizers/http_model_tokenizer.py,sha256=J5Myg6JVDNgHMN7XOHwGV3WrhilUZ9Sw_FrgO4frYuY,3124
919
952
  helm/tokenizers/huggingface_tokenizer.py,sha256=P2ri4n-SUWB9ShMlxlJ9kO-mPmbSTizMGwAf41JE5ds,8734
920
953
  helm/tokenizers/lit_gpt_tokenizer.py,sha256=0c6KDeLNHPd6h27SXQvkUfmrCSLYa1kQY1GqCHVfhvw,1675
@@ -934,8 +967,8 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
934
967
  helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=1ZcPL3srfk031LmA8bEdPcIraAPnHGiYi_CqTiJSTlc,904
935
968
  helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
936
969
  helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
937
- crfm_helm-0.5.6.dist-info/METADATA,sha256=QlR8qMFpWzt_gIs6aCdrEEUuOS5uCdg1kbRMoI7YGYc,23069
938
- crfm_helm-0.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
939
- crfm_helm-0.5.6.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
940
- crfm_helm-0.5.6.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
941
- crfm_helm-0.5.6.dist-info/RECORD,,
970
+ crfm_helm-0.5.7.dist-info/METADATA,sha256=TMyCY6K4C2Z3wO2Jh5XVDq-hHQ1xxCArIm31BUeGbgM,23548
971
+ crfm_helm-0.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
972
+ crfm_helm-0.5.7.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
973
+ crfm_helm-0.5.7.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
974
+ crfm_helm-0.5.7.dist-info/RECORD,,
@@ -37,7 +37,7 @@ class AIRBench2024Annotator(Annotator):
37
37
  )
38
38
  self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
39
39
  # Regex pattern is lenient to allow for typos e.g. extra whitespace
40
- self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
40
+ self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
41
41
  self._model = model or self._DEFAULT_MODEL
42
42
  self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
43
43
 
@@ -50,7 +50,7 @@ class LiveQAAnnotator(Annotator):
50
50
  cache_dir = os.path.join(file_storage_path, "data")
51
51
  ensure_directory_exists(cache_dir)
52
52
  # Regex pattern is lenient to allow for typos e.g. extra whitespace
53
- self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
53
+ self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
54
54
 
55
55
  def annotate(self, request_state: RequestState) -> Any:
56
56
  assert request_state.result