crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
crfm_helm-0.5.
|
|
1
|
+
crfm_helm-0.5.7.dist-info/licenses/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
|
|
2
2
|
helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
helm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
helm/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -7,20 +7,20 @@ helm/benchmark/config_registry.py,sha256=Cd25a8FHriUzAgvGGU5sBAPyhisdSIjdUJR4YbY
|
|
|
7
7
|
helm/benchmark/data_preprocessor.py,sha256=wqGzAiLwOYa4v6TVPe6ayrnuzdNbmfjeiofRQiO2uso,2201
|
|
8
8
|
helm/benchmark/executor.py,sha256=E7cF1vMXBn5eT1z5Le5ng4M9AaIMLjxfLgMmF1EfZy0,4843
|
|
9
9
|
helm/benchmark/huggingface_registration.py,sha256=DAiHffNmo4H90rBfvQ_LHADtUCnCk6dfpI7Wbat1DZA,4389
|
|
10
|
-
helm/benchmark/model_deployment_registry.py,sha256=
|
|
10
|
+
helm/benchmark/model_deployment_registry.py,sha256=aPBkSr59jqx6ThFW-DYFhi3tPsLLhSKF5JC4-pxqLrk,9011
|
|
11
11
|
helm/benchmark/model_metadata_registry.py,sha256=7XisV0an_edM8hvP8LSoCnTeUN2QLJrQknOCA6-OE7M,8841
|
|
12
12
|
helm/benchmark/multi_gpu_runner.py,sha256=WmTKpVfcKXyiiPzrmxpbvQoZy0Ua8IyPgxB8r_3jrRw,4773
|
|
13
13
|
helm/benchmark/reeval_run.py,sha256=vImL8JNhveEOftZbRQ6JAxF0L-XCKIwh65M6fIYo4RU,7198
|
|
14
14
|
helm/benchmark/reeval_runner.py,sha256=bJPl7XVOVwK2fUA7voOVQYwVFEOfKVnrT2tbSGQzQY8,15584
|
|
15
|
-
helm/benchmark/run.py,sha256=
|
|
15
|
+
helm/benchmark/run.py,sha256=ZyqkKnqkMqM2AH4HL6sH72H8-mrDWu0NW0piE7BY0HM,13973
|
|
16
16
|
helm/benchmark/run_expander.py,sha256=hKFLpmq8W2KBl_mBf-ahHEbt67qZFgu-VxjvidOeQuE,56543
|
|
17
17
|
helm/benchmark/run_spec.py,sha256=GiIU8iGO2FGYFDWIxt51CeNPsW7rM7BzDqH1KgEL1cg,3217
|
|
18
18
|
helm/benchmark/run_spec_factory.py,sha256=Hxeft3fXoWNz9yGo-2nIfb5pd3GDWlwYWc6YYvAkTjM,7785
|
|
19
19
|
helm/benchmark/runner.py,sha256=O-91eRRrNgE4_tlCVeLq9_0QsRfNELvaQT-KWtJw894,14618
|
|
20
20
|
helm/benchmark/runner_config_registry.py,sha256=2gW5wBLkHdYb2WNbZulto06hTcto2ROvjy8HULw3jNM,515
|
|
21
21
|
helm/benchmark/server.py,sha256=uphh9L0FQnVZVVoGx50MMb_jXh-uen6ouE3uDN5GKFE,6422
|
|
22
|
-
helm/benchmark/slurm_jobs.py,sha256=
|
|
23
|
-
helm/benchmark/slurm_runner.py,sha256=
|
|
22
|
+
helm/benchmark/slurm_jobs.py,sha256=6m11gyMo-cA2dwxR2pBXv4tEds5Aok4YCQQyHRmPoPk,3164
|
|
23
|
+
helm/benchmark/slurm_runner.py,sha256=T4vSoxwdRR8gqyL4S2sw_Le-9rv9BPC0BlOy88pwt70,16785
|
|
24
24
|
helm/benchmark/test_data_preprocessor.py,sha256=_esdtkqyU_8Yp5ZOO7n1b-Y4Qc28wpD5drG-4Y4UhIM,2219
|
|
25
25
|
helm/benchmark/test_run_expander.py,sha256=gLeHkNt_nLgbwEJiYxhwda-eKA3sJAxkYolCvgRN5TY,1163
|
|
26
26
|
helm/benchmark/tokenizer_config_registry.py,sha256=ZOImg38ta0FXZYAWna6q7A5xrG2mU7Ofr-8j4EqGlUY,1585
|
|
@@ -56,7 +56,7 @@ helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimoda
|
|
|
56
56
|
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py,sha256=6nuz0Vn89A1mOedutsiq2SwTOG3qn8dUZTiaXhKffiw,3587
|
|
57
57
|
helm/benchmark/annotation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
helm/benchmark/annotation/aci_bench_annotator.py,sha256=SjXidlbpm5HOhdhNXg3HjabMEQvt3hq1iJ5GPajxt8M,3228
|
|
59
|
-
helm/benchmark/annotation/air_bench_annotator.py,sha256=
|
|
59
|
+
helm/benchmark/annotation/air_bench_annotator.py,sha256=CDyHVwD4eoymfLduJC5WvvhDX1DOgYBqgjvqBjoCfU8,3501
|
|
60
60
|
helm/benchmark/annotation/annotator.py,sha256=__BkMVpAEpSs1pbwPK5sVWLdCAXnjsHcPYgmOqmNPu0,1843
|
|
61
61
|
helm/benchmark/annotation/annotator_factory.py,sha256=8uo5uz1UpIVCHUd7CRvmy6b9XB1gspdHmgxH5UZMPVI,2335
|
|
62
62
|
helm/benchmark/annotation/anthropic_red_team_annotator.py,sha256=4hob15m2k9e2A97E0aG9FstCbJ_oMM7-9y-nh2EaYqc,2395
|
|
@@ -72,7 +72,7 @@ helm/benchmark/annotation/ehr_sql_annotator.py,sha256=Izpq0biZ9lkJOPk6NwTuv2wk8B
|
|
|
72
72
|
helm/benchmark/annotation/financebench_annotator.py,sha256=gNERLY35t2kcpayXGGrY4-pBs2jbEUomqElRYbb9nho,4150
|
|
73
73
|
helm/benchmark/annotation/harm_bench_annotator.py,sha256=zhkWnV3qZgY-nvHgQRHGrrCMC7605JwFHesY7UC3ZnQ,2293
|
|
74
74
|
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py,sha256=I7TjpN502Sa-Z4uUKemJXSAdOiVA3MMO92YIAAXeDBg,6034
|
|
75
|
-
helm/benchmark/annotation/live_qa_annotator.py,sha256=
|
|
75
|
+
helm/benchmark/annotation/live_qa_annotator.py,sha256=PSff59mU_t3ypmptYsYRKU3m1vMLF0dMyUySIOxBrPw,3553
|
|
76
76
|
helm/benchmark/annotation/med_dialog_annotator.py,sha256=OVTFIlvdhcOr_hdK0tnrDes9hYdN1mDWFTp4GDYY7O0,3162
|
|
77
77
|
helm/benchmark/annotation/medalign_annotator.py,sha256=8edAZh8oQgDKUT1bQ3Hp2NBE-QnBZ_-ZQjHkV7YKWhs,3240
|
|
78
78
|
helm/benchmark/annotation/medi_qa_annotator.py,sha256=v8e6hkHZX1x9KtTedCnpCseh-Y72z5kUgUrXHWPUkX8,3074
|
|
@@ -142,22 +142,27 @@ helm/benchmark/metrics/cleva_harms_metrics.py,sha256=xVubv2pG3iinVs3namoVHWAmV9o
|
|
|
142
142
|
helm/benchmark/metrics/cleva_metrics_helper.py,sha256=8UwiGhekUmp7DxYWU4rxqX2v3ewkg-O5-jOh49iOGmc,304
|
|
143
143
|
helm/benchmark/metrics/code_metrics.py,sha256=SebQ5MXJe_phTiMfGMfhgYago-hwh_g9ctBWEHGqCnU,5230
|
|
144
144
|
helm/benchmark/metrics/code_metrics_helper.py,sha256=UNai154RuhYRZM_YK-rveLct4Ui5iEBNPYmYdKq34Xs,22712
|
|
145
|
-
helm/benchmark/metrics/
|
|
145
|
+
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py,sha256=biKk67r4ij3pK2L0OuGTJ4BAb8ig5tpGAV86uBD1qNs,7832
|
|
146
|
+
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py,sha256=QrePgX-1UALQKs1dHMfOm1qoALvOU1pbLyC4JmcINx8,19083
|
|
147
|
+
helm/benchmark/metrics/codeinsights_correct_code_metrics.py,sha256=CQs9HXh7P1vzkKWdpvugvttD_8ZF6W_QPp7_rhYFwsY,13873
|
|
148
|
+
helm/benchmark/metrics/codeinsights_edge_case_metrics.py,sha256=B7EEELwwH67VxmgrTBSP25Etyb5XYIDuadfggMrHmcE,3866
|
|
149
|
+
helm/benchmark/metrics/codeinsights_metric_specs.py,sha256=BkKWII9yTkChdZVsGeeeCbiWQDYvvcAKo0nxi_RTTUk,1798
|
|
150
|
+
helm/benchmark/metrics/comet_metric.py,sha256=EJWZ9x8CGeDDQlfxYrY-np_NVJBt5gun0XLJvtpjXVI,4798
|
|
146
151
|
helm/benchmark/metrics/common_metric_specs.py,sha256=JKqmO4ovBdfOYKC-00OSzOMv--g9NTCVfUHLaz-1Uns,6025
|
|
147
152
|
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py,sha256=Zrf6HyH_WNe7gGFgW0j8FJlX5KZvbk-05iX8QFPJDyU,2656
|
|
148
|
-
helm/benchmark/metrics/copyright_metrics.py,sha256=
|
|
153
|
+
helm/benchmark/metrics/copyright_metrics.py,sha256=RYOWKFN97UCD2Vj51gzKGbnnY9wAq6KJgiRt2cecVfs,7824
|
|
149
154
|
helm/benchmark/metrics/czech_bank_qa_metrics.py,sha256=bKoooK2T5v_fFKNbUnsuW6Mv9muAirJD5lTrzuHfpz8,1113
|
|
150
155
|
helm/benchmark/metrics/decodingtrust_fairness_metrics.py,sha256=x66XP0iQGk4ThT7ddmrlLCA0XF4arRbQMDT42LHf2kE,3297
|
|
151
156
|
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py,sha256=TxTkkWdx6d6ym0MirZTiucl_TWFdn4uJLnlTfLjQvgk,2925
|
|
152
157
|
helm/benchmark/metrics/decodingtrust_privacy_metrics.py,sha256=OU7lka-hm6PubR5Gjj4uNyrqhjlfhe0mmjBCAz9vlRs,3456
|
|
153
|
-
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=
|
|
158
|
+
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py,sha256=bW4zafRyKFa__8fGrdiTPUu848ovNnvakLCfqcMrcHk,6461
|
|
154
159
|
helm/benchmark/metrics/dischargeme_metrics.py,sha256=D8LI52E17hNSPDpEvb2tw1za4QWDE3p9xgx7Nm9l7_Y,454
|
|
155
160
|
helm/benchmark/metrics/disinformation_metrics.py,sha256=5n8wgRBb6FaDjqe1nR3Cj9aS48esmMsIUq4KpBHoQoU,7870
|
|
156
161
|
helm/benchmark/metrics/dry_run_metrics.py,sha256=Ss0lzf944HIbL1CX6QuJpGFPqOzhBT0qVWLNR1BoEjk,3784
|
|
157
162
|
helm/benchmark/metrics/efficiency_metrics.py,sha256=SJqpA1d_GfBPl9H6moai8ra1GVe7tlaCfg3PeiWT54c,11845
|
|
158
163
|
helm/benchmark/metrics/ehr_sql_metrics.py,sha256=YRjvPIty7zlyoyGD6wo3HYOz7y_PThySOZzVRJ38iww,4797
|
|
159
164
|
helm/benchmark/metrics/evaluate_instances_metric.py,sha256=LGk1Dv_76Ak0YUlWKFTsOLEFiBSmcGVhNrbj_4zg9g4,2913
|
|
160
|
-
helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=
|
|
165
|
+
helm/benchmark/metrics/evaluate_reference_metrics.py,sha256=T3pftPfYEUR88NEZEZuzYOTNoHELo7nSbz4qmxN8oQc,19628
|
|
161
166
|
helm/benchmark/metrics/fin_qa_metrics.py,sha256=MtXxGMGYiCiwCD1CclBXPopzly-Tz3zJTrXJaHYTXn4,2470
|
|
162
167
|
helm/benchmark/metrics/fin_qa_metrics_helper.py,sha256=sH5FIpsxxGUkXO21YGS2EtVsev1EdQ44lYoqFZPSSGo,11884
|
|
163
168
|
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py,sha256=HRRKkcTbCu5ScOVwmjzYaA7UAEGE_AJUZVOCDRuv4Po,4321
|
|
@@ -171,6 +176,8 @@ helm/benchmark/metrics/kpi_edgar_metrics.py,sha256=1GsW-nBz8TgP4wFIVEGA4_BhI17ki
|
|
|
171
176
|
helm/benchmark/metrics/language_modeling_metrics.py,sha256=yS7k8iFjxfkckSBA0RVA7VdOivSEBtNzCjczK6We7y0,4598
|
|
172
177
|
helm/benchmark/metrics/live_qa_metrics.py,sha256=f2XFmQaohjQNqYqNg8NcDVavCzyP4cd8Cl8rLArn9EM,816
|
|
173
178
|
helm/benchmark/metrics/llm_jury_metrics.py,sha256=yzAsdacyX0MFJy2qKIjhI0y7JvtflELpCh6R14wuCgk,1704
|
|
179
|
+
helm/benchmark/metrics/lmkt_metric_specs.py,sha256=0Fa0xLjQDXwsRCE5VqGzEfb5ZdzKsDoSCwR_zHogFcc,376
|
|
180
|
+
helm/benchmark/metrics/lmkt_metrics.py,sha256=GaZTfl-NQXa1YSzcJUGlZ5wZURH1CnJxGkPFBj8ydTQ,1856
|
|
174
181
|
helm/benchmark/metrics/machine_translation_metrics.py,sha256=22vaGBCSw12uM1wmtDG-MBBZW8OiTZwNPaerjckdtDE,3860
|
|
175
182
|
helm/benchmark/metrics/med_dialog_metrics.py,sha256=kzmrkQcmJ15zuOF9_Onk9N0oeNeyl9Rri1JEb1AqRT4,447
|
|
176
183
|
helm/benchmark/metrics/medalign_metrics.py,sha256=q6l8p5Pie-H9pxhaA-lQkSOnliJWXr6zUeN8syEQ91Q,439
|
|
@@ -181,7 +188,7 @@ helm/benchmark/metrics/medication_qa_metrics.py,sha256=wit3nKNWpGFfgauu6Xye2IDTe
|
|
|
181
188
|
helm/benchmark/metrics/melt_bias_metric.py,sha256=mHDCkRGLD-0pyJA_depi_KX3sn7g7Bgd3_m0XdLQahY,11520
|
|
182
189
|
helm/benchmark/metrics/melt_bias_word_lists.py,sha256=xA0araUdszAIOqfxiTi6MIJhKYwr_Gwsc1L9qinZx9U,27891
|
|
183
190
|
helm/benchmark/metrics/melt_metric_specs.py,sha256=zaeV57LQEl8qK7be36NaojiUJlzmkoKY8JyOkOVuPqs,1619
|
|
184
|
-
helm/benchmark/metrics/melt_toxicity_metric.py,sha256=
|
|
191
|
+
helm/benchmark/metrics/melt_toxicity_metric.py,sha256=ni6bb_QC51NM5jQpbFYLWtsQy3tNOLwQ_5b3PDV5vVk,4193
|
|
185
192
|
helm/benchmark/metrics/mental_health_metrics.py,sha256=4HXCXl2GxFPn6wDzHptHeBTuP4BJVLUzEUKffpd5R_k,462
|
|
186
193
|
helm/benchmark/metrics/metric.py,sha256=jqQyiKDq_pQv-ulGqfZI56ydRDQs3N3XhfHIPysUhrk,14311
|
|
187
194
|
helm/benchmark/metrics/metric_name.py,sha256=POhgmUqqIWh_LjCbYpiKkzGqqChBLeW3FADy9u_FcWw,1354
|
|
@@ -192,7 +199,6 @@ helm/benchmark/metrics/mimiciv_billing_code_metrics.py,sha256=Pu9efXoBrhsvxSeGHq
|
|
|
192
199
|
helm/benchmark/metrics/mtsamples_procedures_metrics.py,sha256=XrddVk-gnc8jF8amCI1RBa_XTS9yEXD2Y9Ld9W7Q-m8,497
|
|
193
200
|
helm/benchmark/metrics/mtsamples_replicate_metrics.py,sha256=rmH34aTX_wZWxLi4jrxf3sR1RIqNRF0QDANLRQUGhqM,492
|
|
194
201
|
helm/benchmark/metrics/nltk_helper.py,sha256=QMEps-lqJZ_pCgvjlMf4BvC0pzDu3ez5jit5F4p8dAk,1313
|
|
195
|
-
helm/benchmark/metrics/numeracy_metrics.py,sha256=3E-CMmB2wuGW5tLjmEm8wFMf85DJ1ZDUANfh84SQuP0,2906
|
|
196
202
|
helm/benchmark/metrics/omni_math_metrics.py,sha256=Gqih87UrE93-a0hbRhTBkjmfGLNTkuKQGaG-sTQeuG8,1287
|
|
197
203
|
helm/benchmark/metrics/openai_mrcr_metrics.py,sha256=TAop7G50FKaR-Jyo2EGLqmMOfJRmS2vNRDFiifa6mhg,2313
|
|
198
204
|
helm/benchmark/metrics/output_processing_metric.py,sha256=ey9UBi2f3780OwFlp82ymzfjLR3MA2fpA9vW5R4W5TA,2581
|
|
@@ -216,7 +222,6 @@ helm/benchmark/metrics/test_classification_metrics.py,sha256=CRDMGmVmzEUnNaM0C02
|
|
|
216
222
|
helm/benchmark/metrics/test_disinformation_metrics.py,sha256=U3ZmS9s33oimTQbKO-7pgWeX_WiDB9chlOCtf_vslXw,2249
|
|
217
223
|
helm/benchmark/metrics/test_evaluate_reference_metrics.py,sha256=B7xtDDWPAxF7d-vcUx_R51hFMae-DD52nUwbu_eWt6Y,1601
|
|
218
224
|
helm/benchmark/metrics/test_metric.py,sha256=0sGlXE3_Al_VyKpOPBhQR_xT-XrcVgGepLpwut37DmA,771
|
|
219
|
-
helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
|
|
220
225
|
helm/benchmark/metrics/test_statistic.py,sha256=yK6m2BZ5UXWmb2D1cQzDH_2ELvrNDaR_lyzX4WoHw9Q,1273
|
|
221
226
|
helm/benchmark/metrics/toxicity_metrics.py,sha256=ZLOzxDlMgbljl-9y6vT2ZgwdhsBZ4MfV-T66VpKk00U,4114
|
|
222
227
|
helm/benchmark/metrics/toxicity_utils.py,sha256=-bfittLtMkHyV5wu-hj6KVtaiNGgVIO5duUmThBlX8w,988
|
|
@@ -230,7 +235,7 @@ helm/benchmark/metrics/ifeval/instructions_util.py,sha256=VhkJfZLCaHi094rZSoeQbo
|
|
|
230
235
|
helm/benchmark/metrics/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
231
236
|
helm/benchmark/metrics/image_generation/aesthetics_metrics.py,sha256=UqjBgAi1ylegvHBjALJ8vxINhHEqqr2fSvN9lXgyIZk,2140
|
|
232
237
|
helm/benchmark/metrics/image_generation/aesthetics_scorer.py,sha256=ISdThDKMrx-SHQe69dCcr8qUrMCa_GsxX3BeZnd0WPA,2538
|
|
233
|
-
helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=
|
|
238
|
+
helm/benchmark/metrics/image_generation/clip_score_metrics.py,sha256=0B2WCTP5LDHDbWGoMW2mKnnImHt-QYEU2QzqYf4HxjQ,3812
|
|
234
239
|
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py,sha256=Nom_yw15ePU7wUuV2DFHpLnEAqaZQjlkW9LowRElOAI,1646
|
|
235
240
|
helm/benchmark/metrics/image_generation/detection_metrics.py,sha256=mfYoPbLCmqWxqMSXbcX6TM0niNnpCeipcHImuV3mZ3c,2160
|
|
236
241
|
helm/benchmark/metrics/image_generation/efficiency_metrics.py,sha256=neeNJNtHAVUMWqr5rvRIRlPKl225cXUGCURLB0z-rKQ,1459
|
|
@@ -253,7 +258,7 @@ helm/benchmark/metrics/image_generation/detectors/__init__.py,sha256=47DEQpj8HBS
|
|
|
253
258
|
helm/benchmark/metrics/image_generation/detectors/base_detector.py,sha256=e4c8vPfioGzl2ftYzWOFIBDJcZJxBmpjU13n4fXaSvY,226
|
|
254
259
|
helm/benchmark/metrics/image_generation/detectors/vitdet.py,sha256=kxXS8uNBC0pQ7LatuN85CXU8pJHZn0pJXY0rOLd_39g,7526
|
|
255
260
|
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
256
|
-
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=
|
|
261
|
+
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py,sha256=NwE85dtiVSlCJc50E57pkckgnCiKBsW0nF3cqgc2EUo,2128
|
|
257
262
|
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py,sha256=5qKL-gHnEVmzSDW2GKDq6Uox_EJMDLe0QA55Nrl4H6s,1472
|
|
258
263
|
helm/benchmark/metrics/image_generation/q16/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
259
264
|
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py,sha256=8Y5h-6RMjivm50RnNbNwV7wCug4RhKT5g8R_YeEp54I,3467
|
|
@@ -262,7 +267,7 @@ helm/benchmark/metrics/image_generation/watermark/__init__.py,sha256=47DEQpj8HBS
|
|
|
262
267
|
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py,sha256=Ir4u8blJWTRtEBogb6u22qCy3JXAIzvx-Th6dSBLfdw,698
|
|
263
268
|
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py,sha256=w6WnTc6t6zx0W0gTjgedXC9OO5dq5iWpx9UcnioKml4,3641
|
|
264
269
|
helm/benchmark/metrics/summac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
265
|
-
helm/benchmark/metrics/summac/model_summac.py,sha256=
|
|
270
|
+
helm/benchmark/metrics/summac/model_summac.py,sha256=ccOP0z4WEpR26iAzzTWviFfX33Cg9MdpZgKgSRQc9D8,17445
|
|
266
271
|
helm/benchmark/metrics/summac/utils_misc.py,sha256=7_Q1c72cKt8PWtxn8u4R8nB53HK6_JF2nP8bBXYNk-A,1485
|
|
267
272
|
helm/benchmark/metrics/tokens/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
273
|
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py,sha256=XDZGK8h84F2w_pK8Zjko8ssKZmVxKFqTOuHL0mLBzMY,694
|
|
@@ -280,43 +285,49 @@ helm/benchmark/metrics/vision_language/image_metrics.py,sha256=RgKAn7ftl4KCZ86V3
|
|
|
280
285
|
helm/benchmark/metrics/vision_language/image_utils.py,sha256=xwtydR8-s23cJacIGXDXL_pUhAqi6O5CbhM4XNEFlDo,3787
|
|
281
286
|
helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
282
287
|
helm/benchmark/presentation/contamination.py,sha256=07IuIP92vfuI0GwfeNC-i_NZUlF8N1azzagC19YHOMQ,2802
|
|
283
|
-
helm/benchmark/presentation/create_plots.py,sha256=
|
|
288
|
+
helm/benchmark/presentation/create_plots.py,sha256=bM6UNzH0Bx8Bv2iKcyMoYp7IwfCZSQob-w_XOOI6r1M,29090
|
|
284
289
|
helm/benchmark/presentation/run_display.py,sha256=LmY2HES4dU94kRYuUxt-c9LTMDN6MU5CspWTF6rZwDo,12419
|
|
285
290
|
helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
|
|
286
|
-
helm/benchmark/presentation/schema.py,sha256=
|
|
287
|
-
helm/benchmark/presentation/summarize.py,sha256=
|
|
291
|
+
helm/benchmark/presentation/schema.py,sha256=j3gOhj-okQ4qzYoMh5N3ltsL0OXiOGuB7ydF-SI-Ug4,11229
|
|
292
|
+
helm/benchmark/presentation/summarize.py,sha256=_d3gd45eBpx8yMnVq1XgF9D-pPMcpbuwseSZz4giybo,60092
|
|
288
293
|
helm/benchmark/presentation/table.py,sha256=-foH1BIfMiD6YvpwoGJ910CH7Hib-_pYtHH1hE8zwNc,2904
|
|
289
294
|
helm/benchmark/presentation/test_contamination.py,sha256=RlihBOF6vx2tKEj6_EMnJojTYoStx0FUeJSLT1bdf8w,509
|
|
290
|
-
helm/benchmark/presentation/test_create_plots.py,sha256=
|
|
295
|
+
helm/benchmark/presentation/test_create_plots.py,sha256=1FrJZnPW-5QUQKt_pf4y47uDha4B8wHyY1o5hqhKWhc,1293
|
|
291
296
|
helm/benchmark/presentation/test_run_entry.py,sha256=4n484sSYT0gQ4WVt67Fs3ctKa4vi97hI32O5XXxGY1o,794
|
|
292
297
|
helm/benchmark/presentation/test_schema.py,sha256=6mq6CeAOLW2Kxi1lX_ZW8QCVqVR73XImR8ylcRGFkBE,378
|
|
293
298
|
helm/benchmark/presentation/test_summarize.py,sha256=GzZNwBDybpstzl6wT0Rgqn75N9iCNrUIzrdjOfUolu0,6317
|
|
294
299
|
helm/benchmark/presentation/torr_robustness_summarizer.py,sha256=SmMOZWCQ-KaJBp78otwvAeE1btWignyWalaQ8QG87r4,8242
|
|
295
300
|
helm/benchmark/run_specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
296
301
|
helm/benchmark/run_specs/air_bench_run_specs.py,sha256=K86SqpINMBOiLIpuHz-jwlQL3SrH6n6WbqjD90i4LQQ,2231
|
|
302
|
+
helm/benchmark/run_specs/arabic_run_specs.py,sha256=p5KPvcugJI3ERYhO7Le_aiKOZ4IM2EOvsXEmZE8R4Wc,3014
|
|
297
303
|
helm/benchmark/run_specs/audio_run_specs.py,sha256=baJz5LZiwWZP3KD0hluKgpidtswzdorQnshX0CoqKAc,23383
|
|
304
|
+
helm/benchmark/run_specs/bluex_run_specs.py,sha256=OHweBHS8JC-k9_e5Zq1LUU2FZhJ2P7SDshatX-N15Ls,1798
|
|
298
305
|
helm/benchmark/run_specs/call_center_run_specs.py,sha256=QhRQw91WblB9UaB319XNCO5K8PX8Riiza41Ym-1CcRU,7044
|
|
299
306
|
helm/benchmark/run_specs/capabilities_run_specs.py,sha256=sbqhIj4AoujV45erwoVK61lWdlkjg4qssmGlu0eSr1U,12067
|
|
300
|
-
helm/benchmark/run_specs/classic_run_specs.py,sha256=
|
|
307
|
+
helm/benchmark/run_specs/classic_run_specs.py,sha256=4DA-21Tiz87dQ_iklyrKpfsyTw2f51tbwtRvv3Zs57s,53727
|
|
301
308
|
helm/benchmark/run_specs/cleva_run_specs.py,sha256=lEIHEqQY3Efx-sl2Z6Rq9Qq_1HEWHqFYuUkZbGvq66s,13387
|
|
309
|
+
helm/benchmark/run_specs/codeinsights_run_specs.py,sha256=lz3yysrPjCIiObzrIkRjJsWzkABh9qIXn-o7FSqZPl0,9207
|
|
302
310
|
helm/benchmark/run_specs/decodingtrust_run_specs.py,sha256=7slILDS9f0_Z0y-Pz5xEspoGQUmOCOI2K2r4XWUVsm8,14428
|
|
303
311
|
helm/benchmark/run_specs/enem_challenge_specs.py,sha256=5UWeP2bsnwCHMMXI3DFRMUPKcnJ9_EL01qPUthbWIvE,1351
|
|
304
312
|
helm/benchmark/run_specs/enterprise_run_specs.py,sha256=ul2YMPpvThOmi7yIc6xR3W0rtE-8tUIaIzuhGlMg2rY,9598
|
|
305
313
|
helm/benchmark/run_specs/experimental_run_specs.py,sha256=tIgAdK3cm4t6ZBGkcPcPkxx0XAslKShYA1i3QxWVJEY,7675
|
|
306
314
|
helm/benchmark/run_specs/finance_run_specs.py,sha256=5mwb7GbAcSLVZiumqCiAr9dr8qBYApkEt5Oben5CFXs,4371
|
|
307
|
-
helm/benchmark/run_specs/
|
|
315
|
+
helm/benchmark/run_specs/healthqa_br_run_specs.py,sha256=515pDZf8rTpvebPmhr9pqY2c08Ey_OtWIGsFDVVcQqI,1416
|
|
316
|
+
helm/benchmark/run_specs/heim_run_specs.py,sha256=9uOB_eW5bQqoP9eYRaJ2bcigPg75pQLQnyQ67fG9wHo,22226
|
|
308
317
|
helm/benchmark/run_specs/imdb_ptbr_run_specs.py,sha256=nkW5A_xeD5kCKeJVxsL8RFS8r3UpP_WCcwSdMh2s850,1215
|
|
309
318
|
helm/benchmark/run_specs/instruction_following_run_specs.py,sha256=GElJhgbQhlZMYSAM4YyGcYq0pqycR32kBCoHqG6m-ZY,4177
|
|
310
319
|
helm/benchmark/run_specs/lite_run_specs.py,sha256=8OkL9g3wQBG96g0ijGZ9L1Trb59b7VPDyYMqvA3hXfE,11129
|
|
311
|
-
helm/benchmark/run_specs/
|
|
320
|
+
helm/benchmark/run_specs/lmkt_run_specs.py,sha256=tNZvlA4mXUX-NBC9enRR90qFLeh8SNGFq701rXmXc18,5376
|
|
321
|
+
helm/benchmark/run_specs/long_context_run_specs.py,sha256=A3yhg1IEds7kQWxkRYH7WVkMPouA1xDz28uxpHgwJvE,6229
|
|
312
322
|
helm/benchmark/run_specs/medhelm_run_specs.py,sha256=--KgkjVwKt4uyiTebalrbeGV4FB-jGqPciYjFZED7zA,43407
|
|
313
323
|
helm/benchmark/run_specs/melt_run_specs.py,sha256=729MkALud2wG07yulx9zqAzejdXW_eVGkfF5cQWeGGY,32031
|
|
314
324
|
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py,sha256=kenpGGMK1XXaNtvNXsshPvdvN9ubv1sOfaPdjFM4obA,2034
|
|
325
|
+
helm/benchmark/run_specs/multilingual_run_specs.py,sha256=umf8e6ZDgRXiU0G_BPoovj1UZ_dxyrXtIQ7i9WC6USg,2296
|
|
315
326
|
helm/benchmark/run_specs/oab_exams_specs.py,sha256=ws7Vppo_zJvxKqQ_sNhm9N7-5eQbX2CBkcDI5c_sRG4,1658
|
|
316
327
|
helm/benchmark/run_specs/safety_run_specs.py,sha256=3X6tYaq2SlRsZs9q6SCtBUgjNEpOwUtV6M7iY2Kowm0,6807
|
|
317
328
|
helm/benchmark/run_specs/seahelm_run_specs.py,sha256=R3mg4_OoaRizZ5n0FHcUQpJLny3j-ulBlHzOyF0a0Ok,23904
|
|
318
329
|
helm/benchmark/run_specs/simple_run_specs.py,sha256=0kK_e8U4JUWZ6wO4N-GPFRE1iGT4ilvSMUGfirvpIE0,3837
|
|
319
|
-
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=
|
|
330
|
+
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py,sha256=P1mxSu7ErjiK0ARbRmbIzFwYO3fC-6QpsZQeez4U3qI,7346
|
|
320
331
|
helm/benchmark/run_specs/sql_run_specs.py,sha256=JWCICELKi81m11MggyR6CJNl3vpWPwk4kr8DZSsWvj4,1965
|
|
321
332
|
helm/benchmark/run_specs/tweetsentbr_run_specs.py,sha256=qogc-fb83Rh1DooKKaskhak52ycvu8DAnhabw9rc7yA,1129
|
|
322
333
|
helm/benchmark/run_specs/unitxt_run_specs.py,sha256=4Vbsq0MPpSe4cIJOXzeVpMm60N9Qafa2R85X5BeFQew,1873
|
|
@@ -325,8 +336,11 @@ helm/benchmark/run_specs/winogrande_afr_run_specs.py,sha256=dhOm8z6Q_ZpnzYKrsS0n
|
|
|
325
336
|
helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
326
337
|
helm/benchmark/scenarios/aci_bench_scenario.py,sha256=W8h7eWz9mjR0kRAffKWSnA1Fs8t2l83sPyW8fjPOxWQ,5670
|
|
327
338
|
helm/benchmark/scenarios/air_bench_scenario.py,sha256=B6_WMowLFe4gWfnoFA_yrHe0kagbIkZabEnK4kGGqSU,1884
|
|
339
|
+
helm/benchmark/scenarios/alghafa_scenario.py,sha256=FJXO3W6qYzCgLJMSiJEhpddNcFyR3N5Brh8pATW_9GM,5217
|
|
328
340
|
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py,sha256=jnUGbppDGEsbe5xoJjmv7nW_RvwPIYm6cwSULeqk2Fk,5133
|
|
329
341
|
helm/benchmark/scenarios/anthropic_red_team_scenario.py,sha256=_OWE33eVRaZI0gmfP7bLd572uOi_6jb39z_J6nkcvfg,3182
|
|
342
|
+
helm/benchmark/scenarios/arabic_mmlu_scenario.py,sha256=mI6ttMFAT3sH_v87qVNxYptqDS2EMUhK0b8vpfePSdY,2807
|
|
343
|
+
helm/benchmark/scenarios/aratrust_scenario.py,sha256=G20j6Z-C_6bUJf-bpdyUN23Hb7XK0YtieUprq_5Z5hA,2552
|
|
330
344
|
helm/benchmark/scenarios/autobencher_capabilities_scenario.py,sha256=fOCHumFWZa4OJZcTZefJiJbdWsb3zjQnWLJYd10Cctw,2496
|
|
331
345
|
helm/benchmark/scenarios/autobencher_safety_scenario.py,sha256=MFt3f5baN5r-FmzWZfUChGR1mX_PUB_5hxoINac_Whs,1854
|
|
332
346
|
helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SyM6RP4v08B1PjumkdQnuKrM9L8SyK0bXbx-LgmyTPo,5067
|
|
@@ -337,6 +351,7 @@ helm/benchmark/scenarios/bigcodebench_scenario.py,sha256=zQLv91uwfGAR9N4jm_iBUmY
|
|
|
337
351
|
helm/benchmark/scenarios/bird_sql_scenario.py,sha256=n5elzanKEX9YclAl2l1y33aCjihTmaw1VF_ZsAU5IaM,3613
|
|
338
352
|
helm/benchmark/scenarios/bird_sql_scenario_helper.py,sha256=FIwPk-dwfTY-8gDXeAiTZbfbS0Oe1OuWRlYiJOhZwk4,4664
|
|
339
353
|
helm/benchmark/scenarios/blimp_scenario.py,sha256=9Ge3QKRgtVHpWy7aehZVKiO6JrsxK7zrEdtqAb4zxtQ,6284
|
|
354
|
+
helm/benchmark/scenarios/bluex_scenario.py,sha256=eHAltiFqZ_bS0AVi0kbskTlxJbQXy7Sqj6E9nZPWqCc,2500
|
|
340
355
|
helm/benchmark/scenarios/bold_scenario.py,sha256=iE9drB9IeXfRn3xvLnaQi3-nJAp-bV1RE0GJGnp9dJc,4130
|
|
341
356
|
helm/benchmark/scenarios/boolq_scenario.py,sha256=wPETIu5jcI4jgP5GoFa_xi4SsvHtS9gxQ5TD8neHmdk,8037
|
|
342
357
|
helm/benchmark/scenarios/call_center_scenario.py,sha256=19J2N57WnUkPMGRRbJyZak8YCeMTRwD3BRK1SArQlL0,3037
|
|
@@ -345,10 +360,15 @@ helm/benchmark/scenarios/chw_care_plan_scenario.py,sha256=BbEjDqa4C5wpdil5jIb1nz
|
|
|
345
360
|
helm/benchmark/scenarios/ci_mcqa_scenario.py,sha256=slZZT74QI3OMQAgT-ybcR_xVcRDoopXw6mMu4iy3XCY,3074
|
|
346
361
|
helm/benchmark/scenarios/civil_comments_scenario.py,sha256=pnZU2U_cYFYOJmlmwTehHU5oLIPx_Yg8Ayxinroh4IQ,4875
|
|
347
362
|
helm/benchmark/scenarios/clear_scenario.py,sha256=yGdPxWO6vY4JHNa4xywtvD-9lOn6s5cr3njpZyFA0D0,6183
|
|
348
|
-
helm/benchmark/scenarios/cleva_scenario.py,sha256=
|
|
363
|
+
helm/benchmark/scenarios/cleva_scenario.py,sha256=n-h2urZ06GUOuAC_60HMwspTTpBFid72Fx8eZGQppdA,57988
|
|
349
364
|
helm/benchmark/scenarios/code_scenario.py,sha256=lSbZWw67ie9osOjXDZukj3EEZGa3L6TrMvTg--IbuxE,12520
|
|
350
365
|
helm/benchmark/scenarios/code_scenario_apps_pinned_file_order.py,sha256=KC-5MQ-d8Nn46aDN4FaPxmd6yk1DtVUmVR-CIZsNCp4,1738
|
|
351
366
|
helm/benchmark/scenarios/code_scenario_helper.py,sha256=TnXAlY-wdAFwIDylFItf0z7HOu93WD6dNThwzZYe330,5904
|
|
367
|
+
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py,sha256=PK4wtuBXs4cPPwOoGfhBA4J4cGLQYC_MvRWuvWrkrv8,9068
|
|
368
|
+
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py,sha256=7BpcezugYHleSuG8hreHe5oXpm3bxoxQ4RCnx6rjKbU,3734
|
|
369
|
+
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py,sha256=csTwe-mv1f6Tyvnj9uZ0SYuj1GRVvgjzukV28gIhNpk,8703
|
|
370
|
+
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py,sha256=wc5Fefn4jpCw03dQ6WswCztJ8AO5j0Vrn6omcOVUq2k,7409
|
|
371
|
+
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py,sha256=qX3yckZdMojYhiwvokvEuQpRXOzmN2zmzKjQb96Ljg8,9651
|
|
352
372
|
helm/benchmark/scenarios/commonsense_scenario.py,sha256=yZ6n9aqOi7UWY3q4uTDNc2JRNZxaBZPIp7n_Snt_8g8,9511
|
|
353
373
|
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py,sha256=gKEwqHDD8KlKmW8z3xAxSIGmALTXrRRPcoDUzbv_IXg,3854
|
|
354
374
|
helm/benchmark/scenarios/copyright_scenario.py,sha256=FHzUYEabj-BTKl90fgq7jSCq5_Yf9cO9MA9djn50B1Q,3697
|
|
@@ -376,6 +396,7 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=4V426oOuexGg5
|
|
|
376
396
|
helm/benchmark/scenarios/entity_matching_scenario.py,sha256=kzzDaoVikL2P7Z-17EkLIVR_W7IHcNVerUts2oXDKLA,7111
|
|
377
397
|
helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py,sha256=TklbX7Kx4y-estV-YHUbI5O08q2qCZRrOmX9D3gZS9c,2193
|
|
378
398
|
helm/benchmark/scenarios/ewok_scenario.py,sha256=vrbJg9vakAxE6n-1jURUcwb-ihrsYoY9e32BpnEGDaQ,4684
|
|
399
|
+
helm/benchmark/scenarios/exams_multilingual_scenario.py,sha256=c9zMGGL8EbCeNogTm-88g_5wWUiX1Zr7z_tsyjUq2h0,5404
|
|
379
400
|
helm/benchmark/scenarios/fin_qa_scenario.py,sha256=Dm_kGOivaxiKVhcqFgN8pRPs1eqm2LdBZxWy0yFhFuE,5958
|
|
380
401
|
helm/benchmark/scenarios/financebench_scenario.py,sha256=cHMljdg0_9HA3FbwcwwMt3DR9rxl0jkyFN9jNrUStSE,1956
|
|
381
402
|
helm/benchmark/scenarios/financial_phrasebank_scenario.py,sha256=dMTfI9MRHKXnECsXOIY8xvX6w5vAPEIa6A7TYyIu2Fw,4457
|
|
@@ -387,6 +408,7 @@ helm/benchmark/scenarios/gsm_scenario.py,sha256=QIj0QK5ncF31ES0GUlxbdBk6SIiJJnj5
|
|
|
387
408
|
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py,sha256=8_ShEuOoEGu7iRE2b0tgi-cfBrCPF9k1L-Pgb__n3Bg,2005
|
|
388
409
|
helm/benchmark/scenarios/harm_bench_scenario.py,sha256=CBo_AfbtHTlvJdsiquP0EDTKApVmDZc7EW0VTENNAfQ,2478
|
|
389
410
|
helm/benchmark/scenarios/headqa_scenario.py,sha256=m6Kqt16JeqA1-OLJvmBPZzhVOVt7O6rbJGAwG9C7FZs,5658
|
|
411
|
+
helm/benchmark/scenarios/healthqa_br_scenario.py,sha256=YneXTfp8V6k8rYCF3BTX6bxN2ASxdG3qrBr7uH_IFWc,3406
|
|
390
412
|
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py,sha256=iv1khpdiWW0Z7lshyWOhhjRfYFdAU6etN8X5EDEQCrc,1302
|
|
391
413
|
helm/benchmark/scenarios/ice_scenario.py,sha256=NCbeqvpDFIIG7kSCrJrS-Z9S3iG2THZ7HpAqghpi_y4,16725
|
|
392
414
|
helm/benchmark/scenarios/ice_scenario_pinned_file_order.py,sha256=fuirubIdi-rkJMfSd7YoDdBX2q0f5K7GGTN4XVapAUY,1613
|
|
@@ -394,7 +416,8 @@ helm/benchmark/scenarios/ifeval_scenario.py,sha256=SYn9itpFG0tlWSayf6v0P8bRgdtc-
|
|
|
394
416
|
helm/benchmark/scenarios/imdb_ptbr_scenario.py,sha256=laq9UwyvBvZZuo54rf-8SdKTLrMdDHTdGWJ4TdC8Eng,2340
|
|
395
417
|
helm/benchmark/scenarios/imdb_scenario.py,sha256=qHXd-QIXTCBq8rWW3N5I2Rvg6Pz9v1zFhZkwc73w9io,6259
|
|
396
418
|
helm/benchmark/scenarios/imdb_scenario_pinned_file_order.py,sha256=fjW0Gkzg2Y3IAbtYJ3KC7MueWd9U8h0tlcBCqxYmRrM,1621
|
|
397
|
-
helm/benchmark/scenarios/
|
|
419
|
+
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py,sha256=JRTLaQc3PDpYeX9ewGnBteT9jXeaGbmJ1VzYGT8TsXI,3067
|
|
420
|
+
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py,sha256=5fJHFonb7Ko7exHFtoUtvHar_7PhK2HjW9uDlU8Ljj0,2872
|
|
398
421
|
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py,sha256=6z3VlcucrwK2B30artWiSpo-mOTr9tiwYV6Fu8XD0VY,2657
|
|
399
422
|
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=F-gDO6r4GPBJTLirhF5noRaV0edvoIT7tiIDlovBFfE,2253
|
|
400
423
|
helm/benchmark/scenarios/koala_scenario.py,sha256=A5M6SD7Jjg7r9QlbHCtMaydBe-wpOtB6oc6gFXuZ47o,1389
|
|
@@ -408,15 +431,16 @@ helm/benchmark/scenarios/lex_glue_scenario.py,sha256=-3fsSjTXjgRN96Hl4GzDIMB_dlx
|
|
|
408
431
|
helm/benchmark/scenarios/lextreme_scenario.py,sha256=gVTHoMYX6Q_Itt5rOVO5lYmqWfAtuuf63CnKAF8b_ak,20461
|
|
409
432
|
helm/benchmark/scenarios/live_qa_scenario.py,sha256=TnWaOPOcA4U1_8JdahQOUZ9KBj0MpMf4BcK2TDBl3BE,3666
|
|
410
433
|
helm/benchmark/scenarios/lm_entry_scenario.py,sha256=kQTnj5gKJmDxCgynmzQOmghwNySpna7aTY7K7RPD2x4,9109
|
|
434
|
+
helm/benchmark/scenarios/lmkt_scenarios.py,sha256=K51CdOZqMOMOozUmADjrJuNCpUtXVEZwcOeIY-EZrwM,11162
|
|
411
435
|
helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2VUJ36vHUZp6fZuLfRIuPSsU_K6Z3Im2ums06sZENqo,6153
|
|
412
|
-
helm/benchmark/scenarios/math_scenario.py,sha256=
|
|
436
|
+
helm/benchmark/scenarios/math_scenario.py,sha256=tW-nGKxyDOwOo2siqu1ZzPrCGzw_lFYGK5uiUK0lF7A,14525
|
|
413
437
|
helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=7DOqQmO70BpDeJy_S4fJ5i2UcCH8tunxzjFgTIim9bQ,4062
|
|
414
438
|
helm/benchmark/scenarios/med_dialog_scenario.py,sha256=AE10W1UWhOrgKUnz7e2brKSaQR1WJkQUcPoo4s6n0Fs,7553
|
|
415
439
|
helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=XEipvuIA-QoyZrtlm8nnaPuyZzdDaeTskAhnseD3Q68,5096
|
|
416
440
|
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=0Z1JrizLygjd9v_LLFMk8uZ805IWjJPvg-ZvPVhtMm4,7652
|
|
417
441
|
helm/benchmark/scenarios/med_qa_scenario.py,sha256=m0W-FgFi58psLglZyQy_ouMQIDP-2j3aL7uInkdVtms,4478
|
|
418
442
|
helm/benchmark/scenarios/medalign_scenario.py,sha256=mhd8REXpPwxftH48-KKb0ZURJ1mdOlvPRmvN4g4M9Ho,3383
|
|
419
|
-
helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=
|
|
443
|
+
helm/benchmark/scenarios/medalign_scenario_helper.py,sha256=fKXJFVLGnLcZKRBLsbjJA6YA4WqMaQAjkEU-i6YzSTQ,11626
|
|
420
444
|
helm/benchmark/scenarios/medbullets_scenario.py,sha256=8O0UsPWw-ESkrgiuWz4f8gR99jH5-wS5HtCKYwZ1ycs,6713
|
|
421
445
|
helm/benchmark/scenarios/medcalc_bench_scenario.py,sha256=vwmEQZ119tOVeZtl6Zt-nXKwkA8Qt4WRiH2HogIkV0w,5560
|
|
422
446
|
helm/benchmark/scenarios/medec_scenario.py,sha256=Lo7iVkek7C9omJ5LX-C83pA_Q5OrAfdNhJY4rslJWTQ,5270
|
|
@@ -426,17 +450,18 @@ helm/benchmark/scenarios/medication_qa_scenario.py,sha256=StQmfHTYi8pZLP9FMPzyS-
|
|
|
426
450
|
helm/benchmark/scenarios/melt_ir_scenario.py,sha256=d88DEGKVJZCeGnbrXrQZO_W4VJeqW8XNaYc8wIUiJtA,5978
|
|
427
451
|
helm/benchmark/scenarios/melt_knowledge_scenario.py,sha256=FDG4OGYEV6Ac40VC7KAeikzbFKAK2XXFhH1-QUTw8jo,7923
|
|
428
452
|
helm/benchmark/scenarios/melt_lm_scenarios.py,sha256=kSm0lRRixhnXctMprPnzi09PLOmgfs-C7TAW3QI8RmE,8969
|
|
429
|
-
helm/benchmark/scenarios/melt_scenarios.py,sha256=
|
|
453
|
+
helm/benchmark/scenarios/melt_scenarios.py,sha256=_WShDpmPaKrujGbZcazCqleDn0TKDhFg1h-vu3ieS8E,30144
|
|
430
454
|
helm/benchmark/scenarios/melt_srn_scenario.py,sha256=EQSOZIXbfvVWCJMJ4H2e_CiBz6wc8THJndnbK2WwTHM,14674
|
|
431
455
|
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py,sha256=ptMQWgNn6R-XpAVAAjutSdZg_9ZUqG6fVotzAgeead4,7945
|
|
432
456
|
helm/benchmark/scenarios/melt_translation_scenario.py,sha256=j9YrY60DQHZz4m1MJZaGLzyI6FERlHRx2wy9auyAVB8,5415
|
|
433
457
|
helm/benchmark/scenarios/mental_health_scenario.py,sha256=O1Lfd0MxqawLZLKUDSynaqqbaGHRjDglmePIqepnJI4,4961
|
|
434
|
-
helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=
|
|
458
|
+
helm/benchmark/scenarios/mimic_bhc_scenario.py,sha256=GSUlvK-NVPYB83emucc1cPj-HgAQVu2aXGuutfXJUHc,4098
|
|
435
459
|
helm/benchmark/scenarios/mimic_rrs_scenario.py,sha256=bxwVWjE_z4I_Nk5eD78g3QAGyjpsNg7DVWpkp8IGWXM,3841
|
|
436
460
|
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py,sha256=tZBUZEaUMZvfSlsU6hcPs-pxQ0kDIL6qebGd7JmpDbk,2699
|
|
437
461
|
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py,sha256=-OkPMRyB7aO6QBFwoTl6a2rpzcoHeEl84tqz7k9kpCM,2982
|
|
438
462
|
helm/benchmark/scenarios/mmlu_pro_scenario.py,sha256=pwpp0wqNhsGc8v2V11aUyEWbwdkmIm-42N676j1T3Ws,4031
|
|
439
463
|
helm/benchmark/scenarios/mmlu_scenario.py,sha256=_5cX2uI7CxD7K_GvO3MD8CRJLuN4EzS2o_EFvbrfjSU,3855
|
|
464
|
+
helm/benchmark/scenarios/mmmlu_scenario.py,sha256=CyOISLOsXF9IEYGfeqWyYYkWGvrUvGivlWSJ5ttN9qY,2762
|
|
440
465
|
helm/benchmark/scenarios/msmarco_scenario.py,sha256=-l7_rIMQjMWcpTyn6dGotmNJ5XxN_Ze8dEJyv5ftWFA,34050
|
|
441
466
|
helm/benchmark/scenarios/mtsamples_procedures_scenario.py,sha256=13pXjs9lFduM-QL03mpM10hU0iA8Vr2jJG2FVBQdKOI,5577
|
|
442
467
|
helm/benchmark/scenarios/mtsamples_replicate_scenario.py,sha256=RlyWrlI9e5MLsGbkQWpO2WRsIOZJi39xHskOIBypHdo,5399
|
|
@@ -444,7 +469,6 @@ helm/benchmark/scenarios/n2c2_ct_matching_scenario.py,sha256=-Et7hJnQJOGl1U9Xdb5
|
|
|
444
469
|
helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=MiSq0UnUllJxHFU2gO7m4vr_vmulavJxc4ruZhsAt2U,5632
|
|
445
470
|
helm/benchmark/scenarios/natural_qa_scenario.py,sha256=g-fP8L1lXs7zwNVQOc0ZUnbYkCyElQtLVt5fe5dtvSE,12564
|
|
446
471
|
helm/benchmark/scenarios/newsqa_scenario.py,sha256=G25VYaLrV_JyyoT0jpzJ6p4l5qsOydm8rlzTvSptNKQ,7284
|
|
447
|
-
helm/benchmark/scenarios/numeracy_scenario.py,sha256=lgTGzZc81RyL8iB4K67PAHbyYz6BM2ieub8RSFi2aRc,30895
|
|
448
472
|
helm/benchmark/scenarios/oab_exams_scenario.py,sha256=vbjUzQP0zU4ckvMbsk4lh24NddVWbUAtfWmsq1h24_w,2101
|
|
449
473
|
helm/benchmark/scenarios/omni_math_scenario.py,sha256=5qb2cO-Ibb3kDbwYvkzsoU_aOsoKV3ROLgZbi83OyGU,1955
|
|
450
474
|
helm/benchmark/scenarios/open_assistant_scenario.py,sha256=zd8T6eLOlYMZiFyKrRjc-EPwk5_KpbBedAcKDbZ-TdI,5609
|
|
@@ -458,7 +482,7 @@ helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=zpQthgDi-AyEgO
|
|
|
458
482
|
helm/benchmark/scenarios/ruler_qa_scenario_helper.py,sha256=jgVf1D4eTSxwxQsW0GBou5hfSo2dnlEJvHpVJqk3BxM,6327
|
|
459
483
|
helm/benchmark/scenarios/ruler_qa_scenarios.py,sha256=Dy0INRMzxSiIs9Pm3fa0hYodN-W--WPSv4kcmeQhucM,3270
|
|
460
484
|
helm/benchmark/scenarios/scenario.py,sha256=kSy7tmtFeC6-QSEsBuvlrMTA1PB6fOY9jycMld-vBVM,8592
|
|
461
|
-
helm/benchmark/scenarios/seahelm_scenario.py,sha256=
|
|
485
|
+
helm/benchmark/scenarios/seahelm_scenario.py,sha256=i8SnuYDQtFGFkm686h3_FF9J3vr-Bd9w_jd7h5tV5yY,78152
|
|
462
486
|
helm/benchmark/scenarios/self_instruct_scenario.py,sha256=3Kvi3pLL6eGOEezjoQoGv9c1UxKiRVlFmILKzqst4pI,2309
|
|
463
487
|
helm/benchmark/scenarios/shc_bmt_scenario.py,sha256=wF_sD61IZ4RDznBVQ1HYbGh3Vc2qjbcBuU0jdmp1aD8,2803
|
|
464
488
|
helm/benchmark/scenarios/shc_cdi_scenario.py,sha256=5aVEiRgFCutEWW9yMcJBxEo11FlwW0SiZTaOyXY6ioc,2693
|
|
@@ -480,16 +504,21 @@ helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=OaxEvT1H9VjOjBS
|
|
|
480
504
|
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=pt2Aln_dX1YMSl-9hV1HJmwW90MC3fWwGsMxZg-Q-UY,16391
|
|
481
505
|
helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=7STCSHiHGIQ2aaN_PwDE5jXUJ-qcu8PaS4pC-pbOceE,8410
|
|
482
506
|
helm/benchmark/scenarios/test_air_bench_scenario.py,sha256=9o92CK57xxgPaA9Xt9uJPPie4Cxllzq-KbMt3G35UQ0,1320
|
|
507
|
+
helm/benchmark/scenarios/test_alghafa_scenario.py,sha256=ARQyzjmEpX_FpN2QLnIB7P-ToAeMtE4dqsolzlq8KPQ,1696
|
|
508
|
+
helm/benchmark/scenarios/test_aratrust_scenario.py,sha256=3rsIBfFCAmemNT_IJQ6RZ5liwrHseKGvtGmm2VHoBo0,904
|
|
483
509
|
helm/benchmark/scenarios/test_bigcodebench_scenario.py,sha256=q9FWJsxLJoFaB3PSMLjI_-YyPoZYusOsMPwn6X6NKXw,1304
|
|
510
|
+
helm/benchmark/scenarios/test_bluex_scenario.py,sha256=fLTyMTmSiJ8MoJKYJ2pcE39yXwZm6zv3_oWsQbRbwH0,1930
|
|
484
511
|
helm/benchmark/scenarios/test_commonsense_scenario.py,sha256=V5Mq4cxWqU6j1U3icfIuzcnCZsZO7NTKLQgF0lEpdyc,924
|
|
485
512
|
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py,sha256=bZNLEGu58iHmutGlSp-2uVC2931TO6Rxw7giqFh9RHY,828
|
|
486
513
|
helm/benchmark/scenarios/test_enem_challenge_scenario.py,sha256=XfPkYaSwdGa63ToC_BLuVKTRSldWNBlKsZYK6CFzL3w,2000
|
|
487
514
|
helm/benchmark/scenarios/test_ewok_scenario.py,sha256=WY2vqbHF1120ht4PER0uviKMb2jnoPM3ff4KwvwcU4I,1291
|
|
515
|
+
helm/benchmark/scenarios/test_exams_multilingual_scenario.py,sha256=vHLTcEzo5SkZgy2yXYm1Sex641qkr4HQWmVsOrlCQ_s,1764
|
|
488
516
|
helm/benchmark/scenarios/test_financebench_scenario.py,sha256=EFZLJXXBoyjlTiMQFaQ6MiYkve1lfQDjQWjn4BjqgAQ,1184
|
|
489
517
|
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py,sha256=RO0NcIkJuujdPVO6tDygmDxhZ5YlmIIYlhwx9LeXlQs,731
|
|
490
518
|
helm/benchmark/scenarios/test_gpqa_scenario.py,sha256=QQJ_-nmujZBSmhBhikRUWznFJ4jHPbGDnUVCP_17poI,1884
|
|
491
519
|
helm/benchmark/scenarios/test_grammar.py,sha256=sPlA36sHpThbXgnGlXyOuqHfDPe2epIafmzIeL0nkoU,1364
|
|
492
520
|
helm/benchmark/scenarios/test_gsm_scenario.py,sha256=I-Sl8Sg8kmFd7u0zZbwbNmeFV1mQLuOHoQ1cQDDwovs,1123
|
|
521
|
+
helm/benchmark/scenarios/test_healtha_br_scenario.py,sha256=YmhXK24MuTPyLFCkXXI7IlwwiiJxytAbONOEh6wSJWI,1935
|
|
493
522
|
helm/benchmark/scenarios/test_ifeval_scenario.py,sha256=h3CBg13VKwyb1Xaddwg2GWOzAXz4stK5lXdQtHenAw0,1646
|
|
494
523
|
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py,sha256=8kfCkMRUMU7N4WIrWawFDoxaLB2iTvQ-sPj4RoE2Osg,887
|
|
495
524
|
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py,sha256=qZE-fi1tdNOybpvEQZJUpq9fHsyrPW7NYqj_RTwsv2A,746
|
|
@@ -529,7 +558,7 @@ helm/benchmark/scenarios/audio_language/ami_scenario.py,sha256=SH4r2YyW2kQ8r6-nS
|
|
|
529
558
|
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py,sha256=kiUngeoAVOXfuKgqo96RgK_volpJUPFziu-cYDqT8WM,2685
|
|
530
559
|
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py,sha256=oLOeBGjQCa3hpzjhX2bNS6637VD9VF1KbSJri9BJ3PI,2698
|
|
531
560
|
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py,sha256=PkVqQM1zX6ecXYk-Pz4YWlST3Hnla8NyeBHbuHvhSlY,2447
|
|
532
|
-
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=
|
|
561
|
+
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py,sha256=4X_C68yoMKRUC3SuNHYK4_fcboOz-9gbjhbUK1g3VVY,6725
|
|
533
562
|
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py,sha256=CbcoGPW65xXRRkrDthotDfoVn51ozANG9s3LCsjxkLA,3706
|
|
534
563
|
helm/benchmark/scenarios/audio_language/corebench_scenario.py,sha256=R8RAUtdRAQcUAN0PFXybQUekdQFNtT8hXtoR1A1hMGk,3155
|
|
535
564
|
helm/benchmark/scenarios/audio_language/covost2_scenario.py,sha256=3YiaQXuLGfths2XswRw30Vf26bO9jEW_kAj5wZQSOSI,5119
|
|
@@ -540,14 +569,15 @@ helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py,sha256=
|
|
|
540
569
|
helm/benchmark/scenarios/audio_language/librispeech_scenario.py,sha256=ogMXxnyTG05tCyJ2d4hiuiVsbQvf4TbndksYeaJXl1s,3475
|
|
541
570
|
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py,sha256=j1JFX0jGfcqX0QZBKSjYjDWo1jHJbW5Q9jHyOs6Kgls,4903
|
|
542
571
|
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py,sha256=Jo_-3zC226iKGT-ac0JNMhlEccazMMiHbomx_qU0rxg,3098
|
|
543
|
-
helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=
|
|
572
|
+
helm/benchmark/scenarios/audio_language/mustard_scenario.py,sha256=7YHgfSpua5OdEGPlmxoufwGXQjvGJMTlEWFiJ_ap5ME,6131
|
|
544
573
|
helm/benchmark/scenarios/audio_language/mutox_scenario.py,sha256=bDCQbhsRDR6iQGNlCu_35kjmjGjuzjOIoraSncfOlOY,10277
|
|
545
574
|
helm/benchmark/scenarios/audio_language/parade_scenario.py,sha256=UuOa5cSrHh5n3VF_SuJp4cy1MxlI3uEKHLrNEhGuyuw,4186
|
|
546
575
|
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py,sha256=oN4vBkElVzjccaEK2JFqoXMCGFTTHD0gcYwSDhvHTpQ,5438
|
|
547
|
-
helm/benchmark/scenarios/audio_language/
|
|
548
|
-
helm/benchmark/scenarios/audio_language/
|
|
549
|
-
helm/benchmark/scenarios/audio_language/
|
|
550
|
-
helm/benchmark/scenarios/audio_language/
|
|
576
|
+
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py,sha256=2qzPYfn0YYzzOtffD50kQu_ePpFJj_sSW7Bq8ZS6M2g,3559
|
|
577
|
+
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py,sha256=TEyfAsas3ihN4b4bpGkbK_M_uDt39fVrL5k8vl2Cdyw,3389
|
|
578
|
+
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py,sha256=qPOP6eIEwxPKu6q5EzcrRmhMxMUQk5F9iq8zdJ1Ccrc,4819
|
|
579
|
+
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py,sha256=CGteDFCd31vbu_eg5oal1cnfjQ2J0Ty3C2HYyBLhI5M,4186
|
|
580
|
+
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py,sha256=sL93Q2ERzYiWcTOFEyvjUNbX0BgPdsyHKt6eTr51-Kc,5177
|
|
551
581
|
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py,sha256=wkKyTCtx4isQSMufap_6DsNdGkHi7L8FQ2p7n58kKYI,3124
|
|
552
582
|
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py,sha256=4M_gTWs4CoJ1Ce9dDFBTAe9dzSovpsve_sN1eco2V2A,3155
|
|
553
583
|
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py,sha256=L04ee5bM5E0UNNmkwEzVwug4HJXQoIcVjujPgxtU2h0,4366
|
|
@@ -614,11 +644,12 @@ helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py,s
|
|
|
614
644
|
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py,sha256=UYe3PnxCKBYEbZTTEzdIoTY9gW7ZZAWmVISRIdItD-A,940
|
|
615
645
|
helm/benchmark/static/contamination.yaml,sha256=rAfh1DqwyUcDtyzHPQ2QiUK5eY7QfuuRtBXpZMn4TeA,3171
|
|
616
646
|
helm/benchmark/static/schema_air_bench.yaml,sha256=LapSMj3Ecl1Gp9XIwVCYfrerqS93GNErvp6oDnBCtgw,142378
|
|
647
|
+
helm/benchmark/static/schema_arabic.yaml,sha256=PoudK_u7hV2lalGRvYDI5b89tSfch6Dx_bn5681Um_c,7688
|
|
617
648
|
helm/benchmark/static/schema_audio.yaml,sha256=lVslZX7JmFo0ZgLU4n6amrs9DK8y43Ux0I9QyDUG-14,29119
|
|
618
649
|
helm/benchmark/static/schema_autobencher.yaml,sha256=yb-NkF5w5R2YOg7RIsadNHJ_5G7lG1gbcDVq_25luEk,5716
|
|
619
650
|
helm/benchmark/static/schema_call_center.yaml,sha256=i30aFzWqdOJRyAHN8vAzyHEX1v95DEK0TI1SMKTN4TE,9106
|
|
620
651
|
helm/benchmark/static/schema_capabilities.yaml,sha256=HHy0aafhOaqL0C4TZw2mMt1Dce2_wuN062ORNZIbwYg,8733
|
|
621
|
-
helm/benchmark/static/schema_classic.yaml,sha256=
|
|
652
|
+
helm/benchmark/static/schema_classic.yaml,sha256=pRkfy6jrdslx5onmeCUdkRi9y2DQrcPIjVyZLJ7uKCs,104147
|
|
622
653
|
helm/benchmark/static/schema_cleva.yaml,sha256=TDh-zcCzzTTs7bu0IWlY5dXYaTFhxly8sJIBGQdBvug,25401
|
|
623
654
|
helm/benchmark/static/schema_czech_bank.yaml,sha256=jkTRQVmmbKkbB0zPH9AtYh6Lt33ymMInRBQnHE5lIOo,5462
|
|
624
655
|
helm/benchmark/static/schema_decodingtrust.yaml,sha256=2VPxzcyKYea7mx-qmswyVRjPfVatjVH4Rs3OU82mgII,15670
|
|
@@ -631,7 +662,7 @@ helm/benchmark/static/schema_image2struct.yaml,sha256=cD1X99YcPI8BMAnNfDmXlM-FN0
|
|
|
631
662
|
helm/benchmark/static/schema_instruction_following.yaml,sha256=mYLpMv-iNtsmrv9ewfN9ceDOBBg8nSxOWfc6ByATmIk,6056
|
|
632
663
|
helm/benchmark/static/schema_legal.yaml,sha256=RpoFOuVSIowNgxlPn3UMfJC-68RFr3CGDciUGLPfVqc,28806
|
|
633
664
|
helm/benchmark/static/schema_lite.yaml,sha256=rFSoG7zGPNOtKkJyGgOViWf5WJbMiJMAXrgmqCAi9X4,36611
|
|
634
|
-
helm/benchmark/static/schema_long_context.yaml,sha256=
|
|
665
|
+
helm/benchmark/static/schema_long_context.yaml,sha256=3YjlNkQBgp4hS4PE1EjZvjpvX9v4QjaBPALtOYLpPCs,11486
|
|
635
666
|
helm/benchmark/static/schema_medhelm.yaml,sha256=84BrIengbq0m42ICWvyEWoYtdERR-8J8-8QbPOqUzvA,50747
|
|
636
667
|
helm/benchmark/static/schema_melt.yaml,sha256=mmPqwDa26DVZXsRJkmKQSyD0OStvjlxaMoSPM25SpD4,47494
|
|
637
668
|
helm/benchmark/static/schema_mmlu.yaml,sha256=KI3XnzEwBRpzfYGjP77yKL-hBklEg72D3vL0kVl1BeI,49666
|
|
@@ -649,15 +680,15 @@ helm/benchmark/static/schema_vhelm.yaml,sha256=0slYep2eepUefgtK_m4iSS785sHdJzljm
|
|
|
649
680
|
helm/benchmark/static/schema_vhelm_lite.yaml,sha256=4I68Em9q5wW8sFzj5GCJz8m49fBEuMyVmSZM0-wbfOk,4024
|
|
650
681
|
helm/benchmark/static/schema_video.yaml,sha256=FkpI5Slc4w-ty4hns82ArXIvTdqppWDnkJSpIp74QN4,9713
|
|
651
682
|
helm/benchmark/static_build/config.js,sha256=o98g6QSly1NAfqhYWbU4lEoZB4LEpIrePZtmimiuoXc,165
|
|
652
|
-
helm/benchmark/static_build/index.html,sha256=
|
|
683
|
+
helm/benchmark/static_build/index.html,sha256=hlkvPO8WVcvIJXentHj3Kn5Cd3QwOoi7OqRAou0pRVQ,1178
|
|
653
684
|
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png,sha256=0ubEn4J0T51-jx7IlwjaEGSrofZWlW_e67MJw47Ujzg,733055
|
|
654
685
|
helm/benchmark/static_build/assets/crfm-logo-74391ab8.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
|
|
655
686
|
helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png,sha256=Pl46pKbC_TU3L6kZQ_3G-0wTseluAhIYwb3EqpdQAjQ,1344452
|
|
656
687
|
helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
|
|
657
688
|
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png,sha256=KQentq_1e3uGwiWMViAPxHu2XZ60gqFgovP3UWTyMmw,72312
|
|
658
689
|
helm/benchmark/static_build/assets/helmhero-28e90f4d.png,sha256=KOkPTf-q28PdvGOBp1G5O4q1eWUJjuij3z2h_SUUf8s,55314
|
|
659
|
-
helm/benchmark/static_build/assets/index-94295e78.js,sha256=yvo6hRwNE6Ns7NxJHOdVfUOhc8HsW8eZVadLMW0Wn0w,124386
|
|
660
690
|
helm/benchmark/static_build/assets/index-b9779128.css,sha256=uXeRKCUzQAC32ofNoaK3-WC7kRWR--KnR6--1m9NdQA,491471
|
|
691
|
+
helm/benchmark/static_build/assets/index-e439d5e1.js,sha256=t7AnJSBjGs43kxIev2uLVumaInyBUxad9KVtvA86oUw,124597
|
|
661
692
|
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png,sha256=6sKYQ79cN07-cUsnt-JPsdoVwUBWu5KxOaHWSdwjdgA,284408
|
|
662
693
|
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png,sha256=Pd_NZfAf1ZeU2BIGx9zNT6WmypZNP2bk5z5AxDkbwoU,270625
|
|
663
694
|
helm/benchmark/static_build/assets/overview-74aea3d8.png,sha256=dK6j2Nn3j9O-FMUIVRT5HGBpR_GL78vrKi8oHdG1eaI,74685
|
|
@@ -694,7 +725,7 @@ helm/benchmark/window_services/window_service.py,sha256=y6BthPY1V-ugmYfaJElm5Wfy
|
|
|
694
725
|
helm/benchmark/window_services/window_service_factory.py,sha256=T55F0Y2jiOYxUHHZxT4YX4fFXY5gfFhn56zIwUBhc7s,3423
|
|
695
726
|
helm/benchmark/window_services/yalm_window_service.py,sha256=EwwCoMpr9WVLhCI7OI_7tmZHQfTUwn9FFWjbhIBFRfA,1089
|
|
696
727
|
helm/benchmark/window_services/image_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
697
|
-
helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=
|
|
728
|
+
helm/benchmark/window_services/image_generation/clip_window_service.py,sha256=bhCZXzQDm2fEDKEslWDzkwPihQgmZS0DLVo__Ll9aLI,605
|
|
698
729
|
helm/benchmark/window_services/image_generation/lexica_search_window_service.py,sha256=uDCUclHvo8toxSTMztK3zG7Eb-hjueobGQaBqPqVJlk,454
|
|
699
730
|
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py,sha256=8U2qDrUB1QJHRy5STV5FywkeVm6qfNOaeVBkMQhyMGc,453
|
|
700
731
|
helm/benchmark/window_services/image_generation/test_clip_window_service.py,sha256=domn2MRduHVAdruSUuGPDIGKyDrh-gFxW-fZaBYR7cg,1430
|
|
@@ -717,7 +748,7 @@ helm/clients/google_client.py,sha256=mIaUzK7GHCa9pqK1BEVhdt6dZsJfHv1Qdsf3I0Ayq8A
|
|
|
717
748
|
helm/clients/google_translate_client.py,sha256=TgiQEscjOae58Ptgp9f4n0LXUtl1Jf6v9BI-Z1_wcuw,1304
|
|
718
749
|
helm/clients/grok_client.py,sha256=SbVB6AduTwfElzUgEMnQW2kQUFVTCv4TpPPJvElQEe0,1127
|
|
719
750
|
helm/clients/http_model_client.py,sha256=_F3_y2UWqbzESQdzV0FMEsECIKjporVSAW6iUQhJ35c,2818
|
|
720
|
-
helm/clients/huggingface_client.py,sha256=
|
|
751
|
+
helm/clients/huggingface_client.py,sha256=oWR4yNFk28nrnB3IoznrhcEuU0pZkNywP0E82z1-NGM,17671
|
|
721
752
|
helm/clients/huggingface_pipeline_client.py,sha256=ivFTMNHBwwIUjkeOHkl-veZi5nNAjtnkYvneRFWs-6Q,6154
|
|
722
753
|
helm/clients/ibm_client.py,sha256=4W4fbjnDNjXrP4gVwSfBHPus0QcqFOQzFvfaST1BE1Y,9701
|
|
723
754
|
helm/clients/lit_gpt_client.py,sha256=pgLfSvusNpdj8F5DVxzQdHxTDRNX4RVt6unegao803U,6229
|
|
@@ -727,8 +758,8 @@ helm/clients/mistral_client.py,sha256=ceM8KLAcniAqK1BNVdUGzqy4av2SEEau6PVmPivxc0
|
|
|
727
758
|
helm/clients/moderation_api_client.py,sha256=I5pYWRb2MmcLDYrScnC3P5N7OUFzQiVQ828_hf7zjM4,4719
|
|
728
759
|
helm/clients/nvidia_nim_client.py,sha256=Z1UAqR2jHacIO_QGqQl1JUZ_82JiSPstBOtj6xURmQk,902
|
|
729
760
|
helm/clients/open_lm_client.py,sha256=qFgYqlV_3UiW8WJKz66lLqRqg2jt1qtJ1bHMRAtBn40,1749
|
|
730
|
-
helm/clients/openai_client.py,sha256=
|
|
731
|
-
helm/clients/openai_responses_client.py,sha256=
|
|
761
|
+
helm/clients/openai_client.py,sha256=prSASL8IE3XO0_CaAuZ45iYSITAMQIwtD2q7UqM4qjA,27803
|
|
762
|
+
helm/clients/openai_responses_client.py,sha256=aixsZwO_swP4dhOhJPe1ZcOIav3rxmovPGY0Ug6s5ZA,7308
|
|
732
763
|
helm/clients/palmyra_client.py,sha256=4AaZcV2tPHU4HJ9FWSkOY8_C9ndEckH3PH715QxJQ8E,7086
|
|
733
764
|
helm/clients/perspective_api_client.py,sha256=o_1FFTCrTny6AZ4EJTstX1H9t8SQSQ8dvhi321RTcL4,6105
|
|
734
765
|
helm/clients/reka_client.py,sha256=hA0tq3Hc9669q2sYa4Jr5yWy2NAbvoFDnVqQ6vds62w,8334
|
|
@@ -741,20 +772,21 @@ helm/clients/stanfordhealthcare_openai_client.py,sha256=Qyl8voGz1hJPqT6g4PunMuN9
|
|
|
741
772
|
helm/clients/stanfordhealthcare_shc_openai_client.py,sha256=V7K4KZaSjIiE0FkoY4qy6ifJ8pUiNa3vBcWiDsIwXFI,1343
|
|
742
773
|
helm/clients/test_auto_client.py,sha256=bc-rsMJ8JM0MFnQ4B48hBJ1jL3RtRyVvmPwOgzF2mF8,3155
|
|
743
774
|
helm/clients/test_client.py,sha256=T27UsIPWsbE1JK_8DN_DW9LkEcIGRbgDjio14YOIAb0,3854
|
|
744
|
-
helm/clients/test_huggingface_client.py,sha256=
|
|
775
|
+
helm/clients/test_huggingface_client.py,sha256=8Shzrf1Pad1UsiUAdeOSqsTPQaay0CrWXmdNeIfrJ2Y,3418
|
|
745
776
|
helm/clients/test_simple_client.py,sha256=G0JRQX69ypQN2VxhlNQXs5u2Tdtkcl_aeHqudDUVKi4,702
|
|
746
777
|
helm/clients/test_together_client.py,sha256=kyBLu-2i4EJyuJm5ft0yg8W-H1IqmULRXggEbChuxdo,6178
|
|
747
|
-
helm/clients/together_client.py,sha256=
|
|
778
|
+
helm/clients/together_client.py,sha256=ByImeitpWRhXpZ9U6c0Kol1D8X7Fxno5xgo6D7sZYOY,24201
|
|
748
779
|
helm/clients/toxicity_classifier_client.py,sha256=AI_FizxMurubTIyeceRdkixSnhWQbcD-oEEONj5ve7o,464
|
|
749
780
|
helm/clients/upstage_client.py,sha256=iSL1G8G3jWSbrpacz4I0l6Lwc5T01fsLR-wZzF39ftM,679
|
|
750
|
-
helm/clients/vertexai_client.py,sha256=
|
|
751
|
-
helm/clients/vllm_client.py,sha256=
|
|
781
|
+
helm/clients/vertexai_client.py,sha256=AxZRpZTRrzxwPs2xwKTgHH0eh7WEmHSS1ArTZwI_q3E,23268
|
|
782
|
+
helm/clients/vllm_client.py,sha256=xmXf35WX2oOZhpQnRxeooXGshENySOHZCUQ1E4pbQbA,2647
|
|
783
|
+
helm/clients/vllm_granite_thinking_client.py,sha256=fds2i8LUG78OJYke1uYdDy6XRFqE3rZgSornFjzu4Sk,2172
|
|
752
784
|
helm/clients/writer_client.py,sha256=flKLeMbFkyGfNmv1ozZGU4dxNy-QF5bFJF0mGHqpU3c,4467
|
|
753
785
|
helm/clients/yi_client.py,sha256=nC60d2HiUL2W59FTne9tWmZ9bGGY1OvI7Ob3Ng4wSPE,750
|
|
754
786
|
helm/clients/audio_language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
755
787
|
helm/clients/audio_language/diva_llama_client.py,sha256=Bvcf4wE7yMZlqETgKEMtCug8-2fQI8QCDdaGWSeQ2X8,4864
|
|
756
788
|
helm/clients/audio_language/llama_omni_client.py,sha256=OCak716q97uEk9CBXQqnmUsbLFR-dddMzg5eyIZ4gzE,8718
|
|
757
|
-
helm/clients/audio_language/qwen2_5_omni_client.py,sha256=
|
|
789
|
+
helm/clients/audio_language/qwen2_5_omni_client.py,sha256=ftAVtOG0azvRQEcFjkSSBMU6SDk9Bi8WIks6o6UCbKQ,9684
|
|
758
790
|
helm/clients/audio_language/qwen2_audiolm_client.py,sha256=s9eH8fnVgw5xV39b_8AGt6IyNN3q9Uhcx6HZVxt7TM8,8981
|
|
759
791
|
helm/clients/audio_language/qwen_audiolm_client.py,sha256=RvYweXANEyzhHYDx38H10F0ZEFaL8kj7n7TZ-UrRmZs,6338
|
|
760
792
|
helm/clients/audio_language/test.py,sha256=FrKpirOwJW1__E2egq4VPgsTrgiSHZHBwfUCvxNjC0o,1969
|
|
@@ -840,11 +872,11 @@ helm/common/codec.py,sha256=gTh6AwIQ0Bbul_QSnIO7eItwMZmYtnkIrG1jkc4GOL4,7100
|
|
|
840
872
|
helm/common/concurrency.py,sha256=8THtHlCtXo5c8iCuz_UcBBdzZX6aiEALLc4u0M4SYL0,856
|
|
841
873
|
helm/common/context.py,sha256=0U5KNNKLHiiqjb8JVq03mninagEp9zTzFKP0He8o7A8,2788
|
|
842
874
|
helm/common/credentials_utils.py,sha256=BX_P6wUpLKA7Bg3Dztm7jVI2j4ls7H-h38UbmGMBt3A,1101
|
|
843
|
-
helm/common/critique_request.py,sha256=
|
|
875
|
+
helm/common/critique_request.py,sha256=DZhJ_sY2IMluOxz-FeHvuEkA2Ujsx65HXT__7T3UxGk,3005
|
|
844
876
|
helm/common/file_upload_request.py,sha256=OZeAW1_zsiNdXnWDwNNvhPs0b48TUmW_e4kzzCYmyiY,543
|
|
845
877
|
helm/common/general.py,sha256=TcdPXn_bgPFvXtFP2lJhncz4Q8SdTXnKOinHOTBsegw,12027
|
|
846
878
|
helm/common/gpu_utils.py,sha256=pmLq6ipYNLEm28VxxSNeZuVt-gAw-WnYmBvxP1P1p6M,480
|
|
847
|
-
helm/common/hierarchical_logger.py,sha256=
|
|
879
|
+
helm/common/hierarchical_logger.py,sha256=qIbhwh-dlCcnYG10qTSMxIMM7_Q9VJj8ymDqnWlseuo,6151
|
|
848
880
|
helm/common/image_generation_parameters.py,sha256=nsbuk_-BlRMK6IwP5y6BnTXbTRTOcvZ6uLblL5VHLOo,916
|
|
849
881
|
helm/common/images_utils.py,sha256=8BsN0fd8pc0rh_TSDvippWhTfwmJJXKNF2zqKLB8cps,3372
|
|
850
882
|
helm/common/key_value_store.py,sha256=D9ZBORzZncf3zHQOP4AuNbQnV8cZpO_kqHY1mDRugqQ,3174
|
|
@@ -854,7 +886,7 @@ helm/common/moderations_api_request.py,sha256=3xTsErSsCr2PHD2jpdV1JglHaYHwP2Yqu2
|
|
|
854
886
|
helm/common/mongo_key_value_store.py,sha256=G0TIWQcvwMjyXh4TnN6xJ462HKHUAZtQJJYQOrHK-K8,3887
|
|
855
887
|
helm/common/multimodal_request_utils.py,sha256=n6HgTyHNqfGmU9qmVK-wxQzrkPZ5Wdh-lO_y_ln6VYc,2184
|
|
856
888
|
helm/common/nudity_check_request.py,sha256=VMsujI_RBy5u_cGEk0teE4KyX1dL2Zt3Pb4U6LpBdSY,728
|
|
857
|
-
helm/common/object_spec.py,sha256=
|
|
889
|
+
helm/common/object_spec.py,sha256=sKcEdggqRa3a8TovHAS4lf1LaahOFInvMl5DUF4tE6c,5186
|
|
858
890
|
helm/common/optional_dependencies.py,sha256=Qam3QCHff8tuXbS-fCw-MVe-pK18gSvHw-uQoXXxT7M,616
|
|
859
891
|
helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
|
|
860
892
|
helm/common/reeval_parameters.py,sha256=exaEucXnSI8a076uq_qhO3CTBztMMRoRzL_7v1N4adE,300
|
|
@@ -864,6 +896,7 @@ helm/common/response_format.py,sha256=wIptA8FydZoRjMvO5SFIplgDXhwpZvZmFI-Bi-7mcG
|
|
|
864
896
|
helm/common/test_cache.py,sha256=j19p-qzv_98X_TMW4b39ZHwSJ-MX3p91PrkYumarS6Y,4870
|
|
865
897
|
helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
|
|
866
898
|
helm/common/test_general.py,sha256=c8Lh0mK8I-SfcMprq909B6zWRBxSBngq2nNL1L6-cYA,1788
|
|
899
|
+
helm/common/test_logging.py,sha256=tkb_QDPkKBfaEQ5Y8Xip9PgMYhqOFakcENqyzO5Mj2o,2681
|
|
867
900
|
helm/common/test_media_object.py,sha256=SUWLfms_vkXNivRYM0ZT8AI3_2ru6GON5l-Hb-lk-t0,1661
|
|
868
901
|
helm/common/tokenization_request.py,sha256=NND9ESiiDE0H8QRNpfHVjXS7MQfKKIwtVRKDIjPnnJM,3344
|
|
869
902
|
helm/common/file_caches/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -871,15 +904,15 @@ helm/common/file_caches/file_cache.py,sha256=QfF1hlF8FQ-rcPn9Zyl6L0dOCokvYgd-dFq
|
|
|
871
904
|
helm/common/file_caches/local_file_cache.py,sha256=NiXbat1BBGl5P27oERqSLFfhIHpYqA1IQrvE_N1sWR8,1944
|
|
872
905
|
helm/common/file_caches/test_local_file_cache.py,sha256=ANb01ctUV-J4i1ab3l4uhg9Ce54U_56xq9Hayjt1WhQ,686
|
|
873
906
|
helm/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
874
|
-
helm/config/model_deployments.yaml,sha256=
|
|
875
|
-
helm/config/model_metadata.yaml,sha256=
|
|
876
|
-
helm/config/tokenizer_configs.yaml,sha256=
|
|
907
|
+
helm/config/model_deployments.yaml,sha256=sB3cV6io0NzUQXuKlA49-H3UzOEvWpFDP_MZ30gH0I0,171682
|
|
908
|
+
helm/config/model_metadata.yaml,sha256=0Ps6WlsgElxOpCHVGiWu7QfS0o3Ls4zi1iuwC8PTUgE,269972
|
|
909
|
+
helm/config/tokenizer_configs.yaml,sha256=Vq6MY2nplhYgiyLR98xCXBJWQgEpm64yenrskmkm2NI,40415
|
|
877
910
|
helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
878
911
|
helm/proxy/accounts.py,sha256=gd5cKhKeqklf_cXCAISl65AUvZeD6afBNrs6WK3IBvQ,14764
|
|
879
|
-
helm/proxy/cli.py,sha256=
|
|
912
|
+
helm/proxy/cli.py,sha256=kEDoHpisFO0EJ0Wfm1FLpJdP9sXk9j8WCILEq42RKb0,8317
|
|
880
913
|
helm/proxy/example_queries.py,sha256=EB2vVpAryOUAFiLrwsMiFz0zGl_UAQ8TJ9SkWngvsu4,4389
|
|
881
914
|
helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
|
|
882
|
-
helm/proxy/retry.py,sha256=
|
|
915
|
+
helm/proxy/retry.py,sha256=o64BZsW2vwu2iewRA18wdsru2xC3eNBQ7WUw3IjC_5g,3698
|
|
883
916
|
helm/proxy/server.py,sha256=Q4Mzts8mketktGVJ5AoOEA-_SGCue5QeOlK8dqPUuHI,10853
|
|
884
917
|
helm/proxy/test_accounts.py,sha256=Vs1iOzTPN29LosDAAEs6IagQ3PccvutrJTlR1qNIcj0,1146
|
|
885
918
|
helm/proxy/test_retry.py,sha256=db0owyGTThmIMhYWU_Eh1U-AJvQ-Wa9j_kRmC9DNjOA,1059
|
|
@@ -914,7 +947,7 @@ helm/tokenizers/aleph_alpha_tokenizer.py,sha256=Ofc5thTfW_eb5ztiU-y_0p6e2PIGbHMb
|
|
|
914
947
|
helm/tokenizers/auto_tokenizer.py,sha256=Of-T-CFOhLAjjU45T1hnrEPG_k_hzPufuDE7FRAcSN8,4251
|
|
915
948
|
helm/tokenizers/caching_tokenizer.py,sha256=BwcyVzG7vy3R2O0UgbNxNP2nN4wBnsvpG_9mXQuDYfw,7300
|
|
916
949
|
helm/tokenizers/cohere_tokenizer.py,sha256=6WwHIt7SsICmYR2QQpwDJ7pfNF8VWrFHFxF5Kynq6aY,2116
|
|
917
|
-
helm/tokenizers/grok_tokenizer.py,sha256=
|
|
950
|
+
helm/tokenizers/grok_tokenizer.py,sha256=t_cl1BnjRNCW24mU3Z6eAMhh-86FnCcSo-jB2AhvlL4,2142
|
|
918
951
|
helm/tokenizers/http_model_tokenizer.py,sha256=J5Myg6JVDNgHMN7XOHwGV3WrhilUZ9Sw_FrgO4frYuY,3124
|
|
919
952
|
helm/tokenizers/huggingface_tokenizer.py,sha256=P2ri4n-SUWB9ShMlxlJ9kO-mPmbSTizMGwAf41JE5ds,8734
|
|
920
953
|
helm/tokenizers/lit_gpt_tokenizer.py,sha256=0c6KDeLNHPd6h27SXQvkUfmrCSLYa1kQY1GqCHVfhvw,1675
|
|
@@ -934,8 +967,8 @@ helm/tokenizers/yalm_tokenizer_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
|
|
|
934
967
|
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py,sha256=1ZcPL3srfk031LmA8bEdPcIraAPnHGiYi_CqTiJSTlc,904
|
|
935
968
|
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
|
|
936
969
|
helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py,sha256=kH5Qig1_6r_sKbAHinX7C83tqBUoTwbe-gGZCbGVkko,6389
|
|
937
|
-
crfm_helm-0.5.
|
|
938
|
-
crfm_helm-0.5.
|
|
939
|
-
crfm_helm-0.5.
|
|
940
|
-
crfm_helm-0.5.
|
|
941
|
-
crfm_helm-0.5.
|
|
970
|
+
crfm_helm-0.5.7.dist-info/METADATA,sha256=TMyCY6K4C2Z3wO2Jh5XVDq-hHQ1xxCArIm31BUeGbgM,23548
|
|
971
|
+
crfm_helm-0.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
972
|
+
crfm_helm-0.5.7.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
|
|
973
|
+
crfm_helm-0.5.7.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
|
|
974
|
+
crfm_helm-0.5.7.dist-info/RECORD,,
|
|
@@ -37,7 +37,7 @@ class AIRBench2024Annotator(Annotator):
|
|
|
37
37
|
)
|
|
38
38
|
self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
|
|
39
39
|
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
40
|
-
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
40
|
+
self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
41
41
|
self._model = model or self._DEFAULT_MODEL
|
|
42
42
|
self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
|
|
43
43
|
|
|
@@ -50,7 +50,7 @@ class LiveQAAnnotator(Annotator):
|
|
|
50
50
|
cache_dir = os.path.join(file_storage_path, "data")
|
|
51
51
|
ensure_directory_exists(cache_dir)
|
|
52
52
|
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
53
|
-
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
53
|
+
self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
54
54
|
|
|
55
55
|
def annotate(self, request_state: RequestState) -> Any:
|
|
56
56
|
assert request_state.result
|