crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/benchmark/run_expander.py
CHANGED
|
@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
21
21
|
AUDIO_LANGUAGE_MODEL_TAG,
|
|
22
22
|
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
23
23
|
)
|
|
24
|
-
from helm.benchmark.adaptation.adapters.adapter_factory import
|
|
24
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
25
|
+
ADAPT_GENERATION,
|
|
26
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
27
|
+
)
|
|
25
28
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
26
29
|
from helm.benchmark.run_spec import RunSpec
|
|
27
30
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
537
540
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
538
541
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
539
542
|
"vhelm": [0, 1, 2, 4, 8],
|
|
543
|
+
"melt": [0, 1, 5],
|
|
540
544
|
}
|
|
541
545
|
|
|
542
546
|
|
|
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1476
1480
|
instructions = "Answer with only a single letter."
|
|
1477
1481
|
elif self.scenario == "mcqa":
|
|
1478
1482
|
instructions = "Answer with only a single letter."
|
|
1483
|
+
elif self.scenario == "mcqa_no_period":
|
|
1484
|
+
instructions = "Answer with only a single letter. Do not include a period in your answer."
|
|
1479
1485
|
elif self.scenario == "mcqa_only_last_question":
|
|
1480
1486
|
instructions = "Answer only the last question with only a single letter."
|
|
1481
1487
|
else:
|
|
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1521
1527
|
)
|
|
1522
1528
|
else:
|
|
1523
1529
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1530
|
+
elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
1531
|
+
if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
|
|
1532
|
+
instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.' # noqa: E501
|
|
1533
|
+
else:
|
|
1534
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1524
1535
|
|
|
1525
1536
|
if self.no_prefix:
|
|
1526
1537
|
if instructions:
|
|
@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
143
143
|
):
|
|
144
144
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
145
145
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
# TODO: find a better solution for this
|
|
147
|
+
# if model.name.startswith("openai/o"):
|
|
148
|
+
# # From https://platform.openai.com/docs/guides/reasoning,
|
|
149
|
+
# # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
|
|
150
|
+
# # experimenting with these models. As you become familiar with the number of reasoning tokens your
|
|
151
|
+
# # prompts require, you can adjust this buffer accordingly."
|
|
152
|
+
# run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
|
|
152
153
|
|
|
153
154
|
# IDEFICS special handling
|
|
154
155
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
|
|
|
113
113
|
]
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
117
|
+
return [
|
|
118
|
+
MetricSpec(
|
|
119
|
+
class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
|
|
120
|
+
args={
|
|
121
|
+
"num_respondents": num_respondents,
|
|
122
|
+
"max_tokens": max_tokens,
|
|
123
|
+
},
|
|
124
|
+
)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
116
128
|
########################################################################################################################
|
|
117
129
|
# RunSpecs
|
|
118
130
|
|
|
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
|
|
|
215
227
|
|
|
216
228
|
|
|
217
229
|
@run_spec_function("voice_jailbreak_attacks")
|
|
218
|
-
def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
|
|
230
|
+
def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
|
|
219
231
|
scenario_spec = ScenarioSpec(
|
|
220
232
|
class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
|
|
221
233
|
"VoiceJailbreakAttacksScenario",
|
|
222
234
|
args={"subset": subset},
|
|
223
235
|
)
|
|
224
|
-
adapter_spec = _get_generation_adapter_spec(
|
|
225
|
-
|
|
226
|
-
|
|
236
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
237
|
+
instructions="Listen to the audio and respond according to its instructions.",
|
|
238
|
+
max_tokens=1024,
|
|
227
239
|
)
|
|
240
|
+
metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
|
|
241
|
+
num_respondents=num_respondents,
|
|
242
|
+
max_tokens=200,
|
|
243
|
+
) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
|
|
228
244
|
|
|
229
245
|
run_spec_name: str = "voice_jailbreak_attacks"
|
|
230
246
|
return RunSpec(
|
|
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
|
|
|
258
274
|
|
|
259
275
|
|
|
260
276
|
@run_spec_function("vocal_sound")
|
|
261
|
-
def get_vocal_sound_run_spec() -> RunSpec:
|
|
277
|
+
def get_vocal_sound_run_spec(sound: str) -> RunSpec:
|
|
262
278
|
scenario_spec = ScenarioSpec(
|
|
263
279
|
class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
|
|
280
|
+
args={"sound": sound},
|
|
264
281
|
)
|
|
265
282
|
adapter_spec = _get_generation_adapter_spec(
|
|
266
283
|
instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
|
|
267
284
|
'"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
|
|
268
285
|
max_tokens=5,
|
|
269
286
|
)
|
|
270
|
-
metric_specs = get_exact_match_metric_specs()
|
|
287
|
+
metric_specs = get_exact_match_metric_specs()
|
|
271
288
|
run_spec_name: str = "vocal_sound"
|
|
272
289
|
return RunSpec(
|
|
273
|
-
name=run_spec_name,
|
|
290
|
+
name=f"{run_spec_name}:sound={sound}",
|
|
274
291
|
scenario_spec=scenario_spec,
|
|
275
292
|
adapter_spec=adapter_spec,
|
|
276
293
|
metric_specs=metric_specs,
|
|
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
|
|
|
501
518
|
)
|
|
502
519
|
+ _get_open_ended_generation_metric_specs()
|
|
503
520
|
)
|
|
521
|
+
|
|
504
522
|
run_spec_name: str = "air_bench_chat"
|
|
523
|
+
group_name: str = run_spec_name
|
|
524
|
+
if subject in ["mix", "speech"]:
|
|
525
|
+
group_name += "_reasoning"
|
|
526
|
+
elif subject in ["sound", "music"]:
|
|
527
|
+
group_name += "_knowledge"
|
|
528
|
+
|
|
505
529
|
return RunSpec(
|
|
506
530
|
name=f"{run_spec_name}:subject={subject}",
|
|
507
531
|
scenario_spec=scenario_spec,
|
|
508
532
|
adapter_spec=adapter_spec,
|
|
509
533
|
metric_specs=metric_specs,
|
|
510
|
-
groups=[
|
|
534
|
+
groups=[group_name],
|
|
511
535
|
)
|
|
512
536
|
|
|
513
537
|
|
|
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
|
|
|
611
635
|
metric_specs=metric_specs,
|
|
612
636
|
groups=[run_spec_name],
|
|
613
637
|
)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
@run_spec_function("corebench")
|
|
641
|
+
def get_corebench_run_spec() -> RunSpec:
|
|
642
|
+
scenario_spec = ScenarioSpec(
|
|
643
|
+
class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
|
|
644
|
+
)
|
|
645
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
646
|
+
instructions="",
|
|
647
|
+
max_tokens=10,
|
|
648
|
+
)
|
|
649
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
650
|
+
run_spec_name: str = "corebench"
|
|
651
|
+
return RunSpec(
|
|
652
|
+
name=f"{run_spec_name}",
|
|
653
|
+
scenario_spec=scenario_spec,
|
|
654
|
+
adapter_spec=adapter_spec,
|
|
655
|
+
metric_specs=metric_specs,
|
|
656
|
+
groups=[run_spec_name],
|
|
657
|
+
)
|
|
@@ -100,6 +100,26 @@ def get_conv_fin_qa_calc_spec() -> RunSpec:
|
|
|
100
100
|
)
|
|
101
101
|
|
|
102
102
|
|
|
103
|
+
@run_spec_function("kpi_edgar")
|
|
104
|
+
def get_kpi_edgar_spec() -> RunSpec:
|
|
105
|
+
scenario_spec = ScenarioSpec(
|
|
106
|
+
class_name="helm.benchmark.scenarios.kpi_edgar_scenario.KPIEDGARScenario",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
adapter_spec = get_generation_adapter_spec(
|
|
110
|
+
input_noun=None, output_noun="Answer", max_tokens=100, max_train_instances=20
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return RunSpec(
|
|
114
|
+
name="kpi_edgar",
|
|
115
|
+
scenario_spec=scenario_spec,
|
|
116
|
+
adapter_spec=adapter_spec,
|
|
117
|
+
metric_specs=get_basic_metric_specs([])
|
|
118
|
+
+ [MetricSpec(class_name="helm.benchmark.metrics.kpi_edgar_metrics.KPIEdgarMetric")],
|
|
119
|
+
groups=["kpi_edgar"],
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
103
123
|
# Legal
|
|
104
124
|
|
|
105
125
|
|
|
@@ -6,7 +6,11 @@ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
|
6
6
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
7
7
|
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
|
|
8
8
|
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
9
|
-
from helm.benchmark.metrics.common_metric_specs import
|
|
9
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
10
|
+
get_basic_metric_specs,
|
|
11
|
+
get_exact_match_metric_specs,
|
|
12
|
+
get_open_ended_generation_metric_specs,
|
|
13
|
+
)
|
|
10
14
|
from helm.benchmark.metrics.metric import MetricSpec
|
|
11
15
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
12
16
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
@@ -192,3 +196,29 @@ def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -
|
|
|
192
196
|
annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
|
|
193
197
|
groups=["czech_bank_qa"],
|
|
194
198
|
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@run_spec_function("medi_qa_without_annotator")
|
|
202
|
+
def get_medi_qa_without_annotator_spec() -> RunSpec:
|
|
203
|
+
"""A version of medi_qa that does not use annotators.
|
|
204
|
+
|
|
205
|
+
EXPERIMENTAL: You should probably use medi_qa instead."""
|
|
206
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
|
|
207
|
+
|
|
208
|
+
adapter_spec = get_generation_adapter_spec(
|
|
209
|
+
instructions="Answer the following consumer health question.",
|
|
210
|
+
input_noun="Question",
|
|
211
|
+
output_noun="Answer",
|
|
212
|
+
max_tokens=1024,
|
|
213
|
+
max_train_instances=0,
|
|
214
|
+
stop_sequences=[],
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
metric_specs = get_open_ended_generation_metric_specs()
|
|
218
|
+
return RunSpec(
|
|
219
|
+
name="medi_qa",
|
|
220
|
+
scenario_spec=scenario_spec,
|
|
221
|
+
adapter_spec=adapter_spec,
|
|
222
|
+
metric_specs=metric_specs,
|
|
223
|
+
groups=["medi_qa"],
|
|
224
|
+
)
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
2
|
-
from helm.benchmark.metrics.common_metric_specs import
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
|
|
2
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
3
|
+
get_exact_match_metric_specs,
|
|
4
|
+
get_open_ended_generation_metric_specs,
|
|
5
|
+
)
|
|
6
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
3
7
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
4
8
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
5
9
|
|
|
@@ -26,7 +30,7 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
|
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
@run_spec_function("ruler_hotpotqa")
|
|
29
|
-
def get_ruler_hotpotqa_spec(max_num_words: int =
|
|
33
|
+
def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
|
|
30
34
|
scenario_spec = ScenarioSpec(
|
|
31
35
|
class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario",
|
|
32
36
|
args={
|
|
@@ -35,18 +39,21 @@ def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
|
|
|
35
39
|
)
|
|
36
40
|
|
|
37
41
|
adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
|
|
42
|
+
metric_specs = get_open_ended_generation_metric_specs() + [
|
|
43
|
+
MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
|
|
44
|
+
]
|
|
38
45
|
|
|
39
46
|
return RunSpec(
|
|
40
47
|
name=f"ruler_hotpotqa:max_num_words={max_num_words}",
|
|
41
48
|
scenario_spec=scenario_spec,
|
|
42
49
|
adapter_spec=adapter_spec,
|
|
43
|
-
metric_specs=
|
|
50
|
+
metric_specs=metric_specs,
|
|
44
51
|
groups=["ruler_hotpotqa"],
|
|
45
52
|
)
|
|
46
53
|
|
|
47
54
|
|
|
48
55
|
@run_spec_function("ruler_squad")
|
|
49
|
-
def get_ruler_squad_spec(max_num_words: int =
|
|
56
|
+
def get_ruler_squad_spec(max_num_words: int = 131072) -> RunSpec:
|
|
50
57
|
scenario_spec = ScenarioSpec(
|
|
51
58
|
class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario",
|
|
52
59
|
args={
|
|
@@ -55,35 +62,80 @@ def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
|
|
|
55
62
|
)
|
|
56
63
|
|
|
57
64
|
adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
|
|
65
|
+
metric_specs = get_open_ended_generation_metric_specs() + [
|
|
66
|
+
MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
|
|
67
|
+
]
|
|
58
68
|
|
|
59
69
|
return RunSpec(
|
|
60
70
|
name=f"ruler_squad:max_num_words={max_num_words}",
|
|
61
71
|
scenario_spec=scenario_spec,
|
|
62
72
|
adapter_spec=adapter_spec,
|
|
63
|
-
metric_specs=
|
|
73
|
+
metric_specs=metric_specs,
|
|
64
74
|
groups=["ruler_squad"],
|
|
65
75
|
)
|
|
66
76
|
|
|
67
77
|
|
|
68
|
-
@run_spec_function("
|
|
69
|
-
def
|
|
78
|
+
@run_spec_function("infinite_bench_en_qa")
|
|
79
|
+
def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
|
|
80
|
+
scenario_spec = ScenarioSpec(
|
|
81
|
+
class_name="helm.benchmark.scenarios.infinite_bench_en_qa_scenario.InfiniteBenchEnQAScenario",
|
|
82
|
+
args={
|
|
83
|
+
"max_num_words": max_num_words,
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=40)
|
|
88
|
+
metric_specs = get_open_ended_generation_metric_specs()
|
|
89
|
+
|
|
90
|
+
return RunSpec(
|
|
91
|
+
name=f"infinite_bench_en_qa:max_num_words={max_num_words}",
|
|
92
|
+
scenario_spec=scenario_spec,
|
|
93
|
+
adapter_spec=adapter_spec,
|
|
94
|
+
metric_specs=metric_specs,
|
|
95
|
+
groups=["infinite_bench_en_qa"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@run_spec_function("infinite_bench_en_sum")
|
|
100
|
+
def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
|
|
70
101
|
|
|
71
102
|
scenario_spec = ScenarioSpec(
|
|
72
|
-
class_name="helm.benchmark.scenarios.
|
|
103
|
+
class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario",
|
|
73
104
|
args={
|
|
74
|
-
"min_num_words": min_num_words,
|
|
75
105
|
"max_num_words": max_num_words,
|
|
76
106
|
},
|
|
77
107
|
)
|
|
78
108
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
109
|
+
adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=1200)
|
|
110
|
+
metric_specs = get_open_ended_generation_metric_specs()
|
|
111
|
+
|
|
112
|
+
return RunSpec(
|
|
113
|
+
name=f"infinite_bench_en_sum:max_num_words={max_num_words}",
|
|
114
|
+
scenario_spec=scenario_spec,
|
|
115
|
+
adapter_spec=adapter_spec,
|
|
116
|
+
metric_specs=metric_specs,
|
|
117
|
+
groups=["infinite_bench_en_sum"],
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@run_spec_function("openai_mrcr")
|
|
122
|
+
def get_openai_mrcr_spec(needles: int, max_num_words: int = 131072) -> RunSpec:
|
|
123
|
+
scenario_spec = ScenarioSpec(
|
|
124
|
+
class_name="helm.benchmark.scenarios.openai_mrcr_scenario.OpenAIMRCRScenario",
|
|
125
|
+
args={"needles": needles, "max_num_words": max_num_words},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
adapter_spec = AdapterSpec(
|
|
129
|
+
method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
|
|
130
|
+
)
|
|
131
|
+
metric_specs = get_exact_match_metric_specs() + [
|
|
132
|
+
MetricSpec(class_name="helm.benchmark.metrics.openai_mrcr_metrics.OpenAIMRCRMetric")
|
|
133
|
+
]
|
|
82
134
|
|
|
83
135
|
return RunSpec(
|
|
84
|
-
name=f"
|
|
136
|
+
name=f"openai_mrcr:needles={needles},max_num_words={max_num_words}",
|
|
85
137
|
scenario_spec=scenario_spec,
|
|
86
138
|
adapter_spec=adapter_spec,
|
|
87
139
|
metric_specs=metric_specs,
|
|
88
|
-
groups=["
|
|
140
|
+
groups=["openai_mrcr"],
|
|
89
141
|
)
|