crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class TaxonomyInfo:
|
|
7
|
+
# Task (e.g., question answering)
|
|
8
|
+
task: Optional[str] = None
|
|
9
|
+
|
|
10
|
+
# Domain - genre (e.g., Wikipedia)
|
|
11
|
+
what: Optional[str] = None
|
|
12
|
+
|
|
13
|
+
# Domain - when it was written (e.g., 2010s)
|
|
14
|
+
when: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
# Domain - demographics (e.g., web users)
|
|
17
|
+
who: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
# Language (e.g., English)
|
|
20
|
+
language: Optional[str] = None
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1484
1484
|
instructions = "Answer with only a single letter. Do not include a period in your answer."
|
|
1485
1485
|
elif self.scenario == "mcqa_only_last_question":
|
|
1486
1486
|
instructions = "Answer only the last question with only a single letter."
|
|
1487
|
+
elif self.scenario == "arabic_mcqa":
|
|
1488
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1487
1489
|
else:
|
|
1488
1490
|
instructions = "Answer with only a single letter."
|
|
1489
1491
|
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1525
1527
|
"Answer only the last question with a short answer. "
|
|
1526
1528
|
"Avoid extra, unnecessary information in the answer."
|
|
1527
1529
|
)
|
|
1530
|
+
elif self.scenario == "arabic_mcqa":
|
|
1531
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1528
1532
|
else:
|
|
1529
1533
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1530
1534
|
elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
@@ -4,25 +4,35 @@ EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
|
|
|
4
4
|
|
|
5
5
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
6
|
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
|
|
7
|
-
from helm.benchmark.
|
|
7
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
8
|
+
from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
8
10
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
9
11
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
_ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
|
|
15
|
+
|
|
16
|
+
|
|
12
17
|
@run_spec_function("arabic_mmlu")
|
|
13
|
-
def get_arabic_mmlu_spec() -> RunSpec:
|
|
18
|
+
def get_arabic_mmlu_spec(subset: str) -> RunSpec:
|
|
14
19
|
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
15
|
-
|
|
20
|
+
|
|
21
|
+
scenario_spec = ScenarioSpec(
|
|
22
|
+
class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario", args={"subset": subset}
|
|
23
|
+
)
|
|
16
24
|
|
|
17
25
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
18
26
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
19
|
-
instructions="
|
|
20
|
-
input_noun="
|
|
21
|
-
output_noun="
|
|
27
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
28
|
+
input_noun="السؤال",
|
|
29
|
+
output_noun="الإجابة",
|
|
30
|
+
max_tokens=100,
|
|
31
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
22
32
|
)
|
|
23
33
|
|
|
24
34
|
return RunSpec(
|
|
25
|
-
name="arabic_mmlu",
|
|
35
|
+
name=f"arabic_mmlu:subset={subset}",
|
|
26
36
|
scenario_spec=scenario_spec,
|
|
27
37
|
adapter_spec=adapter_spec,
|
|
28
38
|
metric_specs=get_exact_match_metric_specs(),
|
|
@@ -39,9 +49,11 @@ def get_alghafa_spec(subset: str) -> RunSpec:
|
|
|
39
49
|
|
|
40
50
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
41
51
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
42
|
-
instructions="
|
|
43
|
-
input_noun="
|
|
44
|
-
output_noun="
|
|
52
|
+
instructions="الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح", # noqa: E501
|
|
53
|
+
input_noun="السؤال",
|
|
54
|
+
output_noun="الإجابة",
|
|
55
|
+
max_tokens=100,
|
|
56
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
45
57
|
)
|
|
46
58
|
|
|
47
59
|
return RunSpec(
|
|
@@ -54,20 +66,126 @@ def get_alghafa_spec(subset: str) -> RunSpec:
|
|
|
54
66
|
|
|
55
67
|
|
|
56
68
|
@run_spec_function("aratrust")
|
|
57
|
-
def get_aratrust_spec() -> RunSpec:
|
|
69
|
+
def get_aratrust_spec(category: str) -> RunSpec:
|
|
58
70
|
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
59
|
-
scenario_spec = ScenarioSpec(
|
|
71
|
+
scenario_spec = ScenarioSpec(
|
|
72
|
+
class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario",
|
|
73
|
+
args={"category": category},
|
|
74
|
+
)
|
|
60
75
|
|
|
61
76
|
adapter_spec = get_generation_adapter_spec(
|
|
62
|
-
instructions="
|
|
63
|
-
input_noun="
|
|
64
|
-
output_noun="
|
|
77
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج", # noqa: E501
|
|
78
|
+
input_noun="السؤال",
|
|
79
|
+
output_noun="الإجابة",
|
|
80
|
+
max_tokens=100,
|
|
65
81
|
)
|
|
66
82
|
|
|
67
83
|
return RunSpec(
|
|
68
|
-
name="aratrust",
|
|
84
|
+
name=f"aratrust:category={category}",
|
|
69
85
|
scenario_spec=scenario_spec,
|
|
70
86
|
adapter_spec=adapter_spec,
|
|
71
87
|
metric_specs=get_exact_match_metric_specs(),
|
|
72
88
|
groups=["aratrust"],
|
|
73
89
|
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@run_spec_function("alrage")
|
|
93
|
+
def get_alrage_spec() -> RunSpec:
|
|
94
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
95
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.alrage_scenario.ALRAGEScenario")
|
|
96
|
+
|
|
97
|
+
adapter_spec = get_generation_adapter_spec(
|
|
98
|
+
instructions="بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي", # noqa: E501
|
|
99
|
+
input_noun="السؤال",
|
|
100
|
+
output_noun="الإجابة",
|
|
101
|
+
max_tokens=100,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.alrage_annotator.ALRAGEAnnotator")]
|
|
105
|
+
|
|
106
|
+
metric_specs = [
|
|
107
|
+
MetricSpec(class_name="helm.benchmark.metrics.alrage_metric.ALRAGEMetric")
|
|
108
|
+
] + get_basic_metric_specs([])
|
|
109
|
+
|
|
110
|
+
return RunSpec(
|
|
111
|
+
name="alrage",
|
|
112
|
+
scenario_spec=scenario_spec,
|
|
113
|
+
adapter_spec=adapter_spec,
|
|
114
|
+
annotators=annotator_specs,
|
|
115
|
+
metric_specs=metric_specs,
|
|
116
|
+
groups=["alrage"],
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@run_spec_function("madinah_qa")
|
|
121
|
+
def get_madinah_qa_spec(subset: str) -> RunSpec:
|
|
122
|
+
scenario_spec = ScenarioSpec(
|
|
123
|
+
class_name="helm.benchmark.scenarios.madinah_qa_scenario.MadinahQAScenario", args={"subset": subset}
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
127
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
128
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
129
|
+
input_noun="السؤال",
|
|
130
|
+
output_noun="الإجابة",
|
|
131
|
+
max_tokens=100,
|
|
132
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return RunSpec(
|
|
136
|
+
name=f"madinah_qa:subset={subset}",
|
|
137
|
+
scenario_spec=scenario_spec,
|
|
138
|
+
adapter_spec=adapter_spec,
|
|
139
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
140
|
+
groups=["madinah_qa"],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@run_spec_function("mbzuai_human_translated_arabic_mmlu")
|
|
145
|
+
def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
|
|
146
|
+
scenario_spec = ScenarioSpec(
|
|
147
|
+
class_name="helm.benchmark.scenarios.mbzuai_human_translated_arabic_mmlu.MBZUAIHumanTranslatedArabicMMLUScenario",
|
|
148
|
+
args={"subject": subject},
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
152
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
153
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
154
|
+
input_noun="السؤال",
|
|
155
|
+
output_noun="الإجابة",
|
|
156
|
+
max_tokens=100,
|
|
157
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return RunSpec(
|
|
161
|
+
name=f"mbzuai_human_translated_arabic_mmlu:subject={subject}",
|
|
162
|
+
scenario_spec=scenario_spec,
|
|
163
|
+
adapter_spec=adapter_spec,
|
|
164
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
165
|
+
groups=["mbzuai_human_translated_arabic_mmlu"],
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@run_spec_function("arabic_exams")
|
|
170
|
+
def get_arabic_exams_spec(subject: str) -> RunSpec:
|
|
171
|
+
scenario_spec = ScenarioSpec(
|
|
172
|
+
class_name="helm.benchmark.scenarios.arabic_exams_scenario.ArabicEXAMSScenario",
|
|
173
|
+
args={"subject": subject},
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
177
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
178
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
179
|
+
input_noun="السؤال",
|
|
180
|
+
output_noun="الإجابة",
|
|
181
|
+
max_tokens=100,
|
|
182
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return RunSpec(
|
|
186
|
+
name=f"arabic_exams:subject={subject}",
|
|
187
|
+
scenario_spec=scenario_spec,
|
|
188
|
+
adapter_spec=adapter_spec,
|
|
189
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
190
|
+
groups=["arabic_exams"],
|
|
191
|
+
)
|
|
@@ -7,7 +7,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
|
7
7
|
|
|
8
8
|
@run_spec_function("bluex")
|
|
9
9
|
def get_bluex_spec() -> RunSpec:
|
|
10
|
-
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.
|
|
10
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
|
|
11
11
|
|
|
12
12
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
13
13
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -753,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
|
|
|
753
753
|
)
|
|
754
754
|
|
|
755
755
|
return RunSpec(
|
|
756
|
-
name=f"
|
|
756
|
+
name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
|
|
757
757
|
scenario_spec=scenario_spec,
|
|
758
758
|
adapter_spec=adapter_spec,
|
|
759
759
|
metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
|
|
760
760
|
+ get_generative_harms_metric_specs(),
|
|
761
|
-
groups=["
|
|
761
|
+
groups=["summarization_xsum_sampled"],
|
|
762
762
|
)
|
|
763
763
|
|
|
764
764
|
|
|
@@ -39,12 +39,12 @@ def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSp
|
|
|
39
39
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
40
40
|
global_prefix="",
|
|
41
41
|
global_suffix="",
|
|
42
|
-
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.", # noqa: E501
|
|
42
|
+
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n", # noqa: E501
|
|
43
43
|
input_prefix="",
|
|
44
44
|
input_suffix="\n",
|
|
45
45
|
reference_prefix="A. ",
|
|
46
46
|
reference_suffix="\n",
|
|
47
|
-
output_prefix="",
|
|
47
|
+
output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n", # noqa: E501
|
|
48
48
|
output_suffix="",
|
|
49
49
|
instance_prefix="",
|
|
50
50
|
max_train_instances=0,
|
|
File without changes
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, List, Union
|
|
7
|
+
from abc import ABC
|
|
8
|
+
|
|
9
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
10
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
11
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
12
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
13
|
+
get_basic_metric_specs,
|
|
14
|
+
get_exact_match_metric_specs,
|
|
15
|
+
get_summarization_metric_specs,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
SUMMARIZATION_METRICS = {
|
|
21
|
+
"rouge_1",
|
|
22
|
+
"rouge_2",
|
|
23
|
+
"rouge_l",
|
|
24
|
+
"BERTScore-P",
|
|
25
|
+
"BERTScore-R",
|
|
26
|
+
"BERTScore-F",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class MetricConfig(ABC):
|
|
32
|
+
"""Base class for all metric configurations"""
|
|
33
|
+
|
|
34
|
+
name: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class SimpleMetricConfig(MetricConfig):
|
|
39
|
+
"""Configuration for simple string-based metrics like 'exact_match'"""
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class JuryMetricConfig(MetricConfig):
|
|
46
|
+
"""Configuration for jury-based metrics with multiple judges"""
|
|
47
|
+
|
|
48
|
+
prompt_file: str
|
|
49
|
+
judges: List[AnnotatorModelInfo]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class BenchmarkConfig:
|
|
54
|
+
"""
|
|
55
|
+
A benchmark configuration is an immutable data structure that holds
|
|
56
|
+
the configuration for a specific benchmark, including prompt, dataset and metric
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
name: str
|
|
60
|
+
"""Name of the benchmark"""
|
|
61
|
+
|
|
62
|
+
description: str
|
|
63
|
+
"""Description of the benchmark"""
|
|
64
|
+
|
|
65
|
+
prompt_file: str
|
|
66
|
+
"""Path to the prompt file. This prompt will be used for all instances of the benchmark."""
|
|
67
|
+
|
|
68
|
+
dataset_file: str
|
|
69
|
+
"""Path to the dataset file. This dataset will be used to populate the context in the prompt."""
|
|
70
|
+
|
|
71
|
+
main_metric: Union[SimpleMetricConfig, JuryMetricConfig]
|
|
72
|
+
"""The main metric for the benchmark"""
|
|
73
|
+
|
|
74
|
+
metrics: List[Union[SimpleMetricConfig, JuryMetricConfig]]
|
|
75
|
+
"""List of structured metric configurations for the benchmark"""
|
|
76
|
+
|
|
77
|
+
max_tokens: int = 1024
|
|
78
|
+
"""Maximum number of tokens to generate in the response"""
|
|
79
|
+
|
|
80
|
+
def get_metric_specs(self) -> List[MetricSpec]:
|
|
81
|
+
"""Get the metric specifications for the benchmark"""
|
|
82
|
+
metric_specs: List[MetricSpec] = []
|
|
83
|
+
summarization = False
|
|
84
|
+
for metric in self.metrics:
|
|
85
|
+
if metric.name == "exact_match":
|
|
86
|
+
metric_specs.extend(get_exact_match_metric_specs())
|
|
87
|
+
|
|
88
|
+
elif metric.name == "jury_score":
|
|
89
|
+
if not isinstance(metric, JuryMetricConfig):
|
|
90
|
+
raise AssertionError("Metric 'jury_score' must be a JuryMetricConfig")
|
|
91
|
+
annotator_models = {judge.model_deployment: judge for judge in metric.judges}
|
|
92
|
+
metric_specs.append(
|
|
93
|
+
MetricSpec(
|
|
94
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
95
|
+
args={
|
|
96
|
+
"metric_name": "jury_score",
|
|
97
|
+
"scenario_name": self.name,
|
|
98
|
+
"annotator_models": annotator_models,
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
if len(self.metrics) == 1:
|
|
103
|
+
metric_specs.extend(get_basic_metric_specs([]))
|
|
104
|
+
|
|
105
|
+
elif metric.name in SUMMARIZATION_METRICS:
|
|
106
|
+
if not summarization:
|
|
107
|
+
summarization = True
|
|
108
|
+
metric_args = {
|
|
109
|
+
"task": self.name,
|
|
110
|
+
"device": get_torch_device_name(),
|
|
111
|
+
"bertscore_model": "distilbert-base-uncased",
|
|
112
|
+
"rescale_with_baseline": False,
|
|
113
|
+
}
|
|
114
|
+
metric_specs.extend(get_summarization_metric_specs(metric_args))
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError(f"Unknown metric name: {metric.name}")
|
|
117
|
+
return metric_specs
|
|
118
|
+
|
|
119
|
+
def _get_annotation_criteria(self, prompt_template: str) -> Dict[str, List[str]]:
|
|
120
|
+
criteria_tag = re.compile(r"<rubric_criteria>\s*(\{.*?\})\s*</rubric_criteria>", re.DOTALL)
|
|
121
|
+
m = criteria_tag.search(prompt_template)
|
|
122
|
+
if not m:
|
|
123
|
+
raise ValueError("No <rubric_criteria>{...}</rubric_criteria> block found in prompt_template.")
|
|
124
|
+
raw = json.loads(m.group(1))
|
|
125
|
+
# normalize to Dict[str, Set[str]]
|
|
126
|
+
return {k: list(v) for k, v in raw.items()}
|
|
127
|
+
|
|
128
|
+
def get_annotator_specs(self) -> List[AnnotatorSpec]:
|
|
129
|
+
"""Convert jury metrics to AnnotatorSpec objects"""
|
|
130
|
+
annotator_specs = []
|
|
131
|
+
# return annotator_specs
|
|
132
|
+
for metric in self.metrics:
|
|
133
|
+
if isinstance(metric, JuryMetricConfig):
|
|
134
|
+
with open(metric.prompt_file, "r") as f:
|
|
135
|
+
prompt_template = f.read()
|
|
136
|
+
annotator_models = {judge.model_deployment: judge for judge in metric.judges}
|
|
137
|
+
annotator_criteria = self._get_annotation_criteria(prompt_template)
|
|
138
|
+
# Create a generic annotator spec - you may need to customize the class_name
|
|
139
|
+
# based on your specific use case
|
|
140
|
+
annotator_specs.append(
|
|
141
|
+
AnnotatorSpec(
|
|
142
|
+
class_name="helm.benchmark.annotation.model_as_judge.LLMAsJuryAnnotator",
|
|
143
|
+
args={
|
|
144
|
+
"name": self.name,
|
|
145
|
+
"prompt_template": prompt_template,
|
|
146
|
+
"annotation_criteria": annotator_criteria,
|
|
147
|
+
"annotator_models": annotator_models,
|
|
148
|
+
},
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return annotator_specs
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _convert_metrics(raw_metrics: List[Dict[str, Any]]) -> List[MetricConfig]:
|
|
156
|
+
"""
|
|
157
|
+
Convert raw metrics from YAML into structured MetricConfig objects.
|
|
158
|
+
"""
|
|
159
|
+
converted_metrics: List[MetricConfig] = []
|
|
160
|
+
|
|
161
|
+
for metric in raw_metrics:
|
|
162
|
+
if not isinstance(metric, dict) or "name" not in metric:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Invalid metric format: {metric}. Each metric must be a dict with at least a 'name' field."
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
metric_name = metric["name"]
|
|
168
|
+
|
|
169
|
+
if metric_name == "jury_score":
|
|
170
|
+
if "prompt_file" not in metric or "judges" not in metric:
|
|
171
|
+
raise ValueError(f"jury_score metric requires 'prompt_file' and 'judges': {metric}")
|
|
172
|
+
|
|
173
|
+
judges = [
|
|
174
|
+
AnnotatorModelInfo(
|
|
175
|
+
model_name=j["model_name"],
|
|
176
|
+
model_deployment=j["name"],
|
|
177
|
+
)
|
|
178
|
+
for j in metric["judges"]
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
converted_metrics.append(
|
|
182
|
+
JuryMetricConfig(name=metric_name, prompt_file=metric["prompt_file"], judges=judges)
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
converted_metrics.append(SimpleMetricConfig(name=metric_name))
|
|
186
|
+
|
|
187
|
+
return converted_metrics
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _structure_benchmark_config(data: Dict[str, Any], cls) -> BenchmarkConfig:
|
|
191
|
+
"""
|
|
192
|
+
Custom structure function for BenchmarkConfig that handles metrics conversion
|
|
193
|
+
"""
|
|
194
|
+
if "metrics" in data:
|
|
195
|
+
data = data.copy() # Don't modify the original
|
|
196
|
+
raw_metrics = data["metrics"]
|
|
197
|
+
data["metrics"] = _convert_metrics(raw_metrics)
|
|
198
|
+
data["main_metric"] = data["metrics"][0]
|
|
199
|
+
else:
|
|
200
|
+
raise ValueError("No metrics specified.")
|
|
201
|
+
|
|
202
|
+
return BenchmarkConfig(
|
|
203
|
+
name=data["name"],
|
|
204
|
+
description=data["description"],
|
|
205
|
+
prompt_file=data["prompt_file"],
|
|
206
|
+
dataset_file=data["dataset_file"],
|
|
207
|
+
main_metric=data["main_metric"],
|
|
208
|
+
metrics=data["metrics"],
|
|
209
|
+
max_tokens=data.get("max_tokens", 1024),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_benchmark_config_from_path(path: str) -> BenchmarkConfig:
|
|
214
|
+
"""Load and parse benchmark configuration from YAML file"""
|
|
215
|
+
with open(path) as f:
|
|
216
|
+
config = yaml.safe_load(f)
|
|
217
|
+
|
|
218
|
+
benchmark_config = _structure_benchmark_config(config, BenchmarkConfig)
|
|
219
|
+
return benchmark_config
|