crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -9,6 +9,7 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
+
import dataclasses
|
|
12
13
|
import os
|
|
13
14
|
import datetime
|
|
14
15
|
import urllib.parse
|
|
@@ -31,18 +32,26 @@ from helm.common.general import (
|
|
|
31
32
|
)
|
|
32
33
|
from helm.common.codec import from_json
|
|
33
34
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
|
-
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
|
+
from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
|
|
35
36
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
37
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
37
|
-
from helm.benchmark.metrics.metric import
|
|
38
|
+
from helm.benchmark.metrics.metric import (
|
|
39
|
+
MetricInterface,
|
|
40
|
+
MetricMetadata,
|
|
41
|
+
MetricSpec,
|
|
42
|
+
create_metric,
|
|
43
|
+
get_all_stats_by_name,
|
|
44
|
+
)
|
|
38
45
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
39
46
|
from helm.benchmark.run_spec import RunSpec
|
|
40
47
|
from helm.benchmark.runner import LATEST_SYMLINK
|
|
41
48
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
42
49
|
from helm.benchmark.presentation.schema import (
|
|
50
|
+
MetricGroup,
|
|
43
51
|
MetricNameMatcher,
|
|
44
52
|
RunGroup,
|
|
45
53
|
Field,
|
|
54
|
+
Schema,
|
|
46
55
|
read_schema,
|
|
47
56
|
get_default_schema_path,
|
|
48
57
|
BY_GROUP,
|
|
@@ -341,7 +350,7 @@ class Summarizer:
|
|
|
341
350
|
release: Optional[str],
|
|
342
351
|
suites: Optional[List[str]],
|
|
343
352
|
suite: Optional[str],
|
|
344
|
-
schema_path: str,
|
|
353
|
+
schema_path: Optional[str],
|
|
345
354
|
output_path: str,
|
|
346
355
|
verbose: bool,
|
|
347
356
|
num_threads: int,
|
|
@@ -376,10 +385,8 @@ class Summarizer:
|
|
|
376
385
|
self.verbose: bool = verbose
|
|
377
386
|
self.num_threads: int = num_threads
|
|
378
387
|
self.allow_unknown_models: bool = allow_unknown_models
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
self.schema = read_schema(schema_path)
|
|
388
|
+
self.schema = read_schema(schema_path) if schema_path else Schema()
|
|
389
|
+
self.metric_metadata: List[MetricMetadata] = []
|
|
383
390
|
|
|
384
391
|
def read_run(self, run_path: str) -> Run:
|
|
385
392
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -426,6 +433,8 @@ class Summarizer:
|
|
|
426
433
|
|
|
427
434
|
def read_runs_for_suite(self, suite, run_suite_path):
|
|
428
435
|
"""Load the runs in the run suite path."""
|
|
436
|
+
if not os.path.exists(run_suite_path):
|
|
437
|
+
raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
|
|
429
438
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
430
439
|
# so filter them out.
|
|
431
440
|
run_dir_names = sorted(
|
|
@@ -509,6 +518,150 @@ class Summarizer:
|
|
|
509
518
|
model_field_dicts.append(asdict_without_nones(model_field))
|
|
510
519
|
return model_field_dicts
|
|
511
520
|
|
|
521
|
+
def get_metric_metadata(self) -> List[MetricMetadata]:
|
|
522
|
+
if self.metric_metadata:
|
|
523
|
+
return self.metric_metadata
|
|
524
|
+
metric_specs: List[MetricSpec] = []
|
|
525
|
+
for run in self.runs:
|
|
526
|
+
metric_specs.extend(run.run_spec.metric_specs)
|
|
527
|
+
metric_specs = list(set(metric_specs))
|
|
528
|
+
metric_name_to_metadata: Dict[str, MetricMetadata] = {}
|
|
529
|
+
for metric_spec in metric_specs:
|
|
530
|
+
try:
|
|
531
|
+
metric: MetricInterface = create_metric(metric_spec)
|
|
532
|
+
metric_metadata_list = metric.get_metadata()
|
|
533
|
+
for metric_metadata in metric_metadata_list:
|
|
534
|
+
metric_name_to_metadata[metric_metadata.name] = metric_metadata
|
|
535
|
+
except NotImplementedError:
|
|
536
|
+
pass
|
|
537
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
538
|
+
pass
|
|
539
|
+
|
|
540
|
+
run_stat_names: Set[str] = set()
|
|
541
|
+
for run in self.runs:
|
|
542
|
+
for stat in run.stats:
|
|
543
|
+
run_stat_names.add(stat.name.name)
|
|
544
|
+
|
|
545
|
+
metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
|
|
546
|
+
for metric_name_to_prune in metric_names_to_prune:
|
|
547
|
+
del metric_name_to_metadata[metric_name_to_prune]
|
|
548
|
+
self.metric_metadata = list(metric_name_to_metadata.values())
|
|
549
|
+
return self.metric_metadata
|
|
550
|
+
|
|
551
|
+
def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
|
|
552
|
+
return Field(
|
|
553
|
+
name=metric_metadata.name,
|
|
554
|
+
display_name=metric_metadata.display_name,
|
|
555
|
+
short_display_name=metric_metadata.short_display_name,
|
|
556
|
+
description=metric_metadata.description,
|
|
557
|
+
lower_is_better=metric_metadata.lower_is_better,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
def auto_generate_metric_fields(self) -> List[Field]:
|
|
561
|
+
return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
|
|
562
|
+
|
|
563
|
+
def auto_generate_metric_groups(self) -> List[MetricGroup]:
|
|
564
|
+
metric_groups = [
|
|
565
|
+
MetricGroup(
|
|
566
|
+
name="main_metric",
|
|
567
|
+
display_name="Main Metric",
|
|
568
|
+
description="Main Metric",
|
|
569
|
+
metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
|
|
570
|
+
)
|
|
571
|
+
]
|
|
572
|
+
metric_group_to_metrics: Dict[str, List[str]] = {}
|
|
573
|
+
for metric_metadata in self.metric_metadata:
|
|
574
|
+
if metric_metadata.group:
|
|
575
|
+
if metric_metadata.group not in metric_group_to_metrics:
|
|
576
|
+
metric_group_to_metrics[metric_metadata.group] = []
|
|
577
|
+
metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
|
|
578
|
+
for metric_group, metric_names in metric_group_to_metrics.items():
|
|
579
|
+
display_name = metric_group.replace("_", " ").capitalize()
|
|
580
|
+
metric_groups.append(
|
|
581
|
+
MetricGroup(
|
|
582
|
+
name=metric_group,
|
|
583
|
+
# TODO: Make display_name and description nicer
|
|
584
|
+
display_name=display_name,
|
|
585
|
+
description=display_name,
|
|
586
|
+
aggregation_strategies=[],
|
|
587
|
+
metrics=[
|
|
588
|
+
MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
|
|
589
|
+
],
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
return metric_groups
|
|
593
|
+
|
|
594
|
+
def get_scenario_metadata(self) -> List[ScenarioMetadata]:
|
|
595
|
+
scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
|
|
596
|
+
scenario_specs = list(set(scenario_specs))
|
|
597
|
+
scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
|
|
598
|
+
for scenario_spec in scenario_specs:
|
|
599
|
+
try:
|
|
600
|
+
scenario: Scenario = create_scenario(scenario_spec)
|
|
601
|
+
scenario_metadata = scenario.get_metadata()
|
|
602
|
+
scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
|
|
603
|
+
except NotImplementedError:
|
|
604
|
+
pass
|
|
605
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
run_groups: Set[str] = set()
|
|
609
|
+
for run in self.runs:
|
|
610
|
+
for run_group in run.run_spec.groups:
|
|
611
|
+
run_groups.add(run_group)
|
|
612
|
+
|
|
613
|
+
scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
|
|
614
|
+
for scenario_name_to_prune in scenario_names_to_prune:
|
|
615
|
+
del scenario_name_to_metadata[scenario_name_to_prune]
|
|
616
|
+
return list(scenario_name_to_metadata.values())
|
|
617
|
+
|
|
618
|
+
def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
|
|
619
|
+
metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
|
|
620
|
+
return RunGroup(
|
|
621
|
+
name=scenario_metadata.name,
|
|
622
|
+
display_name=scenario_metadata.display_name,
|
|
623
|
+
short_display_name=scenario_metadata.short_display_name,
|
|
624
|
+
description=scenario_metadata.description,
|
|
625
|
+
metric_groups=metric_group_names,
|
|
626
|
+
environment={
|
|
627
|
+
"main_name": scenario_metadata.main_metric,
|
|
628
|
+
"main_split": scenario_metadata.main_split,
|
|
629
|
+
},
|
|
630
|
+
taxonomy=scenario_metadata.taxonomy,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
def auto_generate_all_scenarios_run_group(self) -> RunGroup:
|
|
634
|
+
return RunGroup(
|
|
635
|
+
name="all_scenarios",
|
|
636
|
+
display_name="All Scenarios",
|
|
637
|
+
description="All scenarios",
|
|
638
|
+
category="Scenario Groups",
|
|
639
|
+
subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
|
|
643
|
+
return [
|
|
644
|
+
self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
def fix_up_schema(self) -> None:
|
|
648
|
+
# if not self.schema.run_groups:
|
|
649
|
+
if not self.schema.metrics:
|
|
650
|
+
self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
|
|
651
|
+
# Can only auto-generate metric groups if metrics were also auto-generated
|
|
652
|
+
# because auto_generate_metric_groups() requires self.metric_metadata()
|
|
653
|
+
# which is populated by auto_generate_metric_fields()
|
|
654
|
+
if not self.schema.metric_groups:
|
|
655
|
+
self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
|
|
656
|
+
if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
|
|
657
|
+
self.schema = dataclasses.replace(
|
|
658
|
+
self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
|
|
659
|
+
)
|
|
660
|
+
if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
|
|
661
|
+
self.schema = dataclasses.replace(
|
|
662
|
+
self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
|
|
663
|
+
)
|
|
664
|
+
|
|
512
665
|
def write_schema(self) -> None:
|
|
513
666
|
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
514
667
|
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
@@ -1070,7 +1223,8 @@ class Summarizer:
|
|
|
1070
1223
|
is_scenario_table=False,
|
|
1071
1224
|
aggregation_strategies=aggregate_strategies,
|
|
1072
1225
|
)
|
|
1073
|
-
|
|
1226
|
+
if len(table.header) > 1:
|
|
1227
|
+
tables.append(table)
|
|
1074
1228
|
return tables
|
|
1075
1229
|
|
|
1076
1230
|
def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
|
|
@@ -1213,14 +1367,16 @@ class Summarizer:
|
|
|
1213
1367
|
"""Run the entire summarization pipeline."""
|
|
1214
1368
|
self.read_runs()
|
|
1215
1369
|
self.group_runs()
|
|
1216
|
-
self.check_metrics_defined()
|
|
1217
1370
|
|
|
1218
|
-
self.
|
|
1371
|
+
ensure_directory_exists(self.run_release_path)
|
|
1219
1372
|
|
|
1220
1373
|
# Must happen after self.read_runs()
|
|
1221
1374
|
# because it uses self.runs
|
|
1375
|
+
self.fix_up_schema()
|
|
1376
|
+
self.check_metrics_defined()
|
|
1222
1377
|
self.write_schema()
|
|
1223
1378
|
|
|
1379
|
+
self.write_run_display_json(skip_completed)
|
|
1224
1380
|
self.write_executive_summary()
|
|
1225
1381
|
self.write_runs()
|
|
1226
1382
|
self.write_run_specs()
|
|
@@ -1254,7 +1410,15 @@ def summarize(args):
|
|
|
1254
1410
|
else:
|
|
1255
1411
|
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
1412
|
|
|
1257
|
-
schema_path
|
|
1413
|
+
schema_path: Optional[str]
|
|
1414
|
+
if args.auto_generate_schema:
|
|
1415
|
+
if args.schema_path:
|
|
1416
|
+
raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
|
|
1417
|
+
schema_path = None
|
|
1418
|
+
elif args.schema_path:
|
|
1419
|
+
schema_path = args.schema_path
|
|
1420
|
+
else:
|
|
1421
|
+
schema_path = get_default_schema_path()
|
|
1258
1422
|
|
|
1259
1423
|
register_builtin_configs_from_helm_package()
|
|
1260
1424
|
register_configs_from_directory(args.local_path)
|
|
@@ -1346,6 +1510,11 @@ def main():
|
|
|
1346
1510
|
default=None,
|
|
1347
1511
|
help="PATH to a YAML file to customize logging",
|
|
1348
1512
|
)
|
|
1513
|
+
parser.add_argument(
|
|
1514
|
+
"--auto-generate-schema",
|
|
1515
|
+
action="store_true",
|
|
1516
|
+
help="EXPERIMENTAL: Auto-generate schema",
|
|
1517
|
+
)
|
|
1349
1518
|
args = parser.parse_args()
|
|
1350
1519
|
setup_default_logging(args.log_config)
|
|
1351
1520
|
summarize(args)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class TaxonomyInfo:
|
|
7
|
+
# Task (e.g., question answering)
|
|
8
|
+
task: Optional[str] = None
|
|
9
|
+
|
|
10
|
+
# Domain - genre (e.g., Wikipedia)
|
|
11
|
+
what: Optional[str] = None
|
|
12
|
+
|
|
13
|
+
# Domain - when it was written (e.g., 2010s)
|
|
14
|
+
when: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
# Domain - demographics (e.g., web users)
|
|
17
|
+
who: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
# Language (e.g., English)
|
|
20
|
+
language: Optional[str] = None
|
helm/benchmark/run.py
CHANGED
|
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
|
|
|
37
37
|
run_specs: List[RunSpec] = []
|
|
38
38
|
for entry in run_entries:
|
|
39
39
|
# Filter by priority
|
|
40
|
-
if priority is not None and entry.priority > priority:
|
|
40
|
+
if priority is not None and entry.priority is not None and entry.priority > priority:
|
|
41
41
|
continue
|
|
42
42
|
|
|
43
43
|
for run_spec in construct_run_specs(parse_object_spec(entry.description)):
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1484
1484
|
instructions = "Answer with only a single letter. Do not include a period in your answer."
|
|
1485
1485
|
elif self.scenario == "mcqa_only_last_question":
|
|
1486
1486
|
instructions = "Answer only the last question with only a single letter."
|
|
1487
|
+
elif self.scenario == "arabic_mcqa":
|
|
1488
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1487
1489
|
else:
|
|
1488
1490
|
instructions = "Answer with only a single letter."
|
|
1489
1491
|
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1525
1527
|
"Answer only the last question with a short answer. "
|
|
1526
1528
|
"Avoid extra, unnecessary information in the answer."
|
|
1527
1529
|
)
|
|
1530
|
+
elif self.scenario == "arabic_mcqa":
|
|
1531
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1528
1532
|
else:
|
|
1529
1533
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1530
1534
|
elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
@@ -4,25 +4,37 @@ EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
|
|
|
4
4
|
|
|
5
5
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
6
|
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
|
|
7
|
-
from helm.benchmark.
|
|
7
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
8
|
+
from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
8
10
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
9
11
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
_ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
|
|
15
|
+
_ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
|
|
16
|
+
|
|
17
|
+
|
|
12
18
|
@run_spec_function("arabic_mmlu")
|
|
13
|
-
def get_arabic_mmlu_spec() -> RunSpec:
|
|
19
|
+
def get_arabic_mmlu_spec(subset: str) -> RunSpec:
|
|
14
20
|
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
15
|
-
|
|
21
|
+
|
|
22
|
+
scenario_spec = ScenarioSpec(
|
|
23
|
+
class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario", args={"subset": subset}
|
|
24
|
+
)
|
|
16
25
|
|
|
17
26
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
18
27
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
19
|
-
instructions="
|
|
20
|
-
input_noun="
|
|
21
|
-
output_noun="
|
|
28
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
29
|
+
input_noun="السؤال",
|
|
30
|
+
output_noun="الإجابة",
|
|
31
|
+
max_tokens=100,
|
|
32
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
33
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
22
34
|
)
|
|
23
35
|
|
|
24
36
|
return RunSpec(
|
|
25
|
-
name="arabic_mmlu",
|
|
37
|
+
name=f"arabic_mmlu:subset={subset}",
|
|
26
38
|
scenario_spec=scenario_spec,
|
|
27
39
|
adapter_spec=adapter_spec,
|
|
28
40
|
metric_specs=get_exact_match_metric_specs(),
|
|
@@ -39,9 +51,12 @@ def get_alghafa_spec(subset: str) -> RunSpec:
|
|
|
39
51
|
|
|
40
52
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
41
53
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
42
|
-
instructions="
|
|
43
|
-
input_noun="
|
|
44
|
-
output_noun="
|
|
54
|
+
instructions="الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح", # noqa: E501
|
|
55
|
+
input_noun="السؤال",
|
|
56
|
+
output_noun="الإجابة",
|
|
57
|
+
max_tokens=100,
|
|
58
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
59
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
45
60
|
)
|
|
46
61
|
|
|
47
62
|
return RunSpec(
|
|
@@ -54,20 +69,129 @@ def get_alghafa_spec(subset: str) -> RunSpec:
|
|
|
54
69
|
|
|
55
70
|
|
|
56
71
|
@run_spec_function("aratrust")
|
|
57
|
-
def get_aratrust_spec() -> RunSpec:
|
|
72
|
+
def get_aratrust_spec(category: str) -> RunSpec:
|
|
58
73
|
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
59
|
-
scenario_spec = ScenarioSpec(
|
|
74
|
+
scenario_spec = ScenarioSpec(
|
|
75
|
+
class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario",
|
|
76
|
+
args={"category": category},
|
|
77
|
+
)
|
|
60
78
|
|
|
61
79
|
adapter_spec = get_generation_adapter_spec(
|
|
62
|
-
instructions="
|
|
63
|
-
input_noun="
|
|
64
|
-
output_noun="
|
|
80
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج", # noqa: E501
|
|
81
|
+
input_noun="السؤال",
|
|
82
|
+
output_noun="الإجابة",
|
|
83
|
+
max_tokens=100,
|
|
65
84
|
)
|
|
66
85
|
|
|
67
86
|
return RunSpec(
|
|
68
|
-
name="aratrust",
|
|
87
|
+
name=f"aratrust:category={category}",
|
|
69
88
|
scenario_spec=scenario_spec,
|
|
70
89
|
adapter_spec=adapter_spec,
|
|
71
90
|
metric_specs=get_exact_match_metric_specs(),
|
|
72
91
|
groups=["aratrust"],
|
|
73
92
|
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@run_spec_function("alrage")
|
|
96
|
+
def get_alrage_spec() -> RunSpec:
|
|
97
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
98
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.alrage_scenario.ALRAGEScenario")
|
|
99
|
+
|
|
100
|
+
adapter_spec = get_generation_adapter_spec(
|
|
101
|
+
instructions="بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي", # noqa: E501
|
|
102
|
+
input_noun="السؤال",
|
|
103
|
+
output_noun="الإجابة",
|
|
104
|
+
max_tokens=100,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.alrage_annotator.ALRAGEAnnotator")]
|
|
108
|
+
|
|
109
|
+
metric_specs = [
|
|
110
|
+
MetricSpec(class_name="helm.benchmark.metrics.alrage_metric.ALRAGEMetric")
|
|
111
|
+
] + get_basic_metric_specs([])
|
|
112
|
+
|
|
113
|
+
return RunSpec(
|
|
114
|
+
name="alrage",
|
|
115
|
+
scenario_spec=scenario_spec,
|
|
116
|
+
adapter_spec=adapter_spec,
|
|
117
|
+
annotators=annotator_specs,
|
|
118
|
+
metric_specs=metric_specs,
|
|
119
|
+
groups=["alrage"],
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@run_spec_function("madinah_qa")
|
|
124
|
+
def get_madinah_qa_spec(subset: str) -> RunSpec:
|
|
125
|
+
scenario_spec = ScenarioSpec(
|
|
126
|
+
class_name="helm.benchmark.scenarios.madinah_qa_scenario.MadinahQAScenario", args={"subset": subset}
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
130
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
131
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
132
|
+
input_noun="السؤال",
|
|
133
|
+
output_noun="الإجابة",
|
|
134
|
+
max_tokens=100,
|
|
135
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
136
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return RunSpec(
|
|
140
|
+
name=f"madinah_qa:subset={subset}",
|
|
141
|
+
scenario_spec=scenario_spec,
|
|
142
|
+
adapter_spec=adapter_spec,
|
|
143
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
144
|
+
groups=["madinah_qa"],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@run_spec_function("mbzuai_human_translated_arabic_mmlu")
|
|
149
|
+
def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
|
|
150
|
+
scenario_spec = ScenarioSpec(
|
|
151
|
+
class_name="helm.benchmark.scenarios.mbzuai_human_translated_arabic_mmlu.MBZUAIHumanTranslatedArabicMMLUScenario",
|
|
152
|
+
args={"subject": subject},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
156
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
157
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
158
|
+
input_noun="السؤال",
|
|
159
|
+
output_noun="الإجابة",
|
|
160
|
+
max_tokens=100,
|
|
161
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
162
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return RunSpec(
|
|
166
|
+
name=f"mbzuai_human_translated_arabic_mmlu:subject={subject}",
|
|
167
|
+
scenario_spec=scenario_spec,
|
|
168
|
+
adapter_spec=adapter_spec,
|
|
169
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
170
|
+
groups=["mbzuai_human_translated_arabic_mmlu"],
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@run_spec_function("arabic_exams")
|
|
175
|
+
def get_arabic_exams_spec(subject: str) -> RunSpec:
|
|
176
|
+
scenario_spec = ScenarioSpec(
|
|
177
|
+
class_name="helm.benchmark.scenarios.arabic_exams_scenario.ArabicEXAMSScenario",
|
|
178
|
+
args={"subject": subject},
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
182
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
183
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
184
|
+
input_noun="السؤال",
|
|
185
|
+
output_noun="الإجابة",
|
|
186
|
+
max_tokens=100,
|
|
187
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
188
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return RunSpec(
|
|
192
|
+
name=f"arabic_exams:subject={subject}",
|
|
193
|
+
scenario_spec=scenario_spec,
|
|
194
|
+
adapter_spec=adapter_spec,
|
|
195
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
196
|
+
groups=["arabic_exams"],
|
|
197
|
+
)
|
|
@@ -7,7 +7,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
|
7
7
|
|
|
8
8
|
@run_spec_function("bluex")
|
|
9
9
|
def get_bluex_spec() -> RunSpec:
|
|
10
|
-
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.
|
|
10
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
|
|
11
11
|
|
|
12
12
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
13
13
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -753,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
|
|
|
753
753
|
)
|
|
754
754
|
|
|
755
755
|
return RunSpec(
|
|
756
|
-
name=f"
|
|
756
|
+
name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
|
|
757
757
|
scenario_spec=scenario_spec,
|
|
758
758
|
adapter_spec=adapter_spec,
|
|
759
759
|
metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
|
|
760
760
|
+ get_generative_harms_metric_specs(),
|
|
761
|
-
groups=["
|
|
761
|
+
groups=["summarization_xsum_sampled"],
|
|
762
762
|
)
|
|
763
763
|
|
|
764
764
|
|
|
@@ -39,12 +39,12 @@ def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSp
|
|
|
39
39
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
40
40
|
global_prefix="",
|
|
41
41
|
global_suffix="",
|
|
42
|
-
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.", # noqa: E501
|
|
42
|
+
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n", # noqa: E501
|
|
43
43
|
input_prefix="",
|
|
44
44
|
input_suffix="\n",
|
|
45
45
|
reference_prefix="A. ",
|
|
46
46
|
reference_suffix="\n",
|
|
47
|
-
output_prefix="",
|
|
47
|
+
output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n", # noqa: E501
|
|
48
48
|
output_suffix="",
|
|
49
49
|
instance_prefix="",
|
|
50
50
|
max_train_instances=0,
|
|
File without changes
|