crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
2
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
3
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("mmmlu")
|
|
9
|
+
def get_mmmlu_spec(locale: str, subject: str) -> RunSpec:
|
|
10
|
+
scenario_spec = ScenarioSpec(
|
|
11
|
+
class_name="helm.benchmark.scenarios.mmmlu_scenario.MMMLUScenario", args={"locale": locale, "subject": subject}
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
15
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
16
|
+
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
17
|
+
input_noun="Question",
|
|
18
|
+
output_noun="Answer",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return RunSpec(
|
|
22
|
+
name=f"mmmlu:locale={locale},subject={subject}",
|
|
23
|
+
scenario_spec=scenario_spec,
|
|
24
|
+
adapter_spec=adapter_spec,
|
|
25
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
26
|
+
groups=["mmmlu", f"mmmlu_{locale}_{subject}"],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@run_spec_function("exams_multilingual")
|
|
31
|
+
def get_exams_multilingual_spec(language: str, subject: str) -> RunSpec:
|
|
32
|
+
scenario_spec = ScenarioSpec(
|
|
33
|
+
class_name="helm.benchmark.scenarios.exams_multilingual_scenario.EXAMSMultilingualScenario",
|
|
34
|
+
args={"language": language, "subject": subject},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
38
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
39
|
+
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
40
|
+
input_noun="Question",
|
|
41
|
+
output_noun="Answer",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return RunSpec(
|
|
45
|
+
name=f"exams_multilingual:locale={language},subject={subject}",
|
|
46
|
+
scenario_spec=scenario_spec,
|
|
47
|
+
adapter_spec=adapter_spec,
|
|
48
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
49
|
+
groups=["exams_multilingual", f"exams_multilingual_{language}_{subject}"],
|
|
50
|
+
)
|
|
@@ -73,9 +73,7 @@ def get_ultra_suite_classification_run_spec() -> RunSpec:
|
|
|
73
73
|
scenario_spec = ScenarioSpec(
|
|
74
74
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_classification_scenario.UltraSuiteClassificationScenario", # noqa: E501
|
|
75
75
|
)
|
|
76
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
77
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
78
|
-
)
|
|
76
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
79
77
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
80
78
|
run_spec_name: str = "ultra_suite_classification"
|
|
81
79
|
return RunSpec(
|
|
@@ -92,9 +90,7 @@ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
|
|
|
92
90
|
scenario_spec = ScenarioSpec(
|
|
93
91
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_breakdown_scenario.UltraSuiteDisorderBreakdownScenario", # noqa: E501
|
|
94
92
|
)
|
|
95
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
96
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
97
|
-
)
|
|
93
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
98
94
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
99
95
|
run_spec_name: str = "ultra_suite_classification_breakdown"
|
|
100
96
|
return RunSpec(
|
|
@@ -112,13 +108,17 @@ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
|
|
|
112
108
|
@run_spec_function("ultra_suite_asr_classification")
|
|
113
109
|
def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
|
|
114
110
|
scenario_spec = ScenarioSpec(
|
|
115
|
-
class_name="helm.benchmark.scenarios.audio_language.
|
|
111
|
+
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_classification_scenario.UltraSuiteASRClassificationScenario", # noqa: E501
|
|
116
112
|
)
|
|
117
113
|
adapter_spec = _get_generation_adapter_spec(
|
|
118
114
|
instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
|
|
119
|
-
max_tokens=
|
|
115
|
+
max_tokens=50,
|
|
120
116
|
)
|
|
121
|
-
metric_specs: List[MetricSpec] =
|
|
117
|
+
metric_specs: List[MetricSpec] = [
|
|
118
|
+
MetricSpec(
|
|
119
|
+
class_name="helm.benchmark.metrics.ultra_suite_asr_classification_metrics.UltraSuiteASRMetric", args={}
|
|
120
|
+
)
|
|
121
|
+
]
|
|
122
122
|
run_spec_name: str = "ultra_suite_asr_classification"
|
|
123
123
|
return RunSpec(
|
|
124
124
|
name=run_spec_name,
|
|
@@ -133,7 +133,7 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
|
|
|
133
133
|
@run_spec_function("ultra_suite_asr_transcription")
|
|
134
134
|
def get_ultra_suite_asr_transcription_run_spec() -> RunSpec:
|
|
135
135
|
scenario_spec = ScenarioSpec(
|
|
136
|
-
class_name="helm.benchmark.scenarios.audio_language.
|
|
136
|
+
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_transcription_scenario.UltraSuiteASRTranscriptionScenario", # noqa: E501
|
|
137
137
|
)
|
|
138
138
|
adapter_spec = _get_generation_adapter_spec(
|
|
139
139
|
instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Try to understand what the child is expected to say. And only respond with the transcription of the child's speech. Not the pathologist's prompt or any other commentary. Only respond with the text transcription, no other text, commentary or punctuations.""", # noqa: E501
|
|
@@ -155,9 +155,7 @@ def get_ultra_suite_disorder_symptoms_run_spec() -> RunSpec:
|
|
|
155
155
|
scenario_spec = ScenarioSpec(
|
|
156
156
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_symptoms_scenario.UltraSuiteDisorderSymptomsScenario", # noqa: E501
|
|
157
157
|
)
|
|
158
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
159
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
160
|
-
)
|
|
158
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
161
159
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
162
160
|
run_spec_name: str = "ultra_suite_disorder_symptoms"
|
|
163
161
|
return RunSpec(
|
helm/benchmark/runner.py
CHANGED
|
@@ -39,10 +39,15 @@ from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
LATEST_SYMLINK: str = "latest"
|
|
42
|
+
_CURRENT_RUN_SPEC_NAME: typing.Optional[str] = None
|
|
42
43
|
_BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
|
|
43
44
|
_CACHED_MODELS_FOLDER: str = "models"
|
|
44
45
|
|
|
45
46
|
|
|
47
|
+
def _get_current_run_spec_name() -> typing.Optional[str]:
|
|
48
|
+
return _CURRENT_RUN_SPEC_NAME
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
def get_benchmark_output_path() -> str:
|
|
47
52
|
"""Get the benchmark output path.
|
|
48
53
|
|
|
@@ -225,6 +230,8 @@ class Runner:
|
|
|
225
230
|
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
226
231
|
|
|
227
232
|
def run_one(self, run_spec: RunSpec):
|
|
233
|
+
global _CURRENT_RUN_SPEC_NAME
|
|
234
|
+
_CURRENT_RUN_SPEC_NAME = run_spec.name
|
|
228
235
|
run_path: str = self._get_run_path(run_spec)
|
|
229
236
|
if self.skip_completed_runs and self._is_run_completed(run_path):
|
|
230
237
|
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import ensure_file_downloaded
|
|
15
17
|
|
|
@@ -124,3 +126,24 @@ class ACIBenchScenario(Scenario):
|
|
|
124
126
|
instances.extend(self.process_json(test_json, TEST_SPLIT))
|
|
125
127
|
|
|
126
128
|
return instances
|
|
129
|
+
|
|
130
|
+
def get_metadata(self):
|
|
131
|
+
return ScenarioMetadata(
|
|
132
|
+
name="aci_bench",
|
|
133
|
+
display_name="ACI-Bench",
|
|
134
|
+
description="ACI-Bench is a benchmark of real-world patient-doctor conversations paired "
|
|
135
|
+
"with structured clinical notes. The benchmark evaluates a model's ability to "
|
|
136
|
+
"understand spoken medical dialogue and convert it into formal clinical "
|
|
137
|
+
"documentation, covering sections such as history of present illness, physical "
|
|
138
|
+
"exam findings, results, and assessment and plan [(Yim et al., "
|
|
139
|
+
"2024)](https://www.nature.com/articles/s41597-023-02487-3).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="Text generation",
|
|
142
|
+
what="Extract and structure information from patient-doctor " "conversations",
|
|
143
|
+
when="Any",
|
|
144
|
+
who="Clinician",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="aci_bench_accuracy",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_directory_exists
|
|
14
16
|
|
|
@@ -53,3 +55,22 @@ class AIRBench2024Scenario(Scenario):
|
|
|
53
55
|
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
54
56
|
instances.append(instance)
|
|
55
57
|
return instances
|
|
58
|
+
|
|
59
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
60
|
+
return ScenarioMetadata(
|
|
61
|
+
name="air_bench_2024",
|
|
62
|
+
display_name="AIRBench 2024",
|
|
63
|
+
description="AIRBench 2024 is a AI safety benchmark that aligns with emerging government "
|
|
64
|
+
"regulations and company policies. It consists of diverse, malicious prompts "
|
|
65
|
+
"spanning categories of the regulation-based safety categories in the AIR 2024 "
|
|
66
|
+
"safety taxonomy.\n",
|
|
67
|
+
taxonomy=TaxonomyInfo(
|
|
68
|
+
task="open-ended instruction-following text generation",
|
|
69
|
+
what="malicious prompts",
|
|
70
|
+
when="2024",
|
|
71
|
+
who="dataset authors and language models",
|
|
72
|
+
language="English",
|
|
73
|
+
),
|
|
74
|
+
main_metric="air_score",
|
|
75
|
+
main_split="test",
|
|
76
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AlGhafaScenario(Scenario):
|
|
20
|
+
"""AlGhafa Evaluation Benchmark for Arabic Language Models
|
|
21
|
+
|
|
22
|
+
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
23
|
+
|
|
24
|
+
Multiple-choice evaluation benchmark for zero- and few-shot evaluation of Arabic LLMs,
|
|
25
|
+
consisting of
|
|
26
|
+
|
|
27
|
+
- https://huggingface.co/datasets/OALL/AlGhafa-Arabic-LLM-Benchmark-Native/
|
|
28
|
+
- https://aclanthology.org/2023.arabicnlp-1.21/
|
|
29
|
+
|
|
30
|
+
Citation:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
@inproceedings{almazrouei-etal-2023-alghafa,
|
|
34
|
+
title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
|
|
35
|
+
author = "Almazrouei, Ebtesam and
|
|
36
|
+
Cojocaru, Ruxandra and
|
|
37
|
+
Baldo, Michele and
|
|
38
|
+
Malartic, Quentin and
|
|
39
|
+
Alobeidli, Hamza and
|
|
40
|
+
Mazzotta, Daniele and
|
|
41
|
+
Penedo, Guilherme and
|
|
42
|
+
Campesan, Giulia and
|
|
43
|
+
Farooq, Mugariya and
|
|
44
|
+
Alhammadi, Maitha and
|
|
45
|
+
Launay, Julien and
|
|
46
|
+
Noune, Badreddine",
|
|
47
|
+
editor = "Sawaf, Hassan and
|
|
48
|
+
El-Beltagy, Samhaa and
|
|
49
|
+
Zaghouani, Wajdi and
|
|
50
|
+
Magdy, Walid and
|
|
51
|
+
Abdelali, Ahmed and
|
|
52
|
+
Tomeh, Nadi and
|
|
53
|
+
Abu Farha, Ibrahim and
|
|
54
|
+
Habash, Nizar and
|
|
55
|
+
Khalifa, Salam and
|
|
56
|
+
Keleg, Amr and
|
|
57
|
+
Haddad, Hatem and
|
|
58
|
+
Zitouni, Imed and
|
|
59
|
+
Mrini, Khalil and
|
|
60
|
+
Almatham, Rawan",
|
|
61
|
+
booktitle = "Proceedings of ArabicNLP 2023",
|
|
62
|
+
month = dec,
|
|
63
|
+
year = "2023",
|
|
64
|
+
address = "Singapore (Hybrid)",
|
|
65
|
+
publisher = "Association for Computational Linguistics",
|
|
66
|
+
url = "https://aclanthology.org/2023.arabicnlp-1.21/",
|
|
67
|
+
doi = "10.18653/v1/2023.arabicnlp-1.21",
|
|
68
|
+
pages = "244--275",
|
|
69
|
+
abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs."
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
""" # noqa: E501
|
|
73
|
+
|
|
74
|
+
name = "alghafa"
|
|
75
|
+
description = "AlGhafa"
|
|
76
|
+
tags = ["multiple choice"]
|
|
77
|
+
|
|
78
|
+
HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
79
|
+
REFERENCE_PREFIX = "sol"
|
|
80
|
+
|
|
81
|
+
def __init__(self, subset: str):
|
|
82
|
+
super().__init__()
|
|
83
|
+
self.subset = subset
|
|
84
|
+
|
|
85
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
86
|
+
cache_dir = os.path.join(output_path, "data")
|
|
87
|
+
ensure_directory_exists(cache_dir)
|
|
88
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
89
|
+
"OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
|
|
90
|
+
self.subset,
|
|
91
|
+
revision="a31ebd34ca311d7e0cfc6ad7f458b3435af280f5",
|
|
92
|
+
cache_dir=cache_dir,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Read all instances
|
|
96
|
+
instances: List[Instance] = []
|
|
97
|
+
for split_name, dataset in dataset_splits.items():
|
|
98
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
99
|
+
option_indexes = [
|
|
100
|
+
int(s.removeprefix(self.REFERENCE_PREFIX))
|
|
101
|
+
for s in dataset[0].keys()
|
|
102
|
+
if s.startswith(self.REFERENCE_PREFIX)
|
|
103
|
+
]
|
|
104
|
+
for row_index, row in enumerate(dataset):
|
|
105
|
+
input = Input(text=row["query"])
|
|
106
|
+
references: List[Reference] = []
|
|
107
|
+
# Need to add 1 because label is zero-indexed and has a value from 0 to (N - 1),
|
|
108
|
+
# but column names are 1 indexed and have values from "sol1" to "solN"
|
|
109
|
+
correct_option_index = int(row["label"]) + 1
|
|
110
|
+
for option_index in option_indexes:
|
|
111
|
+
column_name = f"{self.REFERENCE_PREFIX}{option_index}"
|
|
112
|
+
references.append(
|
|
113
|
+
Reference(
|
|
114
|
+
output=Output(text=row[column_name]),
|
|
115
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
instance = Instance(
|
|
119
|
+
id=f"id{row_index}_{split_name}",
|
|
120
|
+
input=input,
|
|
121
|
+
references=references,
|
|
122
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
123
|
+
)
|
|
124
|
+
instances.append(instance)
|
|
125
|
+
|
|
126
|
+
return instances
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ALRAGEScenario(Scenario):
|
|
19
|
+
"""ALRAGE""" # noqa: E501
|
|
20
|
+
|
|
21
|
+
name = "alrage"
|
|
22
|
+
description = "ALRAGE"
|
|
23
|
+
tags = ["open-book question answering"]
|
|
24
|
+
|
|
25
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
26
|
+
cache_dir = os.path.join(output_path, "data")
|
|
27
|
+
ensure_directory_exists(cache_dir)
|
|
28
|
+
dataset: datasets.Dataset = datasets.load_dataset(
|
|
29
|
+
"OALL/ALRAGE",
|
|
30
|
+
revision="4827b2ed2436aea578e84d9bd4150b66ab8bbe0e",
|
|
31
|
+
split="train",
|
|
32
|
+
cache_dir=cache_dir,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Read all instances
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
for row in dataset:
|
|
38
|
+
input = Input(text=f"السؤال:\n{row['question']}\n\nالسياقات المقترحة:\n{row['candidates']}\n")
|
|
39
|
+
references: List[Reference] = []
|
|
40
|
+
references = [
|
|
41
|
+
Reference(
|
|
42
|
+
output=Output(text=row["gold_answer"]),
|
|
43
|
+
tags=[CORRECT_TAG],
|
|
44
|
+
)
|
|
45
|
+
]
|
|
46
|
+
instance = Instance(
|
|
47
|
+
id=row["id"],
|
|
48
|
+
input=input,
|
|
49
|
+
references=references,
|
|
50
|
+
split=TEST_SPLIT,
|
|
51
|
+
)
|
|
52
|
+
instances.append(instance)
|
|
53
|
+
|
|
54
|
+
return instances
|
|
@@ -2,7 +2,8 @@ import re
|
|
|
2
2
|
from typing import List, Any, Dict
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class AnthropicHHRLHFScenario(Scenario):
|
|
@@ -88,3 +89,24 @@ class AnthropicHHRLHFScenario(Scenario):
|
|
|
88
89
|
)
|
|
89
90
|
instances.append(instance)
|
|
90
91
|
return instances
|
|
92
|
+
|
|
93
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
94
|
+
return ScenarioMetadata(
|
|
95
|
+
name="anthropic_hh_rlhf",
|
|
96
|
+
display_name="Anthropic RLHF dataset",
|
|
97
|
+
short_display_name="Anthropic RLHF dataset",
|
|
98
|
+
description="The dialogue datasets released by Anthropic to facilitate research in model "
|
|
99
|
+
"helpfulness and harmlessness ([Bai et al., "
|
|
100
|
+
"2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., "
|
|
101
|
+
"2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance "
|
|
102
|
+
"of each dialogue.",
|
|
103
|
+
taxonomy=TaxonomyInfo(
|
|
104
|
+
task="open-ended instruction following",
|
|
105
|
+
what="Human-LM dialogues and preference labels",
|
|
106
|
+
when="2022",
|
|
107
|
+
who="Workers from MTurk and Upwork, language models from Anthropic",
|
|
108
|
+
language="English",
|
|
109
|
+
),
|
|
110
|
+
main_metric="Helpfulness",
|
|
111
|
+
main_split="test",
|
|
112
|
+
)
|
|
@@ -2,7 +2,8 @@ import re
|
|
|
2
2
|
from typing import List, Any, Dict
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class AnthropicRedTeamScenario(Scenario):
|
|
@@ -69,3 +70,13 @@ class AnthropicRedTeamScenario(Scenario):
|
|
|
69
70
|
)
|
|
70
71
|
instances.append(instance)
|
|
71
72
|
return instances
|
|
73
|
+
|
|
74
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
75
|
+
return ScenarioMetadata(
|
|
76
|
+
name="anthropic_red_team",
|
|
77
|
+
display_name="Anthropic Red Team",
|
|
78
|
+
description="Anthropic Red Team",
|
|
79
|
+
taxonomy=TaxonomyInfo(task="instruction following sfaety", what="?", when="?", who="?", language="English"),
|
|
80
|
+
main_metric="safety_score",
|
|
81
|
+
main_split="test",
|
|
82
|
+
)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.hierarchical_logger import hwarn
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ArabicEXAMSScenario(Scenario):
|
|
21
|
+
"""The Arabic subset of the EXAMS High School Examinations Dataset for Multilingual Question Answering
|
|
22
|
+
|
|
23
|
+
We use the Open Arabic LLM Leaderboard (OALL) version mirror of the Arabic subset of EXAMS, which is in-turn based
|
|
24
|
+
on the AceGPT version.
|
|
25
|
+
|
|
26
|
+
See: https://www.tii.ae/news/introducing-open-arabic-llm-leaderboard-empowering-arabic-language-modeling-community
|
|
27
|
+
|
|
28
|
+
References:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
@misc{huang2024acegptlocalizinglargelanguage,
|
|
32
|
+
title={AceGPT, Localizing Large Language Models in Arabic},
|
|
33
|
+
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Juncai He and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
|
|
34
|
+
year={2024},
|
|
35
|
+
eprint={2309.12053},
|
|
36
|
+
archivePrefix={arXiv},
|
|
37
|
+
primaryClass={cs.CL},
|
|
38
|
+
url={https://arxiv.org/abs/2309.12053},
|
|
39
|
+
}```
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
@inproceedings{hardalov-etal-2020-exams,
|
|
43
|
+
title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
|
|
44
|
+
author = "Hardalov, Momchil and
|
|
45
|
+
Mihaylov, Todor and
|
|
46
|
+
Zlatkova, Dimitrina and
|
|
47
|
+
Dinkov, Yoan and
|
|
48
|
+
Koychev, Ivan and
|
|
49
|
+
Nakov, Preslav",
|
|
50
|
+
editor = "Webber, Bonnie and
|
|
51
|
+
Cohn, Trevor and
|
|
52
|
+
He, Yulan and
|
|
53
|
+
Liu, Yang",
|
|
54
|
+
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
|
55
|
+
month = nov,
|
|
56
|
+
year = "2020",
|
|
57
|
+
address = "Online",
|
|
58
|
+
publisher = "Association for Computational Linguistics",
|
|
59
|
+
url = "https://aclanthology.org/2020.emnlp-main.438/",
|
|
60
|
+
doi = "10.18653/v1/2020.emnlp-main.438",
|
|
61
|
+
pages = "5427--5444",
|
|
62
|
+
abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
|
|
63
|
+
}```
|
|
64
|
+
""" # noqa: E501
|
|
65
|
+
|
|
66
|
+
name = "arabic_exams"
|
|
67
|
+
description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
|
|
68
|
+
tags = ["knowledge", "multiple_choice"]
|
|
69
|
+
|
|
70
|
+
CHOICES = ["A", "B", "C", "D"]
|
|
71
|
+
# Remap validation split to train split
|
|
72
|
+
HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
73
|
+
|
|
74
|
+
def __init__(self, subject: str):
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.subject: str = subject.replace("_", " ")
|
|
77
|
+
|
|
78
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
79
|
+
cache_dir = os.path.join(output_path, "data")
|
|
80
|
+
ensure_directory_exists(cache_dir)
|
|
81
|
+
dataset_splits = datasets.load_dataset(
|
|
82
|
+
"OALL/Arabic_EXAMS",
|
|
83
|
+
revision="bc7a29346dbcaa16a8cd883b1f3e681ab2b7ff2a",
|
|
84
|
+
cache_dir=cache_dir,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
instances: List[Instance] = []
|
|
88
|
+
for split_name, dataset in dataset_splits.items():
|
|
89
|
+
for row in dataset:
|
|
90
|
+
subject = row["id"].split("-")[0]
|
|
91
|
+
if self.subject != "all" and self.subject != subject:
|
|
92
|
+
continue
|
|
93
|
+
input = Input(text=row["question"])
|
|
94
|
+
references: List[Reference] = []
|
|
95
|
+
if row["answer"] not in self.CHOICES:
|
|
96
|
+
hwarn(f"Invalid value in answer column in row: {row}")
|
|
97
|
+
continue
|
|
98
|
+
correct_choice = row["answer"]
|
|
99
|
+
for choice in self.CHOICES:
|
|
100
|
+
references.append(
|
|
101
|
+
Reference(
|
|
102
|
+
output=Output(text=row[choice]),
|
|
103
|
+
tags=[CORRECT_TAG] if choice == correct_choice else [],
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
instance = Instance(
|
|
107
|
+
id=row["id"],
|
|
108
|
+
input=input,
|
|
109
|
+
references=references,
|
|
110
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
111
|
+
)
|
|
112
|
+
instances.append(instance)
|
|
113
|
+
|
|
114
|
+
return instances
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ArabicMMLUScenario(Scenario):
|
|
20
|
+
"""ArabicMMLU
|
|
21
|
+
|
|
22
|
+
ArabicMMLU is the first multi-task language understanding benchmark
|
|
23
|
+
for Arabic language, sourced from school exams across diverse educational
|
|
24
|
+
levels in different countries spanning North Africa, the Levant, and the
|
|
25
|
+
Gulf regions. The data comprises 40 tasks and 14,575 multiple-choice questions
|
|
26
|
+
in Modern Standard Arabic (MSA), and is carefully constructed by collaborating
|
|
27
|
+
with native speakers in the region.
|
|
28
|
+
|
|
29
|
+
- https://huggingface.co/datasets/MBZUAI/ArabicMMLU
|
|
30
|
+
- https://aclanthology.org/2024.findings-acl.334/
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name = "arabic_mmlu"
|
|
34
|
+
description = "Arabic Massive Multitask Language Understanding"
|
|
35
|
+
tags = ["knowledge", "multiple_choice"]
|
|
36
|
+
|
|
37
|
+
OPTIONS = ["A", "B", "C", "D"]
|
|
38
|
+
HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
39
|
+
|
|
40
|
+
def __init__(self, subset: str):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.subset = subset.replace("_", " ")
|
|
43
|
+
|
|
44
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
45
|
+
cache_dir = os.path.join(output_path, "data")
|
|
46
|
+
ensure_directory_exists(cache_dir)
|
|
47
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
48
|
+
"MBZUAI/ArabicMMLU",
|
|
49
|
+
self.subset,
|
|
50
|
+
revision="7aa530e2893ac420352b3f5c1a1310c010e9758b",
|
|
51
|
+
cache_dir=cache_dir,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Read all instances
|
|
55
|
+
instances: List[Instance] = []
|
|
56
|
+
for split_name, dataset in dataset_splits.items():
|
|
57
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
58
|
+
for row_index, row in enumerate(dataset):
|
|
59
|
+
input = Input(text=row["Question"])
|
|
60
|
+
references: List[Reference] = []
|
|
61
|
+
correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
|
|
62
|
+
for option_index in range(1, 6):
|
|
63
|
+
column_name = f"Option {option_index}"
|
|
64
|
+
if not row[column_name]:
|
|
65
|
+
continue
|
|
66
|
+
references.append(
|
|
67
|
+
Reference(
|
|
68
|
+
# Need to convert column to string because the references are floats
|
|
69
|
+
# for the subject "Math (Primary School)"
|
|
70
|
+
output=Output(text=str(row[column_name])),
|
|
71
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
instance = Instance(
|
|
75
|
+
id=f"id{row_index}",
|
|
76
|
+
input=input,
|
|
77
|
+
references=references,
|
|
78
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
79
|
+
)
|
|
80
|
+
instances.append(instance)
|
|
81
|
+
|
|
82
|
+
return instances
|