crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -5,6 +5,7 @@ from typing import List, Dict
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Input,
|
|
10
11
|
Instance,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
TEST_SPLIT,
|
|
17
18
|
TRAIN_SPLIT,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
from helm.common.general import ensure_file_downloaded
|
|
20
22
|
from helm.common.hierarchical_logger import hlog
|
|
@@ -129,6 +131,27 @@ class TyDiQAScenario(Scenario):
|
|
|
129
131
|
outputs.append(instance)
|
|
130
132
|
return outputs
|
|
131
133
|
|
|
134
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
135
|
+
return ScenarioMetadata(
|
|
136
|
+
name="tydiqa",
|
|
137
|
+
display_name="TyDiQA",
|
|
138
|
+
short_display_name=None,
|
|
139
|
+
description="TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an "
|
|
140
|
+
"open-book question answering dataset for 11 typologically-diverse languages. "
|
|
141
|
+
"The questions are written by people who want to know the answer, but do not "
|
|
142
|
+
"know the answer yet, and the data is collected directly in each language "
|
|
143
|
+
"without the use of translation.\n",
|
|
144
|
+
taxonomy=TaxonomyInfo(
|
|
145
|
+
task="question answering",
|
|
146
|
+
what="questions by human annotators about Wikipedia articles",
|
|
147
|
+
when="?",
|
|
148
|
+
who="human annotators",
|
|
149
|
+
language="Indonesian",
|
|
150
|
+
),
|
|
151
|
+
main_metric="squad_f1_score",
|
|
152
|
+
main_split="test",
|
|
153
|
+
)
|
|
154
|
+
|
|
132
155
|
|
|
133
156
|
# 1.2 Vietnamese & Thai: XQuAD
|
|
134
157
|
class XQuADScenario(Scenario):
|
|
@@ -232,6 +255,28 @@ class XQuADScenario(Scenario):
|
|
|
232
255
|
outputs.append(instance)
|
|
233
256
|
return outputs
|
|
234
257
|
|
|
258
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
259
|
+
return ScenarioMetadata(
|
|
260
|
+
name=f"xquad_{self.language}",
|
|
261
|
+
display_name=f"XQuAD ({self.language})",
|
|
262
|
+
short_display_name=None,
|
|
263
|
+
description="XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book "
|
|
264
|
+
"question answering dataset that is parallel across 10 languages. The dataset "
|
|
265
|
+
"consists of a subset of 240 paragraphs and 1190 question-answer pairs from the "
|
|
266
|
+
"development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their "
|
|
267
|
+
"professional translations.\n",
|
|
268
|
+
taxonomy=TaxonomyInfo(
|
|
269
|
+
task="question answering",
|
|
270
|
+
what="questions by crowdworkers about Wikipedia articles translated "
|
|
271
|
+
f"from English to {self.language}",
|
|
272
|
+
when="?",
|
|
273
|
+
who="?",
|
|
274
|
+
language=self.language,
|
|
275
|
+
),
|
|
276
|
+
main_metric="squad_f1_score",
|
|
277
|
+
main_split="test",
|
|
278
|
+
)
|
|
279
|
+
|
|
235
280
|
|
|
236
281
|
# 1.3 Tamil: IndicQA
|
|
237
282
|
class IndicQAScenario(Scenario):
|
|
@@ -341,6 +386,27 @@ class IndicQAScenario(Scenario):
|
|
|
341
386
|
outputs.append(instance)
|
|
342
387
|
return outputs
|
|
343
388
|
|
|
389
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
390
|
+
return ScenarioMetadata(
|
|
391
|
+
name="indicqa",
|
|
392
|
+
display_name="IndicQA",
|
|
393
|
+
short_display_name=None,
|
|
394
|
+
description="IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an "
|
|
395
|
+
"open-book question answering dataset for 11 Indic languages. Answers to "
|
|
396
|
+
"questions are to be extracted from the text provided. The data is taken from "
|
|
397
|
+
"Wikipedia articles across various domains and questions and answers were "
|
|
398
|
+
"manually created by native speakers.\n",
|
|
399
|
+
taxonomy=TaxonomyInfo(
|
|
400
|
+
task="question answering",
|
|
401
|
+
what="questions about Wikipedia articles translated by native " "speakers from English to Tamil",
|
|
402
|
+
when="?",
|
|
403
|
+
who="?",
|
|
404
|
+
language="Tamil",
|
|
405
|
+
),
|
|
406
|
+
main_metric="squad_f1_score",
|
|
407
|
+
main_split="test",
|
|
408
|
+
)
|
|
409
|
+
|
|
344
410
|
|
|
345
411
|
# 2. Sentiment Analysis
|
|
346
412
|
# 2.1 Indonesian: NusaX Sentiment
|
|
@@ -445,6 +511,25 @@ class NusaXScenario(Scenario):
|
|
|
445
511
|
outputs.append(instance)
|
|
446
512
|
return outputs
|
|
447
513
|
|
|
514
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
515
|
+
return ScenarioMetadata(
|
|
516
|
+
name="nusax",
|
|
517
|
+
display_name="NusaX",
|
|
518
|
+
short_display_name=None,
|
|
519
|
+
description="NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an "
|
|
520
|
+
"Indonesian sentiment analysis dataset. The data consists of comments and "
|
|
521
|
+
"reviews from various online platforms.\n",
|
|
522
|
+
taxonomy=TaxonomyInfo(
|
|
523
|
+
task="sentiment analysis",
|
|
524
|
+
what="online comments and reviews",
|
|
525
|
+
when="?",
|
|
526
|
+
who="internet users",
|
|
527
|
+
language="Indonesian",
|
|
528
|
+
),
|
|
529
|
+
main_metric="classification_macro_f1",
|
|
530
|
+
main_split="test",
|
|
531
|
+
)
|
|
532
|
+
|
|
448
533
|
|
|
449
534
|
# 2.2 Vietnamese: UIT-VSFC
|
|
450
535
|
class UITVSFCScenario(Scenario):
|
|
@@ -543,6 +628,25 @@ class UITVSFCScenario(Scenario):
|
|
|
543
628
|
outputs.append(instance)
|
|
544
629
|
return outputs
|
|
545
630
|
|
|
631
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
632
|
+
return ScenarioMetadata(
|
|
633
|
+
name="uitvsfc",
|
|
634
|
+
display_name="UIT-VSFC",
|
|
635
|
+
short_display_name=None,
|
|
636
|
+
description="UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a "
|
|
637
|
+
"Vietnamese sentiment analysis dataset. The data consists of student feedback "
|
|
638
|
+
"obtained from end-of-semester surveys at a Vietnamese university.\n",
|
|
639
|
+
taxonomy=TaxonomyInfo(
|
|
640
|
+
task="sentiment analysis",
|
|
641
|
+
what="university student end-of-semester survey responses",
|
|
642
|
+
when="?",
|
|
643
|
+
who="university students",
|
|
644
|
+
language="Vietnamese",
|
|
645
|
+
),
|
|
646
|
+
main_metric="classification_macro_f1",
|
|
647
|
+
main_split="test",
|
|
648
|
+
)
|
|
649
|
+
|
|
546
650
|
|
|
547
651
|
# 2.3 Thai: Wisesight Sentiment
|
|
548
652
|
class WisesightScenario(Scenario):
|
|
@@ -634,6 +738,25 @@ class WisesightScenario(Scenario):
|
|
|
634
738
|
outputs.append(instance)
|
|
635
739
|
return outputs
|
|
636
740
|
|
|
741
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
742
|
+
return ScenarioMetadata(
|
|
743
|
+
name="wisesight",
|
|
744
|
+
display_name="Wisesight",
|
|
745
|
+
short_display_name=None,
|
|
746
|
+
description="Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is "
|
|
747
|
+
"an Thai sentiment analysis scenario. The data consists of social media "
|
|
748
|
+
"messages regarding consumer products and services. \n",
|
|
749
|
+
taxonomy=TaxonomyInfo(
|
|
750
|
+
task="sentiment analysis",
|
|
751
|
+
what="social media messages regarding consumer products and services",
|
|
752
|
+
when="?",
|
|
753
|
+
who="social media users",
|
|
754
|
+
language="Thai",
|
|
755
|
+
),
|
|
756
|
+
main_metric="classification_macro_f1",
|
|
757
|
+
main_split="test",
|
|
758
|
+
)
|
|
759
|
+
|
|
637
760
|
|
|
638
761
|
# 2.4 Tamil: IndicSentiment
|
|
639
762
|
class IndicSentimentScenario(Scenario):
|
|
@@ -723,6 +846,22 @@ class IndicSentimentScenario(Scenario):
|
|
|
723
846
|
outputs.append(instance)
|
|
724
847
|
return outputs
|
|
725
848
|
|
|
849
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
850
|
+
return ScenarioMetadata(
|
|
851
|
+
name="indicsentiment",
|
|
852
|
+
display_name="IndicSentiment",
|
|
853
|
+
short_display_name=None,
|
|
854
|
+
description="IndicSentiment is a Tamil sentiment analysis dataset that comes from "
|
|
855
|
+
"IndicXTREME [(Doddapaneni, "
|
|
856
|
+
"2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product "
|
|
857
|
+
"reviews that were written by annotators. Labels are positive or negative.\n",
|
|
858
|
+
taxonomy=TaxonomyInfo(
|
|
859
|
+
task="sentiment analysis", what="product reviews", when="?", who="human annotators", language="Tamil"
|
|
860
|
+
),
|
|
861
|
+
main_metric="classification_macro_f1",
|
|
862
|
+
main_split="test",
|
|
863
|
+
)
|
|
864
|
+
|
|
726
865
|
|
|
727
866
|
# 3. Toxicity Detection/Classification
|
|
728
867
|
# 3.1 Indonesian: Multi-Label Hate Speech Detection
|
|
@@ -835,6 +974,24 @@ class MLHSDScenario(Scenario):
|
|
|
835
974
|
outputs.append(instance)
|
|
836
975
|
return outputs
|
|
837
976
|
|
|
977
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
978
|
+
return ScenarioMetadata(
|
|
979
|
+
name="mlhsd",
|
|
980
|
+
display_name="MLHSD",
|
|
981
|
+
short_display_name=None,
|
|
982
|
+
description="MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian "
|
|
983
|
+
"toxicity detection dataset obtained from tweets on Twitter.\n",
|
|
984
|
+
taxonomy=TaxonomyInfo(
|
|
985
|
+
task="toxicity detection/classification",
|
|
986
|
+
what="tweets",
|
|
987
|
+
when="?",
|
|
988
|
+
who="Twitter users",
|
|
989
|
+
language="Indonesian",
|
|
990
|
+
),
|
|
991
|
+
main_metric="classification_macro_f1",
|
|
992
|
+
main_split="test",
|
|
993
|
+
)
|
|
994
|
+
|
|
838
995
|
|
|
839
996
|
# 3.2 Vietnamese: ViHSD
|
|
840
997
|
class ViHSDScenario(Scenario):
|
|
@@ -927,6 +1084,26 @@ class ViHSDScenario(Scenario):
|
|
|
927
1084
|
outputs.append(instance)
|
|
928
1085
|
return outputs
|
|
929
1086
|
|
|
1087
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1088
|
+
return ScenarioMetadata(
|
|
1089
|
+
name="vihsd",
|
|
1090
|
+
display_name="ViHSD",
|
|
1091
|
+
short_display_name=None,
|
|
1092
|
+
description="ViHSD [(Luu, "
|
|
1093
|
+
"2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a "
|
|
1094
|
+
"Vietnamese toxicity detection dataset obtained from comments on Facebook, "
|
|
1095
|
+
"Youtube, Instagram, and Tiktok.\n",
|
|
1096
|
+
taxonomy=TaxonomyInfo(
|
|
1097
|
+
task="toxicity detection/classification",
|
|
1098
|
+
what="social media comments",
|
|
1099
|
+
when="?",
|
|
1100
|
+
who="Social media users",
|
|
1101
|
+
language="Vietnamese",
|
|
1102
|
+
),
|
|
1103
|
+
main_metric="classification_macro_f1",
|
|
1104
|
+
main_split="test",
|
|
1105
|
+
)
|
|
1106
|
+
|
|
930
1107
|
|
|
931
1108
|
# 3.3 Thai: Thai Toxicity Tweets
|
|
932
1109
|
class ThaiToxicityTweetsScenario(Scenario):
|
|
@@ -1013,6 +1190,21 @@ class ThaiToxicityTweetsScenario(Scenario):
|
|
|
1013
1190
|
outputs.append(instance)
|
|
1014
1191
|
return outputs
|
|
1015
1192
|
|
|
1193
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1194
|
+
return ScenarioMetadata(
|
|
1195
|
+
name="thaitoxicitytweets",
|
|
1196
|
+
display_name="Thai Toxicity Tweets",
|
|
1197
|
+
short_display_name=None,
|
|
1198
|
+
description="Thai Toxicity Tweets [(Sirihattasak, "
|
|
1199
|
+
"2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a "
|
|
1200
|
+
"Thai toxicity detection dataset obtained from tweets on Twitter. \n",
|
|
1201
|
+
taxonomy=TaxonomyInfo(
|
|
1202
|
+
task="toxicity detection/classification", what="tweets", when="", who="Twitter users", language="Thai"
|
|
1203
|
+
),
|
|
1204
|
+
main_metric="classification_macro_f1",
|
|
1205
|
+
main_split="test",
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1016
1208
|
|
|
1017
1209
|
# B. Natural Language Generation
|
|
1018
1210
|
# 1. Machine Translation
|
|
@@ -1111,6 +1303,28 @@ class FloresScenario(Scenario):
|
|
|
1111
1303
|
outputs.append(instance)
|
|
1112
1304
|
return outputs
|
|
1113
1305
|
|
|
1306
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1307
|
+
return ScenarioMetadata(
|
|
1308
|
+
name=f"flores_{self.source}_{self.target}",
|
|
1309
|
+
display_name=f"Flores ({self.source} to {self.target})",
|
|
1310
|
+
short_display_name=None,
|
|
1311
|
+
description="Flores [(NLLB Team, "
|
|
1312
|
+
"2022)](https://research.facebook.com/publications/no-language-left-behind/) "
|
|
1313
|
+
"was created with professional human translators who translate the FLORES "
|
|
1314
|
+
"source dataset into the target languages and a separate group of independent "
|
|
1315
|
+
"translation reviewers who perform quality assessments of the human "
|
|
1316
|
+
"translations and provide translation feedback to the translators.\n",
|
|
1317
|
+
taxonomy=TaxonomyInfo(
|
|
1318
|
+
task="machine translation",
|
|
1319
|
+
what="translations from professional human translators",
|
|
1320
|
+
when="?",
|
|
1321
|
+
who="professional human translators",
|
|
1322
|
+
language=f"{self.source}, {self.target}",
|
|
1323
|
+
),
|
|
1324
|
+
main_metric="chr_f_plus_plus",
|
|
1325
|
+
main_split="test",
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1114
1328
|
|
|
1115
1329
|
# C. Natural Language Reasoning
|
|
1116
1330
|
# 1. Natural Language Inference
|
|
@@ -1207,6 +1421,26 @@ class IndoNLIScenario(Scenario):
|
|
|
1207
1421
|
outputs.append(instance)
|
|
1208
1422
|
return outputs
|
|
1209
1423
|
|
|
1424
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1425
|
+
return ScenarioMetadata(
|
|
1426
|
+
name="indonli",
|
|
1427
|
+
display_name="IndoNLI",
|
|
1428
|
+
short_display_name=None,
|
|
1429
|
+
description="IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a "
|
|
1430
|
+
"natural language inference dataset obtained from Wikipedia, news, and web "
|
|
1431
|
+
"articles that incorporates various linguistic phenomena such as numerical "
|
|
1432
|
+
"reasoning, structural changes, idioms, or temporal and spatial reasoning. \n",
|
|
1433
|
+
taxonomy=TaxonomyInfo(
|
|
1434
|
+
task="natural language inference",
|
|
1435
|
+
what="Wikipedia, news, and web articles",
|
|
1436
|
+
when="?",
|
|
1437
|
+
who="?",
|
|
1438
|
+
language="Indonesian",
|
|
1439
|
+
),
|
|
1440
|
+
main_metric="exact_match",
|
|
1441
|
+
main_split="test",
|
|
1442
|
+
)
|
|
1443
|
+
|
|
1210
1444
|
|
|
1211
1445
|
# 1.2 Vietnamese & Thai: XNLI
|
|
1212
1446
|
class XNLIScenario(Scenario):
|
|
@@ -1305,6 +1539,25 @@ class XNLIScenario(Scenario):
|
|
|
1305
1539
|
outputs.append(instance)
|
|
1306
1540
|
return outputs
|
|
1307
1541
|
|
|
1542
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1543
|
+
return ScenarioMetadata(
|
|
1544
|
+
name=f"xnli_{self.language}",
|
|
1545
|
+
display_name=f"XNLI ({self.language})",
|
|
1546
|
+
short_display_name=None,
|
|
1547
|
+
description="XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural "
|
|
1548
|
+
"language inference dataset obtained from crowdsourced NLI data then "
|
|
1549
|
+
"professionally translated across 14 other languages.\n",
|
|
1550
|
+
taxonomy=TaxonomyInfo(
|
|
1551
|
+
task="natural language inference",
|
|
1552
|
+
what="crowdsourced NLI data professionally translated",
|
|
1553
|
+
when="?",
|
|
1554
|
+
who="?",
|
|
1555
|
+
language=self.language,
|
|
1556
|
+
),
|
|
1557
|
+
main_metric="exact_match",
|
|
1558
|
+
main_split="test",
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1308
1561
|
|
|
1309
1562
|
# 1.3 Tamil: IndicXNLI
|
|
1310
1563
|
class IndicXNLIScenario(Scenario):
|
|
@@ -1398,6 +1651,25 @@ class IndicXNLIScenario(Scenario):
|
|
|
1398
1651
|
outputs.append(instance)
|
|
1399
1652
|
return outputs
|
|
1400
1653
|
|
|
1654
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1655
|
+
return ScenarioMetadata(
|
|
1656
|
+
name="indicxnli",
|
|
1657
|
+
display_name="IndicXNLI",
|
|
1658
|
+
short_display_name=None,
|
|
1659
|
+
description="IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME "
|
|
1660
|
+
"[(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which "
|
|
1661
|
+
"automatically translated from XNLI into 11 Indic languages.\n",
|
|
1662
|
+
taxonomy=TaxonomyInfo(
|
|
1663
|
+
task="natural language inference",
|
|
1664
|
+
what="crowdsourced NLI data professionally translated into Tamil",
|
|
1665
|
+
when="?",
|
|
1666
|
+
who="?",
|
|
1667
|
+
language="Tamil",
|
|
1668
|
+
),
|
|
1669
|
+
main_metric="exact_match",
|
|
1670
|
+
main_split="test",
|
|
1671
|
+
)
|
|
1672
|
+
|
|
1401
1673
|
|
|
1402
1674
|
# 2. Causal Reasoning: XCOPA
|
|
1403
1675
|
class XCOPAScenario(Scenario):
|
|
@@ -1529,6 +1801,25 @@ class XCOPAScenario(Scenario):
|
|
|
1529
1801
|
outputs.append(instance)
|
|
1530
1802
|
return outputs
|
|
1531
1803
|
|
|
1804
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1805
|
+
return ScenarioMetadata(
|
|
1806
|
+
name=f"xcopa_{self.language}",
|
|
1807
|
+
display_name=f"XCOPA ({self.language})",
|
|
1808
|
+
short_display_name=None,
|
|
1809
|
+
description="XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal "
|
|
1810
|
+
"reasoning dataset, a translation and reannotation of the English COPA. English "
|
|
1811
|
+
"COPA included questions that directly assess commonsense causal reasoning.\n",
|
|
1812
|
+
taxonomy=TaxonomyInfo(
|
|
1813
|
+
task="causal reasoning",
|
|
1814
|
+
what="commonsense causal reasoning questions translated into " "Indonesian",
|
|
1815
|
+
when="?",
|
|
1816
|
+
who="?",
|
|
1817
|
+
language=self.language,
|
|
1818
|
+
),
|
|
1819
|
+
main_metric="exact_match",
|
|
1820
|
+
main_split="test",
|
|
1821
|
+
)
|
|
1822
|
+
|
|
1532
1823
|
|
|
1533
1824
|
# 1. Syntax: LINDSEA Minimal Pairs
|
|
1534
1825
|
class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
@@ -1650,6 +1941,26 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1650
1941
|
outputs.append(instance)
|
|
1651
1942
|
return outputs
|
|
1652
1943
|
|
|
1944
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1945
|
+
return ScenarioMetadata(
|
|
1946
|
+
name=f"lindsea_syntax_minimal_pairs_{self.language}",
|
|
1947
|
+
display_name="LINDSEA Syntax Minimal Pairs",
|
|
1948
|
+
short_display_name=None,
|
|
1949
|
+
description="LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA "
|
|
1950
|
+
"[(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of "
|
|
1951
|
+
"sentences that differ minimally from each other and contrast in grammatical "
|
|
1952
|
+
"acceptability.\n",
|
|
1953
|
+
taxonomy=TaxonomyInfo(
|
|
1954
|
+
task="minimal pairs",
|
|
1955
|
+
what="sentence pairs with minimal differences and constrasting " "grammatical acceptability",
|
|
1956
|
+
when="?",
|
|
1957
|
+
who="?",
|
|
1958
|
+
language=self.language,
|
|
1959
|
+
),
|
|
1960
|
+
main_metric="exact_match",
|
|
1961
|
+
main_split="test",
|
|
1962
|
+
)
|
|
1963
|
+
|
|
1653
1964
|
|
|
1654
1965
|
# 2.1 Pragmatics: LINDSEA Presuppositions
|
|
1655
1966
|
class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
@@ -1750,7 +2061,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
|
1750
2061
|
text_noun = self.prompt_components["text_noun"]
|
|
1751
2062
|
instruction = self.prompt_components["single_instruction"]
|
|
1752
2063
|
|
|
1753
|
-
passage = "{question}
|
|
2064
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1754
2065
|
question=question.format(row["question_translated"]),
|
|
1755
2066
|
text_noun=text_noun,
|
|
1756
2067
|
text=row["text"],
|
|
@@ -1798,6 +2109,24 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
|
1798
2109
|
outputs.append(instance)
|
|
1799
2110
|
return outputs
|
|
1800
2111
|
|
|
2112
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
2113
|
+
return ScenarioMetadata(
|
|
2114
|
+
name=f"lindsea_pragmatics_presuppositions_{self.language}",
|
|
2115
|
+
display_name="LINDSEA Pragmatics Presuppositions",
|
|
2116
|
+
short_display_name=None,
|
|
2117
|
+
description="LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics "
|
|
2118
|
+
"dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), "
|
|
2119
|
+
"involving two formats: single and pair sentences. For single sentence "
|
|
2120
|
+
"questions, the system under test needs to determine if the sentence is "
|
|
2121
|
+
"true/false. For pair sentence questions, the system under test needs to "
|
|
2122
|
+
"determine whether a conclusion can be drawn from another sentence.\n",
|
|
2123
|
+
taxonomy=TaxonomyInfo(
|
|
2124
|
+
task="pragmatic reasoning", what="presuppositions", when="?", who="?", language=self.language
|
|
2125
|
+
),
|
|
2126
|
+
main_metric="exact_match",
|
|
2127
|
+
main_split="test",
|
|
2128
|
+
)
|
|
2129
|
+
|
|
1801
2130
|
|
|
1802
2131
|
# 2.2 Pragmatics: LINDSEA Scalar Implicatures
|
|
1803
2132
|
class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
@@ -1898,7 +2227,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
|
1898
2227
|
text_noun = self.prompt_components["text_noun"]
|
|
1899
2228
|
instruction = self.prompt_components["single_instruction"]
|
|
1900
2229
|
|
|
1901
|
-
passage = "{question}
|
|
2230
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1902
2231
|
question=question.format(row["question_translated"]),
|
|
1903
2232
|
text_noun=text_noun,
|
|
1904
2233
|
text=row["text"],
|
|
@@ -1945,3 +2274,22 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
|
1945
2274
|
)
|
|
1946
2275
|
outputs.append(instance)
|
|
1947
2276
|
return outputs
|
|
2277
|
+
|
|
2278
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
2279
|
+
return ScenarioMetadata(
|
|
2280
|
+
name=f"lindsea_pragmatics_scalar_implicatures_{self.language}",
|
|
2281
|
+
display_name="LINDSEA Pragmatics Scalar Implicatures",
|
|
2282
|
+
short_display_name=None,
|
|
2283
|
+
description="LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for "
|
|
2284
|
+
"pragmatics dataset from BHASA [(Leong, "
|
|
2285
|
+
"2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and "
|
|
2286
|
+
"pair sentences. For single sentence questions, the system under test needs to "
|
|
2287
|
+
"determine if the sentence is true/false. For pair sentence questions, the "
|
|
2288
|
+
"system under test needs to determine whether a conclusion can be drawn from "
|
|
2289
|
+
"another sentence.\n",
|
|
2290
|
+
taxonomy=TaxonomyInfo(
|
|
2291
|
+
task="pragmatic reasoning", what="scalar implicatures", when="?", who="?", language=self.language
|
|
2292
|
+
),
|
|
2293
|
+
main_metric="exact_match",
|
|
2294
|
+
main_split="test",
|
|
2295
|
+
)
|
|
@@ -2,8 +2,18 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
Reference,
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Input,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class SelfInstructScenario(Scenario):
|
|
@@ -46,3 +56,21 @@ class SelfInstructScenario(Scenario):
|
|
|
46
56
|
)
|
|
47
57
|
instances.append(instance)
|
|
48
58
|
return instances
|
|
59
|
+
|
|
60
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
61
|
+
return ScenarioMetadata(
|
|
62
|
+
name="self_instruct",
|
|
63
|
+
display_name="Self Instruct",
|
|
64
|
+
short_display_name="Self Instruct",
|
|
65
|
+
description="The manually-curated instructions from the Self-Instruct paper ([Wang et al., "
|
|
66
|
+
"2023](https://aclanthology.org/2023.acl-long.754.pdf)).",
|
|
67
|
+
taxonomy=TaxonomyInfo(
|
|
68
|
+
task="open-ended instruction following",
|
|
69
|
+
what="Instructions for LLMs",
|
|
70
|
+
when="2022",
|
|
71
|
+
who="Authors of the research paper",
|
|
72
|
+
language="English",
|
|
73
|
+
),
|
|
74
|
+
main_metric="Helpfulness",
|
|
75
|
+
main_split="test",
|
|
76
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -73,3 +75,23 @@ class SHCBMTMedScenario(Scenario):
|
|
|
73
75
|
)
|
|
74
76
|
|
|
75
77
|
return instances
|
|
78
|
+
|
|
79
|
+
def get_metadata(self):
|
|
80
|
+
return ScenarioMetadata(
|
|
81
|
+
name="shc_bmt_med",
|
|
82
|
+
display_name="BMT-Status",
|
|
83
|
+
description="BMT-Status is a benchmark composed of clinical notes and associated binary "
|
|
84
|
+
"questions related to bone marrow transplant (BMT), hematopoietic stem cell "
|
|
85
|
+
"transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is "
|
|
86
|
+
"to determine whether the patient received a subsequent transplant based on the "
|
|
87
|
+
"provided clinical documentation.",
|
|
88
|
+
taxonomy=TaxonomyInfo(
|
|
89
|
+
task="question answering",
|
|
90
|
+
what="Answer bone marrow transplant questions",
|
|
91
|
+
when="Any",
|
|
92
|
+
who="Researcher",
|
|
93
|
+
language="English",
|
|
94
|
+
),
|
|
95
|
+
main_metric="exact_match",
|
|
96
|
+
main_split="test",
|
|
97
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -73,3 +75,21 @@ class SHCCDIMedScenario(Scenario):
|
|
|
73
75
|
)
|
|
74
76
|
|
|
75
77
|
return instances
|
|
78
|
+
|
|
79
|
+
def get_metadata(self):
|
|
80
|
+
return ScenarioMetadata(
|
|
81
|
+
name="shc_cdi_med",
|
|
82
|
+
display_name="CDI-QA",
|
|
83
|
+
description="CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) "
|
|
84
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions "
|
|
85
|
+
"based on documented evidence in patient records.",
|
|
86
|
+
taxonomy=TaxonomyInfo(
|
|
87
|
+
task="Classification",
|
|
88
|
+
what="Answer verification questions from CDI notes",
|
|
89
|
+
when="Any",
|
|
90
|
+
who="Hospital Admistrator",
|
|
91
|
+
language="English",
|
|
92
|
+
),
|
|
93
|
+
main_metric="exact_match",
|
|
94
|
+
main_split="test",
|
|
95
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -74,3 +76,24 @@ class SHCCONFMedScenario(Scenario):
|
|
|
74
76
|
)
|
|
75
77
|
|
|
76
78
|
return instances
|
|
79
|
+
|
|
80
|
+
def get_metadata(self):
|
|
81
|
+
return ScenarioMetadata(
|
|
82
|
+
name="shc_conf_med",
|
|
83
|
+
display_name="MedConfInfo",
|
|
84
|
+
description="MedConfInfo is a benchmark comprising clinical notes from adolescent patients. "
|
|
85
|
+
"It is used to evaluate whether the content contains sensitive protected health "
|
|
86
|
+
"information (PHI) that should be restricted from parental access, in "
|
|
87
|
+
"accordance with adolescent confidentiality policies in clinical care. "
|
|
88
|
+
"[(Rabbani et al., "
|
|
89
|
+
"2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).",
|
|
90
|
+
taxonomy=TaxonomyInfo(
|
|
91
|
+
task="Classification",
|
|
92
|
+
what="Identify sensitive health info in adolescent notes",
|
|
93
|
+
when="Any",
|
|
94
|
+
who="Clinician",
|
|
95
|
+
language="English",
|
|
96
|
+
),
|
|
97
|
+
main_metric="exact_match",
|
|
98
|
+
main_split="test",
|
|
99
|
+
)
|