crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -4,8 +4,9 @@ import string
|
|
|
4
4
|
import random
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
from typing import List, Dict, Optional
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT, ScenarioMetadata
|
|
9
10
|
|
|
10
11
|
option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
|
|
11
12
|
|
|
@@ -217,6 +218,17 @@ class DecodingTrustPrivacyScenario(Scenario):
|
|
|
217
218
|
|
|
218
219
|
return instances
|
|
219
220
|
|
|
221
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
222
|
+
return ScenarioMetadata(
|
|
223
|
+
name="decodingtrust_privacy",
|
|
224
|
+
display_name="DecodingTrust - Privacy",
|
|
225
|
+
short_display_name="Privacy",
|
|
226
|
+
description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
|
|
227
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
228
|
+
main_metric="quasi_exact_match",
|
|
229
|
+
main_split="test",
|
|
230
|
+
)
|
|
231
|
+
|
|
220
232
|
|
|
221
233
|
def get_local_domain(email):
|
|
222
234
|
return email.split("@")
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict
|
|
4
|
-
from helm.benchmark.
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import Reference, Output
|
|
7
8
|
|
|
@@ -66,3 +67,14 @@ class DecodingTrustStereotypeBiasScenario(Scenario):
|
|
|
66
67
|
instances.append(instance)
|
|
67
68
|
|
|
68
69
|
return instances
|
|
70
|
+
|
|
71
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
72
|
+
return ScenarioMetadata(
|
|
73
|
+
name="decodingtrust_stereotype_bias",
|
|
74
|
+
display_name="DecodingTrust - Stereotype Bias",
|
|
75
|
+
short_display_name="Stereotype",
|
|
76
|
+
description="Manually crafted stereotype user prompts from DecodingTrust",
|
|
77
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
78
|
+
main_metric="unknown",
|
|
79
|
+
main_split="test",
|
|
80
|
+
)
|
|
@@ -3,8 +3,9 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Dict
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
|
|
@@ -76,3 +77,14 @@ class DecodingTrustToxicityPromptsScenario(Scenario):
|
|
|
76
77
|
random.shuffle(instances)
|
|
77
78
|
|
|
78
79
|
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name="decodingtrust_toxicity_prompts",
|
|
84
|
+
display_name="DecodingTrust - Toxicity",
|
|
85
|
+
short_display_name="Toxicity",
|
|
86
|
+
description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
|
|
87
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
88
|
+
main_metric="unknown",
|
|
89
|
+
main_split="test",
|
|
90
|
+
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
2
3
|
from helm.common.general import check_file_exists
|
|
3
4
|
from helm.benchmark.scenarios.scenario import (
|
|
4
5
|
Input,
|
|
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
Reference,
|
|
10
11
|
Output,
|
|
12
|
+
ScenarioMetadata,
|
|
11
13
|
)
|
|
12
14
|
import pandas as pd
|
|
13
15
|
|
|
@@ -170,3 +172,25 @@ class DischargeMeScenario(Scenario):
|
|
|
170
172
|
lines = file.readlines()
|
|
171
173
|
lines = [line.strip() for line in lines]
|
|
172
174
|
return lines
|
|
175
|
+
|
|
176
|
+
def get_metadata(self):
|
|
177
|
+
return ScenarioMetadata(
|
|
178
|
+
name="dischargeme",
|
|
179
|
+
display_name="DischargeMe",
|
|
180
|
+
short_display_name="DischargeMe",
|
|
181
|
+
description="DischargeMe is a benchmark designed to evaluate clinical text generation. It "
|
|
182
|
+
"pairs discharge summaries and radiology reports from MIMIC-IV with generation "
|
|
183
|
+
"tasks such as writing discharge instructions or summarizing the brief hospital "
|
|
184
|
+
"course. The benchmark assesses a model's ability to generate patient-facing "
|
|
185
|
+
"documentation that is complete, empathetic, and clinically accurate [(Xu, "
|
|
186
|
+
"2024)](https://physionet.org/content/discharge-me/1.3/).",
|
|
187
|
+
taxonomy=TaxonomyInfo(
|
|
188
|
+
task="Text generation",
|
|
189
|
+
what="Generate discharge instructions from hospital notes",
|
|
190
|
+
when="Upon hospital discharge",
|
|
191
|
+
who="Clinician",
|
|
192
|
+
language="English",
|
|
193
|
+
),
|
|
194
|
+
main_metric="dischargeme_accuracy",
|
|
195
|
+
main_split="test",
|
|
196
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict, Optional
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
|
|
@@ -175,3 +177,23 @@ class DisinformationScenario(Scenario):
|
|
|
175
177
|
instances = self.create_wedging_instances(data)
|
|
176
178
|
|
|
177
179
|
return instances
|
|
180
|
+
|
|
181
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
182
|
+
if self.capability == "reiteration":
|
|
183
|
+
name = "disinformation_reiteration"
|
|
184
|
+
display_name = "Disinformation (reiteration)"
|
|
185
|
+
elif self.capability == "wedging":
|
|
186
|
+
name = "disinformation_wedging"
|
|
187
|
+
display_name = "Disinformation (wedging)"
|
|
188
|
+
else:
|
|
189
|
+
raise Exception(f"Unknown capability {self.capability}")
|
|
190
|
+
return ScenarioMetadata(
|
|
191
|
+
name=name,
|
|
192
|
+
display_name=display_name,
|
|
193
|
+
description="Scenario from [Buchanan et al. "
|
|
194
|
+
"(2021)](https://cset.georgetown.edu/publication/truth-lies-and-automation/) "
|
|
195
|
+
"that tests the ability to generate divisive and wedging content.",
|
|
196
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
197
|
+
main_metric="unknown",
|
|
198
|
+
main_split="valid",
|
|
199
|
+
)
|
|
@@ -2,6 +2,7 @@ import numpy as np
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List, Tuple
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -234,3 +236,16 @@ class DyckLanguageScenario(Scenario):
|
|
|
234
236
|
not_allowed=train_inputs,
|
|
235
237
|
)
|
|
236
238
|
return train_instances + test_instances
|
|
239
|
+
|
|
240
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
241
|
+
return ScenarioMetadata(
|
|
242
|
+
name="dyck_language",
|
|
243
|
+
display_name="Dyck",
|
|
244
|
+
description="Scenario testing hierarchical reasoning through the Dyck formal languages "
|
|
245
|
+
"[(Suzgun et al., 2019)](https://aclanthology.org/W19-3905/).",
|
|
246
|
+
taxonomy=TaxonomyInfo(
|
|
247
|
+
task="next-word prediction", what="Dyck formal language", when="n/a", who="n/a", language="synthetic"
|
|
248
|
+
),
|
|
249
|
+
main_metric="exact_match_indicator",
|
|
250
|
+
main_split="test",
|
|
251
|
+
)
|
|
@@ -7,6 +7,7 @@ from functools import partial
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
from typing import Any, Dict, List, Optional, Mapping
|
|
9
9
|
|
|
10
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
10
11
|
from helm.common.general import check_file_exists, ensure_directory_exists
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
12
13
|
TEST_SPLIT,
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
CORRECT_TAG,
|
|
17
18
|
Reference,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
##################################
|
|
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
|
|
|
1517
1519
|
)
|
|
1518
1520
|
|
|
1519
1521
|
return instances
|
|
1522
|
+
|
|
1523
|
+
def get_metadata(self):
|
|
1524
|
+
return ScenarioMetadata(
|
|
1525
|
+
name="ehrshot",
|
|
1526
|
+
display_name="EHRSHOT",
|
|
1527
|
+
description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
|
|
1528
|
+
"future clinical events using structured EHR code sequences. Each instance "
|
|
1529
|
+
"contains a patient's historical EHR data and a forward-looking clinical "
|
|
1530
|
+
"question about whether a particular diagnosis, lab result, or hospital event "
|
|
1531
|
+
"will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
|
|
1532
|
+
taxonomy=TaxonomyInfo(
|
|
1533
|
+
task="Classification",
|
|
1534
|
+
what="Predict whether a medical event will occur in the future based " "on EHR codes",
|
|
1535
|
+
when="Future prediction",
|
|
1536
|
+
who="Clinician, Insurer",
|
|
1537
|
+
language="English",
|
|
1538
|
+
),
|
|
1539
|
+
main_metric="exact_match",
|
|
1540
|
+
main_split="test",
|
|
1541
|
+
)
|
|
@@ -2,6 +2,7 @@ from typing import List, Any
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
|
|
|
56
58
|
)
|
|
57
59
|
instances.append(instance)
|
|
58
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="enem_challenge",
|
|
65
|
+
display_name="ENEM Challenge",
|
|
66
|
+
short_display_name=None,
|
|
67
|
+
description="ENEM Challenge",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="multiple-choice question answering",
|
|
70
|
+
what="general academic subjects",
|
|
71
|
+
when="between 2009 and 2023",
|
|
72
|
+
who="brazilian ministry of education",
|
|
73
|
+
language="Portuguese",
|
|
74
|
+
),
|
|
75
|
+
main_metric="exact_match",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -3,6 +3,7 @@ import pandas as pd
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
|
|
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
|
|
|
160
162
|
instances.append(instance)
|
|
161
163
|
|
|
162
164
|
return instances
|
|
165
|
+
|
|
166
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
167
|
+
return ScenarioMetadata(
|
|
168
|
+
name="entity_data_imputation",
|
|
169
|
+
display_name="Data imputation",
|
|
170
|
+
description="Scenario from [Mei et al. "
|
|
171
|
+
"(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
|
|
172
|
+
"to impute missing entities in a data table.",
|
|
173
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
174
|
+
main_metric="quasi_exact_match",
|
|
175
|
+
main_split="test",
|
|
176
|
+
)
|
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.hierarchical_logger import hlog
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
|
|
19
21
|
|
|
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
|
|
|
155
157
|
instances.append(instance)
|
|
156
158
|
|
|
157
159
|
return instances
|
|
160
|
+
|
|
161
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
162
|
+
return ScenarioMetadata(
|
|
163
|
+
name="entity_matching",
|
|
164
|
+
display_name="Entity matching",
|
|
165
|
+
description="Scenario from Magellan [(Konda et al., "
|
|
166
|
+
"2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
|
|
167
|
+
"to determine if two entities match.",
|
|
168
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
169
|
+
main_metric="quasi_exact_match",
|
|
170
|
+
main_split="test",
|
|
171
|
+
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.hierarchical_logger import hwarn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EXAMSMultilingualScenario(Scenario):
|
|
22
|
+
"""EXAMS: A Multi-subject High School Examinations Dataset
|
|
23
|
+
|
|
24
|
+
EXAMS is a benchmark dataset for multilingual and cross-lingual
|
|
25
|
+
question answering from high school examinations. It consists of
|
|
26
|
+
more than 24,000 high-quality high school exam questions in 16
|
|
27
|
+
languages, covering 8 language families and 24 school subjects
|
|
28
|
+
from Natural Sciences and Social Sciences, among others.
|
|
29
|
+
|
|
30
|
+
- https://huggingface.co/datasets/mhardalov/exams
|
|
31
|
+
- https://aclanthology.org/2020.emnlp-main.438/
|
|
32
|
+
|
|
33
|
+
Note: Some dataset rows have the value '@' in the `answerKey` column.
|
|
34
|
+
These rows will be ignored.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
@inproceedings{hardalov-etal-2020-exams,
|
|
38
|
+
title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
|
|
39
|
+
author = "Hardalov, Momchil and
|
|
40
|
+
Mihaylov, Todor and
|
|
41
|
+
Zlatkova, Dimitrina and
|
|
42
|
+
Dinkov, Yoan and
|
|
43
|
+
Koychev, Ivan and
|
|
44
|
+
Nakov, Preslav",
|
|
45
|
+
editor = "Webber, Bonnie and
|
|
46
|
+
Cohn, Trevor and
|
|
47
|
+
He, Yulan and
|
|
48
|
+
Liu, Yang",
|
|
49
|
+
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
|
50
|
+
month = nov,
|
|
51
|
+
year = "2020",
|
|
52
|
+
address = "Online",
|
|
53
|
+
publisher = "Association for Computational Linguistics",
|
|
54
|
+
url = "https://aclanthology.org/2020.emnlp-main.438/",
|
|
55
|
+
doi = "10.18653/v1/2020.emnlp-main.438",
|
|
56
|
+
pages = "5427--5444",
|
|
57
|
+
abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
|
|
58
|
+
}```
|
|
59
|
+
""" # noqa: E501
|
|
60
|
+
|
|
61
|
+
name = "exams_multilingual"
|
|
62
|
+
description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
|
|
63
|
+
tags = ["knowledge", "multiple_choice"]
|
|
64
|
+
|
|
65
|
+
CHOICES = ["A", "B", "C", "D", "E"]
|
|
66
|
+
HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
|
|
67
|
+
|
|
68
|
+
def __init__(self, language: str, subject: str):
|
|
69
|
+
super().__init__()
|
|
70
|
+
self.language = language
|
|
71
|
+
self.subject = subject
|
|
72
|
+
|
|
73
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
74
|
+
cache_dir = os.path.join(output_path, "data")
|
|
75
|
+
ensure_directory_exists(cache_dir)
|
|
76
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
77
|
+
"mhardalov/exams",
|
|
78
|
+
"multilingual",
|
|
79
|
+
revision="4ff10804abb3341f8815cacd778181177bba7edd",
|
|
80
|
+
cache_dir=cache_dir,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Read all instances
|
|
84
|
+
instances: List[Instance] = []
|
|
85
|
+
for split_name, dataset in dataset_splits.items():
|
|
86
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
87
|
+
for row in dataset:
|
|
88
|
+
question = row["question"]
|
|
89
|
+
question_info = row["info"]
|
|
90
|
+
if self.subject != "all" and question_info["subject"] != self.subject:
|
|
91
|
+
continue
|
|
92
|
+
if self.language != "all" and question_info["language"] != self.language:
|
|
93
|
+
continue
|
|
94
|
+
input = Input(text=question["stem"])
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
if row["answerKey"] not in self.CHOICES:
|
|
97
|
+
hwarn(f"Invalid value in answerKey column in row: {row}")
|
|
98
|
+
continue
|
|
99
|
+
correct_choice_index = ord(row["answerKey"]) - ord("A")
|
|
100
|
+
for choice_index, choice_text in enumerate(question["choices"]["text"]):
|
|
101
|
+
references.append(
|
|
102
|
+
Reference(
|
|
103
|
+
output=Output(text=choice_text),
|
|
104
|
+
tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
instance = Instance(
|
|
108
|
+
id=row["id"],
|
|
109
|
+
input=input,
|
|
110
|
+
references=references,
|
|
111
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
112
|
+
)
|
|
113
|
+
instances.append(instance)
|
|
114
|
+
|
|
115
|
+
return instances
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TRAIN_SPLIT,
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
CORRECT_TAG,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
|
|
|
117
119
|
)
|
|
118
120
|
instances.append(instance)
|
|
119
121
|
return instances
|
|
122
|
+
|
|
123
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
124
|
+
return ScenarioMetadata(
|
|
125
|
+
name="fin_qa",
|
|
126
|
+
display_name="FinQA",
|
|
127
|
+
description="The FinQA benchmark for numeric reasoning over financial data, with question "
|
|
128
|
+
"answering pairs written by financial experts over financial reports [(Chen et "
|
|
129
|
+
"al., 2021)](https://arxiv.org/abs/2109.00122/).",
|
|
130
|
+
taxonomy=TaxonomyInfo(
|
|
131
|
+
task="question answering with numeric reasoning",
|
|
132
|
+
what="financial reports",
|
|
133
|
+
when="1999 to 2019",
|
|
134
|
+
who="financial experts",
|
|
135
|
+
language="English",
|
|
136
|
+
),
|
|
137
|
+
main_metric="program_accuracy",
|
|
138
|
+
main_split="test",
|
|
139
|
+
)
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import random
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
TRAIN_SPLIT,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
20
|
|
|
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
|
|
|
51
53
|
for train_index in train_indexes:
|
|
52
54
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name="financebench",
|
|
60
|
+
display_name="FinanceBench",
|
|
61
|
+
description="FinanceBench is a benchmark for open book financial question answering. It "
|
|
62
|
+
"comprises 10,231 questions about publicly traded companies, with corresponding "
|
|
63
|
+
"answers and evidence strings [(Islam et al., "
|
|
64
|
+
"2023)](https://arxiv.org/abs/2311.11944/).",
|
|
65
|
+
taxonomy=TaxonomyInfo(
|
|
66
|
+
task="question answering with numeric reasoning",
|
|
67
|
+
what="financial reports",
|
|
68
|
+
when="2015 to 2023",
|
|
69
|
+
who="financial experts",
|
|
70
|
+
language="English",
|
|
71
|
+
),
|
|
72
|
+
main_metric="annotation_financebench_label_correct_answer",
|
|
73
|
+
main_split="test",
|
|
74
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TEST_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
|
|
|
92
94
|
)
|
|
93
95
|
instances.append(instance)
|
|
94
96
|
return instances
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="financial_phrasebank",
|
|
101
|
+
display_name="Financial Phrasebank (Sentiment Classification)",
|
|
102
|
+
short_display_name=None,
|
|
103
|
+
description="A sentiment classification benchmark based on the dataset from Good Debt or "
|
|
104
|
+
"Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
|
|
105
|
+
"2013)](https://arxiv.org/abs/1307.5336).",
|
|
106
|
+
taxonomy=TaxonomyInfo(
|
|
107
|
+
task="sentiment analysis",
|
|
108
|
+
what="phrases from financial news texts and company press releases",
|
|
109
|
+
when="before 2013",
|
|
110
|
+
who="annotators with adequate business education background",
|
|
111
|
+
language="English",
|
|
112
|
+
),
|
|
113
|
+
main_metric="classification_weighted_f1",
|
|
114
|
+
main_split="test",
|
|
115
|
+
)
|
|
@@ -6,6 +6,7 @@ from typing import List
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.benchmark.runner import TRAIN_SPLIT
|
|
10
11
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
Reference,
|
|
17
18
|
Scenario,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
|
|
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
|
|
|
122
124
|
for train_index in train_indexes:
|
|
123
125
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
124
126
|
return instances
|
|
127
|
+
|
|
128
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
129
|
+
return ScenarioMetadata(
|
|
130
|
+
name="gold_commodity_news",
|
|
131
|
+
display_name="Gold Commodity News",
|
|
132
|
+
short_display_name=None,
|
|
133
|
+
description="A classification benchmark based on a dataset of human-annotated gold "
|
|
134
|
+
"commodity news headlines ([Sinha & Khandait, "
|
|
135
|
+
"2019](https://arxiv.org/abs/2009.04202)).",
|
|
136
|
+
taxonomy=TaxonomyInfo(
|
|
137
|
+
task="text classification",
|
|
138
|
+
what="gold commodity news headlines",
|
|
139
|
+
when="2000-2019",
|
|
140
|
+
who="financial journalists",
|
|
141
|
+
language="English",
|
|
142
|
+
),
|
|
143
|
+
main_metric="classification_weighted_f1",
|
|
144
|
+
main_split="test",
|
|
145
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
4
|
from typing import List
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
|
|
|
78
80
|
instances.append(instance)
|
|
79
81
|
|
|
80
82
|
return instances
|
|
83
|
+
|
|
84
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
85
|
+
return ScenarioMetadata(
|
|
86
|
+
name=self.name,
|
|
87
|
+
display_name="GPQA",
|
|
88
|
+
description=self.description,
|
|
89
|
+
main_metric="chain_of_thought_correctness",
|
|
90
|
+
main_split="test",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="question answering",
|
|
93
|
+
what="complex questions across various disciplines",
|
|
94
|
+
who="domain experts",
|
|
95
|
+
when="2024",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
-
from helm.benchmark.
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
4
5
|
from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
|
|
5
6
|
|
|
6
7
|
|
|
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
|
|
|
41
42
|
instances: List[Instance] = list(map(derivation_to_instance, derivations))
|
|
42
43
|
|
|
43
44
|
return instances
|
|
45
|
+
|
|
46
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
47
|
+
return ScenarioMetadata(
|
|
48
|
+
name="grammar",
|
|
49
|
+
display_name="Best ChatGPT Prompts",
|
|
50
|
+
short_display_name="Best ChatGPT Prompts",
|
|
51
|
+
description="A list of “best ChatGPT prompts to power your workflow” summarized by "
|
|
52
|
+
"[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
|
|
53
|
+
taxonomy=TaxonomyInfo(
|
|
54
|
+
task="open-ended instruction following",
|
|
55
|
+
what="Instructions for LLMs",
|
|
56
|
+
when="2023",
|
|
57
|
+
who="Gridfiti Staff",
|
|
58
|
+
language="English",
|
|
59
|
+
),
|
|
60
|
+
main_metric="Helpfulness",
|
|
61
|
+
main_split="test",
|
|
62
|
+
)
|