crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
|
|
|
102
104
|
instances.append(instance)
|
|
103
105
|
|
|
104
106
|
return instances
|
|
107
|
+
|
|
108
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
109
|
+
return ScenarioMetadata(
|
|
110
|
+
name="legal_support",
|
|
111
|
+
display_name="LegalSupport",
|
|
112
|
+
description="Scenario introduced in this work to measure fine-grained legal reasoning "
|
|
113
|
+
"through reverse entailment.",
|
|
114
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
115
|
+
main_metric="quasi_exact_match",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -5,6 +5,7 @@ import datasets
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
Scenario,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
|
|
@@ -144,3 +146,20 @@ class LegalBenchScenario(Scenario):
|
|
|
144
146
|
instances.append(instance)
|
|
145
147
|
|
|
146
148
|
return instances
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
151
|
+
return ScenarioMetadata(
|
|
152
|
+
name=self.name,
|
|
153
|
+
display_name="LegalBench",
|
|
154
|
+
description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
|
|
155
|
+
"tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
|
|
156
|
+
taxonomy=TaxonomyInfo(
|
|
157
|
+
task="multiple-choice question answering",
|
|
158
|
+
what="public legal and admininstrative documents, manually " "constructed questions",
|
|
159
|
+
when="before 2023",
|
|
160
|
+
who="lawyers",
|
|
161
|
+
language="English",
|
|
162
|
+
),
|
|
163
|
+
main_metric="quasi_exact_match",
|
|
164
|
+
main_split="test",
|
|
165
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Input,
|
|
18
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
ECTHR_A = "ecthr_a"
|
|
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
|
|
|
261
262
|
for subset in self.subsets:
|
|
262
263
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
263
264
|
return instances
|
|
265
|
+
|
|
266
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
267
|
+
return ScenarioMetadata(
|
|
268
|
+
name="lex_glue",
|
|
269
|
+
display_name="LexGLUE",
|
|
270
|
+
description="A Benchmark Dataset for Legal Language Understanding in English",
|
|
271
|
+
taxonomy=None,
|
|
272
|
+
main_metric="classification_macro_f1",
|
|
273
|
+
main_split="test",
|
|
274
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Output,
|
|
18
18
|
Input,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
|
|
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
|
|
|
466
467
|
for subset in self.subsets:
|
|
467
468
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
468
469
|
return instances
|
|
470
|
+
|
|
471
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
472
|
+
return ScenarioMetadata(
|
|
473
|
+
name="lextreme",
|
|
474
|
+
display_name="LEXTREME",
|
|
475
|
+
description="A Multilingual Legal Benchmark for Natural Language Understanding",
|
|
476
|
+
taxonomy=None,
|
|
477
|
+
main_metric="classification_macro_f1",
|
|
478
|
+
main_split="test",
|
|
479
|
+
)
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Cultural alignment evaluation scenario based on Vietnam World Values Survey responses."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import random
|
|
6
|
+
from typing import List
|
|
7
|
+
from datasets import load_dataset
|
|
8
|
+
from huggingface_hub import snapshot_download
|
|
9
|
+
|
|
10
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
11
|
+
from helm.benchmark.scenarios.scenario import (
|
|
12
|
+
Scenario,
|
|
13
|
+
Instance,
|
|
14
|
+
Reference,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
CORRECT_TAG,
|
|
17
|
+
Input,
|
|
18
|
+
Output,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
SUPPORTED_LANGUAGES = ["en", "vi"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CulturalValueUnderstandingWVSScenario(Scenario):
|
|
25
|
+
"""Cultural values understanding evaluation based on Vietnam World Values Survey responses."""
|
|
26
|
+
|
|
27
|
+
name = "cultural_value_understanding_wvs"
|
|
28
|
+
description = "Evaluates model understanding of cultural values from WVS Wave 7"
|
|
29
|
+
tags = ["cultural_value_understanding"]
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
language: str,
|
|
34
|
+
personas_filename: str = "vn_personas_with_filtered_responses.json",
|
|
35
|
+
questions_filename: str = "wvs_questions_dump.en.json",
|
|
36
|
+
repo_id: str = "stair-lab/cultural-alignment-dataset",
|
|
37
|
+
num_personas: int = 10,
|
|
38
|
+
num_question_variants: int = 2,
|
|
39
|
+
include_few_shot_examples: bool = False,
|
|
40
|
+
):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.language = language
|
|
43
|
+
self.personas_filename = personas_filename
|
|
44
|
+
self.questions_filename = questions_filename
|
|
45
|
+
self.repo_id = repo_id
|
|
46
|
+
self.num_personas = 10 if isinstance(num_personas, str) else num_personas
|
|
47
|
+
self.num_question_variants = min(num_question_variants, 4)
|
|
48
|
+
self.include_few_shot_examples = include_few_shot_examples
|
|
49
|
+
|
|
50
|
+
if self.language not in SUPPORTED_LANGUAGES:
|
|
51
|
+
raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
|
|
52
|
+
|
|
53
|
+
def get_few_shot_examples(self) -> str:
|
|
54
|
+
"""Return a few-shot learning examples formatted as a string."""
|
|
55
|
+
examples = """Here are some examples:
|
|
56
|
+
|
|
57
|
+
Example 1:
|
|
58
|
+
You are a 35 year old male from Vietnam. You have higher education and consider yourself middle class. You are married.
|
|
59
|
+
|
|
60
|
+
How important is family in your life?
|
|
61
|
+
|
|
62
|
+
1. Very important
|
|
63
|
+
2. Rather important
|
|
64
|
+
3. Not very important
|
|
65
|
+
4. Not at all important
|
|
66
|
+
|
|
67
|
+
Please select the option number that best represents your view.
|
|
68
|
+
1
|
|
69
|
+
|
|
70
|
+
Example 2:
|
|
71
|
+
You are a 28 year old female from Vietnam. You have middle education and consider yourself working class. You are single.
|
|
72
|
+
|
|
73
|
+
Would you say that most people can be trusted or that you need to be very careful in dealing with people?
|
|
74
|
+
|
|
75
|
+
1. Most people can be trusted
|
|
76
|
+
2. Need to be very careful
|
|
77
|
+
|
|
78
|
+
Please select the option number that best represents your view.
|
|
79
|
+
2
|
|
80
|
+
|
|
81
|
+
Example 3:
|
|
82
|
+
You are a 50 year old male from Vietnam. You have lower education and consider yourself lower class. You are married.
|
|
83
|
+
|
|
84
|
+
Do you think that homosexuality is justifiable?
|
|
85
|
+
|
|
86
|
+
1. Never justifiable
|
|
87
|
+
2. Rarely justifiable
|
|
88
|
+
3. Sometimes justifiable
|
|
89
|
+
4. Always justifiable
|
|
90
|
+
|
|
91
|
+
Please select the option number that best represents your view.
|
|
92
|
+
1
|
|
93
|
+
|
|
94
|
+
Now answer the following question:
|
|
95
|
+
""" # noqa: E501
|
|
96
|
+
return examples
|
|
97
|
+
|
|
98
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
99
|
+
"""Generate test instances from Vietnam personas and WVS questions."""
|
|
100
|
+
instances: List[Instance] = []
|
|
101
|
+
|
|
102
|
+
# Download files from Hugging Face Hub
|
|
103
|
+
repo_local_path = snapshot_download(
|
|
104
|
+
repo_id=self.repo_id, repo_type="dataset", revision="fe54b6f5d75cfca5377707cd7199e39f517e3a1f"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Load the downloaded files
|
|
108
|
+
with open(os.path.join(repo_local_path, self.personas_filename), "r", encoding="utf-8") as f:
|
|
109
|
+
personas = json.load(f)
|
|
110
|
+
|
|
111
|
+
with open(os.path.join(repo_local_path, self.questions_filename), "r", encoding="utf-8") as f:
|
|
112
|
+
questions = json.load(f)
|
|
113
|
+
|
|
114
|
+
# Get few-shot examples
|
|
115
|
+
few_shot_examples = self.get_few_shot_examples() if self.include_few_shot_examples else ""
|
|
116
|
+
|
|
117
|
+
# Sample personas
|
|
118
|
+
sampled_personas = random.sample(personas, min(self.num_personas, len(personas)))
|
|
119
|
+
|
|
120
|
+
# Create instances for each persona and question
|
|
121
|
+
for persona in sampled_personas:
|
|
122
|
+
# Get demographic info for persona description
|
|
123
|
+
persona_desc = (
|
|
124
|
+
f"You are a {persona.get('age', 'adult')} year old {persona.get('sex', 'person')} from Vietnam. "
|
|
125
|
+
)
|
|
126
|
+
persona_desc += f"You have {persona.get('education', 'some')} education and consider yourself {persona.get('social_class', 'middle class')}. " # noqa: E501
|
|
127
|
+
persona_desc += f"You are {persona.get('marital_status', 'single')}."
|
|
128
|
+
|
|
129
|
+
# Process each question this persona answered
|
|
130
|
+
for qid, human_response in persona.get("responses", {}).items():
|
|
131
|
+
# Skip if no human response or if it's 0 (which might be a "Don't know" response)
|
|
132
|
+
if human_response is None:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
# Convert human_response to int (if possible)
|
|
136
|
+
try:
|
|
137
|
+
human_response_int = int(human_response)
|
|
138
|
+
except (ValueError, TypeError):
|
|
139
|
+
# Skip if human_response can't be converted to int
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# Get question info
|
|
143
|
+
question_data = questions.get(qid, {})
|
|
144
|
+
if not question_data:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Get options directly from question data
|
|
148
|
+
q_options = question_data.get("options", [])
|
|
149
|
+
if not q_options:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# Skip if human_response is out of range
|
|
153
|
+
if human_response_int < 0 or human_response_int > len(q_options):
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Special handling for "Don't know" or zero responses
|
|
157
|
+
if human_response_int == 0:
|
|
158
|
+
# Some questions might encode "Don't know" as 0
|
|
159
|
+
# Skip for now, or you could add special handling
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Use the predefined question variations
|
|
163
|
+
question_variants = question_data.get("questions", [])
|
|
164
|
+
if not question_variants:
|
|
165
|
+
question_variants = [f"Question {qid}: {question_data.get('description', '')}"]
|
|
166
|
+
|
|
167
|
+
# Use the specified number of variants
|
|
168
|
+
variants_to_use = min(self.num_question_variants, len(question_variants))
|
|
169
|
+
selected_variants = question_variants[:variants_to_use]
|
|
170
|
+
|
|
171
|
+
# Create instances for each selected question variant
|
|
172
|
+
for q_text in selected_variants:
|
|
173
|
+
# Format the prompt with or without few-shot examples
|
|
174
|
+
if self.include_few_shot_examples:
|
|
175
|
+
prompt = f"{few_shot_examples}{persona_desc}\n\n{q_text}\n\n"
|
|
176
|
+
else:
|
|
177
|
+
prompt = f"{persona_desc}\n\n{q_text}\n\n"
|
|
178
|
+
|
|
179
|
+
# Add options from question data - with numbers, not letters
|
|
180
|
+
for i, opt in enumerate(q_options, 1):
|
|
181
|
+
prompt += f"{i}. {opt}\n"
|
|
182
|
+
|
|
183
|
+
prompt += "\nPlease select the option number that best represents your view. Return only the option number. Do not return anything else." # noqa: E501
|
|
184
|
+
|
|
185
|
+
# Create a reference with just the human response number
|
|
186
|
+
# We don't create multiple references, just use the actual human response
|
|
187
|
+
reference = Reference(Output(text=str(human_response_int)), tags=[CORRECT_TAG])
|
|
188
|
+
|
|
189
|
+
# Create the instance
|
|
190
|
+
instance = Instance(
|
|
191
|
+
input=Input(text=prompt),
|
|
192
|
+
references=[reference],
|
|
193
|
+
split=TEST_SPLIT,
|
|
194
|
+
)
|
|
195
|
+
instances.append(instance)
|
|
196
|
+
|
|
197
|
+
if not instances:
|
|
198
|
+
hwarn("No valid instances were created. Check the input data and parameters.")
|
|
199
|
+
else:
|
|
200
|
+
hlog(f"Created {len(instances)} instances for cultural value understanding scenario.")
|
|
201
|
+
|
|
202
|
+
return instances
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class SocialNormApplicationNormADScenario(Scenario):
|
|
206
|
+
"""Social norms application evaluation based on Vietnam World Values Survey responses."""
|
|
207
|
+
|
|
208
|
+
name = "social_norm_application_normad"
|
|
209
|
+
description = "Evaluates model understanding of social norms from WVS Wave 7"
|
|
210
|
+
tags = ["social_norm_application"]
|
|
211
|
+
|
|
212
|
+
def __init__(
|
|
213
|
+
self,
|
|
214
|
+
language: str,
|
|
215
|
+
):
|
|
216
|
+
super().__init__()
|
|
217
|
+
self.language = language
|
|
218
|
+
|
|
219
|
+
if self.language not in SUPPORTED_LANGUAGES:
|
|
220
|
+
raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
|
|
221
|
+
|
|
222
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
223
|
+
dataset = load_dataset(
|
|
224
|
+
"akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
instances: List[Instance] = []
|
|
228
|
+
|
|
229
|
+
for item in dataset:
|
|
230
|
+
input_text = (
|
|
231
|
+
f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
|
|
232
|
+
f"{item['Background']}\n\n{item['Story']}\n"
|
|
233
|
+
"Return one of the following options: 'yes', 'no', or 'neutral'. Do not return anything else.\n\n"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
instance = Instance(
|
|
237
|
+
input=Input(text=input_text),
|
|
238
|
+
references=[Reference(Output(text=item["Gold Label"]), tags=[CORRECT_TAG])],
|
|
239
|
+
split=TEST_SPLIT,
|
|
240
|
+
)
|
|
241
|
+
instances.append(instance)
|
|
242
|
+
|
|
243
|
+
return instances
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class SocialNormExplanationNormADScenario(Scenario):
|
|
247
|
+
"""Social norms explanation evaluation based on Vietnam World Values Survey responses."""
|
|
248
|
+
|
|
249
|
+
name = "social_norm_explanation_normad"
|
|
250
|
+
description = "Evaluates model understanding of social norms from WVS Wave 7"
|
|
251
|
+
tags = ["social_norm_explanation"]
|
|
252
|
+
|
|
253
|
+
def __init__(
|
|
254
|
+
self,
|
|
255
|
+
language: str,
|
|
256
|
+
):
|
|
257
|
+
super().__init__()
|
|
258
|
+
self.language = language
|
|
259
|
+
|
|
260
|
+
if self.language not in SUPPORTED_LANGUAGES:
|
|
261
|
+
raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
|
|
262
|
+
|
|
263
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
264
|
+
dataset = load_dataset(
|
|
265
|
+
"akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
instances: List[Instance] = []
|
|
269
|
+
|
|
270
|
+
for item in dataset:
|
|
271
|
+
if not item["Explanation"]:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
input_text = (
|
|
275
|
+
f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
|
|
276
|
+
f"{item['Background']}\n\n{item['Story']}\n"
|
|
277
|
+
f"The answer is {item['Gold Label']}. "
|
|
278
|
+
"Briefly explain the reasoning behind this answer in one or two sentences.\n\n"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
instance = Instance(
|
|
282
|
+
input=Input(text=input_text),
|
|
283
|
+
references=[Reference(Output(text=item["Explanation"]), tags=[CORRECT_TAG])],
|
|
284
|
+
split=TEST_SPLIT,
|
|
285
|
+
)
|
|
286
|
+
instances.append(instance)
|
|
287
|
+
|
|
288
|
+
return instances
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
PassageQuestionInput,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -143,3 +145,15 @@ class LSATScenario(Scenario):
|
|
|
143
145
|
instances.append(instance)
|
|
144
146
|
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="lsat_qa",
|
|
152
|
+
display_name="LSAT",
|
|
153
|
+
description="The LSAT benchmark for measuring analytical reasoning on the Law School "
|
|
154
|
+
"Admission Test (LSAT; [Zhong et al., "
|
|
155
|
+
"2021](https://arxiv.org/pdf/2104.06598.pdf)).",
|
|
156
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
157
|
+
main_metric="quasi_exact_match",
|
|
158
|
+
main_split="test",
|
|
159
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MadinahQAScenario(Scenario):
|
|
20
|
+
"""MadinahQA Scenario"""
|
|
21
|
+
|
|
22
|
+
name = "madinah_qa"
|
|
23
|
+
description = "Arabic language competency benchmark"
|
|
24
|
+
tags = ["language", "multiple_choice"]
|
|
25
|
+
|
|
26
|
+
OPTIONS = ["A", "B", "C", "D"]
|
|
27
|
+
HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
28
|
+
SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
|
|
29
|
+
|
|
30
|
+
def __init__(self, subset: str):
|
|
31
|
+
super().__init__()
|
|
32
|
+
subset = subset.replace("_", " ")
|
|
33
|
+
if subset not in self.SUBSETS:
|
|
34
|
+
raise Exception(f"Unknown subset: {subset}")
|
|
35
|
+
self.subset = subset
|
|
36
|
+
|
|
37
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
instances: List[Instance] = []
|
|
41
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
42
|
+
"MBZUAI/MadinahQA",
|
|
43
|
+
self.subset,
|
|
44
|
+
revision="62e7c86ac5c07245a5a952722691d77ddb41f695",
|
|
45
|
+
cache_dir=cache_dir,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Read all instances
|
|
49
|
+
for split_name, dataset in dataset_splits.items():
|
|
50
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
51
|
+
for row_index, row in enumerate(dataset):
|
|
52
|
+
input = Input(text=row["Question"])
|
|
53
|
+
references: List[Reference] = []
|
|
54
|
+
correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
|
|
55
|
+
for option_index in range(1, 6):
|
|
56
|
+
column_name = f"Option {option_index}"
|
|
57
|
+
if not row[column_name]:
|
|
58
|
+
continue
|
|
59
|
+
references.append(
|
|
60
|
+
Reference(
|
|
61
|
+
output=Output(text=row[column_name]),
|
|
62
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
instance = Instance(
|
|
66
|
+
id=f"id{row_index}",
|
|
67
|
+
input=input,
|
|
68
|
+
references=references,
|
|
69
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
70
|
+
)
|
|
71
|
+
instances.append(instance)
|
|
72
|
+
|
|
73
|
+
return instances
|
|
@@ -4,6 +4,7 @@ import typing
|
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
from datasets import load_dataset, DatasetDict
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Scenario,
|
|
@@ -14,17 +15,19 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
def remove_boxed(string: str) -> Optional[str]:
|
|
21
|
-
"""Source: https://github.com/hendrycks/math
|
|
23
|
+
r"""Source: https://github.com/hendrycks/math
|
|
22
24
|
|
|
23
|
-
Extract the text within a
|
|
25
|
+
Extract the text within a \boxed{...} environment.
|
|
24
26
|
|
|
25
27
|
Example:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
>>> from helm.benchmark.scenarios.math_scenario import * # NOQA
|
|
29
|
+
>>> remove_boxed(r'\boxed{\frac{2}{3}}')
|
|
30
|
+
'\\frac{2}{3}'
|
|
28
31
|
"""
|
|
29
32
|
left = "\\boxed{"
|
|
30
33
|
try:
|
|
@@ -68,17 +71,17 @@ def last_boxed_only_string(string: str) -> Optional[str]:
|
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
def _fix_fracs(string: str) -> str:
|
|
71
|
-
"""Source: https://github.com/hendrycks/math
|
|
74
|
+
r"""Source: https://github.com/hendrycks/math
|
|
72
75
|
|
|
73
76
|
Reformat fractions.
|
|
74
77
|
|
|
75
78
|
Examples:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
>>> _fix_fracs(r"\frac1b")
|
|
80
|
+
'\\frac{1}{b}'
|
|
81
|
+
>>> _fix_fracs(r"\frac12")
|
|
82
|
+
'\\frac{1}{2}'
|
|
83
|
+
>>> _fix_fracs(r"\frac1{72}")
|
|
84
|
+
'\\frac{1}{72}'
|
|
82
85
|
"""
|
|
83
86
|
substrs = string.split("\\frac")
|
|
84
87
|
new_str = substrs[0]
|
|
@@ -112,13 +115,13 @@ def _fix_fracs(string: str) -> str:
|
|
|
112
115
|
|
|
113
116
|
|
|
114
117
|
def _fix_a_slash_b(string: str) -> str:
|
|
115
|
-
"""Source: https://github.com/hendrycks/math
|
|
118
|
+
r"""Source: https://github.com/hendrycks/math
|
|
116
119
|
|
|
117
120
|
Reformat fractions formatted as a/b to \\frac{a}{b}.
|
|
118
121
|
|
|
119
122
|
Example:
|
|
120
|
-
|
|
121
|
-
|
|
123
|
+
>>> _fix_a_slash_b(r"2/3")
|
|
124
|
+
'\\frac{2}{3}'
|
|
122
125
|
"""
|
|
123
126
|
if len(string.split("/")) != 2:
|
|
124
127
|
return string
|
|
@@ -149,13 +152,13 @@ def _remove_right_units(string: str) -> str:
|
|
|
149
152
|
|
|
150
153
|
|
|
151
154
|
def _fix_sqrt(string: str) -> str:
|
|
152
|
-
"""Source: https://github.com/hendrycks/math
|
|
155
|
+
r"""Source: https://github.com/hendrycks/math
|
|
153
156
|
|
|
154
157
|
Reformat square roots.
|
|
155
158
|
|
|
156
159
|
Example:
|
|
157
|
-
|
|
158
|
-
|
|
160
|
+
>>> _fix_sqrt("\\sqrt3")
|
|
161
|
+
'\\sqrt{3}'
|
|
159
162
|
"""
|
|
160
163
|
if "\\sqrt" not in string:
|
|
161
164
|
return string
|
|
@@ -210,7 +213,7 @@ def _strip_string(string: str) -> str:
|
|
|
210
213
|
|
|
211
214
|
# remove percentage
|
|
212
215
|
string = string.replace("\\%", "")
|
|
213
|
-
string = string.replace("\%", "")
|
|
216
|
+
string = string.replace(r"\%", "")
|
|
214
217
|
|
|
215
218
|
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
|
216
219
|
string = string.replace(" .", " 0.")
|
|
@@ -391,13 +394,13 @@ class MATHScenario(Scenario):
|
|
|
391
394
|
for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
|
|
392
395
|
if split == TRAIN_SPLIT and self.use_official_examples:
|
|
393
396
|
train_instances = [
|
|
394
|
-
("What is
|
|
397
|
+
("What is $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$?", "1"),
|
|
395
398
|
(
|
|
396
399
|
"In how many ways can 4 books be selected from a shelf of 6 books"
|
|
397
400
|
+ " if the order in which the books are selected does not matter?",
|
|
398
401
|
"15",
|
|
399
402
|
),
|
|
400
|
-
("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "
|
|
403
|
+
("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "\\sqrt{59}"),
|
|
401
404
|
(
|
|
402
405
|
"The faces of an octahedral die are labeled with digits $1$ through $8$."
|
|
403
406
|
+ " What is the probability, expressed as a common fraction,"
|
|
@@ -449,3 +452,34 @@ class MATHScenario(Scenario):
|
|
|
449
452
|
instances.append(instance)
|
|
450
453
|
|
|
451
454
|
return instances
|
|
455
|
+
|
|
456
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
457
|
+
taxonomy = TaxonomyInfo(
|
|
458
|
+
task="numeric answer question answering",
|
|
459
|
+
what="math competitions (AMC, AIME, etc.)",
|
|
460
|
+
when="before 2021",
|
|
461
|
+
who="problem setters",
|
|
462
|
+
language="synthetic",
|
|
463
|
+
)
|
|
464
|
+
if self.use_chain_of_thought:
|
|
465
|
+
return ScenarioMetadata(
|
|
466
|
+
name="math_chain_of_thought",
|
|
467
|
+
display_name="MATH",
|
|
468
|
+
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
469
|
+
"math problems with chain-of-thought style reasoning [(Hendrycks et al., "
|
|
470
|
+
"2021)](https://arxiv.org/pdf/2103.03874.pdf).",
|
|
471
|
+
taxonomy=taxonomy,
|
|
472
|
+
main_metric="math_equiv_chain_of_thought",
|
|
473
|
+
main_split="test",
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
return ScenarioMetadata(
|
|
477
|
+
name="math_regular",
|
|
478
|
+
display_name="MATH",
|
|
479
|
+
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
480
|
+
"math problems [(Hendrycks et al., "
|
|
481
|
+
"2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
|
|
482
|
+
taxonomy=taxonomy,
|
|
483
|
+
main_metric="math_equiv",
|
|
484
|
+
main_split="test",
|
|
485
|
+
)
|