PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/benchmark/scenarios/seahelm_scenario.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Dict
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Instance,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
     TRAIN_SPLIT,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
@@ -129,6 +131,27 @@ class TyDiQAScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="tydiqa",
+            display_name="TyDiQA",
+            short_display_name=None,
+            description="TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an "
+            "open-book question answering dataset for 11 typologically-diverse languages. "
+            "The questions are written by people who want to know the answer, but do not "
+            "know the answer yet, and the data is collected directly in each language "
+            "without the use of translation.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions by human annotators about Wikipedia articles",
+                when="?",
+                who="human annotators",
+                language="Indonesian",
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
 # 1.2 Vietnamese & Thai: XQuAD
 class XQuADScenario(Scenario):
@@ -232,6 +255,28 @@ class XQuADScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xquad_{self.language}",
+            display_name=f"XQuAD ({self.language})",
+            short_display_name=None,
+            description="XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book "
+            "question answering dataset that is parallel across 10 languages. The dataset "
+            "consists of a subset of 240 paragraphs and 1190 question-answer pairs from the "
+            "development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their "
+            "professional translations.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions by crowdworkers about Wikipedia articles translated "
+                f"from English to {self.language}",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
 # 1.3 Tamil: IndicQA
 class IndicQAScenario(Scenario):
@@ -341,6 +386,27 @@ class IndicQAScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicqa",
+            display_name="IndicQA",
+            short_display_name=None,
+            description="IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an "
+            "open-book question answering dataset for 11 Indic languages. Answers to "
+            "questions are to be extracted from the text provided. The data is taken from "
+            "Wikipedia articles across various domains and questions and answers were "
+            "manually created by native speakers.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions about Wikipedia articles translated by native " "speakers from English to Tamil",
+                when="?",
+                who="?",
+                language="Tamil",
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
 # 2. Sentiment Analysis
 # 2.1 Indonesian: NusaX Sentiment
@@ -445,6 +511,25 @@ class NusaXScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="nusax",
+            display_name="NusaX",
+            short_display_name=None,
+            description="NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an "
+            "Indonesian sentiment analysis dataset. The data consists of comments and "
+            "reviews from various online platforms.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="online comments and reviews",
+                when="?",
+                who="internet users",
+                language="Indonesian",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 2.2 Vietnamese: UIT-VSFC
 class UITVSFCScenario(Scenario):
@@ -543,6 +628,25 @@ class UITVSFCScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="uitvsfc",
+            display_name="UIT-VSFC",
+            short_display_name=None,
+            description="UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a "
+            "Vietnamese sentiment analysis dataset. The data consists of student feedback "
+            "obtained from end-of-semester surveys at a Vietnamese university.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="university student end-of-semester survey responses",
+                when="?",
+                who="university students",
+                language="Vietnamese",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 2.3 Thai: Wisesight Sentiment
 class WisesightScenario(Scenario):
@@ -634,6 +738,25 @@ class WisesightScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="wisesight",
+            display_name="Wisesight",
+            short_display_name=None,
+            description="Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is "
+            "an Thai sentiment analysis scenario. The data consists of social media "
+            "messages regarding consumer products and services. \n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="social media messages regarding consumer products and services",
+                when="?",
+                who="social media users",
+                language="Thai",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 2.4 Tamil: IndicSentiment
 class IndicSentimentScenario(Scenario):
@@ -723,6 +846,22 @@ class IndicSentimentScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicsentiment",
+            display_name="IndicSentiment",
+            short_display_name=None,
+            description="IndicSentiment is a Tamil sentiment analysis dataset that comes from "
+            "IndicXTREME [(Doddapaneni, "
+            "2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product "
+            "reviews that were written by annotators. Labels are positive or negative.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis", what="product reviews", when="?", who="human annotators", language="Tamil"
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 3. Toxicity Detection/Classification
 # 3.1 Indonesian: Multi-Label Hate Speech Detection
@@ -835,6 +974,24 @@ class MLHSDScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="mlhsd",
+            display_name="MLHSD",
+            short_display_name=None,
+            description="MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian "
+            "toxicity detection dataset obtained from tweets on Twitter.\n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification",
+                what="tweets",
+                when="?",
+                who="Twitter users",
+                language="Indonesian",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 3.2 Vietnamese: ViHSD
 class ViHSDScenario(Scenario):
@@ -927,6 +1084,26 @@ class ViHSDScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="vihsd",
+            display_name="ViHSD",
+            short_display_name=None,
+            description="ViHSD [(Luu, "
+            "2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a "
+            "Vietnamese toxicity detection dataset obtained from comments on Facebook, "
+            "Youtube, Instagram, and Tiktok.\n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification",
+                what="social media comments",
+                when="?",
+                who="Social media users",
+                language="Vietnamese",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # 3.3 Thai: Thai Toxicity Tweets
 class ThaiToxicityTweetsScenario(Scenario):
@@ -1013,6 +1190,21 @@ class ThaiToxicityTweetsScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="thaitoxicitytweets",
+            display_name="Thai Toxicity Tweets",
+            short_display_name=None,
+            description="Thai Toxicity Tweets [(Sirihattasak, "
+            "2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a "
+            "Thai toxicity detection dataset obtained from tweets on Twitter. \n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification", what="tweets", when="", who="Twitter users", language="Thai"
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
 # B. Natural Language Generation
 #   1. Machine Translation
@@ -1111,6 +1303,28 @@ class FloresScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"flores_{self.source}_{self.target}",
+            display_name=f"Flores ({self.source} to {self.target})",
+            short_display_name=None,
+            description="Flores [(NLLB Team, "
+            "2022)](https://research.facebook.com/publications/no-language-left-behind/) "
+            "was created with professional human translators who translate the FLORES "
+            "source dataset into the target languages and a separate group of independent "
+            "translation reviewers who perform quality assessments of the human "
+            "translations and provide translation feedback to the translators.\n",
+            taxonomy=TaxonomyInfo(
+                task="machine translation",
+                what="translations from professional human translators",
+                when="?",
+                who="professional human translators",
+                language=f"{self.source}, {self.target}",
+            ),
+            main_metric="chr_f_plus_plus",
+            main_split="test",
+        )
 # C. Natural Language Reasoning
 #   1. Natural Language Inference
@@ -1207,6 +1421,26 @@ class IndoNLIScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indonli",
+            display_name="IndoNLI",
+            short_display_name=None,
+            description="IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a "
+            "natural language inference dataset obtained from Wikipedia, news, and web "
+            "articles that incorporates various linguistic phenomena such as numerical "
+            "reasoning, structural changes, idioms, or temporal and spatial reasoning. \n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="Wikipedia, news, and web articles",
+                when="?",
+                who="?",
+                language="Indonesian",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 1.2 Vietnamese & Thai: XNLI
 class XNLIScenario(Scenario):
@@ -1305,6 +1539,25 @@ class XNLIScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xnli_{self.language}",
+            display_name=f"XNLI ({self.language})",
+            short_display_name=None,
+            description="XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural "
+            "language inference dataset obtained from crowdsourced NLI data then "
+            "professionally translated across 14 other languages.\n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="crowdsourced NLI data professionally translated",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 1.3 Tamil: IndicXNLI
 class IndicXNLIScenario(Scenario):
@@ -1398,6 +1651,25 @@ class IndicXNLIScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicxnli",
+            display_name="IndicXNLI",
+            short_display_name=None,
+            description="IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME "
+            "[(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which "
+            "automatically translated from XNLI into 11 Indic languages.\n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="crowdsourced NLI data professionally translated into Tamil",
+                when="?",
+                who="?",
+                language="Tamil",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 2. Causal Reasoning: XCOPA
 class XCOPAScenario(Scenario):
@@ -1529,6 +1801,25 @@ class XCOPAScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xcopa_{self.language}",
+            display_name=f"XCOPA ({self.language})",
+            short_display_name=None,
+            description="XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal "
+            "reasoning dataset, a translation and reannotation of the English COPA. English "
+            "COPA included questions that directly assess commonsense causal reasoning.\n",
+            taxonomy=TaxonomyInfo(
+                task="causal reasoning",
+                what="commonsense causal reasoning questions translated into " "Indonesian",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 1. Syntax: LINDSEA Minimal Pairs
 class LINDSEASyntaxMinimalPairsScenario(Scenario):
@@ -1650,6 +1941,26 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
                 outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_syntax_minimal_pairs_{self.language}",
+            display_name="LINDSEA Syntax Minimal Pairs",
+            short_display_name=None,
+            description="LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA "
+            "[(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of "
+            "sentences that differ minimally from each other and contrast in grammatical "
+            "acceptability.\n",
+            taxonomy=TaxonomyInfo(
+                task="minimal pairs",
+                what="sentence pairs with minimal differences and constrasting " "grammatical acceptability",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 2.1 Pragmatics: LINDSEA Presuppositions
 class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
@@ -1750,7 +2061,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],
@@ -1798,6 +2109,24 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
             outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_pragmatics_presuppositions_{self.language}",
+            display_name="LINDSEA Pragmatics Presuppositions",
+            short_display_name=None,
+            description="LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics "
+            "dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), "
+            "involving two formats: single and pair sentences. For single sentence "
+            "questions, the system under test needs to determine if the sentence is "
+            "true/false. For pair sentence questions, the system under test needs to "
+            "determine whether a conclusion can be drawn from another sentence.\n",
+            taxonomy=TaxonomyInfo(
+                task="pragmatic reasoning", what="presuppositions", when="?", who="?", language=self.language
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 # 2.2 Pragmatics: LINDSEA Scalar Implicatures
 class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
@@ -1898,7 +2227,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],
@@ -1945,3 +2274,22 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
             )
             outputs.append(instance)
         return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_pragmatics_scalar_implicatures_{self.language}",
+            display_name="LINDSEA Pragmatics Scalar Implicatures",
+            short_display_name=None,
+            description="LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for "
+            "pragmatics dataset from BHASA [(Leong, "
+            "2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and "
+            "pair sentences. For single sentence questions, the system under test needs to "
+            "determine if the sentence is true/false. For pair sentence questions, the "
+            "system under test needs to determine whether a conclusion can be drawn from "
+            "another sentence.\n",
+            taxonomy=TaxonomyInfo(
+                task="pragmatic reasoning", what="scalar implicatures", when="?", who="?", language=self.language
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/self_instruct_scenario.py CHANGED Viewed

@@ -2,8 +2,18 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    Reference,
+    Scenario,
+    Instance,
+    Input,
+    TEST_SPLIT,
+    Output,
+    ScenarioMetadata,
+)
 class SelfInstructScenario(Scenario):
@@ -46,3 +56,21 @@ class SelfInstructScenario(Scenario):
                 )
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="self_instruct",
+            display_name="Self Instruct",
+            short_display_name="Self Instruct",
+            description="The manually-curated instructions from the Self-Instruct paper ([Wang et al., "
+            "2023](https://aclanthology.org/2023.acl-long.754.pdf)).",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="2022",
+                who="Authors of the research paper",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_bmt_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -73,3 +75,23 @@ class SHCBMTMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_bmt_med",
+            display_name="BMT-Status",
+            description="BMT-Status is a benchmark composed of clinical notes and associated binary "
+            "questions related to bone marrow transplant (BMT), hematopoietic stem cell "
+            "transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is "
+            "to determine whether the patient received a subsequent transplant based on the "
+            "provided clinical documentation.",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="Answer bone marrow transplant questions",
+                when="Any",
+                who="Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_cdi_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -73,3 +75,21 @@ class SHCCDIMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_cdi_med",
+            display_name="CDI-QA",
+            description="CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) "
+            "notes. It is used to evaluate a model's ability to verify clinical conditions "
+            "based on documented evidence in patient records.",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Answer verification questions from CDI notes",
+                when="Any",
+                who="Hospital Admistrator",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_conf_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -74,3 +76,24 @@ class SHCCONFMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_conf_med",
+            display_name="MedConfInfo",
+            description="MedConfInfo is a benchmark comprising clinical notes from adolescent patients. "
+            "It is used to evaluate whether the content contains sensitive protected health "
+            "information (PHI) that should be restricted from parental access, in "
+            "accordance with adolescent confidentiality policies in clinical care. "
+            "[(Rabbani et al., "
+            "2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Identify sensitive health info in adolescent notes",
+                when="Any",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl