crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ALRAGEMetric(Metric):
|
|
12
|
+
def evaluate_generation(
|
|
13
|
+
self,
|
|
14
|
+
adapter_spec: AdapterSpec,
|
|
15
|
+
request_state: RequestState,
|
|
16
|
+
metric_service: MetricService,
|
|
17
|
+
eval_cache_path: str,
|
|
18
|
+
) -> List[Stat]:
|
|
19
|
+
assert request_state.annotations
|
|
20
|
+
assert "alrage" in request_state.annotations
|
|
21
|
+
return [
|
|
22
|
+
Stat(MetricName("alrage_score")).add(request_state.annotations["alrage"]["score"]),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
26
|
+
return [
|
|
27
|
+
MetricMetadata(
|
|
28
|
+
name="alrage_score",
|
|
29
|
+
display_name="ALRAGE Score",
|
|
30
|
+
short_display_name="Score",
|
|
31
|
+
description="Score of the output judged by GPT-4o.",
|
|
32
|
+
lower_is_better=False,
|
|
33
|
+
group="accuracy",
|
|
34
|
+
),
|
|
35
|
+
]
|
|
@@ -8,7 +8,7 @@ import numpy as np
|
|
|
8
8
|
import scipy # type: ignore
|
|
9
9
|
import calibration as cal # type: ignore
|
|
10
10
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
11
|
-
from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics
|
|
11
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics, get_reference_metrics_metadata
|
|
12
12
|
from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
13
13
|
from helm.benchmark.metrics.reference_metric import ReferenceMetric
|
|
14
14
|
|
|
@@ -25,7 +25,14 @@ from helm.benchmark.window_services.window_service import WindowService
|
|
|
25
25
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
26
26
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
27
27
|
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance
|
|
28
|
-
from helm.benchmark.metrics.metric import
|
|
28
|
+
from helm.benchmark.metrics.metric import (
|
|
29
|
+
Metric,
|
|
30
|
+
MetricInterface,
|
|
31
|
+
MetricMetadata,
|
|
32
|
+
MetricResult,
|
|
33
|
+
add_context,
|
|
34
|
+
get_unique_stat_by_name,
|
|
35
|
+
)
|
|
29
36
|
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
30
37
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
31
38
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
@@ -104,6 +111,35 @@ def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
|
|
|
104
111
|
return derived_stats
|
|
105
112
|
|
|
106
113
|
|
|
114
|
+
def _get_perplexity_metrics_metadata() -> List[MetricMetadata]:
|
|
115
|
+
return [
|
|
116
|
+
MetricMetadata(
|
|
117
|
+
name="perplexity",
|
|
118
|
+
display_name="Perplexity",
|
|
119
|
+
short_display_name="PPL",
|
|
120
|
+
description="Perplexity of the output completion (effective branching factor per output token).",
|
|
121
|
+
lower_is_better=True,
|
|
122
|
+
group=None,
|
|
123
|
+
),
|
|
124
|
+
MetricMetadata(
|
|
125
|
+
name="logprob_per_byte",
|
|
126
|
+
display_name="Log probability / byte",
|
|
127
|
+
short_display_name="Logprob/byte",
|
|
128
|
+
description="Predicted output's average log probability normalized by the number of bytes.",
|
|
129
|
+
lower_is_better=False,
|
|
130
|
+
group=None,
|
|
131
|
+
),
|
|
132
|
+
MetricMetadata(
|
|
133
|
+
name="bits_per_byte",
|
|
134
|
+
display_name="Bits/byte",
|
|
135
|
+
short_display_name="BPB",
|
|
136
|
+
description="Average number of bits per byte according to model probabilities.",
|
|
137
|
+
lower_is_better=True,
|
|
138
|
+
group=None,
|
|
139
|
+
),
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
|
|
107
143
|
class InstancesPerSplitMetric(MetricInterface):
|
|
108
144
|
"""Report the average num_instances in each MetricContext across train_trials."""
|
|
109
145
|
|
|
@@ -133,6 +169,16 @@ class InstancesPerSplitMetric(MetricInterface):
|
|
|
133
169
|
# There are no per-instance Stats.
|
|
134
170
|
return MetricResult(list(global_stats.values()), [])
|
|
135
171
|
|
|
172
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
173
|
+
return [
|
|
174
|
+
MetricMetadata(
|
|
175
|
+
name="num_instances",
|
|
176
|
+
display_name="# eval",
|
|
177
|
+
description="Number of evaluation instances.",
|
|
178
|
+
lower_is_better=None,
|
|
179
|
+
)
|
|
180
|
+
]
|
|
181
|
+
|
|
136
182
|
|
|
137
183
|
class BasicGenerationMetric(Metric):
|
|
138
184
|
"""
|
|
@@ -180,6 +226,15 @@ class BasicGenerationMetric(Metric):
|
|
|
180
226
|
derived_stats.extend(compute_calibration_metrics(per_instance_stats))
|
|
181
227
|
return derived_stats
|
|
182
228
|
|
|
229
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
230
|
+
return (
|
|
231
|
+
get_request_state_metrics_metadata(self.efficiency_metric)
|
|
232
|
+
+ get_reference_metrics_metadata(self.names)
|
|
233
|
+
+ _get_language_modeling_metrics_metadata()
|
|
234
|
+
+ _get_perplexity_metrics_metadata()
|
|
235
|
+
+ _get_calibration_metrics_metadata()
|
|
236
|
+
)
|
|
237
|
+
|
|
183
238
|
|
|
184
239
|
class BasicReferenceMetric(ReferenceMetric):
|
|
185
240
|
"""
|
|
@@ -295,6 +350,33 @@ class BasicReferenceMetric(ReferenceMetric):
|
|
|
295
350
|
)
|
|
296
351
|
return stats
|
|
297
352
|
|
|
353
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
354
|
+
return [
|
|
355
|
+
MetricMetadata(
|
|
356
|
+
name="max_prob",
|
|
357
|
+
display_name="Max prob",
|
|
358
|
+
description="Model's average confidence in its prediction (only computed for classification tasks)",
|
|
359
|
+
lower_is_better=False,
|
|
360
|
+
group="calibration_detailed",
|
|
361
|
+
),
|
|
362
|
+
MetricMetadata(
|
|
363
|
+
name="exact_match",
|
|
364
|
+
display_name="Exact match",
|
|
365
|
+
short_display_name="EM",
|
|
366
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly.",
|
|
367
|
+
lower_is_better=False,
|
|
368
|
+
group="accuracy",
|
|
369
|
+
),
|
|
370
|
+
MetricMetadata(
|
|
371
|
+
name="predicted_index",
|
|
372
|
+
display_name="Predicted index",
|
|
373
|
+
description="Integer index of the reference (0, 1, ...) that was predicted by the model (for "
|
|
374
|
+
"multiple-choice).",
|
|
375
|
+
lower_is_better=None,
|
|
376
|
+
group=None,
|
|
377
|
+
),
|
|
378
|
+
]
|
|
379
|
+
|
|
298
380
|
|
|
299
381
|
def compute_request_state_metrics(
|
|
300
382
|
efficiency_metric: EfficiencyMetric,
|
|
@@ -319,6 +401,34 @@ def compute_request_state_metrics(
|
|
|
319
401
|
return stats
|
|
320
402
|
|
|
321
403
|
|
|
404
|
+
def get_request_state_metrics_metadata(
|
|
405
|
+
efficiency_metric: EfficiencyMetric,
|
|
406
|
+
) -> List[MetricMetadata]:
|
|
407
|
+
metric_metadata = [
|
|
408
|
+
MetricMetadata(
|
|
409
|
+
name="num_references",
|
|
410
|
+
display_name="# ref",
|
|
411
|
+
description="Number of references.",
|
|
412
|
+
lower_is_better=None,
|
|
413
|
+
group=None,
|
|
414
|
+
),
|
|
415
|
+
MetricMetadata(
|
|
416
|
+
name="num_train_trials",
|
|
417
|
+
display_name="# trials",
|
|
418
|
+
description="Number of trials, where in each trial we choose an independent, random set of training "
|
|
419
|
+
"instances.",
|
|
420
|
+
lower_is_better=None,
|
|
421
|
+
group="general_information",
|
|
422
|
+
),
|
|
423
|
+
]
|
|
424
|
+
return (
|
|
425
|
+
metric_metadata
|
|
426
|
+
+ efficiency_metric.get_metadata()
|
|
427
|
+
+ _get_finish_reason_metrics_metadata()
|
|
428
|
+
+ _get_truncation_metrics_metadata()
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
|
|
322
432
|
def _compute_finish_reason_metrics(
|
|
323
433
|
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
324
434
|
) -> List[Stat]:
|
|
@@ -341,6 +451,40 @@ def _compute_finish_reason_metrics(
|
|
|
341
451
|
]
|
|
342
452
|
|
|
343
453
|
|
|
454
|
+
def _get_finish_reason_metrics_metadata():
|
|
455
|
+
return [
|
|
456
|
+
MetricMetadata(
|
|
457
|
+
name="finish_reason_endoftext",
|
|
458
|
+
display_name="finish b/c endoftext",
|
|
459
|
+
description="Fraction of instances where the the output was terminated because the end of text token "
|
|
460
|
+
"was generated.",
|
|
461
|
+
lower_is_better=None,
|
|
462
|
+
group=None,
|
|
463
|
+
),
|
|
464
|
+
MetricMetadata(
|
|
465
|
+
name="finish_reason_length",
|
|
466
|
+
display_name="finish b/c length",
|
|
467
|
+
description="Fraction of instances where the the output was terminated because of the max tokens limit.",
|
|
468
|
+
lower_is_better=None,
|
|
469
|
+
group=None,
|
|
470
|
+
),
|
|
471
|
+
MetricMetadata(
|
|
472
|
+
name="finish_reason_stop",
|
|
473
|
+
display_name="finish b/c stop",
|
|
474
|
+
description="Fraction of instances where the the output was terminated because of the stop sequences.",
|
|
475
|
+
lower_is_better=None,
|
|
476
|
+
group=None,
|
|
477
|
+
),
|
|
478
|
+
MetricMetadata(
|
|
479
|
+
name="finish_reason_unknown",
|
|
480
|
+
display_name="finish b/c unknown",
|
|
481
|
+
description="Fraction of instances where the the output was terminated for unknown reasons.",
|
|
482
|
+
lower_is_better=None,
|
|
483
|
+
group=None,
|
|
484
|
+
),
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
|
|
344
488
|
def _compute_truncation_metrics(
|
|
345
489
|
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
346
490
|
) -> List[Stat]:
|
|
@@ -354,6 +498,26 @@ def _compute_truncation_metrics(
|
|
|
354
498
|
]
|
|
355
499
|
|
|
356
500
|
|
|
501
|
+
def _get_truncation_metrics_metadata() -> List[MetricMetadata]:
|
|
502
|
+
return [
|
|
503
|
+
MetricMetadata(
|
|
504
|
+
name="num_train_instances",
|
|
505
|
+
display_name="# train",
|
|
506
|
+
description="Number of training instances (e.g., in-context examples).",
|
|
507
|
+
lower_is_better=None,
|
|
508
|
+
),
|
|
509
|
+
MetricMetadata(
|
|
510
|
+
name="prompt_truncated",
|
|
511
|
+
display_name="truncated",
|
|
512
|
+
description="Fraction of instances where the "
|
|
513
|
+
"prompt itself was truncated (implies "
|
|
514
|
+
"that there were no in-context "
|
|
515
|
+
"examples).",
|
|
516
|
+
lower_is_better=None,
|
|
517
|
+
),
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
|
|
357
521
|
def compute_language_modeling_metrics(
|
|
358
522
|
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
359
523
|
) -> List[Stat]:
|
|
@@ -387,6 +551,30 @@ def compute_language_modeling_metrics(
|
|
|
387
551
|
]
|
|
388
552
|
|
|
389
553
|
|
|
554
|
+
def _get_language_modeling_metrics_metadata() -> List[MetricMetadata]:
|
|
555
|
+
return [
|
|
556
|
+
MetricMetadata(
|
|
557
|
+
name="logprob",
|
|
558
|
+
display_name="Log probability",
|
|
559
|
+
short_display_name="Logprob",
|
|
560
|
+
description="Predicted output's average log probability (input's log prob for language modeling).",
|
|
561
|
+
lower_is_better=False,
|
|
562
|
+
),
|
|
563
|
+
MetricMetadata(
|
|
564
|
+
name="num_perplexity_tokens",
|
|
565
|
+
display_name="# tokens",
|
|
566
|
+
description="Average number of tokens in the predicted output (for language modeling, the input too).",
|
|
567
|
+
lower_is_better=None,
|
|
568
|
+
),
|
|
569
|
+
MetricMetadata(
|
|
570
|
+
name="num_bytes",
|
|
571
|
+
display_name="# bytes",
|
|
572
|
+
description="Average number of bytes in the predicted output (for language modeling, the input too).",
|
|
573
|
+
lower_is_better=None,
|
|
574
|
+
),
|
|
575
|
+
]
|
|
576
|
+
|
|
577
|
+
|
|
390
578
|
def _has_non_zero_valued_logprobs(per_instance_stats: Dict[Instance, List[Stat]]) -> bool:
|
|
391
579
|
"""Return whether the per-instance stats contain non-zero-valued logprobs.
|
|
392
580
|
|
|
@@ -448,3 +636,80 @@ def compute_calibration_metrics(per_instance_stats: Dict[Instance, List[Stat]])
|
|
|
448
636
|
stats.append(Stat(MetricName("platt_ece_1_bin")).add(platt_ece_1_bin))
|
|
449
637
|
|
|
450
638
|
return stats
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _get_calibration_metrics_metadata() -> List[MetricMetadata]:
|
|
642
|
+
return [
|
|
643
|
+
MetricMetadata(
|
|
644
|
+
name="ece_10_bin",
|
|
645
|
+
display_name="10-bin expected calibration error",
|
|
646
|
+
short_display_name="ECE (10-bin)",
|
|
647
|
+
description="The average difference between the model's confidence and accuracy, averaged across 10 "
|
|
648
|
+
"bins where each bin contains an equal number of points (only computed for classification "
|
|
649
|
+
"tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because "
|
|
650
|
+
"each bin will have very few examples.",
|
|
651
|
+
lower_is_better=True,
|
|
652
|
+
group="calibration",
|
|
653
|
+
),
|
|
654
|
+
MetricMetadata(
|
|
655
|
+
name="ece_1_bin",
|
|
656
|
+
display_name="1-bin expected calibration error",
|
|
657
|
+
short_display_name="ECE (1-bin)",
|
|
658
|
+
description="The (absolute value) difference between the model's average confidence and accuracy "
|
|
659
|
+
"(only computed for classification tasks).",
|
|
660
|
+
lower_is_better=True,
|
|
661
|
+
group="calibration_detailed",
|
|
662
|
+
),
|
|
663
|
+
MetricMetadata(
|
|
664
|
+
name="selective_acc@10",
|
|
665
|
+
display_name="Accuracy at 10% coverage",
|
|
666
|
+
short_display_name="Acc@10%",
|
|
667
|
+
description="The accuracy for the 10% of predictions that the model is most confident on (only "
|
|
668
|
+
"computed for classification tasks).",
|
|
669
|
+
lower_is_better=False,
|
|
670
|
+
group="calibration_detailed",
|
|
671
|
+
),
|
|
672
|
+
MetricMetadata(
|
|
673
|
+
name="selective_cov_acc_area",
|
|
674
|
+
display_name="Selective coverage-accuracy area",
|
|
675
|
+
short_display_name="Selective Acc",
|
|
676
|
+
description="The area under the coverage-accuracy curve, a standard selective classification metric "
|
|
677
|
+
"(only computed for classification tasks).",
|
|
678
|
+
lower_is_better=False,
|
|
679
|
+
group="calibration_detailed",
|
|
680
|
+
),
|
|
681
|
+
MetricMetadata(
|
|
682
|
+
name="platt_coef",
|
|
683
|
+
display_name="Platt Scaling Coefficient",
|
|
684
|
+
short_display_name="Platt Coef",
|
|
685
|
+
description="Coefficient of the Platt scaling classifier (can compare this across tasks).",
|
|
686
|
+
lower_is_better=False,
|
|
687
|
+
group="calibration_detailed",
|
|
688
|
+
),
|
|
689
|
+
MetricMetadata(
|
|
690
|
+
name="platt_intercept",
|
|
691
|
+
display_name="Platt Scaling Intercept",
|
|
692
|
+
short_display_name="Platt Intercept",
|
|
693
|
+
description="Intercept of the Platt scaling classifier (can compare this across tasks).",
|
|
694
|
+
lower_is_better=False,
|
|
695
|
+
group="calibration_detailed",
|
|
696
|
+
),
|
|
697
|
+
MetricMetadata(
|
|
698
|
+
name="platt_ece_10_bin",
|
|
699
|
+
display_name="10-bin Expected Calibration Error (after Platt scaling)",
|
|
700
|
+
short_display_name="Platt-scaled ECE (10-bin)",
|
|
701
|
+
description="10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
|
|
702
|
+
"probabilities.",
|
|
703
|
+
lower_is_better=True,
|
|
704
|
+
group="calibration_detailed",
|
|
705
|
+
),
|
|
706
|
+
MetricMetadata(
|
|
707
|
+
name="platt_ece_1_bin",
|
|
708
|
+
display_name="1-bin expected calibration error (after Platt scaling)",
|
|
709
|
+
short_display_name="Platt-scaled ECE (1-bin)",
|
|
710
|
+
description="1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
|
|
711
|
+
"probabilities.",
|
|
712
|
+
lower_is_better=True,
|
|
713
|
+
group="calibration_detailed",
|
|
714
|
+
),
|
|
715
|
+
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.metrics.metric import MetricMetadata
|
|
4
5
|
from helm.common.request import RequestResult
|
|
5
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
@@ -145,3 +146,14 @@ class BBQMetric(EvaluateInstancesMetric):
|
|
|
145
146
|
stats = [acc, amb_bias_stat, disamb_bias_stat]
|
|
146
147
|
|
|
147
148
|
return stats
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
151
|
+
return [
|
|
152
|
+
MetricMetadata(
|
|
153
|
+
name="bbq_accuracy",
|
|
154
|
+
display_name="BBQ accuracy",
|
|
155
|
+
description="BBQ accuracy",
|
|
156
|
+
lower_is_better=False,
|
|
157
|
+
group=None,
|
|
158
|
+
),
|
|
159
|
+
]
|
|
@@ -6,7 +6,7 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
8
8
|
from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
9
|
-
from helm.benchmark.metrics.metric import MetricName
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricMetadata, MetricName
|
|
10
10
|
from helm.benchmark.metrics.statistic import Stat
|
|
11
11
|
from helm.benchmark.scenarios.scenario import Reference
|
|
12
12
|
from helm.common.hierarchical_logger import hwarn
|
|
@@ -168,3 +168,21 @@ class MultipleChoiceClassificationMetric(EvaluateInstancesMetric):
|
|
|
168
168
|
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
169
169
|
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
170
170
|
]
|
|
171
|
+
|
|
172
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
173
|
+
return [
|
|
174
|
+
MetricMetadata(
|
|
175
|
+
name="classification_macro_f1",
|
|
176
|
+
display_name="Macro F1",
|
|
177
|
+
description="Macro F1",
|
|
178
|
+
lower_is_better=False,
|
|
179
|
+
group="classification_metrics",
|
|
180
|
+
),
|
|
181
|
+
MetricMetadata(
|
|
182
|
+
name="classification_micro_f1",
|
|
183
|
+
display_name="Micro F1",
|
|
184
|
+
description="Population-level micro-averaged F1 score.",
|
|
185
|
+
lower_is_better=False,
|
|
186
|
+
group="classification_metrics",
|
|
187
|
+
),
|
|
188
|
+
]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from typing import List, Tuple, Dict, Any
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
|
|
10
|
+
CodeInsightsFunctionalCorrectnessMetric,
|
|
11
|
+
CPPEvaluator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CodeInsightsCodeEfficiencyMetric(CodeInsightsFunctionalCorrectnessMetric):
|
|
16
|
+
"""
|
|
17
|
+
Comprehensive metric combining functional correctness and runtime efficiency evaluation.
|
|
18
|
+
|
|
19
|
+
This metric first evaluates functional correctness and then measures runtime efficiency
|
|
20
|
+
alignment between LLM-generated code and student reference code when both are correct.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
num_runtime_runs: int = 5,
|
|
26
|
+
timeout_seconds: int = 10,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initializes the CodeInsightsFunctionalCorrectnessMetric.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
timeout (int): Timeout for each test case execution.
|
|
33
|
+
"""
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.num_runtime_runs = num_runtime_runs
|
|
36
|
+
self.timeout_seconds = timeout_seconds
|
|
37
|
+
|
|
38
|
+
def evaluate_generation(
|
|
39
|
+
self,
|
|
40
|
+
adapter_spec: AdapterSpec,
|
|
41
|
+
request_state: RequestState,
|
|
42
|
+
metric_service: MetricService,
|
|
43
|
+
eval_cache_path: str,
|
|
44
|
+
) -> List[Stat]:
|
|
45
|
+
"""
|
|
46
|
+
Evaluate LLM-generated code by running unit tests and computing pass rate.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of Stat objects containing the functional correctness score
|
|
50
|
+
"""
|
|
51
|
+
print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
|
|
52
|
+
print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
|
|
53
|
+
|
|
54
|
+
# Get the generated code from the request state
|
|
55
|
+
if not request_state.result or not request_state.result.completions:
|
|
56
|
+
print("ERROR: No output generated")
|
|
57
|
+
return self._create_failure_stats("No output generated")
|
|
58
|
+
|
|
59
|
+
generated_code = request_state.result.completions[0].text.strip()
|
|
60
|
+
generated_code = self._extract_student_code(generated_code)
|
|
61
|
+
print(f"Generated code length: {len(generated_code)}")
|
|
62
|
+
print(f"Generated code preview: {generated_code[:200]}...")
|
|
63
|
+
|
|
64
|
+
# Get the student code from the instance references
|
|
65
|
+
student_code = request_state.instance.references[0].output.text.strip()
|
|
66
|
+
print(f"Student code length: {len(student_code)}")
|
|
67
|
+
|
|
68
|
+
# Get test cases from instance extra_data
|
|
69
|
+
if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
|
|
70
|
+
print("ERROR: No extra_data available")
|
|
71
|
+
print(f"Instance attributes: {dir(request_state.instance)}")
|
|
72
|
+
return self._create_failure_stats("No test data available")
|
|
73
|
+
|
|
74
|
+
extra_data = request_state.instance.extra_data
|
|
75
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
76
|
+
|
|
77
|
+
test_cases = extra_data.get("test_cases", [])
|
|
78
|
+
question_template = extra_data.get("question_template", "")
|
|
79
|
+
question_name = extra_data.get("question_name", "UNKNOWN")
|
|
80
|
+
|
|
81
|
+
print(f"Question name: {question_name}")
|
|
82
|
+
print(f"Number of test cases: {len(test_cases)}")
|
|
83
|
+
print(f"Template length: {len(question_template)}")
|
|
84
|
+
|
|
85
|
+
if not test_cases:
|
|
86
|
+
print("ERROR: No test cases available")
|
|
87
|
+
return self._create_failure_stats("No test cases available")
|
|
88
|
+
|
|
89
|
+
print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
|
|
90
|
+
|
|
91
|
+
# Run unit tests and calculate pass rate
|
|
92
|
+
evaluator = CPPEvaluator(
|
|
93
|
+
question_template,
|
|
94
|
+
test_cases,
|
|
95
|
+
timeout=self.timeout_seconds,
|
|
96
|
+
max_workers=1,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
llm_output, llm_avg_runtime = self._timed_run(evaluator, generated_code, self.num_runtime_runs)
|
|
100
|
+
stu_output, stu_avg_runtime = self._timed_run(evaluator, student_code, self.num_runtime_runs)
|
|
101
|
+
|
|
102
|
+
# Compute functional correctness score
|
|
103
|
+
if not llm_output or "score" not in llm_output:
|
|
104
|
+
stats = [Stat(MetricName("functional_correctness")).add(0.0)]
|
|
105
|
+
else:
|
|
106
|
+
stats = [Stat(MetricName("functional_correctness")).add(llm_output["score"])]
|
|
107
|
+
|
|
108
|
+
# Calculate runtime metrics if we have data for both solutions
|
|
109
|
+
if llm_avg_runtime > 0 and stu_avg_runtime > 0:
|
|
110
|
+
# Runtime ratio (LLM / Student) - values > 1 mean LLM is slower
|
|
111
|
+
runtime_ratio = llm_avg_runtime / stu_avg_runtime if stu_avg_runtime > 0 else float("inf")
|
|
112
|
+
|
|
113
|
+
# Efficiency alignment score (closer to 1.0 is better alignment)
|
|
114
|
+
# Use reciprocal if LLM is faster to normalize the scale
|
|
115
|
+
if runtime_ratio > 1:
|
|
116
|
+
efficiency_alignment = 1.0 / runtime_ratio
|
|
117
|
+
else:
|
|
118
|
+
efficiency_alignment = runtime_ratio
|
|
119
|
+
|
|
120
|
+
print(f"Runtime ratio (LLM/Student): {runtime_ratio:.4f}")
|
|
121
|
+
print(f"Efficiency alignment score: {efficiency_alignment:.4f}")
|
|
122
|
+
|
|
123
|
+
stats.extend(
|
|
124
|
+
[
|
|
125
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(runtime_ratio),
|
|
126
|
+
Stat(MetricName("efficiency_alignment_score")).add(efficiency_alignment),
|
|
127
|
+
]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Handle cases where only one solution has runtime data
|
|
131
|
+
elif llm_avg_runtime > 0 and stu_avg_runtime <= 0:
|
|
132
|
+
print("Only LLM runtime available - student solution failed to run")
|
|
133
|
+
stats.extend(
|
|
134
|
+
[
|
|
135
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(float("inf")), # LLM runs, student doesn't
|
|
136
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
|
|
137
|
+
]
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
elif llm_avg_runtime <= 0 and stu_avg_runtime > 0:
|
|
141
|
+
print("Only student runtime available - LLM solution failed to run")
|
|
142
|
+
stats.extend(
|
|
143
|
+
[
|
|
144
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0), # Student runs, LLM doesn't
|
|
145
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
else:
|
|
150
|
+
# Neither solution has runtime data
|
|
151
|
+
print("Runtime measurement failed for both solutions")
|
|
152
|
+
stats.extend(
|
|
153
|
+
[
|
|
154
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
|
|
155
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0),
|
|
156
|
+
]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return stats
|
|
160
|
+
|
|
161
|
+
def _timed_run(self, evaluator: CPPEvaluator, code: str, num_runtime_runs: int = 1) -> Tuple[Dict[str, Any], float]:
|
|
162
|
+
list_runtimes: List[float] = []
|
|
163
|
+
last_output: Dict[str, Any] = {}
|
|
164
|
+
|
|
165
|
+
for _ in range(num_runtime_runs):
|
|
166
|
+
start_time = time.perf_counter()
|
|
167
|
+
output = evaluator.evaluate(code)
|
|
168
|
+
passed = sum(output.get("testcases", []))
|
|
169
|
+
|
|
170
|
+
if passed > 0:
|
|
171
|
+
elapsed = time.perf_counter() - start_time
|
|
172
|
+
list_runtimes.append(elapsed / passed)
|
|
173
|
+
last_output = output
|
|
174
|
+
# if passed == 0, we simply skip recording this run
|
|
175
|
+
|
|
176
|
+
avg_runtime = sum(list_runtimes) / len(list_runtimes) if list_runtimes else 0.0
|
|
177
|
+
return last_output, avg_runtime
|
|
178
|
+
|
|
179
|
+
def _create_failure_stats(self, error_message: str) -> List[Stat]:
|
|
180
|
+
"""Create default statistics for failure cases."""
|
|
181
|
+
print(f"RUNTIME EFFICIENCY METRIC FAILURE: {error_message}")
|
|
182
|
+
return [
|
|
183
|
+
Stat(MetricName("functional_correctness")).add(0.0),
|
|
184
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
|
|
185
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0),
|
|
186
|
+
]
|