crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Run specs for Arabic leaderboard
|
|
2
|
+
|
|
3
|
+
EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
|
|
7
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
8
|
+
from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
10
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
11
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
|
|
15
|
+
_ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@run_spec_function("arabic_mmlu")
|
|
19
|
+
def get_arabic_mmlu_spec(subset: str) -> RunSpec:
|
|
20
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
21
|
+
|
|
22
|
+
scenario_spec = ScenarioSpec(
|
|
23
|
+
class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario", args={"subset": subset}
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
27
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
28
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
29
|
+
input_noun="السؤال",
|
|
30
|
+
output_noun="الإجابة",
|
|
31
|
+
max_tokens=100,
|
|
32
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
33
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return RunSpec(
|
|
37
|
+
name=f"arabic_mmlu:subset={subset}",
|
|
38
|
+
scenario_spec=scenario_spec,
|
|
39
|
+
adapter_spec=adapter_spec,
|
|
40
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
41
|
+
groups=["arabic_mmlu"],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@run_spec_function("alghafa")
|
|
46
|
+
def get_alghafa_spec(subset: str) -> RunSpec:
|
|
47
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
48
|
+
scenario_spec = ScenarioSpec(
|
|
49
|
+
class_name="helm.benchmark.scenarios.alghafa_scenario.AlGhafaScenario", args={"subset": subset}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
53
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
54
|
+
instructions="الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح", # noqa: E501
|
|
55
|
+
input_noun="السؤال",
|
|
56
|
+
output_noun="الإجابة",
|
|
57
|
+
max_tokens=100,
|
|
58
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
59
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return RunSpec(
|
|
63
|
+
name=f"alghafa:subset={subset}",
|
|
64
|
+
scenario_spec=scenario_spec,
|
|
65
|
+
adapter_spec=adapter_spec,
|
|
66
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
67
|
+
groups=["alghafa", f"alghafa_{subset}"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@run_spec_function("aratrust")
|
|
72
|
+
def get_aratrust_spec(category: str) -> RunSpec:
|
|
73
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
74
|
+
scenario_spec = ScenarioSpec(
|
|
75
|
+
class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario",
|
|
76
|
+
args={"category": category},
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
adapter_spec = get_generation_adapter_spec(
|
|
80
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج", # noqa: E501
|
|
81
|
+
input_noun="السؤال",
|
|
82
|
+
output_noun="الإجابة",
|
|
83
|
+
max_tokens=100,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return RunSpec(
|
|
87
|
+
name=f"aratrust:category={category}",
|
|
88
|
+
scenario_spec=scenario_spec,
|
|
89
|
+
adapter_spec=adapter_spec,
|
|
90
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
91
|
+
groups=["aratrust"],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@run_spec_function("alrage")
|
|
96
|
+
def get_alrage_spec() -> RunSpec:
|
|
97
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
98
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.alrage_scenario.ALRAGEScenario")
|
|
99
|
+
|
|
100
|
+
adapter_spec = get_generation_adapter_spec(
|
|
101
|
+
instructions="بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي", # noqa: E501
|
|
102
|
+
input_noun="السؤال",
|
|
103
|
+
output_noun="الإجابة",
|
|
104
|
+
max_tokens=100,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.alrage_annotator.ALRAGEAnnotator")]
|
|
108
|
+
|
|
109
|
+
metric_specs = [
|
|
110
|
+
MetricSpec(class_name="helm.benchmark.metrics.alrage_metric.ALRAGEMetric")
|
|
111
|
+
] + get_basic_metric_specs([])
|
|
112
|
+
|
|
113
|
+
return RunSpec(
|
|
114
|
+
name="alrage",
|
|
115
|
+
scenario_spec=scenario_spec,
|
|
116
|
+
adapter_spec=adapter_spec,
|
|
117
|
+
annotators=annotator_specs,
|
|
118
|
+
metric_specs=metric_specs,
|
|
119
|
+
groups=["alrage"],
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@run_spec_function("madinah_qa")
|
|
124
|
+
def get_madinah_qa_spec(subset: str) -> RunSpec:
|
|
125
|
+
scenario_spec = ScenarioSpec(
|
|
126
|
+
class_name="helm.benchmark.scenarios.madinah_qa_scenario.MadinahQAScenario", args={"subset": subset}
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
130
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
131
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
132
|
+
input_noun="السؤال",
|
|
133
|
+
output_noun="الإجابة",
|
|
134
|
+
max_tokens=100,
|
|
135
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
136
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return RunSpec(
|
|
140
|
+
name=f"madinah_qa:subset={subset}",
|
|
141
|
+
scenario_spec=scenario_spec,
|
|
142
|
+
adapter_spec=adapter_spec,
|
|
143
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
144
|
+
groups=["madinah_qa"],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@run_spec_function("mbzuai_human_translated_arabic_mmlu")
|
|
149
|
+
def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
|
|
150
|
+
scenario_spec = ScenarioSpec(
|
|
151
|
+
class_name="helm.benchmark.scenarios.mbzuai_human_translated_arabic_mmlu.MBZUAIHumanTranslatedArabicMMLUScenario",
|
|
152
|
+
args={"subject": subject},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
156
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
157
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
158
|
+
input_noun="السؤال",
|
|
159
|
+
output_noun="الإجابة",
|
|
160
|
+
max_tokens=100,
|
|
161
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
162
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return RunSpec(
|
|
166
|
+
name=f"mbzuai_human_translated_arabic_mmlu:subject={subject}",
|
|
167
|
+
scenario_spec=scenario_spec,
|
|
168
|
+
adapter_spec=adapter_spec,
|
|
169
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
170
|
+
groups=["mbzuai_human_translated_arabic_mmlu"],
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@run_spec_function("arabic_exams")
|
|
175
|
+
def get_arabic_exams_spec(subject: str) -> RunSpec:
|
|
176
|
+
scenario_spec = ScenarioSpec(
|
|
177
|
+
class_name="helm.benchmark.scenarios.arabic_exams_scenario.ArabicEXAMSScenario",
|
|
178
|
+
args={"subject": subject},
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
182
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
183
|
+
instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
|
|
184
|
+
input_noun="السؤال",
|
|
185
|
+
output_noun="الإجابة",
|
|
186
|
+
max_tokens=100,
|
|
187
|
+
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
188
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return RunSpec(
|
|
192
|
+
name=f"arabic_exams:subject={subject}",
|
|
193
|
+
scenario_spec=scenario_spec,
|
|
194
|
+
adapter_spec=adapter_spec,
|
|
195
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
196
|
+
groups=["arabic_exams"],
|
|
197
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
2
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
3
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("bluex")
|
|
9
|
+
def get_bluex_spec() -> RunSpec:
|
|
10
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
|
|
11
|
+
|
|
12
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
13
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
14
|
+
instructions="""
|
|
15
|
+
Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
|
|
16
|
+
Exemplo de Pergunta com a resposta:
|
|
17
|
+
Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
|
|
18
|
+
relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
|
|
19
|
+
|
|
20
|
+
(A) Narrador onisciente que conhece os pensamentos de todas as personagens.
|
|
21
|
+
(B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
|
|
22
|
+
(C) Narrador-personagem que vivencia e relata os eventos da história.
|
|
23
|
+
(D) Narrador observador que apenas registra as ações visíveis.
|
|
24
|
+
(E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
|
|
25
|
+
|
|
26
|
+
Resposta correta: C
|
|
27
|
+
|
|
28
|
+
A partir disso, responda:
|
|
29
|
+
""",
|
|
30
|
+
input_noun="Pergunta",
|
|
31
|
+
output_noun="Resposta",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return RunSpec(
|
|
35
|
+
name="bluex",
|
|
36
|
+
scenario_spec=scenario_spec,
|
|
37
|
+
adapter_spec=adapter_spec,
|
|
38
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
39
|
+
groups=["bluex"],
|
|
40
|
+
)
|
|
@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
|
|
|
35
35
|
get_f1_metric_specs,
|
|
36
36
|
get_generative_harms_metric_specs,
|
|
37
37
|
get_language_modeling_metric_specs,
|
|
38
|
-
get_numeracy_metric_specs,
|
|
39
38
|
get_open_ended_generation_metric_specs,
|
|
40
39
|
get_summarization_metric_specs,
|
|
41
40
|
get_basic_generation_metric_specs,
|
|
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
|
|
|
381
380
|
)
|
|
382
381
|
|
|
383
382
|
|
|
384
|
-
@run_spec_function("numeracy")
|
|
385
|
-
def get_numeracy_spec(
|
|
386
|
-
relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
|
|
387
|
-
) -> RunSpec:
|
|
388
|
-
from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
|
|
389
|
-
|
|
390
|
-
run_solver_bool: bool = True if run_solver.lower() == "true" else False
|
|
391
|
-
del run_solver
|
|
392
|
-
random_seed = int(seed)
|
|
393
|
-
scenario_spec = ScenarioSpec(
|
|
394
|
-
class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
|
|
395
|
-
args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
if mode in ["example", "standard"]:
|
|
399
|
-
# Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
|
|
400
|
-
adapter_args: Dict[str, Any] = {
|
|
401
|
-
"max_train_instances": 100,
|
|
402
|
-
"max_eval_instances": 100,
|
|
403
|
-
"dim": RELTYPE_INFO[relation_type].num_variables + 1,
|
|
404
|
-
}
|
|
405
|
-
elif mode == "function":
|
|
406
|
-
# Test a model's ability to impute datapoints for randomly sampled relations
|
|
407
|
-
# (resampled for each evaluation point).
|
|
408
|
-
adapter_args = {
|
|
409
|
-
"instructions": "",
|
|
410
|
-
"max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
|
|
411
|
-
# capture a higher-order version of this task / is a little convoluted
|
|
412
|
-
# for models, currently.
|
|
413
|
-
# (In the general version, the model sees other relations of the same class,
|
|
414
|
-
# and needs to impute a datapoint for the last one. Presumably, inferring
|
|
415
|
-
# the class - eg. the degree of the relation - would help.)
|
|
416
|
-
"max_eval_instances": 1000,
|
|
417
|
-
"dim": RELTYPE_INFO[relation_type].num_variables + 1,
|
|
418
|
-
"instance_prefix": "\n\n",
|
|
419
|
-
}
|
|
420
|
-
else:
|
|
421
|
-
raise ValueError(f"Invalid mode: {mode}")
|
|
422
|
-
|
|
423
|
-
adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
|
|
424
|
-
# `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
|
|
425
|
-
# because it is used within the scenario to construct the instances themselves.
|
|
426
|
-
|
|
427
|
-
return RunSpec(
|
|
428
|
-
name=f"numeracy:relation_type={relation_type},mode={mode}",
|
|
429
|
-
scenario_spec=scenario_spec,
|
|
430
|
-
adapter_spec=adapter_spec,
|
|
431
|
-
metric_specs=get_numeracy_metric_specs(run_solver_bool),
|
|
432
|
-
groups=["numeracy"],
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
|
|
436
383
|
@run_spec_function("boolq")
|
|
437
384
|
def get_boolq_spec(only_contrast=False) -> RunSpec:
|
|
438
385
|
scenario_spec = ScenarioSpec(
|
|
@@ -806,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
|
|
|
806
753
|
)
|
|
807
754
|
|
|
808
755
|
return RunSpec(
|
|
809
|
-
name=f"
|
|
756
|
+
name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
|
|
810
757
|
scenario_spec=scenario_spec,
|
|
811
758
|
adapter_spec=adapter_spec,
|
|
812
759
|
metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
|
|
813
760
|
+ get_generative_harms_metric_specs(),
|
|
814
|
-
groups=["
|
|
761
|
+
groups=["summarization_xsum_sampled"],
|
|
815
762
|
)
|
|
816
763
|
|
|
817
764
|
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
2
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
3
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
|
|
4
|
+
from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
|
|
5
|
+
from helm.benchmark.metrics.codeinsights_metric_specs import (
|
|
6
|
+
get_functional_correctness_metric_specs,
|
|
7
|
+
get_comprehensive_code_evaluation_metric_specs,
|
|
8
|
+
get_edge_case_metric_specs,
|
|
9
|
+
get_code_efficiency_metric_specs,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@run_spec_function("codeinsights_correct_code")
|
|
14
|
+
def get_codeinsights_correct_code_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
|
|
15
|
+
scenario_spec = ScenarioSpec(
|
|
16
|
+
class_name="helm.benchmark.scenarios.codeinsights_correct_code_scenario.CodeInsightsCorrectCodeScenario",
|
|
17
|
+
args={"num_testcases": num_testcases},
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
instruction = (
|
|
21
|
+
"You are a skilled C++ programmer working on a foundational programming course assignment. "
|
|
22
|
+
"Your task is to write correct, efficient C++ code that solves the given problem. "
|
|
23
|
+
"Write clean, well-structured code that follows good programming practices. "
|
|
24
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
|
|
25
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template,"
|
|
26
|
+
"and make sure the code is compatible with the Unit Test Input"
|
|
27
|
+
"Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
adapter_spec = get_generation_adapter_spec(
|
|
31
|
+
instructions=instruction,
|
|
32
|
+
output_noun="Your code",
|
|
33
|
+
stop_sequences=[],
|
|
34
|
+
max_tokens=4000,
|
|
35
|
+
temperature=tpr,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return RunSpec(
|
|
39
|
+
name=f"codeinsights_correct_code:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
|
|
40
|
+
scenario_spec=scenario_spec,
|
|
41
|
+
adapter_spec=adapter_spec,
|
|
42
|
+
metric_specs=get_functional_correctness_metric_specs() + get_basic_metric_specs([]),
|
|
43
|
+
groups=["codeinsights", "codeinsights_correct_code"],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@run_spec_function("codeinsights_student_coding")
|
|
48
|
+
def get_codeinsights_student_coding_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
|
|
49
|
+
scenario_spec = ScenarioSpec(
|
|
50
|
+
class_name="helm.benchmark.scenarios.codeinsights_student_coding_scenario.CodeInsightsStudentCodingScenario",
|
|
51
|
+
args={"num_testcases": num_testcases},
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
instruction = (
|
|
55
|
+
"You are the same student who wrote the three examples below in your foundational C++ course. "
|
|
56
|
+
"Mimic exactly your personal coding style, conventions, and level of proficiency—"
|
|
57
|
+
"do not over‐optimize or introduce unfamiliar patterns. "
|
|
58
|
+
"Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
|
|
59
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
|
|
60
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template,"
|
|
61
|
+
"and make sure the code is compatible with the Unit Test Input"
|
|
62
|
+
"Ensure your code includes any class definition when needed."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
adapter_spec = get_generation_adapter_spec(
|
|
66
|
+
instructions=instruction,
|
|
67
|
+
output_noun="Your code",
|
|
68
|
+
stop_sequences=[],
|
|
69
|
+
max_tokens=4000,
|
|
70
|
+
temperature=tpr,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return RunSpec(
|
|
74
|
+
name=f"codeinsights_student_coding:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
|
|
75
|
+
scenario_spec=scenario_spec,
|
|
76
|
+
adapter_spec=adapter_spec,
|
|
77
|
+
metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
|
|
78
|
+
groups=["codeinsights", "codeinsights_student_coding"],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@run_spec_function("codeinsights_student_mistake")
|
|
83
|
+
def get_codeinsights_student_mistake_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
|
|
84
|
+
scenario_spec = ScenarioSpec(
|
|
85
|
+
class_name="helm.benchmark.scenarios.codeinsights_student_mistake_scenario.CodeInsightsStudentMistakeScenario",
|
|
86
|
+
args={"num_testcases": num_testcases},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
instruction = (
|
|
90
|
+
"You are a C++ student with a consistent personal style, conventions, and proficiency level.\n"
|
|
91
|
+
"Your task is to attempt the target problem **but introduce realistic mistake** you would typically make—"
|
|
92
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
|
|
93
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template,"
|
|
94
|
+
"and make sure the code is compatible with the Unit Test Input"
|
|
95
|
+
"Ensure your code is includes any class definition when needed."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
adapter_spec = get_generation_adapter_spec(
|
|
99
|
+
instructions=instruction,
|
|
100
|
+
output_noun="Your code",
|
|
101
|
+
stop_sequences=[],
|
|
102
|
+
max_tokens=4000,
|
|
103
|
+
temperature=tpr,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return RunSpec(
|
|
107
|
+
name=f"codeinsights_student_mistake:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
|
|
108
|
+
scenario_spec=scenario_spec,
|
|
109
|
+
adapter_spec=adapter_spec,
|
|
110
|
+
metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
|
|
111
|
+
groups=["codeinsights", "codeinsights_student_mistake"],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@run_spec_function("codeinsights_code_efficiency")
|
|
116
|
+
def get_codeinsights_code_efficiency_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
|
|
117
|
+
"""
|
|
118
|
+
Run specification for code efficiency evaluation scenario.
|
|
119
|
+
|
|
120
|
+
This scenario evaluates whether LLM-generated code has similar runtime efficiency
|
|
121
|
+
as the original student code. It focuses on problems where both solutions are
|
|
122
|
+
functionally correct and measures runtime performance alignment.
|
|
123
|
+
|
|
124
|
+
Requires C++ compiler (g++) to be available for actual compilation and execution.
|
|
125
|
+
"""
|
|
126
|
+
scenario_spec = ScenarioSpec(
|
|
127
|
+
class_name="helm.benchmark.scenarios.codeinsights_code_efficiency_scenario.CodeInsightsCodeEfficiencyScenario",
|
|
128
|
+
args={"num_testcases": num_testcases},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
instruction = (
|
|
132
|
+
"You are the same student who wrote the three examples below in your foundational C++ course. "
|
|
133
|
+
"Mimic exactly your personal coding style, conventions, and make sure to generate a correct code. "
|
|
134
|
+
"Do not over-optimize or introduce unfamiliar patterns. If the code is correct but inefficient, "
|
|
135
|
+
"imitate the inefficiency. "
|
|
136
|
+
"If the student writes efficiently, write efficiently too. "
|
|
137
|
+
"Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
|
|
138
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
|
|
139
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template,"
|
|
140
|
+
"and make sure the code is compatible with the Unit Test Input"
|
|
141
|
+
"Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
adapter_spec = get_generation_adapter_spec(
|
|
145
|
+
instructions=instruction,
|
|
146
|
+
output_noun="Your code",
|
|
147
|
+
stop_sequences=[],
|
|
148
|
+
max_tokens=4000,
|
|
149
|
+
temperature=tpr,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return RunSpec(
|
|
153
|
+
name=f"codeinsights_code_efficiency:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
|
|
154
|
+
scenario_spec=scenario_spec,
|
|
155
|
+
adapter_spec=adapter_spec,
|
|
156
|
+
metric_specs=get_code_efficiency_metric_specs(
|
|
157
|
+
num_runtime_runs=5, # Run each solution 5 times for averaging
|
|
158
|
+
timeout_seconds=10, # 10 second timeout per execution
|
|
159
|
+
)
|
|
160
|
+
+ get_basic_metric_specs([]),
|
|
161
|
+
groups=["codeinsights", "codeinsights_code_efficiency"],
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@run_spec_function("codeinsights_edge_case")
|
|
166
|
+
def get_codeinsights_edge_case_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
|
|
167
|
+
scenario_spec = ScenarioSpec(
|
|
168
|
+
class_name="helm.benchmark.scenarios.codeinsights_edge_case_scenario.CodeInsightsEdgeCaseScenario",
|
|
169
|
+
args={"num_testcases": num_testcases},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
instruction = (
|
|
173
|
+
"You are a student studying C++ with a consistent personal style, conventions, and proficiency level.\n"
|
|
174
|
+
"Your task is to identify which test case you would likely to fail for a given question with unit tests.\n"
|
|
175
|
+
"Respond only with integer of the unittest number\n\n"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
adapter_spec = get_generation_adapter_spec(
|
|
179
|
+
instructions=instruction,
|
|
180
|
+
output_noun="Your code",
|
|
181
|
+
stop_sequences=[],
|
|
182
|
+
max_tokens=4000,
|
|
183
|
+
temperature=tpr,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return RunSpec(
|
|
187
|
+
name=f"codeinsights_edge_case:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
|
|
188
|
+
scenario_spec=scenario_spec,
|
|
189
|
+
adapter_spec=adapter_spec,
|
|
190
|
+
metric_specs=get_edge_case_metric_specs() + get_basic_metric_specs([]),
|
|
191
|
+
groups=["codeinsights", "codeinsights_edge_case"],
|
|
192
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
2
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
3
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("healthqa_br")
|
|
9
|
+
def get_healthqa_br_spec() -> RunSpec:
|
|
10
|
+
scenario_spec = ScenarioSpec(
|
|
11
|
+
class_name="helm.benchmark.scenarios.healthqa_br_scenario.HEALTHQA_BR_Scenario", args={}
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
15
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
16
|
+
instructions="""
|
|
17
|
+
Escolha a alternativa correta para as questões de medicina (responda apenas com a letra).
|
|
18
|
+
Exemplo de Pergunta com a resposta:
|
|
19
|
+
Qual dos seguintes órgãos é responsável pela produção da insulina no corpo humano?
|
|
20
|
+
A) Fígado
|
|
21
|
+
B) Rins
|
|
22
|
+
C) Pâncreas
|
|
23
|
+
D) Baço
|
|
24
|
+
E) Coração
|
|
25
|
+
|
|
26
|
+
Resposta correta: C
|
|
27
|
+
|
|
28
|
+
A partir disso, responda:
|
|
29
|
+
""",
|
|
30
|
+
input_noun="Pergunta",
|
|
31
|
+
output_noun="Resposta",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return RunSpec(
|
|
35
|
+
name="healthqa_br",
|
|
36
|
+
scenario_spec=scenario_spec,
|
|
37
|
+
adapter_spec=adapter_spec,
|
|
38
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
39
|
+
groups=["healthqa_br"],
|
|
40
|
+
)
|
|
@@ -60,7 +60,9 @@ def get_core_heim_metric_specs() -> List[MetricSpec]:
|
|
|
60
60
|
class_name="helm.benchmark.metrics.image_generation.fractal_dimension_metric.FractalDimensionMetric",
|
|
61
61
|
args={},
|
|
62
62
|
),
|
|
63
|
-
|
|
63
|
+
# Disabled due to keras issue.
|
|
64
|
+
# See: https://github.com/stanford-crfm/helm/issues/3741#issuecomment-3109478877
|
|
65
|
+
# MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
|
|
64
66
|
MetricSpec(class_name="helm.benchmark.metrics.image_generation.nudity_metrics.NudityMetric", args={}),
|
|
65
67
|
MetricSpec(class_name="helm.benchmark.metrics.image_generation.watermark_metrics.WatermarkMetric", args={}),
|
|
66
68
|
] + get_basic_metric_specs(names=[])
|