crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from helm.benchmark.metrics.metric import Metric
|
|
4
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
5
5
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
6
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
7
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
|
|
|
94
94
|
Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
|
|
95
95
|
Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
|
|
96
96
|
]
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
99
|
+
return [
|
|
100
|
+
MetricMetadata(
|
|
101
|
+
name="mimiciv_billing_code_precision",
|
|
102
|
+
display_name="Precision for MIMIC Billing Codes",
|
|
103
|
+
short_display_name="MIMICBillingPre",
|
|
104
|
+
description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
|
|
105
|
+
"the model.",
|
|
106
|
+
lower_is_better=False,
|
|
107
|
+
group=None,
|
|
108
|
+
),
|
|
109
|
+
MetricMetadata(
|
|
110
|
+
name="mimiciv_billing_code_recall",
|
|
111
|
+
display_name="Recall for MIMIC Billing Codes",
|
|
112
|
+
short_display_name="MIMICBillingRec",
|
|
113
|
+
description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
|
|
114
|
+
"the gold standard.",
|
|
115
|
+
lower_is_better=False,
|
|
116
|
+
group=None,
|
|
117
|
+
),
|
|
118
|
+
MetricMetadata(
|
|
119
|
+
name="mimiciv_billing_code_f1",
|
|
120
|
+
display_name="F1 Score for MIMIC Billing Codes",
|
|
121
|
+
short_display_name="MIMICBillingF1",
|
|
122
|
+
description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
|
|
123
|
+
"evaluation of the model's performance.",
|
|
124
|
+
lower_is_better=False,
|
|
125
|
+
group=None,
|
|
126
|
+
),
|
|
127
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
|
|
|
30
30
|
return [
|
|
31
31
|
Stat(MetricName("omni_math_accuracy")).add(score),
|
|
32
32
|
]
|
|
33
|
+
|
|
34
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
35
|
+
return [
|
|
36
|
+
MetricMetadata(
|
|
37
|
+
name="omni_math_accuracy",
|
|
38
|
+
display_name="Omni-MATH Accuracy",
|
|
39
|
+
short_display_name="Acc",
|
|
40
|
+
description="Accuracy of the AI output judged by GPT-4.",
|
|
41
|
+
lower_is_better=False,
|
|
42
|
+
group="accuracy",
|
|
43
|
+
),
|
|
44
|
+
]
|
|
@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
|
|
7
7
|
from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
8
|
-
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
9
9
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
10
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
11
11
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -77,3 +77,15 @@ class SafetyScoreMetric(Metric):
|
|
|
77
77
|
raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
|
|
78
78
|
stats.append(safety_score_stat)
|
|
79
79
|
return stats
|
|
80
|
+
|
|
81
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
82
|
+
return [
|
|
83
|
+
MetricMetadata(
|
|
84
|
+
name="safety_score",
|
|
85
|
+
display_name="LM Evaluated Safety score",
|
|
86
|
+
description="LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the "
|
|
87
|
+
"output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).",
|
|
88
|
+
lower_is_better=False,
|
|
89
|
+
group=None,
|
|
90
|
+
),
|
|
91
|
+
]
|
|
@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
|
|
|
8
8
|
|
|
9
9
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
10
10
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
11
|
-
from helm.benchmark.metrics.metric import Metric
|
|
11
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
14
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
|
|
|
73
73
|
|
|
74
74
|
return result
|
|
75
75
|
|
|
76
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
77
|
+
return [
|
|
78
|
+
MetricMetadata(
|
|
79
|
+
name="chr_f_plus_plus",
|
|
80
|
+
display_name="ChrF++",
|
|
81
|
+
description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
|
|
82
|
+
"2015)](https://aclanthology.org/W15-3049/). Code can be found "
|
|
83
|
+
"[here](https://github.com/mjpost/sacrebleu).",
|
|
84
|
+
lower_is_better=False,
|
|
85
|
+
group=None,
|
|
86
|
+
),
|
|
87
|
+
]
|
|
88
|
+
|
|
76
89
|
|
|
77
90
|
class SEAHELMQAMetric(Metric):
|
|
78
91
|
"""SEAHELM QA Metrics
|
|
@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
|
|
|
219
219
|
imager_load_cache=True,
|
|
220
220
|
agg="mean",
|
|
221
221
|
norm_histo=False,
|
|
222
|
-
**kwargs
|
|
222
|
+
**kwargs,
|
|
223
223
|
):
|
|
224
224
|
# `bins` should be `even%d` or `percentiles`
|
|
225
225
|
assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
|
|
@@ -405,7 +405,7 @@ class SummaCZS:
|
|
|
405
405
|
use_con=True,
|
|
406
406
|
imager_load_cache=True,
|
|
407
407
|
device="cuda",
|
|
408
|
-
**kwargs
|
|
408
|
+
**kwargs,
|
|
409
409
|
):
|
|
410
410
|
assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
|
|
411
411
|
assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"
|
|
@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
|
|
|
16
16
|
from helm.common.hierarchical_logger import hlog
|
|
17
17
|
from helm.common.general import ensure_file_downloaded
|
|
18
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
19
|
-
from helm.benchmark.metrics.metric import Metric, MetricResult
|
|
19
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
|
|
20
20
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
21
21
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
22
22
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
|
|
|
240
240
|
|
|
241
241
|
return result
|
|
242
242
|
|
|
243
|
+
def get_metadata(self):
|
|
244
|
+
metadata: List[MetricMetadata] = [
|
|
245
|
+
MetricMetadata(
|
|
246
|
+
name="QAFactEval",
|
|
247
|
+
display_name="QAFactEval",
|
|
248
|
+
description="Faithfulness scores based on the SummaC method of [Laban et al. "
|
|
249
|
+
"(2022)](https://aclanthology.org/2022.tacl-1.10/).",
|
|
250
|
+
lower_is_better=False,
|
|
251
|
+
group="summarization_metrics",
|
|
252
|
+
),
|
|
253
|
+
MetricMetadata(
|
|
254
|
+
name="summarization_coverage",
|
|
255
|
+
display_name="Coverage",
|
|
256
|
+
description="Extent to which the model-generated summaries are extractive fragments from the source "
|
|
257
|
+
"document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
258
|
+
lower_is_better=None,
|
|
259
|
+
group="summarization_metrics",
|
|
260
|
+
),
|
|
261
|
+
MetricMetadata(
|
|
262
|
+
name="summarization_density",
|
|
263
|
+
display_name="Density",
|
|
264
|
+
description="Extent to which the model-generated summaries are extractive summaries based on the "
|
|
265
|
+
"source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
266
|
+
lower_is_better=None,
|
|
267
|
+
group="summarization_metrics",
|
|
268
|
+
),
|
|
269
|
+
MetricMetadata(
|
|
270
|
+
name="summarization_compression",
|
|
271
|
+
display_name="Compression",
|
|
272
|
+
description="Extent to which the model-generated summaries are compressed relative to the source "
|
|
273
|
+
"document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
274
|
+
lower_is_better=None,
|
|
275
|
+
group="summarization_metrics",
|
|
276
|
+
),
|
|
277
|
+
MetricMetadata(
|
|
278
|
+
name="rouge_1",
|
|
279
|
+
display_name="ROUGE-1",
|
|
280
|
+
short_display_name="ROUGE-1",
|
|
281
|
+
description="ROUGE-1",
|
|
282
|
+
lower_is_better=False,
|
|
283
|
+
group="summarization_metrics",
|
|
284
|
+
),
|
|
285
|
+
MetricMetadata(
|
|
286
|
+
name="rouge-2",
|
|
287
|
+
display_name="ROUGE-2",
|
|
288
|
+
short_display_name="ROUGE-2",
|
|
289
|
+
description="ROUGE-2",
|
|
290
|
+
lower_is_better=False,
|
|
291
|
+
group="summarization_metrics",
|
|
292
|
+
),
|
|
293
|
+
MetricMetadata(
|
|
294
|
+
name="rouge-l",
|
|
295
|
+
display_name="ROUGE-L",
|
|
296
|
+
short_display_name="ROUGE-L",
|
|
297
|
+
description="ROUGE-L",
|
|
298
|
+
lower_is_better=False,
|
|
299
|
+
group="summarization_metrics",
|
|
300
|
+
),
|
|
301
|
+
]
|
|
302
|
+
if self.humaneval is not None:
|
|
303
|
+
metadata.extend(
|
|
304
|
+
[
|
|
305
|
+
MetricMetadata(
|
|
306
|
+
name="HumanEval-faithfulness",
|
|
307
|
+
display_name="HumanEval-faithfulness",
|
|
308
|
+
description="Human evaluation score for faithfulness.",
|
|
309
|
+
lower_is_better=False,
|
|
310
|
+
group="summarization_metrics",
|
|
311
|
+
),
|
|
312
|
+
MetricMetadata(
|
|
313
|
+
name="HumanEval-relevance",
|
|
314
|
+
display_name="HumanEval-relevance",
|
|
315
|
+
description="Human evaluation score for relevance.",
|
|
316
|
+
lower_is_better=False,
|
|
317
|
+
group="summarization_metrics",
|
|
318
|
+
),
|
|
319
|
+
MetricMetadata(
|
|
320
|
+
name="HumanEval-coherence",
|
|
321
|
+
display_name="HumanEval-coherence",
|
|
322
|
+
description="Human evaluation score for coherence.",
|
|
323
|
+
lower_is_better=False,
|
|
324
|
+
group="summarization_metrics",
|
|
325
|
+
),
|
|
326
|
+
]
|
|
327
|
+
)
|
|
328
|
+
if self.compute_faithfulness:
|
|
329
|
+
metadata.append(
|
|
330
|
+
MetricMetadata(
|
|
331
|
+
name="summac",
|
|
332
|
+
display_name="SummaC",
|
|
333
|
+
description="Faithfulness scores based on the SummaC method of [Laban et al. "
|
|
334
|
+
"(2022)](https://aclanthology.org/2022.tacl-1.10/).",
|
|
335
|
+
lower_is_better=False,
|
|
336
|
+
group="summarization_metrics",
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
if self.compute_bertscore:
|
|
340
|
+
metadata.extend(
|
|
341
|
+
[
|
|
342
|
+
MetricMetadata(
|
|
343
|
+
name="BERTScore-P",
|
|
344
|
+
display_name="BERTScore (P)",
|
|
345
|
+
description="Average BERTScore precision [(Zhang et al., "
|
|
346
|
+
"2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
|
|
347
|
+
"summary.",
|
|
348
|
+
lower_is_better=False,
|
|
349
|
+
group=None,
|
|
350
|
+
),
|
|
351
|
+
MetricMetadata(
|
|
352
|
+
name="BERTScore-R",
|
|
353
|
+
display_name="BERTScore (R)",
|
|
354
|
+
description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
|
|
355
|
+
"between model generation and reference summary.",
|
|
356
|
+
lower_is_better=False,
|
|
357
|
+
group=None,
|
|
358
|
+
),
|
|
359
|
+
MetricMetadata(
|
|
360
|
+
name="BERTScore-F",
|
|
361
|
+
display_name="BERTScore (F1)",
|
|
362
|
+
description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
|
|
363
|
+
"between model generation and reference summary.",
|
|
364
|
+
lower_is_better=False,
|
|
365
|
+
group="summarization_metrics",
|
|
366
|
+
),
|
|
367
|
+
]
|
|
368
|
+
)
|
|
369
|
+
return metadata
|
|
370
|
+
|
|
243
371
|
|
|
244
372
|
def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
|
|
245
373
|
"""
|
|
@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
8
|
from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
|
|
9
|
-
from helm.benchmark.metrics.metric import Metric
|
|
9
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
10
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
12
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
|
|
|
89
89
|
]
|
|
90
90
|
|
|
91
91
|
return stats
|
|
92
|
+
|
|
93
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
94
|
+
return [
|
|
95
|
+
MetricMetadata(
|
|
96
|
+
name="expected_max_toxicity",
|
|
97
|
+
display_name="Average max toxicity",
|
|
98
|
+
short_display_name="Avg. max toxicity",
|
|
99
|
+
description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
|
|
100
|
+
"classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
|
|
101
|
+
lower_is_better=True,
|
|
102
|
+
group=None,
|
|
103
|
+
),
|
|
104
|
+
MetricMetadata(
|
|
105
|
+
name="max_toxicity_probability",
|
|
106
|
+
display_name="max toxicity",
|
|
107
|
+
description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
|
|
108
|
+
"PerspectiveAPI toxicity classifier; [Gehman et al. "
|
|
109
|
+
"(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
|
|
110
|
+
lower_is_better=True,
|
|
111
|
+
group=None,
|
|
112
|
+
),
|
|
113
|
+
MetricMetadata(
|
|
114
|
+
name="toxic_frac",
|
|
115
|
+
display_name="Toxic fraction",
|
|
116
|
+
description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
|
|
117
|
+
"classifier).",
|
|
118
|
+
lower_is_better=True,
|
|
119
|
+
group="toxicity",
|
|
120
|
+
),
|
|
121
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
5
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
7
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
)
|
|
11
|
+
from sklearn.metrics import f1_score, accuracy_score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UltraSuiteASRMetric(EvaluateInstancesMetric):
|
|
15
|
+
"""Score metrics for UltraSuite ASR."""
|
|
16
|
+
|
|
17
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
18
|
+
y_pred: List[str] = []
|
|
19
|
+
y_pred_quasi: List[str] = []
|
|
20
|
+
y_true: List[str] = []
|
|
21
|
+
for request_state in request_states: # one request state per instance
|
|
22
|
+
|
|
23
|
+
for reference in request_state.instance.references:
|
|
24
|
+
if reference.tags == [CORRECT_TAG]:
|
|
25
|
+
true_label = reference.output.text
|
|
26
|
+
break
|
|
27
|
+
|
|
28
|
+
assert request_state.result
|
|
29
|
+
model_output_text = request_state.result.completions[0].text.strip().lower()
|
|
30
|
+
assert request_state.instance.extra_data
|
|
31
|
+
ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
|
|
32
|
+
|
|
33
|
+
if model_output_text == ground_truth_text:
|
|
34
|
+
predicted_label = "typically_developing"
|
|
35
|
+
else:
|
|
36
|
+
predicted_label = "speech_disorder"
|
|
37
|
+
|
|
38
|
+
if normalize_text(predicted_label) == normalize_text(true_label):
|
|
39
|
+
quasi_label = "typically_developing"
|
|
40
|
+
else:
|
|
41
|
+
quasi_label = "speech_disorder"
|
|
42
|
+
|
|
43
|
+
y_true.append(true_label)
|
|
44
|
+
y_pred.append(predicted_label)
|
|
45
|
+
y_pred_quasi.append(quasi_label)
|
|
46
|
+
|
|
47
|
+
return [
|
|
48
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
49
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
50
|
+
Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
|
|
51
|
+
Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
|
|
52
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
|
|
|
32
32
|
Stat(MetricName("wildbench_score")).add(score),
|
|
33
33
|
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
|
|
34
34
|
]
|
|
35
|
+
|
|
36
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
37
|
+
return [
|
|
38
|
+
MetricMetadata(
|
|
39
|
+
name="wildbench_score",
|
|
40
|
+
display_name="WildBench Score",
|
|
41
|
+
short_display_name="WB Score",
|
|
42
|
+
description="Score of the AI output judged by GPT-4o.",
|
|
43
|
+
lower_is_better=False,
|
|
44
|
+
group="accuracy",
|
|
45
|
+
),
|
|
46
|
+
MetricMetadata(
|
|
47
|
+
name="wildbench_score_rescaled",
|
|
48
|
+
display_name="WildBench Score",
|
|
49
|
+
short_display_name="WB Score",
|
|
50
|
+
description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
|
|
51
|
+
lower_is_better=False,
|
|
52
|
+
group="accuracy",
|
|
53
|
+
),
|
|
54
|
+
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
262
263
|
if request_state.result is not None and request_state.result.completions
|
|
263
264
|
else ""
|
|
264
265
|
)
|
|
265
|
-
mapped_output =
|
|
266
|
-
|
|
267
|
-
|
|
266
|
+
mapped_output: Optional[str] = None
|
|
267
|
+
if request_state.output_mapping is not None:
|
|
268
|
+
output_to_map = predicted_text.strip()
|
|
269
|
+
if run_spec.adapter_spec.output_mapping_pattern:
|
|
270
|
+
match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
|
|
271
|
+
if not match:
|
|
272
|
+
output_to_map = ""
|
|
273
|
+
elif match.groups():
|
|
274
|
+
output_to_map = match.group(0)
|
|
275
|
+
else:
|
|
276
|
+
output_to_map = match.string
|
|
277
|
+
mapped_output = request_state.output_mapping.get(output_to_map)
|
|
268
278
|
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
269
279
|
request_state.instance
|
|
270
280
|
)
|
|
@@ -14,10 +14,10 @@ class RunEntry:
|
|
|
14
14
|
description: str
|
|
15
15
|
|
|
16
16
|
# Priority for this run spec (1 is highest priority, 5 is lowest priority)
|
|
17
|
-
priority: int
|
|
17
|
+
priority: Optional[int] = None
|
|
18
18
|
|
|
19
19
|
# Additional groups to add to the run spec
|
|
20
|
-
groups: Optional[List[str]]
|
|
20
|
+
groups: Optional[List[str]] = None
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|
|
@@ -8,6 +8,7 @@ import mako.template
|
|
|
8
8
|
import yaml
|
|
9
9
|
import importlib_resources as resources
|
|
10
10
|
|
|
11
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
11
12
|
from helm.common.general import hlog
|
|
12
13
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
14
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
|
|
|
131
132
|
NO_GROUPS = "no_groups"
|
|
132
133
|
|
|
133
134
|
|
|
134
|
-
@dataclass(frozen=True)
|
|
135
|
-
class TaxonomyInfo:
|
|
136
|
-
# Task (e.g., question answering)
|
|
137
|
-
task: Optional[str] = None
|
|
138
|
-
|
|
139
|
-
# Domain - genre (e.g., Wikipedia)
|
|
140
|
-
what: Optional[str] = None
|
|
141
|
-
|
|
142
|
-
# Domain - when it was written (e.g., 2010s)
|
|
143
|
-
when: Optional[str] = None
|
|
144
|
-
|
|
145
|
-
# Domain - demographics (e.g., web users)
|
|
146
|
-
who: Optional[str] = None
|
|
147
|
-
|
|
148
|
-
# Language (e.g., English)
|
|
149
|
-
language: Optional[str] = None
|
|
150
|
-
|
|
151
|
-
|
|
152
135
|
@dataclass(frozen=True)
|
|
153
136
|
class RunGroup(Field):
|
|
154
137
|
"""
|
|
@@ -216,16 +199,16 @@ class Schema:
|
|
|
216
199
|
"""Specifies information about what to display on the frontend."""
|
|
217
200
|
|
|
218
201
|
# Information about each field
|
|
219
|
-
metrics: List[Field]
|
|
202
|
+
metrics: List[Field] = field(default_factory=list)
|
|
220
203
|
|
|
221
204
|
# Information about each perturbation
|
|
222
|
-
perturbations: List[Field]
|
|
205
|
+
perturbations: List[Field] = field(default_factory=list)
|
|
223
206
|
|
|
224
207
|
# Group the metrics
|
|
225
|
-
metric_groups: List[MetricGroup]
|
|
208
|
+
metric_groups: List[MetricGroup] = field(default_factory=list)
|
|
226
209
|
|
|
227
210
|
# Group the scenarios
|
|
228
|
-
run_groups: List[RunGroup]
|
|
211
|
+
run_groups: List[RunGroup] = field(default_factory=list)
|
|
229
212
|
|
|
230
213
|
# Adapter fields (e.g., temperature)
|
|
231
214
|
# Automatically populated from the docstrings in the AdapterSpec class definition.
|