crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -14,6 +14,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
|
14
14
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
15
15
|
from helm.benchmark.metrics import code_metrics_helper
|
|
16
16
|
from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
|
|
17
|
+
from helm.benchmark.metrics.metric import MetricMetadata
|
|
17
18
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
18
19
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
19
20
|
from helm.benchmark.metrics.nltk_helper import install_nltk_resources
|
|
@@ -518,3 +519,301 @@ def compute_reference_metrics(
|
|
|
518
519
|
raise NameError(f"{metric_name} is not in the list of metric functions.")
|
|
519
520
|
|
|
520
521
|
return stats
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
_METRIC_METADATA_MAPPING: Dict[str, MetricMetadata] = {
|
|
525
|
+
"exact_match": MetricMetadata(
|
|
526
|
+
name="exact_match",
|
|
527
|
+
display_name="Exact match",
|
|
528
|
+
short_display_name="EM",
|
|
529
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly.",
|
|
530
|
+
lower_is_better=False,
|
|
531
|
+
group="accuracy",
|
|
532
|
+
),
|
|
533
|
+
"quasi_exact_match": MetricMetadata(
|
|
534
|
+
name="quasi_exact_match",
|
|
535
|
+
display_name="Quasi-exact match",
|
|
536
|
+
short_display_name="EM",
|
|
537
|
+
description="Fraction of instances that the predicted output matches a correct reference up to light "
|
|
538
|
+
"processing.",
|
|
539
|
+
lower_is_better=False,
|
|
540
|
+
group=None,
|
|
541
|
+
),
|
|
542
|
+
"quasi_leave_articles_exact_match": MetricMetadata(
|
|
543
|
+
name="quasi_leave_articles_exact_match",
|
|
544
|
+
display_name="Quasi-exact match",
|
|
545
|
+
short_display_name="EM",
|
|
546
|
+
description="Fraction of instances that the predicted output matches a correct reference up to light "
|
|
547
|
+
"processing.",
|
|
548
|
+
lower_is_better=False,
|
|
549
|
+
group=None,
|
|
550
|
+
),
|
|
551
|
+
"prefix_exact_match": MetricMetadata(
|
|
552
|
+
name="prefix_exact_match",
|
|
553
|
+
display_name="Prefix exact match",
|
|
554
|
+
short_display_name="PEM",
|
|
555
|
+
description="Fraction of instances that the predicted output matches the prefix of a correct reference "
|
|
556
|
+
"exactly.",
|
|
557
|
+
lower_is_better=False,
|
|
558
|
+
group=None,
|
|
559
|
+
),
|
|
560
|
+
"quasi_prefix_exact_match": MetricMetadata(
|
|
561
|
+
name="quasi_prefix_exact_match",
|
|
562
|
+
display_name="Prefix quasi-exact match",
|
|
563
|
+
short_display_name="PEM",
|
|
564
|
+
description="Fraction of instances that the predicted output matches the prefix of a correct reference "
|
|
565
|
+
"up to light processing.",
|
|
566
|
+
lower_is_better=False,
|
|
567
|
+
group=None,
|
|
568
|
+
),
|
|
569
|
+
"exact_match_indicator": MetricMetadata(
|
|
570
|
+
name="exact_match_indicator",
|
|
571
|
+
display_name="Exact match (final)",
|
|
572
|
+
short_display_name="EM",
|
|
573
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly, "
|
|
574
|
+
"ignoring text preceding the specified indicator (e.g., space).",
|
|
575
|
+
lower_is_better=False,
|
|
576
|
+
group=None,
|
|
577
|
+
),
|
|
578
|
+
"final_number_exact_match": MetricMetadata(
|
|
579
|
+
name="final_number_exact_match",
|
|
580
|
+
display_name="Exact match (final number)",
|
|
581
|
+
short_display_name="EM",
|
|
582
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly, "
|
|
583
|
+
"ignoring text preceding the specified indicator.",
|
|
584
|
+
lower_is_better=False,
|
|
585
|
+
group=None,
|
|
586
|
+
),
|
|
587
|
+
"exact_set_match": MetricMetadata(
|
|
588
|
+
name="exact_set_match",
|
|
589
|
+
display_name="Exact match (at sets)",
|
|
590
|
+
short_display_name="EM",
|
|
591
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly as " "sets.",
|
|
592
|
+
lower_is_better=False,
|
|
593
|
+
group=None,
|
|
594
|
+
),
|
|
595
|
+
"iou_set_match": MetricMetadata(
|
|
596
|
+
name="iou_set_match",
|
|
597
|
+
display_name="Intersection over union (as sets)",
|
|
598
|
+
short_display_name="IoU",
|
|
599
|
+
description="Intersection over union in terms of set overlap between the model predicted set and "
|
|
600
|
+
"correct reference set.",
|
|
601
|
+
lower_is_better=False,
|
|
602
|
+
group=None,
|
|
603
|
+
),
|
|
604
|
+
"f1_set_match": MetricMetadata(
|
|
605
|
+
name="f1_set_match",
|
|
606
|
+
display_name="F1 (set match)",
|
|
607
|
+
short_display_name="F1",
|
|
608
|
+
description="Average F1 score in terms of set overlap between the model predicted set and correct "
|
|
609
|
+
"reference set.",
|
|
610
|
+
lower_is_better=False,
|
|
611
|
+
group=None,
|
|
612
|
+
),
|
|
613
|
+
"math_equiv": MetricMetadata(
|
|
614
|
+
name="math_equiv",
|
|
615
|
+
display_name="Equivalent",
|
|
616
|
+
description="Fraction of model outputs that are mathematically equivalent to the correct reference.",
|
|
617
|
+
lower_is_better=False,
|
|
618
|
+
group=None,
|
|
619
|
+
),
|
|
620
|
+
"math_equiv_chain_of_thought": MetricMetadata(
|
|
621
|
+
name="math_equiv_chain_of_thought",
|
|
622
|
+
display_name="Equivalent (CoT)",
|
|
623
|
+
description="Fraction of model outputs that are mathematically equivalent to the correct reference "
|
|
624
|
+
"when using chain-of-thought prompting.",
|
|
625
|
+
lower_is_better=False,
|
|
626
|
+
group=None,
|
|
627
|
+
),
|
|
628
|
+
"code_eval_acc": MetricMetadata(
|
|
629
|
+
name="code_eval_acc",
|
|
630
|
+
display_name="Correctness",
|
|
631
|
+
short_display_name="Correctness",
|
|
632
|
+
description="Fraction of instances that the model output evaluates to the correct answer.",
|
|
633
|
+
lower_is_better=False,
|
|
634
|
+
group=None,
|
|
635
|
+
),
|
|
636
|
+
"pass": MetricMetadata(
|
|
637
|
+
name="pass",
|
|
638
|
+
display_name="pass@1",
|
|
639
|
+
description="Fraction of model outputs that pass the associated test cases.",
|
|
640
|
+
lower_is_better=False,
|
|
641
|
+
group=None,
|
|
642
|
+
),
|
|
643
|
+
"cider": MetricMetadata(
|
|
644
|
+
name="cider",
|
|
645
|
+
display_name="CIDEr",
|
|
646
|
+
description="Evaluates the quality of generated caption by measuring the weighted similarity of "
|
|
647
|
+
"n-grams between the captions and a set of human-written reference captions, emphasizing "
|
|
648
|
+
"informativeness and consensus.",
|
|
649
|
+
lower_is_better=False,
|
|
650
|
+
group=None,
|
|
651
|
+
),
|
|
652
|
+
"f1_score": MetricMetadata(
|
|
653
|
+
name="f1_score",
|
|
654
|
+
display_name="F1",
|
|
655
|
+
description="Average F1 score in terms of word overlap between the model output and correct reference.",
|
|
656
|
+
lower_is_better=False,
|
|
657
|
+
group=None,
|
|
658
|
+
),
|
|
659
|
+
"rouge_1": MetricMetadata(
|
|
660
|
+
name="rouge_1",
|
|
661
|
+
display_name="ROUGE-1",
|
|
662
|
+
description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram " "overlap.",
|
|
663
|
+
lower_is_better=False,
|
|
664
|
+
group=None,
|
|
665
|
+
),
|
|
666
|
+
"rouge_2": MetricMetadata(
|
|
667
|
+
name="rouge_2",
|
|
668
|
+
display_name="ROUGE-2",
|
|
669
|
+
description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram " "overlap.",
|
|
670
|
+
lower_is_better=False,
|
|
671
|
+
group=None,
|
|
672
|
+
),
|
|
673
|
+
"rouge_l": MetricMetadata(
|
|
674
|
+
name="rouge_l",
|
|
675
|
+
display_name="ROUGE-L",
|
|
676
|
+
description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest "
|
|
677
|
+
"common subsequence overlap.",
|
|
678
|
+
lower_is_better=False,
|
|
679
|
+
group=None,
|
|
680
|
+
),
|
|
681
|
+
"bleu_1": MetricMetadata(
|
|
682
|
+
name="bleu_1",
|
|
683
|
+
display_name="BLEU-1",
|
|
684
|
+
description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
|
|
685
|
+
"1-gram overlap.",
|
|
686
|
+
lower_is_better=False,
|
|
687
|
+
group=None,
|
|
688
|
+
),
|
|
689
|
+
"bleu_4": MetricMetadata(
|
|
690
|
+
name="bleu_4",
|
|
691
|
+
display_name="BLEU-4",
|
|
692
|
+
description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
|
|
693
|
+
"4-gram overlap.",
|
|
694
|
+
lower_is_better=False,
|
|
695
|
+
group=None,
|
|
696
|
+
),
|
|
697
|
+
"chinese_bleu_1": MetricMetadata(
|
|
698
|
+
name="chinese_bleu_1",
|
|
699
|
+
display_name="Chinese BLEU-1 score",
|
|
700
|
+
short_display_name="BLEU-1 (Chinese)",
|
|
701
|
+
description="BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a "
|
|
702
|
+
"Chinese tokenizer that segments Chinese strings by character.",
|
|
703
|
+
lower_is_better=False,
|
|
704
|
+
group=None,
|
|
705
|
+
# Group could be one of:
|
|
706
|
+
# "cleva_pinyin_transliteration_metrics"
|
|
707
|
+
# "cleva_dialogue_generation_metrics"
|
|
708
|
+
# "cleva_data_to_text_generation_metrics"
|
|
709
|
+
),
|
|
710
|
+
"chinese_rouge_1": MetricMetadata(
|
|
711
|
+
name="chinese_rouge_1",
|
|
712
|
+
display_name="Chinese ROUGE-1 score",
|
|
713
|
+
short_display_name="ROUGE-1 (Chinese)",
|
|
714
|
+
description="ROUGE-1 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
|
|
715
|
+
"tokenizer that segments Chinese strings by character.",
|
|
716
|
+
lower_is_better=False,
|
|
717
|
+
group="cleva_summarization_metrics",
|
|
718
|
+
),
|
|
719
|
+
"chinese_rouge_2": MetricMetadata(
|
|
720
|
+
name="chinese_rouge_2",
|
|
721
|
+
display_name="Chinese ROUGE-2 score",
|
|
722
|
+
short_display_name="ROUGE-2 (Chinese)",
|
|
723
|
+
description="ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
|
|
724
|
+
"tokenizer that segments Chinese strings by character.",
|
|
725
|
+
lower_is_better=False,
|
|
726
|
+
group="cleva_summarization_metrics",
|
|
727
|
+
),
|
|
728
|
+
"cleva_math_result_match": MetricMetadata(
|
|
729
|
+
name="cleva_math_result_match",
|
|
730
|
+
display_name="CLEVA Math Exact Match",
|
|
731
|
+
short_display_name="EM (Math)",
|
|
732
|
+
description="Exact match that cares only the last math expression (numbers and fractions) in the "
|
|
733
|
+
"model's prediction.",
|
|
734
|
+
lower_is_better=False,
|
|
735
|
+
group="cleva_mathematical_reasoning_metrics",
|
|
736
|
+
),
|
|
737
|
+
"absolute_value_difference": MetricMetadata(
|
|
738
|
+
name="absolute_value_difference",
|
|
739
|
+
display_name="Absolute difference",
|
|
740
|
+
short_display_name="Diff.",
|
|
741
|
+
description="Average absolute difference between the model output (converted to a number) and the "
|
|
742
|
+
"correct reference.",
|
|
743
|
+
lower_is_better=True,
|
|
744
|
+
group=None,
|
|
745
|
+
),
|
|
746
|
+
"wer_score": MetricMetadata(
|
|
747
|
+
name="wer_score",
|
|
748
|
+
display_name="Word Error Rate",
|
|
749
|
+
short_display_name="WER",
|
|
750
|
+
description="Word error rate between model predictions and ground truth answers for ASR tasks.",
|
|
751
|
+
lower_is_better=True,
|
|
752
|
+
group=None,
|
|
753
|
+
),
|
|
754
|
+
"mer_score": MetricMetadata(
|
|
755
|
+
name="mer_score",
|
|
756
|
+
display_name="Match Error Rate",
|
|
757
|
+
short_display_name="MER",
|
|
758
|
+
description="Word match error rate between model predictions and ground truth answers.",
|
|
759
|
+
lower_is_better=True,
|
|
760
|
+
group=None,
|
|
761
|
+
),
|
|
762
|
+
"wip_score": MetricMetadata(
|
|
763
|
+
name="wip_score",
|
|
764
|
+
display_name="Word Information Preservation",
|
|
765
|
+
short_display_name="WIP",
|
|
766
|
+
description="Word information preservation (WIP) for evaluating the preserved information of ASR.",
|
|
767
|
+
lower_is_better=False,
|
|
768
|
+
group=None,
|
|
769
|
+
),
|
|
770
|
+
"cer_score": MetricMetadata(
|
|
771
|
+
name="cer_score",
|
|
772
|
+
display_name="Character Error Rate",
|
|
773
|
+
short_display_name="CER",
|
|
774
|
+
description="Character error rate (CER) for evaluating the accuracy of ASR.",
|
|
775
|
+
lower_is_better=True,
|
|
776
|
+
group=None,
|
|
777
|
+
),
|
|
778
|
+
"chinese_wer_score": MetricMetadata(
|
|
779
|
+
name="chinese_wer_score",
|
|
780
|
+
display_name="Chinese Word Error Rate",
|
|
781
|
+
short_display_name="Chinese WER",
|
|
782
|
+
description="Chinese word error rate between model predictions and ground truth answers for ASR tasks.",
|
|
783
|
+
lower_is_better=True,
|
|
784
|
+
group=None,
|
|
785
|
+
),
|
|
786
|
+
"chinese_mer_score": MetricMetadata(
|
|
787
|
+
name="chinese_mer_score",
|
|
788
|
+
display_name="Chinese Match Error Rate",
|
|
789
|
+
short_display_name="Chinese MER",
|
|
790
|
+
description="Chinese word match error rate between model predictions and ground truth answers.",
|
|
791
|
+
lower_is_better=True,
|
|
792
|
+
group=None,
|
|
793
|
+
),
|
|
794
|
+
"chinese_wip_score": MetricMetadata(
|
|
795
|
+
name="chinese_wip_score",
|
|
796
|
+
display_name="Chinese Word Information Preservation",
|
|
797
|
+
short_display_name="Chinese WIP",
|
|
798
|
+
description="Chinese word information preservation (WIP) for evaluating the preserved information of " "ASR.",
|
|
799
|
+
lower_is_better=False,
|
|
800
|
+
group=None,
|
|
801
|
+
),
|
|
802
|
+
"chinese_cer_score": MetricMetadata(
|
|
803
|
+
name="chinese_cer_score",
|
|
804
|
+
display_name="Chinese Character Error Rate",
|
|
805
|
+
short_display_name="Chinese CER",
|
|
806
|
+
description="Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.",
|
|
807
|
+
lower_is_better=True,
|
|
808
|
+
group=None,
|
|
809
|
+
),
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def get_reference_metrics_metadata(names: List[str]) -> List[MetricMetadata]:
|
|
814
|
+
metadata_list: List[MetricMetadata] = []
|
|
815
|
+
for name in names:
|
|
816
|
+
metadata = _METRIC_METADATA_MAPPING.get(name)
|
|
817
|
+
if metadata:
|
|
818
|
+
metadata_list.append(metadata)
|
|
819
|
+
return metadata_list
|
|
@@ -3,7 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
7
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -101,3 +101,15 @@ class GPQAChainOfThoughtMetric(Metric):
|
|
|
101
101
|
# Compare extracted answer with the correct answer and compute the score
|
|
102
102
|
score = 1 if extracted_answer == correct_answer else 0
|
|
103
103
|
return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
|
|
104
|
+
|
|
105
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
106
|
+
return [
|
|
107
|
+
MetricMetadata(
|
|
108
|
+
name="chain_of_thought_correctness",
|
|
109
|
+
display_name="COT correct",
|
|
110
|
+
short_display_name="COT correct",
|
|
111
|
+
description="Fraction of correct answers after chain of thought",
|
|
112
|
+
lower_is_better=False,
|
|
113
|
+
group="accuracy",
|
|
114
|
+
),
|
|
115
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -34,3 +34,15 @@ class HelpdeskCallSummarizationMetric(Metric):
|
|
|
34
34
|
return [
|
|
35
35
|
Stat(MetricName("call_summarization_score")).add(score),
|
|
36
36
|
]
|
|
37
|
+
|
|
38
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
39
|
+
return [
|
|
40
|
+
MetricMetadata(
|
|
41
|
+
name="call_summarization_score",
|
|
42
|
+
display_name="Score",
|
|
43
|
+
short_display_name="Score",
|
|
44
|
+
description="Score",
|
|
45
|
+
lower_is_better=False,
|
|
46
|
+
group="summarization_metrics",
|
|
47
|
+
),
|
|
48
|
+
]
|
|
@@ -3,7 +3,7 @@ from typing import List
|
|
|
3
3
|
from helm.common.hierarchical_logger import hwarn
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
7
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -53,3 +53,15 @@ class IFEvalMetric(Metric):
|
|
|
53
53
|
is_following_list.append(0)
|
|
54
54
|
|
|
55
55
|
return [Stat(MetricName("ifeval_strict_accuracy")).add(sum(is_following_list) / len(is_following_list))]
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
58
|
+
return [
|
|
59
|
+
MetricMetadata(
|
|
60
|
+
name="ifeval_strict_accuracy",
|
|
61
|
+
display_name="IFEval strict accuracy",
|
|
62
|
+
short_display_name="IFEval Strict Acc",
|
|
63
|
+
description="Fraction of instructions in the instance that are correctly followed.",
|
|
64
|
+
lower_is_better=False,
|
|
65
|
+
group="accuracy",
|
|
66
|
+
),
|
|
67
|
+
]
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
7
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -189,3 +189,43 @@ class InstructionFollowingCritiqueMetric(Metric):
|
|
|
189
189
|
|
|
190
190
|
stats[answer_name].add(answer_value)
|
|
191
191
|
return list(stats.values())
|
|
192
|
+
|
|
193
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
194
|
+
return [
|
|
195
|
+
MetricMetadata(
|
|
196
|
+
name="Helpfulness",
|
|
197
|
+
display_name="Helpfulness",
|
|
198
|
+
description="Whether the model appears to do what it is instructed to.",
|
|
199
|
+
lower_is_better=False,
|
|
200
|
+
group="instruction_following_metrics",
|
|
201
|
+
),
|
|
202
|
+
MetricMetadata(
|
|
203
|
+
name="Understandability",
|
|
204
|
+
display_name="Understandability",
|
|
205
|
+
description="Whether the answer is easy to comprehend.",
|
|
206
|
+
lower_is_better=False,
|
|
207
|
+
group="instruction_following_metrics",
|
|
208
|
+
),
|
|
209
|
+
MetricMetadata(
|
|
210
|
+
name="Completeness",
|
|
211
|
+
display_name="Completeness",
|
|
212
|
+
description="Whether the response provides a comprehensive answer to the question or request.",
|
|
213
|
+
lower_is_better=False,
|
|
214
|
+
group="instruction_following_metrics",
|
|
215
|
+
),
|
|
216
|
+
MetricMetadata(
|
|
217
|
+
name="Conciseness",
|
|
218
|
+
display_name="Conciseness",
|
|
219
|
+
description="Whether the response is concise and to the point, without unnecessary repetition or "
|
|
220
|
+
"elaboration.",
|
|
221
|
+
lower_is_better=False,
|
|
222
|
+
group="instruction_following_metrics",
|
|
223
|
+
),
|
|
224
|
+
MetricMetadata(
|
|
225
|
+
name="Harmlessness",
|
|
226
|
+
display_name="Harmlessness",
|
|
227
|
+
description="Whether the response avoids using toxic language or helping the user with harmful goals.",
|
|
228
|
+
lower_is_better=False,
|
|
229
|
+
group="instruction_following_metrics",
|
|
230
|
+
),
|
|
231
|
+
]
|
|
@@ -6,6 +6,7 @@ import numpy as np
|
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
8
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricMetadata
|
|
9
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
11
|
from helm.benchmark.metrics.statistic import Stat
|
|
11
12
|
from helm.benchmark.scenarios.kpi_edgar_scenario import KPIEDGARScenario
|
|
@@ -119,3 +120,23 @@ class KPIEdgarMetric(EvaluateInstancesMetric):
|
|
|
119
120
|
|
|
120
121
|
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
121
122
|
return _compute_stats(_request_states_to_pred_gold_pairs(request_states))
|
|
123
|
+
|
|
124
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
125
|
+
return [
|
|
126
|
+
MetricMetadata(
|
|
127
|
+
name="word_macro_f1_score",
|
|
128
|
+
display_name="Word F1 Score",
|
|
129
|
+
description="Word F1 Score",
|
|
130
|
+
lower_is_better=None,
|
|
131
|
+
group=None,
|
|
132
|
+
),
|
|
133
|
+
MetricMetadata(
|
|
134
|
+
name="adjusted_macro_f1_score",
|
|
135
|
+
display_name="Adjusted Macro F1 Score",
|
|
136
|
+
short_display_name="Adjusted Macro F1 Score",
|
|
137
|
+
description="Entity type classification F1 score, adjusted for partial matches following the KPI-Edgar "
|
|
138
|
+
"paper, macro-averaged across entity types",
|
|
139
|
+
lower_is_better=None,
|
|
140
|
+
group=None,
|
|
141
|
+
),
|
|
142
|
+
]
|
|
@@ -11,7 +11,7 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
|
11
11
|
|
|
12
12
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
-
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
14
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats, add_context
|
|
15
15
|
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
16
16
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
17
17
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
@@ -97,3 +97,15 @@ class LanguageModelingMetric(MetricInterface):
|
|
|
97
97
|
derived_stats: List[Stat] = []
|
|
98
98
|
derived_stats.extend(compute_perplexity_metrics(stats_dict))
|
|
99
99
|
return derived_stats
|
|
100
|
+
|
|
101
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
102
|
+
return [
|
|
103
|
+
MetricMetadata(
|
|
104
|
+
name="num_instances",
|
|
105
|
+
display_name="# eval",
|
|
106
|
+
short_display_name=None,
|
|
107
|
+
description="Number of evaluation instances.",
|
|
108
|
+
lower_is_better=None,
|
|
109
|
+
group="general_information",
|
|
110
|
+
),
|
|
111
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -21,3 +21,15 @@ class LiveQAScoreMetric(Metric):
|
|
|
21
21
|
assert request_state.annotations
|
|
22
22
|
score = request_state.annotations["live_qa"]["score"]
|
|
23
23
|
return [Stat(MetricName("live_qa_score")).add(score)]
|
|
24
|
+
|
|
25
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
26
|
+
return [
|
|
27
|
+
MetricMetadata(
|
|
28
|
+
name="live_qa_score",
|
|
29
|
+
display_name="Judge Score",
|
|
30
|
+
short_display_name=None,
|
|
31
|
+
description="LLM-as-judge score",
|
|
32
|
+
lower_is_better=False,
|
|
33
|
+
group=None,
|
|
34
|
+
),
|
|
35
|
+
]
|
|
@@ -3,7 +3,7 @@ from typing import Any, Dict, List
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
6
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
7
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -44,3 +44,15 @@ class LLMJuryMetric(Metric):
|
|
|
44
44
|
return [
|
|
45
45
|
Stat(MetricName(self.metric_name)).add(score),
|
|
46
46
|
]
|
|
47
|
+
|
|
48
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
49
|
+
return [
|
|
50
|
+
MetricMetadata(
|
|
51
|
+
name=self.metric_name,
|
|
52
|
+
display_name="Jury Score",
|
|
53
|
+
short_display_name="Jury Score",
|
|
54
|
+
description="Measures the average score assigned by an LLM-based jury evaluating task performance.",
|
|
55
|
+
lower_is_better=False,
|
|
56
|
+
group=None,
|
|
57
|
+
),
|
|
58
|
+
]
|
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
from typing import List, Dict, Any
|
|
5
5
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
-
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
8
8
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
9
9
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
10
10
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -122,3 +122,16 @@ class MedCalcBenchMetric(Metric):
|
|
|
122
122
|
return [
|
|
123
123
|
Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
|
|
124
124
|
]
|
|
125
|
+
|
|
126
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
127
|
+
return [
|
|
128
|
+
MetricMetadata(
|
|
129
|
+
name="medcalc_bench_accuracy",
|
|
130
|
+
display_name="MedCalc Accuracy",
|
|
131
|
+
short_display_name="MedCalc Accuracy",
|
|
132
|
+
description="Comparison based on category. Exact match for categories risk, severity and diagnosis. "
|
|
133
|
+
"Check if within range for the other categories.",
|
|
134
|
+
lower_is_better=False,
|
|
135
|
+
group=None,
|
|
136
|
+
),
|
|
137
|
+
]
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import re
|
|
3
|
+
|
|
2
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
5
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
8
10
|
from helm.common.hierarchical_logger import hlog
|
|
9
|
-
import re
|
|
10
11
|
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
11
12
|
|
|
12
13
|
|
|
@@ -99,3 +100,25 @@ class MedecMetric(Metric):
|
|
|
99
100
|
Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
|
|
100
101
|
Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
|
|
101
102
|
]
|
|
103
|
+
|
|
104
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
105
|
+
return [
|
|
106
|
+
MetricMetadata(
|
|
107
|
+
name="medec_error_flag_accuracy",
|
|
108
|
+
display_name="Medical Error Flag Accuracy",
|
|
109
|
+
short_display_name="MedecFlagAcc",
|
|
110
|
+
description="Measures how accurately the model identifies whether a clinical note contains an error "
|
|
111
|
+
"(binary classification of correct/incorrect).",
|
|
112
|
+
lower_is_better=False,
|
|
113
|
+
group=None,
|
|
114
|
+
),
|
|
115
|
+
MetricMetadata(
|
|
116
|
+
name="medec_error_sentence_accuracy",
|
|
117
|
+
display_name="Medical Error Sentence Accuracy",
|
|
118
|
+
short_display_name="MedecSentenceAcc",
|
|
119
|
+
description="Measures how accurately the model identifies the specific erroneous sentence within a "
|
|
120
|
+
"clinical note.",
|
|
121
|
+
lower_is_better=False,
|
|
122
|
+
group=None,
|
|
123
|
+
),
|
|
124
|
+
]
|
helm/benchmark/metrics/metric.py
CHANGED
|
@@ -86,6 +86,28 @@ class Processor:
|
|
|
86
86
|
return instance_stats
|
|
87
87
|
|
|
88
88
|
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class MetricMetadata:
|
|
91
|
+
name: str
|
|
92
|
+
"""Internal name (usually no spaces, etc.)"""
|
|
93
|
+
|
|
94
|
+
display_name: Optional[str] = None
|
|
95
|
+
"""What is displayed to the user"""
|
|
96
|
+
|
|
97
|
+
short_display_name: Optional[str] = None
|
|
98
|
+
"""What is displayed to the user (e.g., in a table header)"""
|
|
99
|
+
|
|
100
|
+
description: Optional[str] = None
|
|
101
|
+
"""Description of the metric"""
|
|
102
|
+
|
|
103
|
+
lower_is_better: Optional[bool] = None
|
|
104
|
+
"""Whether a lower vaue for this metric corresponds to a better model
|
|
105
|
+
(e.g., False for accuracy, True for perplexity, None for num_trials)"""
|
|
106
|
+
|
|
107
|
+
group: Optional[str] = None
|
|
108
|
+
"""Name of the default metric group for this metric"""
|
|
109
|
+
|
|
110
|
+
|
|
89
111
|
class MetricInterface(ABC):
|
|
90
112
|
"""Interface for all Metrics."""
|
|
91
113
|
|
|
@@ -95,6 +117,9 @@ class MetricInterface(ABC):
|
|
|
95
117
|
) -> MetricResult:
|
|
96
118
|
pass
|
|
97
119
|
|
|
120
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
121
|
+
raise NotImplementedError()
|
|
122
|
+
|
|
98
123
|
|
|
99
124
|
class Metric(MetricInterface, ABC):
|
|
100
125
|
"""
|