crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from helm.benchmark.metrics.metric import Metric
|
|
4
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
5
5
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
6
6
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
7
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
|
|
|
94
94
|
Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
|
|
95
95
|
Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
|
|
96
96
|
]
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
99
|
+
return [
|
|
100
|
+
MetricMetadata(
|
|
101
|
+
name="mimiciv_billing_code_precision",
|
|
102
|
+
display_name="Precision for MIMIC Billing Codes",
|
|
103
|
+
short_display_name="MIMICBillingPre",
|
|
104
|
+
description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
|
|
105
|
+
"the model.",
|
|
106
|
+
lower_is_better=False,
|
|
107
|
+
group=None,
|
|
108
|
+
),
|
|
109
|
+
MetricMetadata(
|
|
110
|
+
name="mimiciv_billing_code_recall",
|
|
111
|
+
display_name="Recall for MIMIC Billing Codes",
|
|
112
|
+
short_display_name="MIMICBillingRec",
|
|
113
|
+
description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
|
|
114
|
+
"the gold standard.",
|
|
115
|
+
lower_is_better=False,
|
|
116
|
+
group=None,
|
|
117
|
+
),
|
|
118
|
+
MetricMetadata(
|
|
119
|
+
name="mimiciv_billing_code_f1",
|
|
120
|
+
display_name="F1 Score for MIMIC Billing Codes",
|
|
121
|
+
short_display_name="MIMICBillingF1",
|
|
122
|
+
description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
|
|
123
|
+
"evaluation of the model's performance.",
|
|
124
|
+
lower_is_better=False,
|
|
125
|
+
group=None,
|
|
126
|
+
),
|
|
127
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
|
|
|
30
30
|
return [
|
|
31
31
|
Stat(MetricName("omni_math_accuracy")).add(score),
|
|
32
32
|
]
|
|
33
|
+
|
|
34
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
35
|
+
return [
|
|
36
|
+
MetricMetadata(
|
|
37
|
+
name="omni_math_accuracy",
|
|
38
|
+
display_name="Omni-MATH Accuracy",
|
|
39
|
+
short_display_name="Acc",
|
|
40
|
+
description="Accuracy of the AI output judged by GPT-4.",
|
|
41
|
+
lower_is_better=False,
|
|
42
|
+
group="accuracy",
|
|
43
|
+
),
|
|
44
|
+
]
|
|
@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
|
|
|
8
8
|
|
|
9
9
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
10
10
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
11
|
-
from helm.benchmark.metrics.metric import Metric
|
|
11
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
14
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
|
|
|
73
73
|
|
|
74
74
|
return result
|
|
75
75
|
|
|
76
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
77
|
+
return [
|
|
78
|
+
MetricMetadata(
|
|
79
|
+
name="chr_f_plus_plus",
|
|
80
|
+
display_name="ChrF++",
|
|
81
|
+
description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
|
|
82
|
+
"2015)](https://aclanthology.org/W15-3049/). Code can be found "
|
|
83
|
+
"[here](https://github.com/mjpost/sacrebleu).",
|
|
84
|
+
lower_is_better=False,
|
|
85
|
+
group=None,
|
|
86
|
+
),
|
|
87
|
+
]
|
|
88
|
+
|
|
76
89
|
|
|
77
90
|
class SEAHELMQAMetric(Metric):
|
|
78
91
|
"""SEAHELM QA Metrics
|
|
@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
|
|
|
219
219
|
imager_load_cache=True,
|
|
220
220
|
agg="mean",
|
|
221
221
|
norm_histo=False,
|
|
222
|
-
**kwargs
|
|
222
|
+
**kwargs,
|
|
223
223
|
):
|
|
224
224
|
# `bins` should be `even%d` or `percentiles`
|
|
225
225
|
assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
|
|
@@ -405,7 +405,7 @@ class SummaCZS:
|
|
|
405
405
|
use_con=True,
|
|
406
406
|
imager_load_cache=True,
|
|
407
407
|
device="cuda",
|
|
408
|
-
**kwargs
|
|
408
|
+
**kwargs,
|
|
409
409
|
):
|
|
410
410
|
assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
|
|
411
411
|
assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"
|
|
@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
|
|
|
16
16
|
from helm.common.hierarchical_logger import hlog
|
|
17
17
|
from helm.common.general import ensure_file_downloaded
|
|
18
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
19
|
-
from helm.benchmark.metrics.metric import Metric, MetricResult
|
|
19
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
|
|
20
20
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
21
21
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
22
22
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
|
|
|
240
240
|
|
|
241
241
|
return result
|
|
242
242
|
|
|
243
|
+
def get_metadata(self):
|
|
244
|
+
metadata: List[MetricMetadata] = [
|
|
245
|
+
MetricMetadata(
|
|
246
|
+
name="QAFactEval",
|
|
247
|
+
display_name="QAFactEval",
|
|
248
|
+
description="Faithfulness scores based on the SummaC method of [Laban et al. "
|
|
249
|
+
"(2022)](https://aclanthology.org/2022.tacl-1.10/).",
|
|
250
|
+
lower_is_better=False,
|
|
251
|
+
group="summarization_metrics",
|
|
252
|
+
),
|
|
253
|
+
MetricMetadata(
|
|
254
|
+
name="summarization_coverage",
|
|
255
|
+
display_name="Coverage",
|
|
256
|
+
description="Extent to which the model-generated summaries are extractive fragments from the source "
|
|
257
|
+
"document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
258
|
+
lower_is_better=None,
|
|
259
|
+
group="summarization_metrics",
|
|
260
|
+
),
|
|
261
|
+
MetricMetadata(
|
|
262
|
+
name="summarization_density",
|
|
263
|
+
display_name="Density",
|
|
264
|
+
description="Extent to which the model-generated summaries are extractive summaries based on the "
|
|
265
|
+
"source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
266
|
+
lower_is_better=None,
|
|
267
|
+
group="summarization_metrics",
|
|
268
|
+
),
|
|
269
|
+
MetricMetadata(
|
|
270
|
+
name="summarization_compression",
|
|
271
|
+
display_name="Compression",
|
|
272
|
+
description="Extent to which the model-generated summaries are compressed relative to the source "
|
|
273
|
+
"document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
|
|
274
|
+
lower_is_better=None,
|
|
275
|
+
group="summarization_metrics",
|
|
276
|
+
),
|
|
277
|
+
MetricMetadata(
|
|
278
|
+
name="rouge_1",
|
|
279
|
+
display_name="ROUGE-1",
|
|
280
|
+
short_display_name="ROUGE-1",
|
|
281
|
+
description="ROUGE-1",
|
|
282
|
+
lower_is_better=False,
|
|
283
|
+
group="summarization_metrics",
|
|
284
|
+
),
|
|
285
|
+
MetricMetadata(
|
|
286
|
+
name="rouge-2",
|
|
287
|
+
display_name="ROUGE-2",
|
|
288
|
+
short_display_name="ROUGE-2",
|
|
289
|
+
description="ROUGE-2",
|
|
290
|
+
lower_is_better=False,
|
|
291
|
+
group="summarization_metrics",
|
|
292
|
+
),
|
|
293
|
+
MetricMetadata(
|
|
294
|
+
name="rouge-l",
|
|
295
|
+
display_name="ROUGE-L",
|
|
296
|
+
short_display_name="ROUGE-L",
|
|
297
|
+
description="ROUGE-L",
|
|
298
|
+
lower_is_better=False,
|
|
299
|
+
group="summarization_metrics",
|
|
300
|
+
),
|
|
301
|
+
]
|
|
302
|
+
if self.humaneval is not None:
|
|
303
|
+
metadata.extend(
|
|
304
|
+
[
|
|
305
|
+
MetricMetadata(
|
|
306
|
+
name="HumanEval-faithfulness",
|
|
307
|
+
display_name="HumanEval-faithfulness",
|
|
308
|
+
description="Human evaluation score for faithfulness.",
|
|
309
|
+
lower_is_better=False,
|
|
310
|
+
group="summarization_metrics",
|
|
311
|
+
),
|
|
312
|
+
MetricMetadata(
|
|
313
|
+
name="HumanEval-relevance",
|
|
314
|
+
display_name="HumanEval-relevance",
|
|
315
|
+
description="Human evaluation score for relevance.",
|
|
316
|
+
lower_is_better=False,
|
|
317
|
+
group="summarization_metrics",
|
|
318
|
+
),
|
|
319
|
+
MetricMetadata(
|
|
320
|
+
name="HumanEval-coherence",
|
|
321
|
+
display_name="HumanEval-coherence",
|
|
322
|
+
description="Human evaluation score for coherence.",
|
|
323
|
+
lower_is_better=False,
|
|
324
|
+
group="summarization_metrics",
|
|
325
|
+
),
|
|
326
|
+
]
|
|
327
|
+
)
|
|
328
|
+
if self.compute_faithfulness:
|
|
329
|
+
metadata.append(
|
|
330
|
+
MetricMetadata(
|
|
331
|
+
name="summac",
|
|
332
|
+
display_name="SummaC",
|
|
333
|
+
description="Faithfulness scores based on the SummaC method of [Laban et al. "
|
|
334
|
+
"(2022)](https://aclanthology.org/2022.tacl-1.10/).",
|
|
335
|
+
lower_is_better=False,
|
|
336
|
+
group="summarization_metrics",
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
if self.compute_bertscore:
|
|
340
|
+
metadata.extend(
|
|
341
|
+
[
|
|
342
|
+
MetricMetadata(
|
|
343
|
+
name="BERTScore-P",
|
|
344
|
+
display_name="BERTScore (P)",
|
|
345
|
+
description="Average BERTScore precision [(Zhang et al., "
|
|
346
|
+
"2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
|
|
347
|
+
"summary.",
|
|
348
|
+
lower_is_better=False,
|
|
349
|
+
group=None,
|
|
350
|
+
),
|
|
351
|
+
MetricMetadata(
|
|
352
|
+
name="BERTScore-R",
|
|
353
|
+
display_name="BERTScore (R)",
|
|
354
|
+
description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
|
|
355
|
+
"between model generation and reference summary.",
|
|
356
|
+
lower_is_better=False,
|
|
357
|
+
group=None,
|
|
358
|
+
),
|
|
359
|
+
MetricMetadata(
|
|
360
|
+
name="BERTScore-F",
|
|
361
|
+
display_name="BERTScore (F1)",
|
|
362
|
+
description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
|
|
363
|
+
"between model generation and reference summary.",
|
|
364
|
+
lower_is_better=False,
|
|
365
|
+
group="summarization_metrics",
|
|
366
|
+
),
|
|
367
|
+
]
|
|
368
|
+
)
|
|
369
|
+
return metadata
|
|
370
|
+
|
|
243
371
|
|
|
244
372
|
def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
|
|
245
373
|
"""
|
|
@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
8
|
from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
|
|
9
|
-
from helm.benchmark.metrics.metric import Metric
|
|
9
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
10
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
12
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
|
|
|
89
89
|
]
|
|
90
90
|
|
|
91
91
|
return stats
|
|
92
|
+
|
|
93
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
94
|
+
return [
|
|
95
|
+
MetricMetadata(
|
|
96
|
+
name="expected_max_toxicity",
|
|
97
|
+
display_name="Average max toxicity",
|
|
98
|
+
short_display_name="Avg. max toxicity",
|
|
99
|
+
description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
|
|
100
|
+
"classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
|
|
101
|
+
lower_is_better=True,
|
|
102
|
+
group=None,
|
|
103
|
+
),
|
|
104
|
+
MetricMetadata(
|
|
105
|
+
name="max_toxicity_probability",
|
|
106
|
+
display_name="max toxicity",
|
|
107
|
+
description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
|
|
108
|
+
"PerspectiveAPI toxicity classifier; [Gehman et al. "
|
|
109
|
+
"(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
|
|
110
|
+
lower_is_better=True,
|
|
111
|
+
group=None,
|
|
112
|
+
),
|
|
113
|
+
MetricMetadata(
|
|
114
|
+
name="toxic_frac",
|
|
115
|
+
display_name="Toxic fraction",
|
|
116
|
+
description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
|
|
117
|
+
"classifier).",
|
|
118
|
+
lower_is_better=True,
|
|
119
|
+
group="toxicity",
|
|
120
|
+
),
|
|
121
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
|
|
|
32
32
|
Stat(MetricName("wildbench_score")).add(score),
|
|
33
33
|
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
|
|
34
34
|
]
|
|
35
|
+
|
|
36
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
37
|
+
return [
|
|
38
|
+
MetricMetadata(
|
|
39
|
+
name="wildbench_score",
|
|
40
|
+
display_name="WildBench Score",
|
|
41
|
+
short_display_name="WB Score",
|
|
42
|
+
description="Score of the AI output judged by GPT-4o.",
|
|
43
|
+
lower_is_better=False,
|
|
44
|
+
group="accuracy",
|
|
45
|
+
),
|
|
46
|
+
MetricMetadata(
|
|
47
|
+
name="wildbench_score_rescaled",
|
|
48
|
+
display_name="WildBench Score",
|
|
49
|
+
short_display_name="WB Score",
|
|
50
|
+
description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
|
|
51
|
+
lower_is_better=False,
|
|
52
|
+
group="accuracy",
|
|
53
|
+
),
|
|
54
|
+
]
|
|
@@ -8,6 +8,7 @@ import mako.template
|
|
|
8
8
|
import yaml
|
|
9
9
|
import importlib_resources as resources
|
|
10
10
|
|
|
11
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
11
12
|
from helm.common.general import hlog
|
|
12
13
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
14
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
|
|
|
131
132
|
NO_GROUPS = "no_groups"
|
|
132
133
|
|
|
133
134
|
|
|
134
|
-
@dataclass(frozen=True)
|
|
135
|
-
class TaxonomyInfo:
|
|
136
|
-
# Task (e.g., question answering)
|
|
137
|
-
task: Optional[str] = None
|
|
138
|
-
|
|
139
|
-
# Domain - genre (e.g., Wikipedia)
|
|
140
|
-
what: Optional[str] = None
|
|
141
|
-
|
|
142
|
-
# Domain - when it was written (e.g., 2010s)
|
|
143
|
-
when: Optional[str] = None
|
|
144
|
-
|
|
145
|
-
# Domain - demographics (e.g., web users)
|
|
146
|
-
who: Optional[str] = None
|
|
147
|
-
|
|
148
|
-
# Language (e.g., English)
|
|
149
|
-
language: Optional[str] = None
|
|
150
|
-
|
|
151
|
-
|
|
152
135
|
@dataclass(frozen=True)
|
|
153
136
|
class RunGroup(Field):
|
|
154
137
|
"""
|
|
@@ -216,16 +199,16 @@ class Schema:
|
|
|
216
199
|
"""Specifies information about what to display on the frontend."""
|
|
217
200
|
|
|
218
201
|
# Information about each field
|
|
219
|
-
metrics: List[Field]
|
|
202
|
+
metrics: List[Field] = field(default_factory=list)
|
|
220
203
|
|
|
221
204
|
# Information about each perturbation
|
|
222
|
-
perturbations: List[Field]
|
|
205
|
+
perturbations: List[Field] = field(default_factory=list)
|
|
223
206
|
|
|
224
207
|
# Group the metrics
|
|
225
|
-
metric_groups: List[MetricGroup]
|
|
208
|
+
metric_groups: List[MetricGroup] = field(default_factory=list)
|
|
226
209
|
|
|
227
210
|
# Group the scenarios
|
|
228
|
-
run_groups: List[RunGroup]
|
|
211
|
+
run_groups: List[RunGroup] = field(default_factory=list)
|
|
229
212
|
|
|
230
213
|
# Adapter fields (e.g., temperature)
|
|
231
214
|
# Automatically populated from the docstrings in the AdapterSpec class definition.
|
|
@@ -9,6 +9,7 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
+
import dataclasses
|
|
12
13
|
import os
|
|
13
14
|
import datetime
|
|
14
15
|
import urllib.parse
|
|
@@ -31,18 +32,26 @@ from helm.common.general import (
|
|
|
31
32
|
)
|
|
32
33
|
from helm.common.codec import from_json
|
|
33
34
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
|
-
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
|
+
from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
|
|
35
36
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
37
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
37
|
-
from helm.benchmark.metrics.metric import
|
|
38
|
+
from helm.benchmark.metrics.metric import (
|
|
39
|
+
MetricInterface,
|
|
40
|
+
MetricMetadata,
|
|
41
|
+
MetricSpec,
|
|
42
|
+
create_metric,
|
|
43
|
+
get_all_stats_by_name,
|
|
44
|
+
)
|
|
38
45
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
39
46
|
from helm.benchmark.run_spec import RunSpec
|
|
40
47
|
from helm.benchmark.runner import LATEST_SYMLINK
|
|
41
48
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
42
49
|
from helm.benchmark.presentation.schema import (
|
|
50
|
+
MetricGroup,
|
|
43
51
|
MetricNameMatcher,
|
|
44
52
|
RunGroup,
|
|
45
53
|
Field,
|
|
54
|
+
Schema,
|
|
46
55
|
read_schema,
|
|
47
56
|
get_default_schema_path,
|
|
48
57
|
BY_GROUP,
|
|
@@ -341,7 +350,7 @@ class Summarizer:
|
|
|
341
350
|
release: Optional[str],
|
|
342
351
|
suites: Optional[List[str]],
|
|
343
352
|
suite: Optional[str],
|
|
344
|
-
schema_path: str,
|
|
353
|
+
schema_path: Optional[str],
|
|
345
354
|
output_path: str,
|
|
346
355
|
verbose: bool,
|
|
347
356
|
num_threads: int,
|
|
@@ -376,10 +385,8 @@ class Summarizer:
|
|
|
376
385
|
self.verbose: bool = verbose
|
|
377
386
|
self.num_threads: int = num_threads
|
|
378
387
|
self.allow_unknown_models: bool = allow_unknown_models
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
self.schema = read_schema(schema_path)
|
|
388
|
+
self.schema = read_schema(schema_path) if schema_path else Schema()
|
|
389
|
+
self.metric_metadata: List[MetricMetadata] = []
|
|
383
390
|
|
|
384
391
|
def read_run(self, run_path: str) -> Run:
|
|
385
392
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -426,6 +433,8 @@ class Summarizer:
|
|
|
426
433
|
|
|
427
434
|
def read_runs_for_suite(self, suite, run_suite_path):
|
|
428
435
|
"""Load the runs in the run suite path."""
|
|
436
|
+
if not os.path.exists(run_suite_path):
|
|
437
|
+
raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
|
|
429
438
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
430
439
|
# so filter them out.
|
|
431
440
|
run_dir_names = sorted(
|
|
@@ -509,6 +518,150 @@ class Summarizer:
|
|
|
509
518
|
model_field_dicts.append(asdict_without_nones(model_field))
|
|
510
519
|
return model_field_dicts
|
|
511
520
|
|
|
521
|
+
def get_metric_metadata(self) -> List[MetricMetadata]:
|
|
522
|
+
if self.metric_metadata:
|
|
523
|
+
return self.metric_metadata
|
|
524
|
+
metric_specs: List[MetricSpec] = []
|
|
525
|
+
for run in self.runs:
|
|
526
|
+
metric_specs.extend(run.run_spec.metric_specs)
|
|
527
|
+
metric_specs = list(set(metric_specs))
|
|
528
|
+
metric_name_to_metadata: Dict[str, MetricMetadata] = {}
|
|
529
|
+
for metric_spec in metric_specs:
|
|
530
|
+
try:
|
|
531
|
+
metric: MetricInterface = create_metric(metric_spec)
|
|
532
|
+
metric_metadata_list = metric.get_metadata()
|
|
533
|
+
for metric_metadata in metric_metadata_list:
|
|
534
|
+
metric_name_to_metadata[metric_metadata.name] = metric_metadata
|
|
535
|
+
except NotImplementedError:
|
|
536
|
+
pass
|
|
537
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
538
|
+
pass
|
|
539
|
+
|
|
540
|
+
run_stat_names: Set[str] = set()
|
|
541
|
+
for run in self.runs:
|
|
542
|
+
for stat in run.stats:
|
|
543
|
+
run_stat_names.add(stat.name.name)
|
|
544
|
+
|
|
545
|
+
metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
|
|
546
|
+
for metric_name_to_prune in metric_names_to_prune:
|
|
547
|
+
del metric_name_to_metadata[metric_name_to_prune]
|
|
548
|
+
self.metric_metadata = list(metric_name_to_metadata.values())
|
|
549
|
+
return self.metric_metadata
|
|
550
|
+
|
|
551
|
+
def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
|
|
552
|
+
return Field(
|
|
553
|
+
name=metric_metadata.name,
|
|
554
|
+
display_name=metric_metadata.display_name,
|
|
555
|
+
short_display_name=metric_metadata.short_display_name,
|
|
556
|
+
description=metric_metadata.description,
|
|
557
|
+
lower_is_better=metric_metadata.lower_is_better,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
def auto_generate_metric_fields(self) -> List[Field]:
|
|
561
|
+
return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
|
|
562
|
+
|
|
563
|
+
def auto_generate_metric_groups(self) -> List[MetricGroup]:
|
|
564
|
+
metric_groups = [
|
|
565
|
+
MetricGroup(
|
|
566
|
+
name="main_metric",
|
|
567
|
+
display_name="Main Metric",
|
|
568
|
+
description="Main Metric",
|
|
569
|
+
metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
|
|
570
|
+
)
|
|
571
|
+
]
|
|
572
|
+
metric_group_to_metrics: Dict[str, List[str]] = {}
|
|
573
|
+
for metric_metadata in self.metric_metadata:
|
|
574
|
+
if metric_metadata.group:
|
|
575
|
+
if metric_metadata.group not in metric_group_to_metrics:
|
|
576
|
+
metric_group_to_metrics[metric_metadata.group] = []
|
|
577
|
+
metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
|
|
578
|
+
for metric_group, metric_names in metric_group_to_metrics.items():
|
|
579
|
+
display_name = metric_group.replace("_", " ").capitalize()
|
|
580
|
+
metric_groups.append(
|
|
581
|
+
MetricGroup(
|
|
582
|
+
name=metric_group,
|
|
583
|
+
# TODO: Make display_name and description nicer
|
|
584
|
+
display_name=display_name,
|
|
585
|
+
description=display_name,
|
|
586
|
+
aggregation_strategies=[],
|
|
587
|
+
metrics=[
|
|
588
|
+
MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
|
|
589
|
+
],
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
return metric_groups
|
|
593
|
+
|
|
594
|
+
def get_scenario_metadata(self) -> List[ScenarioMetadata]:
|
|
595
|
+
scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
|
|
596
|
+
scenario_specs = list(set(scenario_specs))
|
|
597
|
+
scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
|
|
598
|
+
for scenario_spec in scenario_specs:
|
|
599
|
+
try:
|
|
600
|
+
scenario: Scenario = create_scenario(scenario_spec)
|
|
601
|
+
scenario_metadata = scenario.get_metadata()
|
|
602
|
+
scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
|
|
603
|
+
except NotImplementedError:
|
|
604
|
+
pass
|
|
605
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
run_groups: Set[str] = set()
|
|
609
|
+
for run in self.runs:
|
|
610
|
+
for run_group in run.run_spec.groups:
|
|
611
|
+
run_groups.add(run_group)
|
|
612
|
+
|
|
613
|
+
scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
|
|
614
|
+
for scenario_name_to_prune in scenario_names_to_prune:
|
|
615
|
+
del scenario_name_to_metadata[scenario_name_to_prune]
|
|
616
|
+
return list(scenario_name_to_metadata.values())
|
|
617
|
+
|
|
618
|
+
def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
|
|
619
|
+
metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
|
|
620
|
+
return RunGroup(
|
|
621
|
+
name=scenario_metadata.name,
|
|
622
|
+
display_name=scenario_metadata.display_name,
|
|
623
|
+
short_display_name=scenario_metadata.short_display_name,
|
|
624
|
+
description=scenario_metadata.description,
|
|
625
|
+
metric_groups=metric_group_names,
|
|
626
|
+
environment={
|
|
627
|
+
"main_name": scenario_metadata.main_metric,
|
|
628
|
+
"main_split": scenario_metadata.main_split,
|
|
629
|
+
},
|
|
630
|
+
taxonomy=scenario_metadata.taxonomy,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
def auto_generate_all_scenarios_run_group(self) -> RunGroup:
|
|
634
|
+
return RunGroup(
|
|
635
|
+
name="all_scenarios",
|
|
636
|
+
display_name="All Scenarios",
|
|
637
|
+
description="All scenarios",
|
|
638
|
+
category="Scenario Groups",
|
|
639
|
+
subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
|
|
643
|
+
return [
|
|
644
|
+
self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
def fix_up_schema(self) -> None:
|
|
648
|
+
# if not self.schema.run_groups:
|
|
649
|
+
if not self.schema.metrics:
|
|
650
|
+
self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
|
|
651
|
+
# Can only auto-generate metric groups if metrics were also auto-generated
|
|
652
|
+
# because auto_generate_metric_groups() requires self.metric_metadata()
|
|
653
|
+
# which is populated by auto_generate_metric_fields()
|
|
654
|
+
if not self.schema.metric_groups:
|
|
655
|
+
self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
|
|
656
|
+
if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
|
|
657
|
+
self.schema = dataclasses.replace(
|
|
658
|
+
self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
|
|
659
|
+
)
|
|
660
|
+
if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
|
|
661
|
+
self.schema = dataclasses.replace(
|
|
662
|
+
self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
|
|
663
|
+
)
|
|
664
|
+
|
|
512
665
|
def write_schema(self) -> None:
|
|
513
666
|
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
514
667
|
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
@@ -1070,7 +1223,8 @@ class Summarizer:
|
|
|
1070
1223
|
is_scenario_table=False,
|
|
1071
1224
|
aggregation_strategies=aggregate_strategies,
|
|
1072
1225
|
)
|
|
1073
|
-
|
|
1226
|
+
if len(table.header) > 1:
|
|
1227
|
+
tables.append(table)
|
|
1074
1228
|
return tables
|
|
1075
1229
|
|
|
1076
1230
|
def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
|
|
@@ -1213,14 +1367,16 @@ class Summarizer:
|
|
|
1213
1367
|
"""Run the entire summarization pipeline."""
|
|
1214
1368
|
self.read_runs()
|
|
1215
1369
|
self.group_runs()
|
|
1216
|
-
self.check_metrics_defined()
|
|
1217
1370
|
|
|
1218
|
-
self.
|
|
1371
|
+
ensure_directory_exists(self.run_release_path)
|
|
1219
1372
|
|
|
1220
1373
|
# Must happen after self.read_runs()
|
|
1221
1374
|
# because it uses self.runs
|
|
1375
|
+
self.fix_up_schema()
|
|
1376
|
+
self.check_metrics_defined()
|
|
1222
1377
|
self.write_schema()
|
|
1223
1378
|
|
|
1379
|
+
self.write_run_display_json(skip_completed)
|
|
1224
1380
|
self.write_executive_summary()
|
|
1225
1381
|
self.write_runs()
|
|
1226
1382
|
self.write_run_specs()
|
|
@@ -1254,7 +1410,15 @@ def summarize(args):
|
|
|
1254
1410
|
else:
|
|
1255
1411
|
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
1412
|
|
|
1257
|
-
schema_path
|
|
1413
|
+
schema_path: Optional[str]
|
|
1414
|
+
if args.auto_generate_schema:
|
|
1415
|
+
if args.schema_path:
|
|
1416
|
+
raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
|
|
1417
|
+
schema_path = None
|
|
1418
|
+
elif args.schema_path:
|
|
1419
|
+
schema_path = args.schema_path
|
|
1420
|
+
else:
|
|
1421
|
+
schema_path = get_default_schema_path()
|
|
1258
1422
|
|
|
1259
1423
|
register_builtin_configs_from_helm_package()
|
|
1260
1424
|
register_configs_from_directory(args.local_path)
|
|
@@ -1346,6 +1510,11 @@ def main():
|
|
|
1346
1510
|
default=None,
|
|
1347
1511
|
help="PATH to a YAML file to customize logging",
|
|
1348
1512
|
)
|
|
1513
|
+
parser.add_argument(
|
|
1514
|
+
"--auto-generate-schema",
|
|
1515
|
+
action="store_true",
|
|
1516
|
+
help="EXPERIMENTAL: Auto-generate schema",
|
|
1517
|
+
)
|
|
1349
1518
|
args = parser.parse_args()
|
|
1350
1519
|
setup_default_logging(args.log_config)
|
|
1351
1520
|
summarize(args)
|