PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/metrics/mimiciv_billing_code_metrics.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
             Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
             Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="mimiciv_billing_code_precision",
+                display_name="Precision for MIMIC Billing Codes",
+                short_display_name="MIMICBillingPre",
+                description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
+                "the model.",
+                lower_is_better=False,
+                group=None,
+            ),
+            MetricMetadata(
+                name="mimiciv_billing_code_recall",
+                display_name="Recall for MIMIC Billing Codes",
+                short_display_name="MIMICBillingRec",
+                description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
+                "the gold standard.",
+                lower_is_better=False,
+                group=None,
+            ),
+            MetricMetadata(
+                name="mimiciv_billing_code_f1",
+                display_name="F1 Score for MIMIC Billing Codes",
+                short_display_name="MIMICBillingF1",
+                description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
+                "evaluation of the model's performance.",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/omni_math_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
         return [
             Stat(MetricName("omni_math_accuracy")).add(score),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="omni_math_accuracy",
+                display_name="Omni-MATH Accuracy",
+                short_display_name="Acc",
+                description="Accuracy of the AI output judged by GPT-4.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+        ]

helm/benchmark/metrics/seahelm_metrics.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
         return result
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="chr_f_plus_plus",
+                display_name="ChrF++",
+                description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
+                "2015)](https://aclanthology.org/W15-3049/). Code can be found "
+                "[here](https://github.com/mjpost/sacrebleu).",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]
 class SEAHELMQAMetric(Metric):
     """SEAHELM QA Metrics

helm/benchmark/metrics/summac/model_summac.py CHANGED Viewed

@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
         imager_load_cache=True,
         agg="mean",
         norm_histo=False,
-        **kwargs
+        **kwargs,
     ):
         # `bins` should be `even%d` or `percentiles`
         assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
@@ -405,7 +405,7 @@ class SummaCZS:
         use_con=True,
         imager_load_cache=True,
         device="cuda",
-        **kwargs
+        **kwargs,
     ):
         assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
         assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"

helm/benchmark/metrics/summarization_metrics.py CHANGED Viewed

@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
         return result
+    def get_metadata(self):
+        metadata: List[MetricMetadata] = [
+            MetricMetadata(
+                name="QAFactEval",
+                display_name="QAFactEval",
+                description="Faithfulness scores based on the SummaC method of [Laban et al. "
+                "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_coverage",
+                display_name="Coverage",
+                description="Extent to which the model-generated summaries are extractive fragments from the source "
+                "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_density",
+                display_name="Density",
+                description="Extent to which the model-generated summaries are extractive summaries based on the "
+                "source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_compression",
+                display_name="Compression",
+                description="Extent to which the model-generated summaries are compressed relative to the source "
+                "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge_1",
+                display_name="ROUGE-1",
+                short_display_name="ROUGE-1",
+                description="ROUGE-1",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge-2",
+                display_name="ROUGE-2",
+                short_display_name="ROUGE-2",
+                description="ROUGE-2",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge-l",
+                display_name="ROUGE-L",
+                short_display_name="ROUGE-L",
+                description="ROUGE-L",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+        ]
+        if self.humaneval is not None:
+            metadata.extend(
+                [
+                    MetricMetadata(
+                        name="HumanEval-faithfulness",
+                        display_name="HumanEval-faithfulness",
+                        description="Human evaluation score for faithfulness.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                    MetricMetadata(
+                        name="HumanEval-relevance",
+                        display_name="HumanEval-relevance",
+                        description="Human evaluation score for relevance.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                    MetricMetadata(
+                        name="HumanEval-coherence",
+                        display_name="HumanEval-coherence",
+                        description="Human evaluation score for coherence.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                ]
+            )
+        if self.compute_faithfulness:
+            metadata.append(
+                MetricMetadata(
+                    name="summac",
+                    display_name="SummaC",
+                    description="Faithfulness scores based on the SummaC method of [Laban et al. "
+                    "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
+                    lower_is_better=False,
+                    group="summarization_metrics",
+                )
+            )
+        if self.compute_bertscore:
+            metadata.extend(
+                [
+                    MetricMetadata(
+                        name="BERTScore-P",
+                        display_name="BERTScore (P)",
+                        description="Average BERTScore precision [(Zhang et al., "
+                        "2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
+                        "summary.",
+                        lower_is_better=False,
+                        group=None,
+                    ),
+                    MetricMetadata(
+                        name="BERTScore-R",
+                        display_name="BERTScore (R)",
+                        description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
+                        "between model generation and reference summary.",
+                        lower_is_better=False,
+                        group=None,
+                    ),
+                    MetricMetadata(
+                        name="BERTScore-F",
+                        display_name="BERTScore (F1)",
+                        description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
+                        "between model generation and reference summary.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                ]
+            )
+        return metadata
 def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
     """

helm/benchmark/metrics/toxicity_metrics.py CHANGED Viewed

@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
         ]
         return stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="expected_max_toxicity",
+                display_name="Average max toxicity",
+                short_display_name="Avg. max toxicity",
+                description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
+                "classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
+                lower_is_better=True,
+                group=None,
+            ),
+            MetricMetadata(
+                name="max_toxicity_probability",
+                display_name="max toxicity",
+                description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
+                "PerspectiveAPI toxicity classifier; [Gehman et al. "
+                "(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
+                lower_is_better=True,
+                group=None,
+            ),
+            MetricMetadata(
+                name="toxic_frac",
+                display_name="Toxic fraction",
+                description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
+                "classifier).",
+                lower_is_better=True,
+                group="toxicity",
+            ),
+        ]

helm/benchmark/metrics/wildbench_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
             Stat(MetricName("wildbench_score")).add(score),
             Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="wildbench_score",
+                display_name="WildBench Score",
+                short_display_name="WB Score",
+                description="Score of the AI output judged by GPT-4o.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+            MetricMetadata(
+                name="wildbench_score_rescaled",
+                display_name="WildBench Score",
+                short_display_name="WB Score",
+                description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+        ]

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -8,6 +8,7 @@ import mako.template
 import yaml
 import importlib_resources as resources
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import hlog
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
 NO_GROUPS = "no_groups"
-@dataclass(frozen=True)
-class TaxonomyInfo:
-    # Task (e.g., question answering)
-    task: Optional[str] = None
-    # Domain - genre (e.g., Wikipedia)
-    what: Optional[str] = None
-    # Domain - when it was written (e.g., 2010s)
-    when: Optional[str] = None
-    # Domain - demographics (e.g., web users)
-    who: Optional[str] = None
-    # Language (e.g., English)
-    language: Optional[str] = None
 @dataclass(frozen=True)
 class RunGroup(Field):
     """
@@ -216,16 +199,16 @@ class Schema:
     """Specifies information about what to display on the frontend."""
     # Information about each field
-    metrics: List[Field]
+    metrics: List[Field] = field(default_factory=list)
     # Information about each perturbation
-    perturbations: List[Field]
+    perturbations: List[Field] = field(default_factory=list)
     # Group the metrics
-    metric_groups: List[MetricGroup]
+    metric_groups: List[MetricGroup] = field(default_factory=list)
     # Group the scenarios
-    run_groups: List[RunGroup]
+    run_groups: List[RunGroup] = field(default_factory=list)
     # Adapter fields (e.g., temperature)
     # Automatically populated from the docstrings in the AdapterSpec class definition.

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -9,6 +9,7 @@ Usage:
 """
 import argparse
+import dataclasses
 import os
 import datetime
 import urllib.parse
@@ -31,18 +32,26 @@ from helm.common.general import (
 )
 from helm.common.codec import from_json
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
-from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric import get_all_stats_by_name
+from helm.benchmark.metrics.metric import (
+    MetricInterface,
+    MetricMetadata,
+    MetricSpec,
+    create_metric,
+    get_all_stats_by_name,
+)
 from helm.benchmark.metrics.statistic import Stat, merge_stat
 from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.runner import LATEST_SYMLINK
 from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
 from helm.benchmark.presentation.schema import (
+    MetricGroup,
     MetricNameMatcher,
     RunGroup,
     Field,
+    Schema,
     read_schema,
     get_default_schema_path,
     BY_GROUP,
@@ -341,7 +350,7 @@ class Summarizer:
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
-        schema_path: str,
+        schema_path: Optional[str],
         output_path: str,
         verbose: bool,
         num_threads: int,
@@ -376,10 +385,8 @@ class Summarizer:
         self.verbose: bool = verbose
         self.num_threads: int = num_threads
         self.allow_unknown_models: bool = allow_unknown_models
-        ensure_directory_exists(self.run_release_path)
-        self.schema = read_schema(schema_path)
+        self.schema = read_schema(schema_path) if schema_path else Schema()
+        self.metric_metadata: List[MetricMetadata] = []
     def read_run(self, run_path: str) -> Run:
         """Load the `Run` object from `run_path`."""
@@ -426,6 +433,8 @@ class Summarizer:
     def read_runs_for_suite(self, suite, run_suite_path):
         """Load the runs in the run suite path."""
+        if not os.path.exists(run_suite_path):
+            raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
         run_dir_names = sorted(
@@ -509,6 +518,150 @@ class Summarizer:
             model_field_dicts.append(asdict_without_nones(model_field))
         return model_field_dicts
+    def get_metric_metadata(self) -> List[MetricMetadata]:
+        if self.metric_metadata:
+            return self.metric_metadata
+        metric_specs: List[MetricSpec] = []
+        for run in self.runs:
+            metric_specs.extend(run.run_spec.metric_specs)
+        metric_specs = list(set(metric_specs))
+        metric_name_to_metadata: Dict[str, MetricMetadata] = {}
+        for metric_spec in metric_specs:
+            try:
+                metric: MetricInterface = create_metric(metric_spec)
+                metric_metadata_list = metric.get_metadata()
+                for metric_metadata in metric_metadata_list:
+                    metric_name_to_metadata[metric_metadata.name] = metric_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_stat_names: Set[str] = set()
+        for run in self.runs:
+            for stat in run.stats:
+                run_stat_names.add(stat.name.name)
+        metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
+        for metric_name_to_prune in metric_names_to_prune:
+            del metric_name_to_metadata[metric_name_to_prune]
+        self.metric_metadata = list(metric_name_to_metadata.values())
+        return self.metric_metadata
+    def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
+        return Field(
+            name=metric_metadata.name,
+            display_name=metric_metadata.display_name,
+            short_display_name=metric_metadata.short_display_name,
+            description=metric_metadata.description,
+            lower_is_better=metric_metadata.lower_is_better,
+        )
+    def auto_generate_metric_fields(self) -> List[Field]:
+        return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
+    def auto_generate_metric_groups(self) -> List[MetricGroup]:
+        metric_groups = [
+            MetricGroup(
+                name="main_metric",
+                display_name="Main Metric",
+                description="Main Metric",
+                metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
+            )
+        ]
+        metric_group_to_metrics: Dict[str, List[str]] = {}
+        for metric_metadata in self.metric_metadata:
+            if metric_metadata.group:
+                if metric_metadata.group not in metric_group_to_metrics:
+                    metric_group_to_metrics[metric_metadata.group] = []
+                metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
+        for metric_group, metric_names in metric_group_to_metrics.items():
+            display_name = metric_group.replace("_", " ").capitalize()
+            metric_groups.append(
+                MetricGroup(
+                    name=metric_group,
+                    # TODO: Make display_name and description nicer
+                    display_name=display_name,
+                    description=display_name,
+                    aggregation_strategies=[],
+                    metrics=[
+                        MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
+                    ],
+                )
+            )
+        return metric_groups
+    def get_scenario_metadata(self) -> List[ScenarioMetadata]:
+        scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
+        scenario_specs = list(set(scenario_specs))
+        scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
+        for scenario_spec in scenario_specs:
+            try:
+                scenario: Scenario = create_scenario(scenario_spec)
+                scenario_metadata = scenario.get_metadata()
+                scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_groups: Set[str] = set()
+        for run in self.runs:
+            for run_group in run.run_spec.groups:
+                run_groups.add(run_group)
+        scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
+        for scenario_name_to_prune in scenario_names_to_prune:
+            del scenario_name_to_metadata[scenario_name_to_prune]
+        return list(scenario_name_to_metadata.values())
+    def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
+        metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
+        return RunGroup(
+            name=scenario_metadata.name,
+            display_name=scenario_metadata.display_name,
+            short_display_name=scenario_metadata.short_display_name,
+            description=scenario_metadata.description,
+            metric_groups=metric_group_names,
+            environment={
+                "main_name": scenario_metadata.main_metric,
+                "main_split": scenario_metadata.main_split,
+            },
+            taxonomy=scenario_metadata.taxonomy,
+        )
+    def auto_generate_all_scenarios_run_group(self) -> RunGroup:
+        return RunGroup(
+            name="all_scenarios",
+            display_name="All Scenarios",
+            description="All scenarios",
+            category="Scenario Groups",
+            subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
+        )
+    def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
+        return [
+            self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
+        ]
+    def fix_up_schema(self) -> None:
+        # if not self.schema.run_groups:
+        if not self.schema.metrics:
+            self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
+            # Can only auto-generate metric groups if metrics were also auto-generated
+            # because auto_generate_metric_groups() requires self.metric_metadata()
+            # which is populated by auto_generate_metric_fields()
+            if not self.schema.metric_groups:
+                self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
+        if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
+            )
+        if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
+            )
     def write_schema(self) -> None:
         """Write the schema file to benchmark_output so the frontend knows about it."""
         # Manually add the model metadata to the schema.json, where the frontend expects it.
@@ -1070,7 +1223,8 @@ class Summarizer:
                     is_scenario_table=False,
                     aggregation_strategies=aggregate_strategies,
                 )
-                tables.append(table)
+                if len(table.header) > 1:
+                    tables.append(table)
         return tables
     def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1213,14 +1367,16 @@ class Summarizer:
         """Run the entire summarization pipeline."""
         self.read_runs()
         self.group_runs()
-        self.check_metrics_defined()
-        self.write_run_display_json(skip_completed)
+        ensure_directory_exists(self.run_release_path)
         # Must happen after self.read_runs()
         # because it uses self.runs
+        self.fix_up_schema()
+        self.check_metrics_defined()
         self.write_schema()
+        self.write_run_display_json(skip_completed)
         self.write_executive_summary()
         self.write_runs()
         self.write_run_specs()
@@ -1254,7 +1410,15 @@ def summarize(args):
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
-    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    schema_path: Optional[str]
+    if args.auto_generate_schema:
+        if args.schema_path:
+            raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
+        schema_path = None
+    elif args.schema_path:
+        schema_path = args.schema_path
+    else:
+        schema_path = get_default_schema_path()
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
@@ -1346,6 +1510,11 @@ def main():
         default=None,
         help="PATH to a YAML file to customize logging",
     )
+    parser.add_argument(
+        "--auto-generate-schema",
+        action="store_true",
+        help="EXPERIMENTAL: Auto-generate schema",
+    )
     args = parser.parse_args()
     setup_default_logging(args.log_config)
     summarize(args)

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl