PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/ehrshot_scenario.py CHANGED Viewed

@@ -7,6 +7,7 @@ from functools import partial
 from tqdm import tqdm
 from typing import Any, Dict, List, Optional, Mapping
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 ##################################
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="ehrshot",
+            display_name="EHRSHOT",
+            description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
+            "future clinical events using structured EHR code sequences. Each instance "
+            "contains a patient's historical EHR data and a forward-looking clinical "
+            "question about whether a particular diagnosis, lab result, or hospital event "
+            "will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Predict whether a medical event will occur in the future based " "on EHR codes",
+                when="Future prediction",
+                who="Clinician, Insurer",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/enem_challenge_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List, Any
 from pathlib import Path
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="enem_challenge",
+            display_name="ENEM Challenge",
+            short_display_name=None,
+            description="ENEM Challenge",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="general academic subjects",
+                when="between 2009 and 2023",
+                who="brazilian ministry of education",
+                language="Portuguese",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_data_imputation_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from pathlib import Path
 from typing import List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_data_imputation",
+            display_name="Data imputation",
+            description="Scenario from [Mei et al. "
+            "(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
+            "to impute missing entities in a data table.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_matching_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pandas as pd
 from pathlib import Path
 from typing import Dict, List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_matching",
+            display_name="Entity matching",
+            description="Scenario from Magellan [(Konda et al., "
+            "2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
+            "to determine if two entities match.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/financial_phrasebank_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative"""  # noqa: E501
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="financial_phrasebank",
+            display_name="Financial Phrasebank (Sentiment Classification)",
+            short_display_name=None,
+            description="A sentiment classification benchmark based on the dataset from Good Debt or "
+            "Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
+            "2013)](https://arxiv.org/abs/1307.5336).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="phrases from financial news texts and company press releases",
+                when="before 2013",
+                who="annotators with adequate business education background",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gold_commodity_news_scenario.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import List
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.runner import TRAIN_SPLIT
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Scenario,
     Output,
+    ScenarioMetadata,
 )
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
         for train_index in train_indexes:
             instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gold_commodity_news",
+            display_name="Gold Commodity News",
+            short_display_name=None,
+            description="A classification benchmark based on a dataset of human-annotated gold "
+            "commodity news headlines ([Sinha & Khandait, "
+            "2019](https://arxiv.org/abs/2009.04202)).",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="gold commodity news headlines",
+                when="2000-2019",
+                who="financial journalists",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gpqa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="GPQA",
+            description=self.description,
+            main_metric="chain_of_thought_correctness",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="complex questions across various disciplines",
+                who="domain experts",
+                when="2024",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/grammar_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
         instances: List[Instance] = list(map(derivation_to_instance, derivations))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="grammar",
+            display_name="Best ChatGPT Prompts",
+            short_display_name="Best ChatGPT Prompts",
+            description="A list of “best ChatGPT prompts to power your workflow” summarized by "
+            "[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="2023",
+                who="Gridfiti Staff",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/gsm_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -65,3 +67,16 @@ class GSM8KScenario(Scenario):
                         ),
                     )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gsm",
+            display_name="GSM8K (Grade school math word problems)",
+            short_display_name="GSM8K",
+            description="The grade school math word problems dataset (GSM8K) for testing mathematical "
+            "reasoning on grade-school math problems [(Cobbe et al., "
+            "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="exact_match_indicator",
+            main_split="test",
+        )

helm/benchmark/scenarios/headqa_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Optional
 from datasets import DatasetDict, load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     Reference,
     Scenario,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
                     )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="head_qa",
+            display_name="HeadQA",
+            description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
+            "intended to evaluate a model's medical knowledge and reasoning. Each instance "
+            "presents a clinical or scientific question with four answer options, requiring "
+            "the model to select the most appropriate answer [(Vilares et al., "
+            "2019)](https://arxiv.org/abs/1906.04701).",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Medical knowledge testing",
+                when="Any",
+                who="Medical student, Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py CHANGED Viewed

@@ -2,11 +2,13 @@ import csv
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
     TEST_SPLIT,
     Input,
+    ScenarioMetadata,
 )
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
             instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="helpdesk_call_summarization",
+            display_name="Helpdesk Call summarization",
+            short_display_name=None,
+            description="Helpdesk Call summarization",
+            taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/ice_scenario.py CHANGED Viewed

@@ -4,9 +4,10 @@ from typing import List, Union
 from enum import Enum
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 try:
     # pd.read_excel() uses xlrd
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
                     instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="ice",
+            display_name="ICE (International Corpus of English)",
+            short_display_name="ICE",
+            description="The International Corpus of English (ICE) drawn from English speakers from "
+            "various places in the world, initiated by [Greenbaum "
+            "(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
+            taxonomy=TaxonomyInfo(
+                task="language modeling",
+                what="?",
+                when="?",
+                who="?",
+                language="English varieties from different nations",
+            ),
+            main_metric="bits_per_byte",
+            main_split="test",
+        )

helm/benchmark/scenarios/ifeval_scenario.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.schema import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
+    ScenarioMetadata,
     Instance,
     Input,
     TEST_SPLIT,
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="IFEval",
+            description=self.description,
+            main_metric="ifeval_strict_accuracy",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="instruction following",
+                what="verifiable general domain instruction following",
+                who="human annotators",
+                when="2023",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/imdb_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
         for split in [TRAIN_SPLIT, VALID_SPLIT]:
             instances.extend(self.get_split_instances(target_path, split, contrast_map))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="imdb",
+            display_name="IMDB",
+            description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
+            "2011)](https://aclanthology.org/P11-1015/).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
+            ),
+            main_metric="quasi_exact_match",
+            main_split="valid",
+        )

helm/benchmark/scenarios/koala_scenario.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 class KoalaScenario(Scenario):
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="koala",
+            display_name="Koala test dataset",
+            short_display_name="Koala test dataset",
+            description="The test dataset from the [Koala "
+            "paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
+            "instruction-following models.",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="Before 2023",
+                who="Web users",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/kpi_edgar_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Dict
 import json
 import re
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
         with open(target_path, "r") as f:
             raw_dataset = json.load(f)
         return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="kpi_edgar",
+            display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
+            short_display_name=None,
+            description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
+            "Dataset and Accompanying Metric for Relation Extraction from Financial "
+            "Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="named entity recognition",
+                what="financial reports",
+                when="before 2022",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="adjusted_macro_f1_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/legal_contract_summarization_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import re
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     CORRECT_TAG,
     Output,
+    ScenarioMetadata,
 )
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="legal_contract_summarization",
+            display_name="Legal Contract Summarization",
+            short_display_name=None,
+            description="Plain English Summarization of Contracts [(Manor et al., "
+            "2019)](https://aclanthology.org/W19-2201.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="summarization",
+                what="legal contracts (e.g. terms of service, license agreements)",
+                when="before 2019",
+                who="lawyers",
+                language="English",
+            ),
+            main_metric="rouge_l",
+            main_split="test",
+        )

helm/benchmark/scenarios/legal_summarization_scenario.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Optional, Any
 import datasets
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 _ALL_LANGUAGES = {
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
                 )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.dataset_name == "BillSum":
+            return ScenarioMetadata(
+                name="billsum_legal_summarization",
+                display_name="BillSum",
+                description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
+                "2020](https://aclanthology.org/D19-5406/)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "MultiLexSum":
+            return ScenarioMetadata(
+                name="multilexsum_legal_summarization",
+                display_name="MultiLexSum",
+                description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
+                "2022](https://arxiv.org/abs/2206.10883)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization",
+                    what="legal text from US civil rights lawsuits",
+                    when=None,
+                    who="lawyers",
+                    language="English",
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "EurLexSum":
+            return ScenarioMetadata(
+                name="eurlexsum_legal_summarization",
+                display_name="EurLexSum",
+                description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
+                "2022](https://arxiv.org/abs/2210.13448)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization",
+                    what="legal text from EU legislation",
+                    when="1960 - 2020",
+                    who="lawyers",
+                    language="English",
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        else:
+            raise Exception(f"Unknown dataset {self.dataset_name}")

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl