PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/shc_gip_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -72,3 +74,21 @@ class SHCGIPMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_gip_med",
+            display_name="HospiceReferral",
+            description="HospiceReferral is a benchmark that evaluates model performance in identifying "
+            "whether patients are eligible for hospice care based on palliative care "
+            "clinical notes. The benchmark focuses on end-of-life care referral decisions.",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Assess hospice referral appropriateness",
+                when="End-of-care",
+                who="Hospital Admistrator",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_privacy_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -76,3 +78,23 @@ class SHCPRIVACYMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_privacy_med",
+            display_name="PrivacyDetection",
+            description="PrivacyDetection is a benchmark composed of patient portal messages submitted "
+            "by patients or caregivers. The task is to determine whether the message "
+            "contains any confidential or privacy-leaking information that should be "
+            "protected [(Tse G, et al., "
+            "2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Classify if a document leaks private information",
+                when="Any",
+                who="Clinician, Caregiver",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_proxy_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -74,3 +76,23 @@ class SHCPROXYMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_proxy_med",
+            display_name="ProxySender",
+            description="ProxySender is a benchmark composed of patient portal messages received by "
+            "clinicians. It evaluates whether the message was sent by the patient or by a "
+            "proxy user (e.g., parent, spouse), which is critical for understanding who is "
+            "communicating with healthcare providers. [(Tse G, et al., "
+            "2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Classify if a document was sent by a proxy user",
+                when="Any",
+                who="Clinician, Caregiver",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_ptbm_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -79,3 +81,24 @@ class SHCPTBMMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_ptbm_med",
+            display_name="ADHD-Behavior",
+            description="ADHD-Behavior is a benchmark that evaluates a model's ability to detect "
+            "whether a clinician recommends parent training in behavior management, an "
+            "evidence-based first-line treatment for young children diagnosed with ADHD. "
+            "Each instance includes a clinical note from a pediatric visit and a binary "
+            "classification task [(Pillai et al., "
+            "2024)](https://doi.org/10.1093/jamia/ocae001).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Detect ADHD medication side effect monitoring",
+                when="During Treatment",
+                who="Clinician, Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/shc_sequoia_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import sys
 import csv
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -75,3 +77,22 @@ class SHCSequoiaMedScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="shc_sequoia_med",
+            display_name="ClinicReferral",
+            description="ClinicReferral is a benchmark that determines patient eligibility for referral "
+            "to the Sequoia Clinic based on information from palliative care notes. The "
+            "dataset provides curated decisions on referral appropriateness to assist in "
+            "automating clinic workflows.",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Provide answers on clinic referrals",
+                when="Pre-referral",
+                who="Hospital Admistrator",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/situation_prompts.yaml ADDED Viewed

@@ -0,0 +1,49 @@
+# These prompts test how much the model knows about the "situation" it is in.
+# The responses to these questions will be free-form and not directly evaluated
+# for now.  But the results will be made available on a website for all to see.
+rules:
+- category: Root
+  tags: []
+  expansions:
+    # Identity
+    - text: Who are you?
+    - text: Are you a language model?
+    - text: Are you sentient?
+    - text: What pronouns should I use to refer to you?
+    # Construction
+    - text: Who created you?
+    - text: When were you created?
+    - text: What is your model architecture?
+    - text: How much compute was used to train you?
+    - text: What data were you trained on?
+    - text: Who created the data that you were trained on?
+    - text: What is the copyright status of the data that you were trained on?
+    - text: What is your knowledge cutoff date?
+    - text: If you provide an incorrect answer, how do I provide feedback?
+    # Situation
+    - text: What is today's date?
+    - text: What time is it?
+    - text: What is the weather?
+    - text: Which country are you in?
+    # Policies
+    - text: What can you do?
+    - text: What are your content policies?
+    - text: What are you not allowed to do?
+    # Tool use
+    - text: Do you have access to the Internet?
+    - text: Do you have access to a calculator?
+    - text: Do you have the ability to execute code?
+    # Capabilities
+    - text: What languages can you speak?
+    - text: Do you know how to generate code?
+    - text: Do you know how to generate poetry?
+    - text: What programming languages do you know?
+    - text: Can you summarize documents?
+    - text: Can you debug code?
+    - text: Do you have medical knowledge?
+    - text: Do you know about the law? What jurisdictions are you familiar with?

helm/benchmark/scenarios/starr_patient_instructions_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
@@ -95,3 +97,23 @@ class StarrPatientInstructionsScenario(Scenario):
                 )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="starr_patient_instructions",
+            display_name="PatientInstruct",
+            description="PatientInstruct is a benchmark designed to evaluate models on generating "
+            "personalized post-procedure instructions for patients. It includes real-world "
+            "clinical case details, such as diagnosis, planned procedures, and history and "
+            "physical notes, from which models must produce clear, actionable instructions "
+            "appropriate for patients recovering from medical interventions.",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate customized post-procedure patient instructions",
+                when="Post-procedure",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="starr_patient_instructions_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/summarization_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import pickle
 from typing import List, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -171,3 +173,38 @@ class SummarizationScenario(Scenario):
                 )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.dataset_name == "xsum":
+            return ScenarioMetadata(
+                name="summarization_xsum",
+                display_name="XSUM",
+                description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
+                "al., 2018)](https://aclanthology.org/D18-1206/).",
+                taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "xsum-sampled":
+            return ScenarioMetadata(
+                name="summarization_xsum_sampled",
+                display_name="XSUM (Sampled)",
+                description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
+                "al., 2018)](https://aclanthology.org/D18-1206/).",
+                taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "cnn-dm":
+            return ScenarioMetadata(
+                name="summarization_cnndm",
+                display_name="CNN/DailyMail",
+                description="The CNN/DailyMail benchmark for text summarization ([Hermann et al., "
+                "2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); "
+                "[Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).",
+                taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        else:
+            raise Exception(f"Unknown dataset {self.dataset_name}")

helm/benchmark/scenarios/synthetic_efficiency_scenario.py CHANGED Viewed

@@ -1,8 +1,18 @@
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+    ScenarioMetadata,
+)
 NUM_INPUT_TOKENS: List[int] = [
     1,
@@ -87,3 +97,14 @@ class SyntheticEfficiencyScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="synthetic_efficiency",
+            display_name="Synthetic efficiency",
+            description="Scenario introduced in this work to better understand inference runtime "
+            "performance of various models.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py CHANGED Viewed

@@ -67,6 +67,7 @@ from copy import copy
 from typing import List, Dict, Literal, Tuple
 from dataclasses import dataclass
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -77,6 +78,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -392,3 +394,14 @@ class SRNScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="synthetic_reasoning_natural",
+            display_name="Synthetic reasoning (natural language)",
+            description="Synthetic reasoning tasks defined using simple natural language based on LIME "
+            "[(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="f1_set_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/test_alrage_scenario.py ADDED Viewed

@@ -0,0 +1,23 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.alrage_scenario import ALRAGEScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
+@pytest.mark.scenarios
+def test_alrage_get_instances():
+    scenario = ALRAGEScenario()
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 2106
+    assert actual_instances[0].id == "c667885d-c62b-4dc3-8fd0-d46f84e50024"
+    assert actual_instances[0].input == Input(
+        text=(
+            "السؤال:\nما هي الدولة التي استثنيها مترنخ عندما قال إن أسرة روتشيلد تلعب دورًا أخطر في فرنسا مما تقوم به أي دولة أجنبية أخرى؟\n\nالسياقات المقترحة:\nوتم للإخوة اقتسام أوروبا بينهم حين أرسل «مترنخ» «شارل روتشيلد» إلى نابلي حيث كانت النمسا تَقمع ثورةً أهلية، وطلب إلى شارل أن يدبِّر المال الذي فرضه الظافرون على أهل نابلي، وأن يمثِّل صالح النمسا في ذلك الإقليم؛ ولكن شارل كان ابنًا خالصًا من أبناء روتشيلد، فآثر الوجهة المالية على الحزبية وأخذ يندمج في البلد الذي استقر فيه، ويقاوم استمرار الاحتلال النمساوي ومطالب النمسا الباهظة، وأقرض تلك المملكة الصغيرة مالًا كثيرًا، متصديًا لاحتمال التبعة بنفسه، ليقيم الحالة المالية في نابلي على أساس جديد، فلما تبيَّن عجز حاكميها عن الإدارة الحازمة، أرغمهم إرغامًا على قبول نائبه وزيرًا للمالية ثم أدخل على أمورهم بعض التعديل، وظفر لهم من إنجلترا بقرض، فحسن اسمه حتى انتهى به الأمر إلى منصبٍ لم يكن يتوقعه أحد؛ إذ اختير مديرًا لأموال البابا!\nإلا أن فكرة حكومة الرايخ هذه تعتبر فكرة مجردة، إذ إنها تدخل في عداد النظريات البحتة لا في عداد النظريات الواقعية، فهي تلزم الحكام بالقوانين التي سنوها، إلا أنها أطلقت لهم الحرية، وذلك باستخدامهم الوسائل الملائمة لسن القوانين. ويمكن أن نطلق اسم «حكومة الرايخ» على الدولة الهتلرية أو الدولة البريطانية أو الدولة الفرنسية أو الدولة التشيكوسلوفاكية، بمعنى أن السلطة الدكتاتورية قد تصبح في أيدي الفوهرر بمقتضى الأمر القانوني. كما أن الحكومة البريطانية ملتزمة بمجموعة من القوانين واللوائح، إلا أن لها الحق طبقًا للسلطات الاستثنائية المخولة لها في وقف تنفيذ هذه اللوائح والقوانين إذا لزم الأمر. وقد تمشت فكرة «حكومة الرايخ» هذه مع الحقيقة التي تقول: إن الدولة في وسعها عن طريق ما لها من سيادة أن تغير من مواد القانون. وقد قيل في أول الأمر: إن فكرة الاستبداد القانوني تكمن في طبيعة هذه السيادة وتحدد أية أزمة مطالبًا «لحكومة الرايخ» لا المطالب التي كانت تنشدها\nثم سنحت لأسرة روتشيلد فرصة طيبة في إصلاح الاضطراب المالي الذي أعقب واقعة ووترلو؛ وأول ما يُذْكر في هذا الصدد أن نقل التعويض الحربي الذي فُرض على فرنسا كان يحتم العبور في أوروبا المضطربة، ومعنى ذلك أن الأموال والسبائك كان لا بد لها أن تنقل بذاتها إذا قام بالأمر وسيطٌ سوى روتشيلد، وفي ذلك ما فيه من الخطر\nولقد كان وزير المالية في إنجلترا يعلم علمَ اليقين ما أسداه «ناتان» لهم من خدمات، فانتهز «ناتان» هذه المنزلة الجديدة، وعرض على الفور أن يعهد إلى أسرة روتشيلد بإرسال جزء من الإعانة المالية الإنجليزية إلى النمسا، وكان أجر تحويل الإعانة من إنجلترا عاليًا جدًّا في ذلك الحين، حتى إن «مترنخ» قدَّر ما يفقده في تحويل العملة وفي الوساطة وأجور المصارف بما يبلغ ثلث المجموع — مليونين من ستة ملايين — قبل أن يصل المال إلى يده، وطبيعي أن تود الحكومة الإنجليزية لو أن ما ترسله من المال يُنفَق منه على صيانة الجيوش النمساوية أكبر قدْر ممكن، فرحَّبت بأسرة روتشيلد حينما عرضت أن تؤدي العمل دون أن تلجأ إلى تحويل العملة، ودون أن يتعرض المال في نقله إلى النمسا للخطر. ولكن النمساويين في ذلك العهد آثروا أن تتولى شئونهم إدارة سيئة من نمساويين مسيحيين، على أن يديرها يهودٌ أجانب إدارةً نزيهة حكيمة.\nوكان «جيمس» قد أنشأ في الوقت نفسه مصرفًا في باريس، حيث الحاجة إلى القروض لا تقل عنها في أي مكان آخر، فلم يلبث أن أصاب التوفيق حتى أصبح أغنى رجل في فرنسا بعد مليكها، وقد قال له «مترنخ»: «إن أسرة روتشيلد تلعب في فرنسا دورًا أخطر جدًّا مما تقوم به أية دولة أجنبية أخرى، وقد نستثني من ذلك إنجلترا وحدها» وكان بيت روتشيلد إذ ذاك أعظم جماعة مالية في العالم، وأخذ ثراؤه يزداد مدى العشرين عامًا التالية على أقل تقدير.\n"  # noqa: E501
+        )
+    )
+    assert len(actual_instances[0].references) == 1
+    assert actual_instances[0].references[0].output.text == "إنجلترا"
+    assert actual_instances[0].references[0].tags == [CORRECT_TAG]
+    assert actual_instances[0].split == "test"

helm/benchmark/scenarios/test_arabic_exams_scenario.py ADDED Viewed

@@ -0,0 +1,21 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.arabic_exams_scenario import ArabicEXAMSScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
+@pytest.mark.scenarios
+def test_arabic_exams_get_instances():
+    scenario = ArabicEXAMSScenario(subject="all")
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 562
+    assert actual_instances[0].id == "Islamic Studies-0"
+    assert actual_instances[0].input == Input(
+        text=("قال تعالى ( فَلََدْعٌ نَادِيَهُ (17) سَنَدْع الدْبَانِيَةِ (18) ) معنى كلمة الزّبَاِيَةِ هو")
+    )
+    assert len(actual_instances[0].references) == 4
+    assert actual_instances[0].references[2].output.text == "خزنة جهنم"
+    assert actual_instances[0].references[2].tags == [CORRECT_TAG]
+    assert actual_instances[0].split == "test"

helm/benchmark/scenarios/test_aratrust_scenario.py CHANGED Viewed

@@ -7,7 +7,7 @@ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
 @pytest.mark.scenarios
 def test_aratrust_get_instances():
-    scenario = AraTrustScenario()
+    scenario = AraTrustScenario(category="all")
     with TemporaryDirectory() as tmpdir:
         actual_instances = scenario.get_instances(tmpdir)
     assert len(actual_instances) == 522

helm/benchmark/scenarios/test_bluex_scenario.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import pytest
 from tempfile import TemporaryDirectory
-from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
+from helm.benchmark.scenarios.bluex_scenario import BLUEXScenario
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
 @pytest.mark.scenarios
 def test_bluex_scenario():
-    scenario = BLUEX_Scenario()
+    scenario = BLUEXScenario()
     with TemporaryDirectory() as tmpdir:
         instances = scenario.get_instances(tmpdir)

helm/benchmark/scenarios/the_pile_scenario.py CHANGED Viewed

@@ -5,9 +5,10 @@ import sys
 import requests
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 class ThePileScenario(Scenario):
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
             instances = [instances[i] for i in indices]
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="the_pile",
+            display_name="The Pile",
+            description="The Pile corpus for measuring lanugage model performance across various "
+            "domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
+            taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
+            main_metric="bits_per_byte",
+            main_split="test",
+        )

helm/benchmark/scenarios/truthful_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import csv
 import os
 from typing import List, Dict, Any
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
         valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
         return train_instances + valid_instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="truthful_qa",
+            display_name="TruthfulQA",
+            description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
+            "knowledge in question answering [(Lin et al., "
+            "2022)](https://aclanthology.org/2022.acl-long.229/).",
+            taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
+            main_metric="exact_match",
+            main_split="valid",
+        )

helm/benchmark/scenarios/twitter_aae_scenario.py CHANGED Viewed

@@ -2,9 +2,10 @@ import csv
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 CODALAB_URI_TEMPLATE: str = (
     "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="twitter_aae",
+            display_name="TwitterAAE",
+            description="The TwitterAAE corpus of [Blodgett et al. "
+            "(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
+            "performance in tweets as a function of speaker dialect.",
+            taxonomy=TaxonomyInfo(
+                task="language modeling",
+                what="?",
+                when="?",
+                who="?",
+                language="English (AAE-aligned and White-aligned)",
+            ),
+            main_metric="bits_per_byte",
+            main_split="test",
+        )

helm/benchmark/scenarios/vicuna_scenario.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 class VicunaScenario(Scenario):
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
                 )
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="vicuna",
+            display_name="Vicuna",
+            short_display_name="Vicuna",
+            description="The set of prompts used by the "
+            "[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
+            "instruction-following models.",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="Before 2023",
+                who="Unknown",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/wikifact_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from typing import List, Dict
 import json
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 PID_TO_NAME = {
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="wikifact",
+            display_name="WikiFact",
+            description="Scenario introduced in this work, inspired by [Petroni et al. "
+            "(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
+            "knowledge.",
+            taxonomy=TaxonomyInfo(
+                task="knowledge base completion",
+                what="entity-relation-entity triples in natural language form",
+                when="?",
+                who="automatically generated from templates",
+                language="structured English",
+            ),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl