PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +191 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +47 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +24 -6
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/huggingface_client.py +2 -2
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +33 -20
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -13
helm/clients/vertexai_client.py +19 -11
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +525 -172
helm/config/model_metadata.yaml +185 -10
helm/config/tokenizer_configs.yaml +100 -2
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py CHANGED Viewed

@@ -3,8 +3,9 @@ import os
 import random
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
@@ -76,3 +77,14 @@ class DecodingTrustToxicityPromptsScenario(Scenario):
         random.shuffle(instances)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_toxicity_prompts",
+            display_name="DecodingTrust - Toxicity",
+            short_display_name="Toxicity",
+            description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/dischargeme_scenario.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 import pandas as pd
@@ -170,3 +172,25 @@ class DischargeMeScenario(Scenario):
             lines = file.readlines()
         lines = [line.strip() for line in lines]
         return lines
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="dischargeme",
+            display_name="DischargeMe",
+            short_display_name="DischargeMe",
+            description="DischargeMe is a benchmark designed to evaluate clinical text generation. It "
+            "pairs discharge summaries and radiology reports from MIMIC-IV with generation "
+            "tasks such as writing discharge instructions or summarizing the brief hospital "
+            "course. The benchmark assesses a model's ability to generate patient-facing "
+            "documentation that is complete, empathetic, and clinically accurate [(Xu, "
+            "2024)](https://physionet.org/content/discharge-me/1.3/).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate discharge instructions from hospital notes",
+                when="Upon hospital discharge",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="dischargeme_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/disinformation_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
@@ -175,3 +177,23 @@ class DisinformationScenario(Scenario):
             instances = self.create_wedging_instances(data)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.capability == "reiteration":
+            name = "disinformation_reiteration"
+            display_name = "Disinformation (reiteration)"
+        elif self.capability == "wedging":
+            name = "disinformation_wedging"
+            display_name = "Disinformation (wedging)"
+        else:
+            raise Exception(f"Unknown capability {self.capability}")
+        return ScenarioMetadata(
+            name=name,
+            display_name=display_name,
+            description="Scenario from [Buchanan et al. "
+            "(2021)](https://cset.georgetown.edu/publication/truth-lies-and-automation/) "
+            "that tests the ability to generate divisive and wedging content.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="valid",
+        )

helm/benchmark/scenarios/dyck_language_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import numpy as np
 import random
 from typing import List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -234,3 +236,16 @@ class DyckLanguageScenario(Scenario):
             not_allowed=train_inputs,
         )
         return train_instances + test_instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="dyck_language",
+            display_name="Dyck",
+            description="Scenario testing hierarchical reasoning through the Dyck formal languages "
+            "[(Suzgun et al., 2019)](https://aclanthology.org/W19-3905/).",
+            taxonomy=TaxonomyInfo(
+                task="next-word prediction", what="Dyck formal language", when="n/a", who="n/a", language="synthetic"
+            ),
+            main_metric="exact_match_indicator",
+            main_split="test",
+        )

helm/benchmark/scenarios/ehrshot_scenario.py CHANGED Viewed

@@ -7,6 +7,7 @@ from functools import partial
 from tqdm import tqdm
 from typing import Any, Dict, List, Optional, Mapping
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 ##################################
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="ehrshot",
+            display_name="EHRSHOT",
+            description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
+            "future clinical events using structured EHR code sequences. Each instance "
+            "contains a patient's historical EHR data and a forward-looking clinical "
+            "question about whether a particular diagnosis, lab result, or hospital event "
+            "will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Predict whether a medical event will occur in the future based " "on EHR codes",
+                when="Future prediction",
+                who="Clinician, Insurer",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/enem_challenge_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List, Any
 from pathlib import Path
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="enem_challenge",
+            display_name="ENEM Challenge",
+            short_display_name=None,
+            description="ENEM Challenge",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="general academic subjects",
+                when="between 2009 and 2023",
+                who="brazilian ministry of education",
+                language="Portuguese",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_data_imputation_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from pathlib import Path
 from typing import List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_data_imputation",
+            display_name="Data imputation",
+            description="Scenario from [Mei et al. "
+            "(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
+            "to impute missing entities in a data table.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_matching_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pandas as pd
 from pathlib import Path
 from typing import Dict, List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_matching",
+            display_name="Entity matching",
+            description="Scenario from Magellan [(Konda et al., "
+            "2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
+            "to determine if two entities match.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/exams_multilingual_scenario.py ADDED Viewed

@@ -0,0 +1,115 @@
+import os
+from typing import Dict, List
+import datasets
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    VALID_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.hierarchical_logger import hwarn
+class EXAMSMultilingualScenario(Scenario):
+    """EXAMS: A Multi-subject High School Examinations Dataset
+    EXAMS is a benchmark dataset for multilingual and cross-lingual
+    question answering from high school examinations. It consists of
+    more than 24,000 high-quality high school exam questions in 16
+    languages, covering 8 language families and 24 school subjects
+    from Natural Sciences and Social Sciences, among others.
+    - https://huggingface.co/datasets/mhardalov/exams
+    - https://aclanthology.org/2020.emnlp-main.438/
+    Note: Some dataset rows have the value '@' in the `answerKey` column.
+    These rows will be ignored.
+    ```
+    @inproceedings{hardalov-etal-2020-exams,
+        title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
+        author = "Hardalov, Momchil  and
+        Mihaylov, Todor  and
+        Zlatkova, Dimitrina  and
+        Dinkov, Yoan  and
+        Koychev, Ivan  and
+        Nakov, Preslav",
+        editor = "Webber, Bonnie  and
+        Cohn, Trevor  and
+        He, Yulan  and
+        Liu, Yang",
+        booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+        month = nov,
+        year = "2020",
+        address = "Online",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2020.emnlp-main.438/",
+        doi = "10.18653/v1/2020.emnlp-main.438",
+        pages = "5427--5444",
+        abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
+    }```
+    """  # noqa: E501
+    name = "exams_multilingual"
+    description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. "  # noqa: E501
+    tags = ["knowledge", "multiple_choice"]
+    CHOICES = ["A", "B", "C", "D", "E"]
+    HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
+    def __init__(self, language: str, subject: str):
+        super().__init__()
+        self.language = language
+        self.subject = subject
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
+            "mhardalov/exams",
+            "multilingual",
+            revision="4ff10804abb3341f8815cacd778181177bba7edd",
+            cache_dir=cache_dir,
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for split_name, dataset in dataset_splits.items():
+            assert isinstance(dataset, datasets.Dataset)
+            for row in dataset:
+                question = row["question"]
+                question_info = row["info"]
+                if self.subject != "all" and question_info["subject"] != self.subject:
+                    continue
+                if self.language != "all" and question_info["language"] != self.language:
+                    continue
+                input = Input(text=question["stem"])
+                references: List[Reference] = []
+                if row["answerKey"] not in self.CHOICES:
+                    hwarn(f"Invalid value in answerKey column in row: {row}")
+                    continue
+                correct_choice_index = ord(row["answerKey"]) - ord("A")
+                for choice_index, choice_text in enumerate(question["choices"]["text"]):
+                    references.append(
+                        Reference(
+                            output=Output(text=choice_text),
+                            tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
+                        )
+                    )
+                instance = Instance(
+                    id=row["id"],
+                    input=input,
+                    references=references,
+                    split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
+                )
+                instances.append(instance)
+        return instances

helm/benchmark/scenarios/financial_phrasebank_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative"""  # noqa: E501
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="financial_phrasebank",
+            display_name="Financial Phrasebank (Sentiment Classification)",
+            short_display_name=None,
+            description="A sentiment classification benchmark based on the dataset from Good Debt or "
+            "Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
+            "2013)](https://arxiv.org/abs/1307.5336).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="phrases from financial news texts and company press releases",
+                when="before 2013",
+                who="annotators with adequate business education background",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gold_commodity_news_scenario.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import List
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.runner import TRAIN_SPLIT
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Scenario,
     Output,
+    ScenarioMetadata,
 )
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
         for train_index in train_indexes:
             instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gold_commodity_news",
+            display_name="Gold Commodity News",
+            short_display_name=None,
+            description="A classification benchmark based on a dataset of human-annotated gold "
+            "commodity news headlines ([Sinha & Khandait, "
+            "2019](https://arxiv.org/abs/2009.04202)).",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="gold commodity news headlines",
+                when="2000-2019",
+                who="financial journalists",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gpqa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="GPQA",
+            description=self.description,
+            main_metric="chain_of_thought_correctness",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="complex questions across various disciplines",
+                who="domain experts",
+                when="2024",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/grammar_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
         instances: List[Instance] = list(map(derivation_to_instance, derivations))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="grammar",
+            display_name="Best ChatGPT Prompts",
+            short_display_name="Best ChatGPT Prompts",
+            description="A list of “best ChatGPT prompts to power your workflow” summarized by "
+            "[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="2023",
+                who="Gridfiti Staff",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/gsm_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -65,3 +67,16 @@ class GSM8KScenario(Scenario):
                         ),
                     )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gsm",
+            display_name="GSM8K (Grade school math word problems)",
+            short_display_name="GSM8K",
+            description="The grade school math word problems dataset (GSM8K) for testing mathematical "
+            "reasoning on grade-school math problems [(Cobbe et al., "
+            "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="exact_match_indicator",
+            main_split="test",
+        )

helm/benchmark/scenarios/headqa_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Optional
 from datasets import DatasetDict, load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     Reference,
     Scenario,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
                     )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="head_qa",
+            display_name="HeadQA",
+            description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
+            "intended to evaluate a model's medical knowledge and reasoning. Each instance "
+            "presents a clinical or scientific question with four answer options, requiring "
+            "the model to select the most appropriate answer [(Vilares et al., "
+            "2019)](https://arxiv.org/abs/1906.04701).",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Medical knowledge testing",
+                when="Any",
+                who="Medical student, Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl