PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +191 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +47 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +24 -6
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/huggingface_client.py +2 -2
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +33 -20
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -13
helm/clients/vertexai_client.py +19 -11
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +525 -172
helm/config/model_metadata.yaml +185 -10
helm/config/tokenizer_configs.yaml +100 -2
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/healthqa_br_scenario.py ADDED Viewed

@@ -0,0 +1,80 @@
+from typing import Any, List
+import re
+from pathlib import Path
+from datasets import load_dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class HEALTHQA_BR_Scenario(Scenario):
+    """
+    HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
+    within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
+    nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
+    Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
+    covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
+    psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
+    of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
+    """
+    name = "healthqa_br"
+    description = "MQA benchmark with questions from Brazilian entrance exams"
+    tags = ["knowledge", "multiple_choice", "pt-br"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data and read all the dialogues
+        dataset: Any
+        # Read all the instances
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+        dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
+        for example in dataset["train"]:
+            question_choices = example["question"]
+            answer = example["answer"].strip().upper()
+            # Separate the question statement from the alternatives
+            question_text, choices_text = self.split_question_and_choices(question_choices)
+            # Extract alternatives from text choices_text
+            pattern = r"'([A-Z])':\s*'([^']+)'"
+            matches = re.findall(pattern, choices_text)
+            answers_dict = {label: text for label, text in matches}
+            if answer not in answers_dict:
+                continue
+            correct_answer_text = answers_dict[answer]
+            def answer_to_reference(answer: str) -> Reference:
+                return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
+            instance = Instance(
+                input=Input(text=question_text),
+                split=TEST_SPLIT,
+                references=[answer_to_reference(text) for text in answers_dict.values()],
+            )
+            instances.append(instance)
+        return instances
+    def split_question_and_choices(self, full_text: str):
+        # Search for the first occurrence of the alternative pattern
+        match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
+        if match:
+            # Everything before the alternatives
+            question_part = full_text[: match.start()].strip()
+            # All of the alternatives (from match to end)
+            choices_part = full_text[match.start() :].strip()
+        else:
+            # If you don't find a pattern, consider everything as a question, and no alternative.
+            question_part = full_text.strip()
+            choices_part = ""
+        return question_part, choices_part

helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py CHANGED Viewed

@@ -2,11 +2,13 @@ import csv
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
     TEST_SPLIT,
     Input,
+    ScenarioMetadata,
 )
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
             instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="helpdesk_call_summarization",
+            display_name="Helpdesk Call summarization",
+            short_display_name=None,
+            description="Helpdesk Call summarization",
+            taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/ice_scenario.py CHANGED Viewed

@@ -4,9 +4,10 @@ from typing import List, Union
 from enum import Enum
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 try:
     # pd.read_excel() uses xlrd
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
                     instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="ice",
+            display_name="ICE (International Corpus of English)",
+            short_display_name="ICE",
+            description="The International Corpus of English (ICE) drawn from English speakers from "
+            "various places in the world, initiated by [Greenbaum "
+            "(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
+            taxonomy=TaxonomyInfo(
+                task="language modeling",
+                what="?",
+                when="?",
+                who="?",
+                language="English varieties from different nations",
+            ),
+            main_metric="bits_per_byte",
+            main_split="test",
+        )

helm/benchmark/scenarios/ifeval_scenario.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.schema import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
+    ScenarioMetadata,
     Instance,
     Input,
     TEST_SPLIT,
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="IFEval",
+            description=self.description,
+            main_metric="ifeval_strict_accuracy",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="instruction following",
+                what="verifiable general domain instruction following",
+                who="human annotators",
+                when="2023",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/imdb_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
         for split in [TRAIN_SPLIT, VALID_SPLIT]:
             instances.extend(self.get_split_instances(target_path, split, contrast_map))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="imdb",
+            display_name="IMDB",
+            description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
+            "2011)](https://aclanthology.org/P11-1015/).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
+            ),
+            main_metric="quasi_exact_match",
+            main_split="valid",
+        )

helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py ADDED Viewed

@@ -0,0 +1,90 @@
+import os
+import re
+from typing import List
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class InfiniteBenchEnMCScenario(Scenario):
+    """InfiniteBench En.MC
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
+    InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
+    long-range dependency and reasoning, beyond simple short passage retrieval.
+    """
+    name = "infinite_bench_en_mc"
+    description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    tags = ["question_answering"]
+    def __init__(self, max_num_words: int):
+        self.max_num_words = max_num_words
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_choice_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+        assert isinstance(dataset, Dataset)
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"])
+            + count_words(example["input"])
+            + sum(count_words(option) for option in example["options"])
+            <= self.max_num_words
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            assert len(row["answer"]) == 1
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            references = [
+                Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
+                for option in row["options"]
+            ]
+            instance = Instance(
+                id=id,
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py CHANGED Viewed

@@ -26,7 +26,7 @@ class InfiniteBenchEnQAScenario(Scenario):
     """
     name = "infinite_bench_en_qa"
-    description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning.  ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
     tags = ["question_answering"]
     def __init__(self, max_num_words: int):

helm/benchmark/scenarios/koala_scenario.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 class KoalaScenario(Scenario):
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="koala",
+            display_name="Koala test dataset",
+            short_display_name="Koala test dataset",
+            description="The test dataset from the [Koala "
+            "paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
+            "instruction-following models.",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="Before 2023",
+                who="Web users",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/kpi_edgar_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Dict
 import json
 import re
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
         with open(target_path, "r") as f:
             raw_dataset = json.load(f)
         return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="kpi_edgar",
+            display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
+            short_display_name=None,
+            description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
+            "Dataset and Accompanying Metric for Relation Extraction from Financial "
+            "Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="named entity recognition",
+                what="financial reports",
+                when="before 2022",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="adjusted_macro_f1_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/legal_contract_summarization_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import re
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     CORRECT_TAG,
     Output,
+    ScenarioMetadata,
 )
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="legal_contract_summarization",
+            display_name="Legal Contract Summarization",
+            short_display_name=None,
+            description="Plain English Summarization of Contracts [(Manor et al., "
+            "2019)](https://aclanthology.org/W19-2201.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="summarization",
+                what="legal contracts (e.g. terms of service, license agreements)",
+                when="before 2019",
+                who="lawyers",
+                language="English",
+            ),
+            main_metric="rouge_l",
+            main_split="test",
+        )

helm/benchmark/scenarios/legal_summarization_scenario.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Optional, Any
 import datasets
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 _ALL_LANGUAGES = {
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
                 )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.dataset_name == "BillSum":
+            return ScenarioMetadata(
+                name="billsum_legal_summarization",
+                display_name="BillSum",
+                description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
+                "2020](https://aclanthology.org/D19-5406/)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "MultiLexSum":
+            return ScenarioMetadata(
+                name="multilexsum_legal_summarization",
+                display_name="MultiLexSum",
+                description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
+                "2022](https://arxiv.org/abs/2206.10883)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization",
+                    what="legal text from US civil rights lawsuits",
+                    when=None,
+                    who="lawyers",
+                    language="English",
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        elif self.dataset_name == "EurLexSum":
+            return ScenarioMetadata(
+                name="eurlexsum_legal_summarization",
+                display_name="EurLexSum",
+                description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
+                "2022](https://arxiv.org/abs/2210.13448)).",
+                taxonomy=TaxonomyInfo(
+                    task="summarization",
+                    what="legal text from EU legislation",
+                    when="1960 - 2020",
+                    who="lawyers",
+                    language="English",
+                ),
+                main_metric="rouge_2",
+                main_split="test",
+            )
+        else:
+            raise Exception(f"Unknown dataset {self.dataset_name}")

helm/benchmark/scenarios/legal_support_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="legal_support",
+            display_name="LegalSupport",
+            description="Scenario introduced in this work to measure fine-grained legal reasoning "
+            "through reverse entailment.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/legalbench_scenario.py CHANGED Viewed

@@ -5,6 +5,7 @@ import datasets
 from pathlib import Path
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
@@ -144,3 +146,21 @@ class LegalBenchScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="legalbench",
+            display_name="LegalBench",
+            description="LegalBench is a large collaboratively constructed benchmark of legal "
+            "reasoning. Five representative tasks are included here. See [(Guha et al, "
+            "2023)[https://arxiv.org/abs/2308.11462] for more details.",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="fact patterns, questions, and legal documents",
+                when="n/a",
+                who="lawyers",
+                language="English",
+            ),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/lex_glue_scenario.py CHANGED Viewed

@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 ECTHR_A = "ecthr_a"
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
         for subset in self.subsets:
             instances.extend(self.get_instances_for_subset(subset, output_path))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="lex_glue",
+            display_name="LexGLUE",
+            description="A Benchmark Dataset for Legal Language Understanding in English",
+            taxonomy=None,
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/lextreme_scenario.py CHANGED Viewed

@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Output,
     Input,
+    ScenarioMetadata,
 )
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
         for subset in self.subsets:
             instances.extend(self.get_instances_for_subset(subset, output_path))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="lextreme",
+            display_name="LEXTREME",
+            description="A Multilingual Legal Benchmark for Natural Language Understanding",
+            taxonomy=None,
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )

crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl