PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/exams_multilingual_scenario.py ADDED Viewed

@@ -0,0 +1,115 @@
+import os
+from typing import Dict, List
+import datasets
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    VALID_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.hierarchical_logger import hwarn
+class EXAMSMultilingualScenario(Scenario):
+    """EXAMS: A Multi-subject High School Examinations Dataset
+    EXAMS is a benchmark dataset for multilingual and cross-lingual
+    question answering from high school examinations. It consists of
+    more than 24,000 high-quality high school exam questions in 16
+    languages, covering 8 language families and 24 school subjects
+    from Natural Sciences and Social Sciences, among others.
+    - https://huggingface.co/datasets/mhardalov/exams
+    - https://aclanthology.org/2020.emnlp-main.438/
+    Note: Some dataset rows have the value '@' in the `answerKey` column.
+    These rows will be ignored.
+    ```
+    @inproceedings{hardalov-etal-2020-exams,
+        title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
+        author = "Hardalov, Momchil  and
+        Mihaylov, Todor  and
+        Zlatkova, Dimitrina  and
+        Dinkov, Yoan  and
+        Koychev, Ivan  and
+        Nakov, Preslav",
+        editor = "Webber, Bonnie  and
+        Cohn, Trevor  and
+        He, Yulan  and
+        Liu, Yang",
+        booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+        month = nov,
+        year = "2020",
+        address = "Online",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2020.emnlp-main.438/",
+        doi = "10.18653/v1/2020.emnlp-main.438",
+        pages = "5427--5444",
+        abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
+    }```
+    """  # noqa: E501
+    name = "exams_multilingual"
+    description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. "  # noqa: E501
+    tags = ["knowledge", "multiple_choice"]
+    CHOICES = ["A", "B", "C", "D", "E"]
+    HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
+    def __init__(self, language: str, subject: str):
+        super().__init__()
+        self.language = language
+        self.subject = subject
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
+            "mhardalov/exams",
+            "multilingual",
+            revision="4ff10804abb3341f8815cacd778181177bba7edd",
+            cache_dir=cache_dir,
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for split_name, dataset in dataset_splits.items():
+            assert isinstance(dataset, datasets.Dataset)
+            for row in dataset:
+                question = row["question"]
+                question_info = row["info"]
+                if self.subject != "all" and question_info["subject"] != self.subject:
+                    continue
+                if self.language != "all" and question_info["language"] != self.language:
+                    continue
+                input = Input(text=question["stem"])
+                references: List[Reference] = []
+                if row["answerKey"] not in self.CHOICES:
+                    hwarn(f"Invalid value in answerKey column in row: {row}")
+                    continue
+                correct_choice_index = ord(row["answerKey"]) - ord("A")
+                for choice_index, choice_text in enumerate(question["choices"]["text"]):
+                    references.append(
+                        Reference(
+                            output=Output(text=choice_text),
+                            tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
+                        )
+                    )
+                instance = Instance(
+                    id=row["id"],
+                    input=input,
+                    references=references,
+                    split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
+                )
+                instances.append(instance)
+        return instances

helm/benchmark/scenarios/grammar.py CHANGED Viewed

@@ -2,7 +2,7 @@ from collections import defaultdict
 from dataclasses import dataclass, field, replace
 from functools import cached_property
 from typing import List, Optional
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 import dacite
 import re
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
             # Make sure all categories are defined
             for category in expansion.categories:
                 if category not in grammar.category_to_rules:
-                    hlog(f"WARNING: Category {category} is not defined")
+                    hwarn(f"Category {category} is not defined")
 def read_grammar(path: str) -> Grammar:

helm/benchmark/scenarios/headqa_scenario.py CHANGED Viewed

@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
     SKIP_TEXTQA: bool = False
     name = "head_qa"
-    description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
+    description = (
+        "HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
+        "evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
+        "or scientific question with four answer options, requiring the model to select the most"
+        "appropriate answer."
+    )
     tags = ["question_answering", "biomedical", "medicine"]
     def __init__(self, language: str = "en", category: Optional[str] = None):

helm/benchmark/scenarios/healthqa_br_scenario.py ADDED Viewed

@@ -0,0 +1,80 @@
+from typing import Any, List
+import re
+from pathlib import Path
+from datasets import load_dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class HEALTHQA_BR_Scenario(Scenario):
+    """
+    HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
+    within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
+    nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
+    Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
+    covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
+    psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
+    of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
+    """
+    name = "healthqa_br"
+    description = "MQA benchmark with questions from Brazilian entrance exams"
+    tags = ["knowledge", "multiple_choice", "pt-br"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data and read all the dialogues
+        dataset: Any
+        # Read all the instances
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+        dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
+        for example in dataset["train"]:
+            question_choices = example["question"]
+            answer = example["answer"].strip().upper()
+            # Separate the question statement from the alternatives
+            question_text, choices_text = self.split_question_and_choices(question_choices)
+            # Extract alternatives from text choices_text
+            pattern = r"'([A-Z])':\s*'([^']+)'"
+            matches = re.findall(pattern, choices_text)
+            answers_dict = {label: text for label, text in matches}
+            if answer not in answers_dict:
+                continue
+            correct_answer_text = answers_dict[answer]
+            def answer_to_reference(answer: str) -> Reference:
+                return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
+            instance = Instance(
+                input=Input(text=question_text),
+                split=TEST_SPLIT,
+                references=[answer_to_reference(text) for text in answers_dict.values()],
+            )
+            instances.append(instance)
+        return instances
+    def split_question_and_choices(self, full_text: str):
+        # Search for the first occurrence of the alternative pattern
+        match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
+        if match:
+            # Everything before the alternatives
+            question_part = full_text[: match.start()].strip()
+            # All of the alternatives (from match to end)
+            choices_part = full_text[match.start() :].strip()
+        else:
+            # If you don't find a pattern, consider everything as a question, and no alternative.
+            question_part = full_text.strip()
+            choices_part = ""
+        return question_part, choices_part

helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py ADDED Viewed

@@ -0,0 +1,90 @@
+import os
+import re
+from typing import List
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class InfiniteBenchEnMCScenario(Scenario):
+    """InfiniteBench En.MC
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
+    InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
+    long-range dependency and reasoning, beyond simple short passage retrieval.
+    """
+    name = "infinite_bench_en_mc"
+    description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    tags = ["question_answering"]
+    def __init__(self, max_num_words: int):
+        self.max_num_words = max_num_words
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_choice_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+        assert isinstance(dataset, Dataset)
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"])
+            + count_words(example["input"])
+            + sum(count_words(option) for option in example["options"])
+            <= self.max_num_words
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            assert len(row["answer"]) == 1
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            references = [
+                Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
+                for option in row["options"]
+            ]
+            instance = Instance(
+                id=id,
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+import re
+from typing import List
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class InfiniteBenchEnQAScenario(Scenario):
+    """InfiniteBench En.QA
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
+    InfiniteBench that requires models to perform open-form question answering on questions that necessitate
+    long-range dependency and reasoning, beyond simple short passage retrieval.
+    """
+    name = "infinite_bench_en_qa"
+    description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning.  ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    tags = ["question_answering"]
+    def __init__(self, max_num_words: int):
+        self.max_num_words = max_num_words
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_qa_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+        assert isinstance(dataset, Dataset)
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"])
+            + count_words(example["input"])
+            + sum(count_words(option) for option in example["options"])
+            <= self.max_num_words
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            instance = Instance(
+                id=id,
+                input=input,
+                references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} RENAMED Viewed

@@ -14,21 +14,19 @@ from helm.benchmark.scenarios.scenario import (
 from helm.common.general import ensure_directory_exists
-class InfiniteBenchSumScenario(Scenario):
-    """InfiniteBench Sum
+class InfiniteBenchEnSumScenario(Scenario):
+    """InfiniteBench En.Sum
     InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
-    understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
-    InfiniteBench that requires models to generate a concise summary of the novel. The subset is referred
-    to as "En.Sum" in the original paper.
+    understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
+    InfiniteBench that requires models to generate a concise summary of the novel.
     """
-    name = "infinite_bench_sum"
-    description = "Summarize a novel from InfiniteBench"
+    name = "infinite_bench_en_sum"
+    description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
     tags = ["summarization"]
-    def __init__(self, min_num_words: int, max_num_words: int):
-        self.min_num_words = min_num_words
+    def __init__(self, max_num_words: int):
         self.max_num_words = max_num_words
         super().__init__()
@@ -61,9 +59,9 @@ class InfiniteBenchSumScenario(Scenario):
         def count_words(text: str) -> int:
             return len(re.split(r"\s+", text.strip()))
-        dataset = dataset.map(
-            lambda example: {"prompt_wc": count_words(example["context"]) + count_words(example["input"])}
-        ).filter(lambda example: self.min_num_words <= example["prompt_wc"] <= self.max_num_words)
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
+        )
         # Read all instances
         instances: List[Instance] = []
@@ -75,7 +73,6 @@ class InfiniteBenchSumScenario(Scenario):
                 input=input,
                 references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
                 split=TEST_SPLIT,
-                extra_data={"word_count": row["prompt_wc"]},
             )
             instances.append(instance)

helm/benchmark/scenarios/kpi_edgar_scenario.py ADDED Viewed

@@ -0,0 +1,151 @@
+import os
+from typing import List, Dict
+import json
+import re
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class KPIEDGARScenario(Scenario):
+    """A financial named entity recognition (NER) scenario based on KPI-EDGAR (T. Deußer et al., 2022).
+    This scenario has been modified from the paper. The original paper has 12 entity types and requires the model
+    to extract pairs of related entities. This scenario only use four named entity types (kpi, cy, py, py1) and only
+    requires the model to extract individual entities.
+    Paper:
+    T. Deußer et al.,
+    “KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
+    https://arxiv.org/abs/2210.09163
+    Prompt format:
+    ```
+    Context: {Sentence}
+    Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
+    kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
+    Answer:
+    ```
+    Example input:
+    ```
+    Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
+    Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
+    kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
+    Answer:
+    ```
+    Example reference:
+    ```
+    284 [cy], total unrecognized compensation cost [kpi]
+    ```"""  # noqa: E501
+    name = "kpi_edgar"
+    description = "Named Entity Recognition from financial documents."
+    tags = ["named_entity_recognition", "finance"]
+    TAG_DICT = {
+        "kpi": "Key Performance Indicators expressible in numerical and monetary value",
+        "cy": "Current Year monetary value",
+        "py": "Prior Year monetary value",
+        "py1": "Two Year Past Value",
+    }
+    TAG_DESCRIPTIONS = ", ".join(["%s: %s" % (key, val) for (key, val) in TAG_DICT.items()]) + "."
+    TAG_PAREN_RE = (r"\[", r"\]")
+    TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
+    TAG_PAREN_ESC = ("(", ")")
+    DATASET_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT, "test": TEST_SPLIT}
+    JSON_URL = "https://raw.githubusercontent.com/tobideusser/kpi-edgar/2ec7084dcd55b4979bbe288d4aa1e962c685c9ab/data/kpi_edgar.json"  # noqa: E501
+    JSON_FILENAME = "kpi_edgar.json"
+    @staticmethod
+    def get_sentences(dataset: List[Dict]) -> List[Dict]:
+        return [
+            sentence
+            for document in dataset
+            for segment in document["segments"]
+            for sentence in segment["sentences"] or []
+        ]
+    @staticmethod
+    def escape_parenthesis(text: str) -> str:
+        tmp0 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[0], KPIEDGARScenario.TAG_PAREN_ESC[0], text)
+        tmp1 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[1], KPIEDGARScenario.TAG_PAREN_ESC[1], tmp0)
+        return tmp1
+    @staticmethod
+    def get_output_text(
+        words: List[str],
+        annotations: List[Dict],
+    ) -> str:
+        # def get_entity_for_annotation(words: List[str], annotation: Dict) -> str
+        entities: List[str] = []
+        for annotation in annotations:
+            annotation_type = annotation["type_"]
+            if annotation_type not in KPIEDGARScenario.TAG_DICT:
+                continue
+            start_idx = annotation["start"]
+            end_idx = annotation["end"]
+            annotated_words = words[start_idx:end_idx]
+            phrase = KPIEDGARScenario.escape_parenthesis(" ".join(annotated_words))
+            entities.append(
+                "%s %s%s%s" % (phrase, KPIEDGARScenario.TAG_PAREN[0], annotation_type, KPIEDGARScenario.TAG_PAREN[1])
+            )
+        return ", ".join(entities)
+    @staticmethod
+    def sentences_to_instances(sentences: List[Dict]) -> List[Instance]:
+        instances: List[Instance] = []
+        for sentence in sentences:
+            dataset_split: str = sentence["split_type"]
+            if dataset_split is None:
+                continue
+            split = KPIEDGARScenario.DATASET_SPLIT_TO_HELM_SPLIT[dataset_split]
+            words: List[str] = [word_dict["value"] for word_dict in sentence["words"]]
+            passage = KPIEDGARScenario.escape_parenthesis(" ".join(words))
+            input_text = (
+                "Context: %s\n"
+                "Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n"  # noqa: E501
+                "%s" % (passage, KPIEDGARScenario.TAG_DESCRIPTIONS)
+            )
+            annotations = sentence["entities_anno"]
+            output_text = KPIEDGARScenario.get_output_text(words, annotations)
+            if not output_text:
+                continue
+            instances.append(
+                Instance(
+                    input=Input(text=input_text),
+                    references=[Reference(Output(text=output_text), tags=[CORRECT_TAG])],
+                    split=split,
+                )
+            )
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        base_url = self.JSON_URL
+        dataset_file_name = self.JSON_FILENAME
+        target_path = os.path.join(data_path, dataset_file_name)
+        ensure_file_downloaded(source_url=base_url, target_path=target_path)
+        with open(target_path, "r") as f:
+            raw_dataset = json.load(f)
+        return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl