PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +1 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +5 -0
helm/benchmark/presentation/summarize.py +9 -3
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +19 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/index.html +1 -1
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/huggingface_client.py +2 -2
helm/clients/openai_client.py +2 -1
helm/clients/openai_responses_client.py +6 -4
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +0 -2
helm/clients/vertexai_client.py +11 -9
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +454 -175
helm/config/model_metadata.yaml +117 -10
helm/config/tokenizer_configs.yaml +81 -1
helm/proxy/cli.py +1 -1
helm/proxy/retry.py +5 -0
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/medalign_scenario_helper.py CHANGED Viewed

@@ -2,22 +2,13 @@
 # type: ignore
 # fmt: off
-import ast
-import datetime
 import transformers
-import langchain
-import langchain.prompts
-import lxml.etree
 import os
 import pandas as pd
-import re
 import tiktoken
-from langchain_community.retrievers import BM25Retriever
 from tqdm import tqdm
-from typing import Any, Dict, Optional, Union, Callable
-from langchain.schema import Document
-import langchain_community
+from typing import Any, Dict, Optional, Callable
 from helm.common.general import check_file_exists
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
     return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
-def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
-    """
-    Retrieve and filter relevant EHR visits based on a query and target length.
-    This function retrieves electronic health record (EHR) visit strings, sorts them
-    by relevance using the BM25Retriever, and constructs a list of final documents
-    that fit within a specified character length. The final list ensures that the
-    most important visit isn't cut off and is sorted chronologically.
-    Parameters:
-        ehr_visit_strs (list of str): List of EHR visit strings.
-        query (str): Query string to retrieve relevant visits.
-        target_length (int): Maximum total token count for the final list of documents.
-        tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
-    Returns:
-        list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
-    """
-    ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
-    langchain_docs = [
-        langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
-    ]
-    # `k` is the number of documents to retrieve
-    # We retrieve everything and just use the BM25Retriever to sort the documents
-    retriever = langchain_community.retrievers.BM25Retriever.from_documents(
-        langchain_docs, k=len(langchain_docs)
-    )
-    # Invoking the retriever means the most relevant documents are sorted first
-    sorted_docs = retriever.invoke(query)
-    # Define the regex pattern to find the start time
-    # pattern = r'start="([\d/]+ [\d:]+)"'
-    pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
-    docs = []
-    dts = []
-    # Find the startime of the document
-    for doc in sorted_docs:
-        doc_content = doc.page_content
-        start_dt_match = re.search(pattern, doc_content)
-        if start_dt_match:
-            start_dt = start_dt_match.group(1)
-            parsed = False
-            # Try different date formats
-            for fmt in (
-                "%m/%d/%y %I:%M %p",
-                "%m/%d/%Y %I:%M %p",
-                "%m/%d/%y %H:%M",
-                "%m/%d/%Y %H:%M",
-            ):
-                try:
-                    dts.append(datetime.datetime.strptime(start_dt, fmt))
-                    parsed = True
-                    break
-                except ValueError:
-                    continue
-            if not parsed:
-                print(f"Error parsing date: {start_dt}")
-                continue
-        else:
-            print(f"Start time not found., {doc_content}")
-            dts.append(datetime.datetime.min)
-        docs.append(doc_content)
-    final_docs = []
-    current_length = 0
-    # Add documents until we exceed the allocated context length
-    for i in range(len(docs)):
-        doc_content = docs[i]
-        doc_length = len(tokenizer.encode(doc_content))
-        final_docs.append((dts[i], doc_content))
-        current_length += doc_length
-        if current_length > target_length:
-            break
-    # Sort final_docs chronologically
-    final_docs.sort(key=lambda x: x[0])
-    # Extract only the document content for the final output
-    final_docs_content = [doc_content for _, doc_content in final_docs]
-    return final_docs_content
 def pack_and_trim_prompts(
     instructions: Dict[int, Dict[str, str]],
     ehrs: Dict[int, str],
-    prompt_template: langchain.prompts.PromptTemplate,
+    prompt_string: str,
     context_length: int,
     generation_length: int,
     tokenizer: Any,
-    use_RAG: bool = True,
     verbose: bool = False,
     include_ehr: bool = True,
 ) -> Dict[int, str]:
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
         patient_id = int(instructions[instruction_id]["patient_id"])
         relevant_ehr = ehrs[patient_id]
-        # Calculate how many tokens of EHR we can include in the prompt
         num_tokens_instruction = len(tokenizer.encode(instruction))
-        num_tokens_prompt_template = len(tokenizer.encode(prompt_template.template))
+        num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
         if include_ehr:
             target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
         else:
             target_ehr_length = 0
         if target_ehr_length <= 0:
-            prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr="")
+            prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
         else:
-            if use_RAG:
-                # Return a list of the most relevant visit strings
-                most_relevant_visits = retrieve_most_relevant_visits(
-                    ehr_visit_strs=relevant_ehr,
-                    query=instruction,
-                    target_length=target_ehr_length,
-                    tokenizer=tokenizer,
-                )
-                relevant_ehr = "\n".join(most_relevant_visits)
             # Do a first pass with a fast tokenizer
             fast_tokenizer = tiktoken.get_encoding("cl100k_base")
             fast_encoded = fast_tokenizer.encode(relevant_ehr)
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
                 encoded_ehr = tokenizer.encode(fast_truncated_ehr)
                 truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
                 truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
-                prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr=truncated_ehr)
+                prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
+            else:
+                # If the fast encoding is still too long, just use the full EHR up to allowed length
+                truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
+                prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
-                prompts_map[instruction_id] = prompt_with_truncated_ehr
+        prompts_map[instruction_id] = prompt_with_truncated_ehr
-                if verbose:
-                    print(prompt_with_truncated_ehr)
-                    print("~" * 20)
+        if verbose:
+            print(prompt_with_truncated_ehr)
+            print("~" * 20)
     return prompts_map
@@ -322,7 +217,6 @@ def preprocess_prompts(
     generation_length,
     path_to_instructions,
     path_to_ehrs,
-    use_RAG,
     include_ehr,
     tokenizer,
     codes_only=False,
@@ -347,16 +241,18 @@ def preprocess_prompts(
     # CONSTRUCT & TRUNCATE PROMPTS #
     print("Constructing prompts using instructions and EHRs...")
-    prompt_string="Instruction: Answer the following question based on the EHR:\n\nEHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
-    prompt_template = langchain.prompts.PromptTemplate.from_template(prompt_string)
+    prompt_string = (
+        "Instruction: Answer the following question based on the EHR:\n\n"
+        "EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
+    )
     filled_prompts = pack_and_trim_prompts(
         instructions=instructions,
         ehrs=ehrs,
-        prompt_template=prompt_template,
+        prompt_string=prompt_string,
         context_length=target_context_length,
         generation_length=generation_length,
         tokenizer=tokenizer,
-        use_RAG=use_RAG,
         verbose=False,
         include_ehr=include_ehr,
     )
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
     path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
     path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
     check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
-    use_RAG = False
     include_ehr = True
     tokenizer = "tiktoken"
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
         generation_length=generation_length,
         path_to_instructions=path_to_instructions,
         path_to_ehrs=path_to_ehrs,
-        use_RAG=use_RAG,
         include_ehr=include_ehr,
         tokenizer=tokenizer,
     )

helm/benchmark/scenarios/melt_scenarios.py CHANGED Viewed

@@ -439,13 +439,13 @@ class MELTMATHScenario(Scenario):
         for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
             if split == TRAIN_SPLIT and self.use_official_examples:
                 train_instances = [
-                    ("Kết quả của $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
+                    ("Kết quả của $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
                     (
                         "Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
                         + " nếu thứ tự các cuốn sách được chọn không quan trọng?",
                         "15",
                     ),
-                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\sqrt{59}"),
+                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\\sqrt{59}"),
                     (
                         "Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
                         + " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"

helm/benchmark/scenarios/mimic_bhc_scenario.py CHANGED Viewed

@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
 class MIMICBHCScenario(Scenario):
-    """
+    r"""
     MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
     course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).

helm/benchmark/scenarios/mmmlu_scenario.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+from typing import List
+import datasets
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class MMMLUScenario(Scenario):
+    """Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
+    The MMLU is a widely recognized benchmark of general knowledge attained
+    by AI models. It covers a broad range of topics from 57 different categories,
+    covering elementary-level knowledge up to advanced professional subjects like
+    law, physics, history, and computer science.
+    MMMLU is a translation of MMLU’s test set into 14 languages using professional
+    human translators. Relying on human translators for this evaluation increases
+    confidence in the accuracy of the translations, especially for low-resource
+    languages like Yoruba.
+    The Massive Multitask Language Understanding benchmark from this paper:
+    - https://arxiv.org/pdf/2009.03300.pdf
+    The MMMLU dataset is from here:
+    - https://huggingface.co/datasets/openai/MMMLU
+    """
+    name = "mmmlu"
+    description = "Multilingual Massive Multitask Language Understanding"
+    tags = ["knowledge", "multiple_choice"]
+    OPTIONS = ["A", "B", "C", "D"]
+    def __init__(self, locale: str, subject: str):
+        super().__init__()
+        self.locale: str = locale
+        self.subject: str = subject
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "openai/MMMLU",
+            self.locale,
+            revision="325a01dc3e173cac1578df94120499aaca2e2504",
+            cache_dir=cache_dir,
+            split="test",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        # Read all instances
+        instances: List[Instance] = []
+        for row_index, row in enumerate(dataset):
+            if self.subject != "all" and row["Subject"] != self.subject:
+                continue
+            input = Input(text=row["Question"])
+            references: List[Reference] = []
+            for option in self.OPTIONS:
+                references.append(
+                    Reference(
+                        output=Output(text=row[option]),
+                        tags=[CORRECT_TAG] if option == row["Answer"] else [],
+                    )
+                )
+            instance = Instance(
+                id=f"id{row_index}",
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/seahelm_scenario.py CHANGED Viewed

@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],

helm/benchmark/scenarios/test_alghafa_scenario.py ADDED Viewed

@@ -0,0 +1,29 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
+@pytest.mark.scenarios
+def test_alghafa_scenario_get_instances():
+    scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 562
+    assert actual_instances[0].id == "id0_test"
+    assert actual_instances[0].input == Input(
+        text=(
+            'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي'  # noqa: E501
+        )
+    )
+    assert len(actual_instances[0].references) == 4
+    assert actual_instances[0].references[0].output.text == "الشجاعة"
+    assert actual_instances[0].references[0].tags == []
+    assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
+    assert actual_instances[0].references[1].tags == [CORRECT_TAG]
+    assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
+    assert actual_instances[0].references[2].tags == []
+    assert actual_instances[0].references[3].output.text == "التواضع"
+    assert actual_instances[0].references[3].tags == []
+    assert actual_instances[0].split == "test"

helm/benchmark/scenarios/test_aratrust_scenario.py ADDED Viewed

@@ -0,0 +1,21 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
+@pytest.mark.scenarios
+def test_aratrust_get_instances():
+    scenario = AraTrustScenario()
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 522
+    assert actual_instances[0].id == "id0"
+    assert actual_instances[0].input == Input(
+        text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
+    )
+    assert len(actual_instances[0].references) == 1
+    assert actual_instances[0].references[0].output.text == "ب"
+    assert actual_instances[0].references[0].tags == [CORRECT_TAG]
+    assert actual_instances[0].split == "test"

helm/benchmark/scenarios/test_bluex_scenario.py ADDED Viewed

@@ -0,0 +1,59 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
+@pytest.mark.scenarios
+def test_bluex_scenario():
+    scenario = BLUEX_Scenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) > 100
+    assert instances[100].split == TEST_SPLIT
+    assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
+    assert len(instances[0].input.text) == 1011
+    assert instances[0].references == [
+        Reference(
+            output=Output(
+                text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
+                'outra coisa".'
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
+                "o presente."
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
+                'linhas tortas".'
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
+                'desgraça...".'
+            ),
+            tags=[CORRECT_TAG],
+        ),
+    ]
+    assert instances[0].references[4].is_correct

helm/benchmark/scenarios/test_exams_multilingual_scenario.py ADDED Viewed

@@ -0,0 +1,29 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
+@pytest.mark.scenarios
+def test_exam_multilingual_scenario_get_instances():
+    scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 393
+    assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
+    assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
+    assert len(actual_instances[0].references) == 4
+    assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
+    assert actual_instances[0].references[0].tags == []
+    assert (
+        actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
+    )
+    assert actual_instances[0].references[1].tags == []
+    assert (
+        actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
+    )
+    assert actual_instances[0].references[2].tags == []
+    assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
+    assert actual_instances[0].references[3].tags == [CORRECT_TAG]
+    assert actual_instances[0].split == TRAIN_SPLIT

helm/benchmark/scenarios/test_healtha_br_scenario.py ADDED Viewed

@@ -0,0 +1,57 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
+@pytest.mark.scenarios
+def test_healthqa_br_instance():
+    scenario = HEALTHQA_BR_Scenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    instance = instances[35]
+    assert instance.split == TEST_SPLIT
+    assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
+    assert instance.references == [
+        Reference(
+            output=Output(
+                text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
+                "tentativa de redução do volume."
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
+                "imediata do cirurgião."
+            ),
+            tags=[CORRECT_TAG],
+        ),
+        Reference(
+            output=Output(
+                text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
+                "abdominal."
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
+            tags=[],
+        ),
+    ]
+    correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
+    assert len(correct_refs) == 1
+    assert instance.references[1].is_correct

helm/benchmark/slurm_jobs.py CHANGED Viewed

@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
 class SlurmJobState:
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     # Non-exhaustive list of Slurm job states.
     # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
     except subprocess.CalledProcessError as e:
         # Default CalledProcessError message doesn't have output, so re-raise here to include the output.
         raise Exception(f"{str(e)} output: {e.output}")
-    search_result = re.search("JobState=(\w+)", scontrol_output.decode())
+    search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
     if not search_result:
         raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
     return search_result.group(1)

helm/benchmark/slurm_runner.py CHANGED Viewed

@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
     FAILURE_SLURM_JOB_STATES,
 )
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
 from helm.benchmark.runner_config_registry import RUNNER_CONFIG
@@ -343,7 +343,14 @@ def main():
         help="Path to the RunSpec JSON file",
         required=True,
     )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
     args = parser.parse_args()
+    setup_default_logging(args.log_config)
     # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
     with open(args.slurm_runner_spec_path, "r") as f:

crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl