PyPI - crfm-helm - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -144,3 +144,8 @@ class AdapterSpec:
     # Set hash=False to make `AdapterSpec` hashable
     eval_splits: Optional[List[str]] = field(default=None, hash=False)
     """The splits from which evaluation instances will be drawn."""
+    output_mapping_pattern: Optional[str] = None
+    """Pattern to apply to the output before applying the output mapping for the joint multiple choice adapter.
+    If the pattern has no group, the output mapping will be applied to the first match.
+    If the pattern has a group, the output mapping will be applied to the group of the first match."""

helm/benchmark/metrics/bbq_metrics.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.benchmark.metrics.metric import MetricMetadata
 from helm.common.request import RequestResult
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.metric_name import MetricName
@@ -145,3 +146,14 @@ class BBQMetric(EvaluateInstancesMetric):
         stats = [acc, amb_bias_stat, disamb_bias_stat]
         return stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="bbq_accuracy",
+                display_name="BBQ accuracy",
+                description="BBQ accuracy",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/evaluate_reference_metrics.py CHANGED Viewed

@@ -397,6 +397,16 @@ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
     return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"])  # type: ignore
+def _apply_output_mapping_pattern(pattern: str, prediction: str) -> str:
+    match = re.search(pattern, prediction)
+    if not match:
+        return ""
+    elif match.groups():
+        return match.group(0)
+    else:
+        return match.string
 # TODO This should probably be made into an implementation of MetricInterface. For now it lives here
 # just to separate it from basic_metrics.py.
 def compute_reference_metrics(
@@ -498,6 +508,8 @@ def compute_reference_metrics(
     # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
     # sometimes predict a random letter like 'M'.
     if request_state.output_mapping is not None:
+        if adapter_spec.output_mapping_pattern:
+            preds = [_apply_output_mapping_pattern(adapter_spec.output_mapping_pattern, pred) for pred in preds]
         preds = [request_state.output_mapping.get(pred) for pred in preds]  # type: ignore
     # Compute max_prob, the probability that the model assigns to its generated text.

helm/benchmark/metrics/safety_metrics.py CHANGED Viewed

@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
 from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -77,3 +77,15 @@ class SafetyScoreMetric(Metric):
             raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
         stats.append(safety_score_stat)
         return stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="safety_score",
+                display_name="LM Evaluated Safety score",
+                description="LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the "
+                "output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py ADDED Viewed

@@ -0,0 +1,52 @@
+from typing import List
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
+from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+)
+from sklearn.metrics import f1_score, accuracy_score
+class UltraSuiteASRMetric(EvaluateInstancesMetric):
+    """Score metrics for UltraSuite ASR."""
+    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+        y_pred: List[str] = []
+        y_pred_quasi: List[str] = []
+        y_true: List[str] = []
+        for request_state in request_states:  # one request state per instance
+            for reference in request_state.instance.references:
+                if reference.tags == [CORRECT_TAG]:
+                    true_label = reference.output.text
+                    break
+            assert request_state.result
+            model_output_text = request_state.result.completions[0].text.strip().lower()
+            assert request_state.instance.extra_data
+            ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
+            if model_output_text == ground_truth_text:
+                predicted_label = "typically_developing"
+            else:
+                predicted_label = "speech_disorder"
+            if normalize_text(predicted_label) == normalize_text(true_label):
+                quasi_label = "typically_developing"
+            else:
+                quasi_label = "speech_disorder"
+            y_true.append(true_label)
+            y_pred.append(predicted_label)
+            y_pred_quasi.append(quasi_label)
+        return [
+            Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
+            Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
+            Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
+            Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
+        ]

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 import os
+import re
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
 from helm.benchmark.adaptation.adapter_spec import (
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             if request_state.result is not None and request_state.result.completions
             else ""
         )
-        mapped_output = (
-            request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
-        )
+        mapped_output: Optional[str] = None
+        if request_state.output_mapping is not None:
+            output_to_map = predicted_text.strip()
+            if run_spec.adapter_spec.output_mapping_pattern:
+                match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
+                if not match:
+                    output_to_map = ""
+                elif match.groups():
+                    output_to_map = match.group(0)
+                else:
+                    output_to_map = match.string
+            mapped_output = request_state.output_mapping.get(output_to_map)
         instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
             request_state.instance
         )

helm/benchmark/presentation/run_entry.py CHANGED Viewed

@@ -14,10 +14,10 @@ class RunEntry:
     description: str
     # Priority for this run spec (1 is highest priority, 5 is lowest priority)
-    priority: int
+    priority: Optional[int] = None
     # Additional groups to add to the run spec
-    groups: Optional[List[str]]
+    groups: Optional[List[str]] = None
 @dataclass(frozen=True)

helm/benchmark/run.py CHANGED Viewed

@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
     run_specs: List[RunSpec] = []
     for entry in run_entries:
         # Filter by priority
-        if priority is not None and entry.priority > priority:
+        if priority is not None and entry.priority is not None and entry.priority > priority:
             continue
         for run_spec in construct_run_specs(parse_object_spec(entry.description)):

helm/benchmark/run_specs/arabic_run_specs.py CHANGED Viewed

@@ -12,6 +12,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
 _ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
+_ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
 @run_spec_function("arabic_mmlu")
@@ -29,6 +30,7 @@ def get_arabic_mmlu_spec(subset: str) -> RunSpec:
         output_noun="الإجابة",
         max_tokens=100,
         reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
+        output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
     )
     return RunSpec(
@@ -54,6 +56,7 @@ def get_alghafa_spec(subset: str) -> RunSpec:
         output_noun="الإجابة",
         max_tokens=100,
         reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
+        output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
     )
     return RunSpec(
@@ -130,6 +133,7 @@ def get_madinah_qa_spec(subset: str) -> RunSpec:
         output_noun="الإجابة",
         max_tokens=100,
         reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
+        output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
     )
     return RunSpec(
@@ -155,6 +159,7 @@ def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
         output_noun="الإجابة",
         max_tokens=100,
         reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
+        output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
     )
     return RunSpec(
@@ -180,6 +185,7 @@ def get_arabic_exams_spec(subject: str) -> RunSpec:
         output_noun="الإجابة",
         max_tokens=100,
         reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
+        output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
     )
     return RunSpec(

helm/benchmark/run_specs/medhelm_run_specs.py CHANGED Viewed

@@ -1527,7 +1527,7 @@ def get_shc_ent_spec(data_path: str) -> RunSpec:
 @run_spec_function("shc_privacy_med")
 def get_shc_privacy_spec(data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
+        class_name="helm.benchmark.scenarios.shc_privacy_scenario.SHCPRIVACYMedScenario",
         args={"data_path": data_path},
     )
@@ -1550,7 +1550,7 @@ def get_shc_privacy_spec(data_path: str) -> RunSpec:
 @run_spec_function("shc_proxy_med")
 def get_shc_proxy_spec(data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
+        class_name="helm.benchmark.scenarios.shc_proxy_scenario.SHCPROXYMedScenario",
         args={"data_path": data_path},
     )

helm/benchmark/run_specs/speech_disorder_audio_run_specs.py CHANGED Viewed

@@ -112,9 +112,13 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
     )
     adapter_spec = _get_generation_adapter_spec(
         instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""",  # noqa: E501
-        max_tokens=10,
+        max_tokens=50,
     )
-    metric_specs: List[MetricSpec] = audio_classification_metric_specs()
+    metric_specs: List[MetricSpec] = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.ultra_suite_asr_classification_metrics.UltraSuiteASRMetric", args={}
+        )
+    ]
     run_spec_name: str = "ultra_suite_asr_classification"
     return RunSpec(
         name=run_spec_name,

helm/benchmark/scenarios/anthropic_red_team_scenario.py CHANGED Viewed

@@ -2,7 +2,8 @@ import re
 from typing import List, Any, Dict
 from datasets import load_dataset
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
 class AnthropicRedTeamScenario(Scenario):
@@ -69,3 +70,13 @@ class AnthropicRedTeamScenario(Scenario):
                 )
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="anthropic_red_team",
+            display_name="Anthropic Red Team",
+            description="Anthropic Red Team",
+            taxonomy=TaxonomyInfo(task="instruction following sfaety", what="?", when="?", who="?", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import List, Tuple
+from typing import List
 import os
-import json
+from datasets import load_dataset
 from tqdm import tqdm
 from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from huggingface_hub import snapshot_download
-def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
-    """
-    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
-    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
-    Args:
-        directory: Path to the directory containing the files
-    Returns:
-        List of tuples where each tuple contains (mp3_path, json_path)
-    """
-    pairs = []
-    # Walk through all directories and subdirectories
-    for root, _, files in os.walk(directory):
-        # Get all MP3 files in current directory
-        mp3_files = [f for f in files if f.endswith(".mp3")]
-        for mp3_file in mp3_files:
-            base_name = os.path.splitext(mp3_file)[0]
-            json_file = f"{base_name}.json"
-            # Check if corresponding JSON file exists in the same directory
-            if json_file in files:
-                mp3_path = os.path.join(root, mp3_file)
-                json_path = os.path.join(root, json_file)
-                pairs.append((mp3_path, json_path))
-    return pairs
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
 class UltraSuiteASRClassificationScenario(Scenario):
@@ -59,9 +28,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
     description = "A scenario for evaluating speech disorders in children"
     tags = ["audio", "classification", "speech_disorder", "asr"]
-    # Classification options
-    options: List[str] = ["Healthy", "Unhealthy"]
     def get_instances(self, output_path: str) -> List[Instance]:
         """
         Create instances from the audio files and their corresponding JSON annotations.
@@ -69,36 +35,40 @@ class UltraSuiteASRClassificationScenario(Scenario):
         - Audio files (e.g., .mp3)
         - A JSON file with annotations containing 'answer' field
         """
-        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
-        data_path = snapshot_download(
-            repo_id="SAA-Lab/SLPHelmManualLabels",
-            repo_type="dataset",
-            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
-        )
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        os.makedirs(audio_save_dir, exist_ok=True)
+        print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
+        dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
         instances: List[Instance] = []
         split: str = TEST_SPLIT
-        # Find all pairs of audio and JSON files
-        pairs = find_audio_json_pairs(data_path)
+        for idx, row in enumerate(tqdm(dataset["train"])):
-        for audio_path, json_path in tqdm(pairs):
+            label = row["disorder_class"]
+            transcription = row["transcription"]
-            # Load the annotation
-            with open(json_path, "r") as f:
-                annotation = json.load(f)
+            unique_id = str(idx)
+            local_audio_name = f"{label}_{unique_id}.mp3"
+            local_audio_path = os.path.join(audio_save_dir, local_audio_name)
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
-            # Get the correct answer and convert to label
-            answer = annotation["disorder_class"]
             # Create references for each option
-            references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
+            references: List[Reference] = []
+            for option in ["typically_developing", "speech_disorder"]:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
+                references.append(reference)
             # Create the input with audio and instruction
             content = [
-                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="audio/mpeg", location=local_audio_path),
             ]
             input = Input(multimedia_content=MultimediaObject(content))
-            instances.append(Instance(input=input, references=references, split=split))
+            instances.append(
+                Instance(input=input, references=references, split=split, extra_data={"transcription": transcription})
+            )
         return instances

helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import List, Tuple
+from typing import List
 import os
-import json
+from datasets import load_dataset
 from tqdm import tqdm
 from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from huggingface_hub import snapshot_download
-def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
-    """
-    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
-    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
-    Args:
-        directory: Path to the directory containing the files
-    Returns:
-        List of tuples where each tuple contains (mp3_path, json_path)
-    """
-    pairs = []
-    # Walk through all directories and subdirectories
-    for root, _, files in os.walk(directory):
-        # Get all MP3 files in current directory
-        mp3_files = [f for f in files if f.endswith(".mp3")]
-        for mp3_file in mp3_files:
-            base_name = os.path.splitext(mp3_file)[0]
-            json_file = f"{base_name}.json"
-            # Check if corresponding JSON file exists in the same directory
-            if json_file in files:
-                mp3_path = os.path.join(root, mp3_file)
-                json_path = os.path.join(root, json_file)
-                pairs.append((mp3_path, json_path))
-    return pairs
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
 class UltraSuiteASRTranscriptionScenario(Scenario):
@@ -66,31 +35,33 @@ class UltraSuiteASRTranscriptionScenario(Scenario):
         - Audio files (e.g., .mp3)
         - A JSON file with annotations containing 'answer' field
         """
-        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
-        data_path = snapshot_download(
-            repo_id="SAA-Lab/SLPHelmManualLabels",
-            repo_type="dataset",
-            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
-        )
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        os.makedirs(audio_save_dir, exist_ok=True)
+        print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
+        dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
         instances: List[Instance] = []
         split: str = TEST_SPLIT
         # Find all pairs of audio and JSON files
-        pairs = find_audio_json_pairs(data_path)
-        for audio_path, json_path in tqdm(pairs):
+        for idx, row in enumerate(tqdm(dataset["train"])):
             # Load the annotation
-            with open(json_path, "r") as f:
-                annotation = json.load(f)
+            # Load the annotation
+            label = row["disorder_class"]
+            unique_id = str(idx)
+            local_audio_name = f"{label}_{unique_id}.mp3"
+            local_audio_path = os.path.join(audio_save_dir, local_audio_name)
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
-            # Create references for the transcription
-            references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
+            # Create references for each option
+            references: List[Reference] = [Reference(Output(text=row["transcription"]), tags=[CORRECT_TAG])]
             # Create the input with audio and instruction
             content = [
-                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="audio/mpeg", location=local_audio_path),
             ]
             input = Input(multimedia_content=MultimediaObject(content))

helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import List, Tuple
+from typing import List
 import os
-import json
+from datasets import load_dataset
 from tqdm import tqdm
 from helm.benchmark.scenarios.scenario import (
@@ -14,41 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from huggingface_hub import snapshot_download
-def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
-    """
-    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
-    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
-    Args:
-        directory: Path to the directory containing the files
-    Returns:
-        List of tuples where each tuple contains (mp3_path, json_path)
-    """
-    pairs = []
-    # Walk through all directories and subdirectories
-    for root, _, files in os.walk(directory):
-        # Get all MP3 files in current directory
-        mp3_files = [f for f in files if f.endswith(".mp3")]
-        for mp3_file in mp3_files:
-            base_name = os.path.splitext(mp3_file)[0]
-            json_file = f"{base_name}.json"
-            # Check if corresponding JSON file exists in the same directory
-            if json_file in files:
-                mp3_path = os.path.join(root, mp3_file)
-                json_path = os.path.join(root, json_file)
-                pairs.append((mp3_path, json_path))
-    if len(pairs) == 0:
-        raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
-    return pairs
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
 class UltraSuiteClassificationScenario(Scenario):
@@ -72,44 +38,39 @@ class UltraSuiteClassificationScenario(Scenario):
         - Audio files (e.g., .mp3)
         - A JSON file with annotations containing 'answer' field
         """
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        os.makedirs(audio_save_dir, exist_ok=True)
-        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
-        data_path = snapshot_download(
-            repo_id="SAA-Lab/SLPHelmManualLabels",
-            repo_type="dataset",
-            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
-        )
+        print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
+        dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
         instances: List[Instance] = []
         split: str = TEST_SPLIT
-        # Find all pairs of audio and JSON files
-        pairs = find_audio_json_pairs(data_path)
-        print(f"Num pairs: {len(pairs)}")
+        for idx, row in enumerate(tqdm(dataset["train"])):
-        for audio_path, json_path in tqdm(pairs):
             # Load the annotation
-            with open(json_path, "r") as f:
-                annotation = json.load(f)
+            label = row["disorder_class"]
+            transcription = row["transcription"]
+            unique_id = str(idx)
+            local_audio_name = f"{label}_{unique_id}.mp3"
+            local_audio_path = os.path.join(audio_save_dir, local_audio_name)
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
-            # Get the correct answer and convert to label
-            answer = annotation["disorder_class"]
-            words = annotation["transcription"]
             # Create references for each option
             references: List[Reference] = []
-            correct_label = 0
-            for option in ["typically_developing", "speech_disorder"]:
-                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
-                references.append(reference)
-                if option == answer:
-                    correct_label += 1
-            if correct_label == 0:
+            options = ["typically_developing", "speech_disorder"]
+            if label not in options:
                 continue
+            for option in options:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
+                references.append(reference)
             # Create the input with audio and instruction
             content = [
-                MediaObject(content_type="audio/mpeg", location=audio_path),
-                MediaObject(content_type="text/plain", text=self.get_instruction(words)),
+                MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
             ]
             input = Input(multimedia_content=MultimediaObject(content))

crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl