PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/audio_language/corebench_scenario.py ADDED Viewed

@@ -0,0 +1,77 @@
+from typing import List
+import os
+import json
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+class COREBenchScenario(Scenario):
+    """COREBench
+    COREBench is a new audio benchmark incorporating multi-speaker conversations. It consists of conversational
+    audio, transcript, question, and answer. There are two challenging features of this benchmark: (1) the questions
+    are designed to require reasoning over multiple turns of conversation, and (2) the average audio length is
+    longer than 1 minute, which is significantly longer than existing benchmarks.
+    """
+    ANNOT_URL = (
+        "https://huggingface.co/datasets/stanford-crfm/COnversationalREasoningBench_v0.1/resolve/"
+        "main/test/instances.jsonl"
+    )
+    HF_AUDIO_FOLDER = (
+        "https://huggingface.co/datasets/stanford-crfm/COnversationalREasoningBench_v0.1/resolve/main/test/audio"
+    )
+    COREBENCH_INSTRUCTION = (
+        "\n\n Answer the question by just giving the final answer and nothing else. "
+        "Answer 'unanswerable' if the question is irrelevant to the audio or cannot be inferred."
+    )
+    name = "corebench"
+    description = "Exploring multi-speaker conversational audio reasoning task."
+    tags: List[str] = ["audio", "reasoning"]
+    def load_jsonl(self, file_path):
+        with open(file_path, "r", encoding="utf-8") as f:
+            return [json.loads(line.strip()) for line in f]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        annot_save_path = os.path.join(output_path, "instances.jsonl")
+        ensure_file_downloaded(source_url=COREBenchScenario.ANNOT_URL, target_path=annot_save_path)
+        annotations = self.load_jsonl(annot_save_path)
+        audio_save_dir = os.path.join(output_path, "audio")
+        # Download audio files first
+        for row in tqdm(annotations):
+            audio_path = row["audio_path"]
+            local_audio_path = os.path.join(audio_save_dir, audio_path)
+            ensure_file_downloaded(
+                source_url=os.path.join(COREBenchScenario.HF_AUDIO_FOLDER, audio_path), target_path=local_audio_path
+            )
+        for row in tqdm(annotations):
+            local_audio_path = os.path.join(audio_save_dir, row["audio_path"])
+            answer = row["answer"].lower()
+            question = row["question"]
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                        MediaObject(content_type="text/plain", text=question + self.COREBENCH_INSTRUCTION),
+                    ]
+                )
+            )
+            references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

helm/benchmark/scenarios/audio_language/mustard_scenario.py CHANGED Viewed

@@ -19,7 +19,7 @@ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 class MUStARDScenario(Scenario):
-    """
+    r"""
     MUStARD: Multimodal Sarcasm Detection Dataset
     A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular

helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py ADDED Viewed

@@ -0,0 +1,104 @@
+from typing import List, Tuple
+import os
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from huggingface_hub import snapshot_download
+def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
+    """
+    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
+    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
+    Args:
+        directory: Path to the directory containing the files
+    Returns:
+        List of tuples where each tuple contains (mp3_path, json_path)
+    """
+    pairs = []
+    # Walk through all directories and subdirectories
+    for root, _, files in os.walk(directory):
+        # Get all MP3 files in current directory
+        mp3_files = [f for f in files if f.endswith(".mp3")]
+        for mp3_file in mp3_files:
+            base_name = os.path.splitext(mp3_file)[0]
+            json_file = f"{base_name}.json"
+            # Check if corresponding JSON file exists in the same directory
+            if json_file in files:
+                mp3_path = os.path.join(root, mp3_file)
+                json_path = os.path.join(root, json_file)
+                pairs.append((mp3_path, json_path))
+    return pairs
+class UltraSuiteASRClassificationScenario(Scenario):
+    """
+    A scenario for evaluating whether a child speaker has a speech disorder or not.
+    The audio files contain speech from children, potentially with an adult present.
+    The task is to classify whether the child speaker is typically developing or has a speech disorder.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating speech disorders in children"
+    tags = ["audio", "classification", "speech_disorder", "asr"]
+    # Classification options
+    options: List[str] = ["Healthy", "Unhealthy"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'answer' field
+        """
+        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
+        data_path = snapshot_download(
+            repo_id="SAA-Lab/SLPHelmManualLabels",
+            repo_type="dataset",
+            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(data_path)
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Get the correct answer and convert to label
+            answer = annotation["disorder_class"]
+            # Create references for each option
+            references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import List, Tuple
+import os
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from huggingface_hub import snapshot_download
+def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
+    """
+    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
+    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
+    Args:
+        directory: Path to the directory containing the files
+    Returns:
+        List of tuples where each tuple contains (mp3_path, json_path)
+    """
+    pairs = []
+    # Walk through all directories and subdirectories
+    for root, _, files in os.walk(directory):
+        # Get all MP3 files in current directory
+        mp3_files = [f for f in files if f.endswith(".mp3")]
+        for mp3_file in mp3_files:
+            base_name = os.path.splitext(mp3_file)[0]
+            json_file = f"{base_name}.json"
+            # Check if corresponding JSON file exists in the same directory
+            if json_file in files:
+                mp3_path = os.path.join(root, mp3_file)
+                json_path = os.path.join(root, json_file)
+                pairs.append((mp3_path, json_path))
+    return pairs
+class UltraSuiteASRTranscriptionScenario(Scenario):
+    """
+    A scenario for evaluating the transcription capabilities of ASR systems.
+    The audio files contain speech from children, potentially with an adult present.
+    The task is to classify whether the child speaker is typically developing or has a speech disorder.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating speech disorders in children"
+    tags = ["audio", "transcription", "speech_disorder", "asr"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'answer' field
+        """
+        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
+        data_path = snapshot_download(
+            repo_id="SAA-Lab/SLPHelmManualLabels",
+            repo_type="dataset",
+            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(data_path)
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Create references for the transcription
+            references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import List, Tuple
+import os
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from huggingface_hub import snapshot_download
+def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
+    """
+    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
+    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
+    Args:
+        directory: Path to the directory containing the files
+    Returns:
+        List of tuples where each tuple contains (mp3_path, json_path)
+    """
+    pairs = []
+    # Walk through all directories and subdirectories
+    for root, _, files in os.walk(directory):
+        # Get all MP3 files in current directory
+        mp3_files = [f for f in files if f.endswith(".mp3")]
+        for mp3_file in mp3_files:
+            base_name = os.path.splitext(mp3_file)[0]
+            json_file = f"{base_name}.json"
+            # Check if corresponding JSON file exists in the same directory
+            if json_file in files:
+                mp3_path = os.path.join(root, mp3_file)
+                json_path = os.path.join(root, json_file)
+                pairs.append((mp3_path, json_path))
+    if len(pairs) == 0:
+        raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
+    return pairs
+class UltraSuiteClassificationScenario(Scenario):
+    """
+    A scenario for evaluating whether a child speaker has a speech disorder or not.
+    The audio files contain speech from children, potentially with an adult present.
+    The task is to classify whether the child speaker is typically developing or has a speech disorder.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating speech disorders in children"
+    tags = ["audio", "classification", "speech_disorder"]
+    def get_instruction(self, words: str) -> str:
+        return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: 'typically_developing' or 'speech_disorder'. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text."""  # noqa: E501
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'answer' field
+        """
+        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
+        data_path = snapshot_download(
+            repo_id="SAA-Lab/SLPHelmManualLabels",
+            repo_type="dataset",
+            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(data_path)
+        print(f"Num pairs: {len(pairs)}")
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Get the correct answer and convert to label
+            answer = annotation["disorder_class"]
+            words = annotation["transcription"]
+            # Create references for each option
+            references: List[Reference] = []
+            correct_label = 0
+            for option in ["typically_developing", "speech_disorder"]:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
+                references.append(reference)
+                if option == answer:
+                    correct_label += 1
+            if correct_label == 0:
+                continue
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(words)),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py ADDED Viewed

@@ -0,0 +1,86 @@
+from typing import List
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from huggingface_hub import snapshot_download
+from .ultra_suite_classification_scenario import find_audio_json_pairs
+class UltraSuiteDisorderBreakdownScenario(Scenario):
+    """
+    A scenario for evaluating and classifying specific types of speech disorders in children.
+    This scenario extends the basic speech disorder classification by breaking down disorders
+    into specific categories: articulation and phonological disorders.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating and classifying specific types of speech disorders in children"
+    tags = ["audio", "classification", "speech_disorder", "disorder_breakdown"]
+    def get_instruction(self, words: str) -> str:
+        return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt text the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: A - 'typically developing' (child's speech patterns and development are within normal age-appropriate ranges), B - 'articulation' (difficulty producing specific speech sounds correctly, such as substituting, omitting, or distorting sounds), C - 'phonological' (difficulty understanding and using the sound system of language, affecting sounds of a particular type). 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text."""  # noqa: E501
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'disorder_class' field
+        """
+        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
+        data_path = snapshot_download(
+            repo_id="SAA-Lab/SLPHelmManualLabels",
+            repo_type="dataset",
+            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(data_path)
+        print(f"Num pairs: {len(pairs)}")
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Get the correct answer and convert to label
+            if "disorder_type" not in annotation or "transcription" not in annotation:
+                continue
+            label = annotation["disorder_type"]
+            prompt = annotation["transcription"]
+            # Create references for each option
+            references: List[Reference] = []
+            correct_label = 0
+            for option in ["typically_developing", "articulation", "phonological"]:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
+                references.append(reference)
+                if option == label:
+                    correct_label += 1
+            if correct_label == 0:
+                continue
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py ADDED Viewed

@@ -0,0 +1,117 @@
+from typing import List, Tuple
+import os
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from huggingface_hub import snapshot_download
+def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
+    """
+    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
+    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
+    Args:
+        directory: Path to the directory containing the files
+    Returns:
+        List of tuples where each tuple contains (mp3_path, json_path)
+    """
+    pairs = []
+    # Walk through all directories and subdirectories
+    for root, _, files in os.walk(directory):
+        # Get all MP3 files in current directory
+        mp3_files = [f for f in files if f.endswith(".mp3")]
+        for mp3_file in mp3_files:
+            base_name = os.path.splitext(mp3_file)[0]
+            json_file = f"{base_name}.json"
+            # Check if corresponding JSON file exists in the same directory
+            if json_file in files:
+                mp3_path = os.path.join(root, mp3_file)
+                json_path = os.path.join(root, json_file)
+                pairs.append((mp3_path, json_path))
+    return pairs
+class UltraSuiteDisorderSymptomsScenario(Scenario):
+    """
+    A scenario identifying features of speech disorders within the provided audio.
+    The audio files contain speech from children, potentially with an adult present.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating speech disorders in children"
+    tags = ["audio", "classification", "speech_disorder"]
+    def get_instruction(self, words: str) -> str:
+        prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text."""  # noqa: E501
+        return prompt
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'answer' field
+        """
+        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
+        data_path = snapshot_download(
+            repo_id="SAA-Lab/SLPHelmManualLabels",
+            repo_type="dataset",
+            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(data_path)
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Get the correct answer and convert to label
+            if "disorder_symptom" not in annotation or "transcription" not in annotation:
+                continue
+            label = annotation["disorder_symptom"]
+            prompt = annotation["transcription"]
+            # Create references for each option
+            references: List[Reference] = []
+            correct_label = 0
+            for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
+                references.append(reference)
+                if option == label:
+                    correct_label += 1
+            if correct_label == 0:
+                continue
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py CHANGED Viewed

@@ -46,6 +46,10 @@ class VocalSoundScenario(Scenario):
     description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
     tags: List[str] = ["audio", "classification"]
+    def __init__(self, sound: str) -> None:
+        super().__init__()
+        self._sound: str = sound
     def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
         down_loading_path = os.path.join(output_path, "download")
@@ -53,7 +57,12 @@ class VocalSoundScenario(Scenario):
         wav_save_dir = os.path.join(down_loading_path, "audio_16k")
         for file_name in tqdm(os.listdir(wav_save_dir)):
             local_audio_path: str = os.path.join(wav_save_dir, file_name)
-            if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
+            if (
+                not file_name.endswith(".wav")
+                or is_invalid_audio_file(local_audio_path)
+                # Skip this problematic file
+                or file_name == "m0083_0_sneeze.wav"
+            ):
                 continue
             input = Input(
@@ -61,9 +70,14 @@ class VocalSoundScenario(Scenario):
             )
             answer: str = file_name.split("_")[-1].split(".")[0]
+            if answer.lower() != self._sound:
+                continue
             if answer == "throatclearing":
                 answer = "throat clearing"
             references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
             instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        assert len(instances) > 0, f"No instances found for sound: {self._sound}"
         return instances

helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py CHANGED Viewed

@@ -40,8 +40,7 @@ class VoxCeleb2Scenario(Scenario):
         "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
     )
     IDENTITY_INSTRUCTION = (
-        "Listen to the audio and take your best guess to determine if the two speakers are the same person. "
-        "Give just the letter of your answer and nothing else."
+        "Listen to the audio and take your best guess to determine if the two speakers are the same person."
     )
     name = "voxceleb2"

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl