crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
+
"""
|
|
22
|
+
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
+
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
directory: Path to the directory containing the files
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
+
"""
|
|
31
|
+
pairs = []
|
|
32
|
+
|
|
33
|
+
# Walk through all directories and subdirectories
|
|
34
|
+
for root, _, files in os.walk(directory):
|
|
35
|
+
# Get all MP3 files in current directory
|
|
36
|
+
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
+
|
|
38
|
+
for mp3_file in mp3_files:
|
|
39
|
+
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
+
json_file = f"{base_name}.json"
|
|
41
|
+
|
|
42
|
+
# Check if corresponding JSON file exists in the same directory
|
|
43
|
+
if json_file in files:
|
|
44
|
+
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
+
json_path = os.path.join(root, json_file)
|
|
46
|
+
pairs.append((mp3_path, json_path))
|
|
47
|
+
|
|
48
|
+
return pairs
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class UltraSuiteASRTranscriptionScenario(Scenario):
|
|
52
|
+
"""
|
|
53
|
+
A scenario for evaluating the transcription capabilities of ASR systems.
|
|
54
|
+
The audio files contain speech from children, potentially with an adult present.
|
|
55
|
+
The task is to classify whether the child speaker is typically developing or has a speech disorder.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name = "speech_disorder"
|
|
59
|
+
description = "A scenario for evaluating speech disorders in children"
|
|
60
|
+
tags = ["audio", "transcription", "speech_disorder", "asr"]
|
|
61
|
+
|
|
62
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
63
|
+
"""
|
|
64
|
+
Create instances from the audio files and their corresponding JSON annotations.
|
|
65
|
+
The data directory should contain:
|
|
66
|
+
- Audio files (e.g., .mp3)
|
|
67
|
+
- A JSON file with annotations containing 'answer' field
|
|
68
|
+
"""
|
|
69
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
70
|
+
data_path = snapshot_download(
|
|
71
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
72
|
+
repo_type="dataset",
|
|
73
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
instances: List[Instance] = []
|
|
77
|
+
split: str = TEST_SPLIT
|
|
78
|
+
|
|
79
|
+
# Find all pairs of audio and JSON files
|
|
80
|
+
pairs = find_audio_json_pairs(data_path)
|
|
81
|
+
|
|
82
|
+
for audio_path, json_path in tqdm(pairs):
|
|
83
|
+
|
|
84
|
+
# Load the annotation
|
|
85
|
+
with open(json_path, "r") as f:
|
|
86
|
+
annotation = json.load(f)
|
|
87
|
+
|
|
88
|
+
# Create references for the transcription
|
|
89
|
+
references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
|
|
90
|
+
|
|
91
|
+
# Create the input with audio and instruction
|
|
92
|
+
content = [
|
|
93
|
+
MediaObject(content_type="audio/mpeg", location=audio_path),
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
input = Input(multimedia_content=MultimediaObject(content))
|
|
97
|
+
instances.append(Instance(input=input, references=references, split=split))
|
|
98
|
+
|
|
99
|
+
return instances
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -61,7 +61,6 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
61
61
|
name = "speech_disorder"
|
|
62
62
|
description = "A scenario for evaluating speech disorders in children"
|
|
63
63
|
tags = ["audio", "classification", "speech_disorder"]
|
|
64
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite"
|
|
65
64
|
|
|
66
65
|
def get_instruction(self, words: str) -> str:
|
|
67
66
|
return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: 'typically_developing' or 'speech_disorder'. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -74,14 +73,18 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
74
73
|
- A JSON file with annotations containing 'answer' field
|
|
75
74
|
"""
|
|
76
75
|
|
|
77
|
-
print(
|
|
78
|
-
|
|
76
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
77
|
+
data_path = snapshot_download(
|
|
78
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
79
|
+
repo_type="dataset",
|
|
80
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
81
|
+
)
|
|
79
82
|
|
|
80
83
|
instances: List[Instance] = []
|
|
81
84
|
split: str = TEST_SPLIT
|
|
82
85
|
|
|
83
86
|
# Find all pairs of audio and JSON files
|
|
84
|
-
pairs = find_audio_json_pairs(
|
|
87
|
+
pairs = find_audio_json_pairs(data_path)
|
|
85
88
|
print(f"Num pairs: {len(pairs)}")
|
|
86
89
|
|
|
87
90
|
for audio_path, json_path in tqdm(pairs):
|
|
@@ -94,9 +97,14 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
94
97
|
words = annotation["transcription"]
|
|
95
98
|
# Create references for each option
|
|
96
99
|
references: List[Reference] = []
|
|
100
|
+
correct_label = 0
|
|
97
101
|
for option in ["typically_developing", "speech_disorder"]:
|
|
98
102
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
|
|
99
103
|
references.append(reference)
|
|
104
|
+
if option == answer:
|
|
105
|
+
correct_label += 1
|
|
106
|
+
if correct_label == 0:
|
|
107
|
+
continue
|
|
100
108
|
|
|
101
109
|
# Create the input with audio and instruction
|
|
102
110
|
content = [
|
|
@@ -13,7 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
13
|
Output,
|
|
14
14
|
)
|
|
15
15
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
-
from
|
|
16
|
+
from huggingface_hub import snapshot_download
|
|
17
17
|
from .ultra_suite_classification_scenario import find_audio_json_pairs
|
|
18
18
|
|
|
19
19
|
|
|
@@ -27,7 +27,6 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
27
27
|
name = "speech_disorder"
|
|
28
28
|
description = "A scenario for evaluating and classifying specific types of speech disorders in children"
|
|
29
29
|
tags = ["audio", "classification", "speech_disorder", "disorder_breakdown"]
|
|
30
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
|
|
31
30
|
|
|
32
31
|
def get_instruction(self, words: str) -> str:
|
|
33
32
|
return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt text the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: A - 'typically developing' (child's speech patterns and development are within normal age-appropriate ranges), B - 'articulation' (difficulty producing specific speech sounds correctly, such as substituting, omitting, or distorting sounds), C - 'phonological' (difficulty understanding and using the sound system of language, affecting sounds of a particular type). 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -39,14 +38,18 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
39
38
|
- Audio files (e.g., .mp3)
|
|
40
39
|
- A JSON file with annotations containing 'disorder_class' field
|
|
41
40
|
"""
|
|
42
|
-
print(
|
|
43
|
-
|
|
41
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
42
|
+
data_path = snapshot_download(
|
|
43
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
44
|
+
repo_type="dataset",
|
|
45
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
46
|
+
)
|
|
44
47
|
|
|
45
48
|
instances: List[Instance] = []
|
|
46
49
|
split: str = TEST_SPLIT
|
|
47
50
|
|
|
48
51
|
# Find all pairs of audio and JSON files
|
|
49
|
-
pairs = find_audio_json_pairs(
|
|
52
|
+
pairs = find_audio_json_pairs(data_path)
|
|
50
53
|
print(f"Num pairs: {len(pairs)}")
|
|
51
54
|
|
|
52
55
|
for audio_path, json_path in tqdm(pairs):
|
|
@@ -62,9 +65,14 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
62
65
|
|
|
63
66
|
# Create references for each option
|
|
64
67
|
references: List[Reference] = []
|
|
68
|
+
correct_label = 0
|
|
65
69
|
for option in ["typically_developing", "articulation", "phonological"]:
|
|
66
70
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
67
71
|
references.append(reference)
|
|
72
|
+
if option == label:
|
|
73
|
+
correct_label += 1
|
|
74
|
+
if correct_label == 0:
|
|
75
|
+
continue
|
|
68
76
|
|
|
69
77
|
# Create the input with audio and instruction
|
|
70
78
|
content = [
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -57,7 +57,6 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
57
57
|
name = "speech_disorder"
|
|
58
58
|
description = "A scenario for evaluating speech disorders in children"
|
|
59
59
|
tags = ["audio", "classification", "speech_disorder"]
|
|
60
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
|
|
61
60
|
|
|
62
61
|
def get_instruction(self, words: str) -> str:
|
|
63
62
|
prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -71,14 +70,18 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
71
70
|
- Audio files (e.g., .mp3)
|
|
72
71
|
- A JSON file with annotations containing 'answer' field
|
|
73
72
|
"""
|
|
74
|
-
print(
|
|
75
|
-
|
|
73
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
74
|
+
data_path = snapshot_download(
|
|
75
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
76
|
+
repo_type="dataset",
|
|
77
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
78
|
+
)
|
|
76
79
|
|
|
77
80
|
instances: List[Instance] = []
|
|
78
81
|
split: str = TEST_SPLIT
|
|
79
82
|
|
|
80
83
|
# Find all pairs of audio and JSON files
|
|
81
|
-
pairs = find_audio_json_pairs(
|
|
84
|
+
pairs = find_audio_json_pairs(data_path)
|
|
82
85
|
|
|
83
86
|
for audio_path, json_path in tqdm(pairs):
|
|
84
87
|
|
|
@@ -93,9 +96,14 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
93
96
|
prompt = annotation["transcription"]
|
|
94
97
|
# Create references for each option
|
|
95
98
|
references: List[Reference] = []
|
|
99
|
+
correct_label = 0
|
|
96
100
|
for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
|
|
97
101
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
98
102
|
references.append(reference)
|
|
103
|
+
if option == label:
|
|
104
|
+
correct_label += 1
|
|
105
|
+
if correct_label == 0:
|
|
106
|
+
continue
|
|
99
107
|
|
|
100
108
|
# Create the input with audio and instruction
|
|
101
109
|
content = [
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BLUEX_Scenario(Scenario):
|
|
16
|
+
"""
|
|
17
|
+
The BLUEX dataset is a benchmark used for evaluating natural language processing models in Brazilian Portuguese.
|
|
18
|
+
It consists of multiple-choice questions taken from official entrance exams of Unicamp (Convest) and USP (Fuvest),
|
|
19
|
+
covering various high school subjects. The questions include both textual prompts and visual elements. This dataset
|
|
20
|
+
was developed to assess the performance of models on tasks involving comprehension and reasoning, with a specific
|
|
21
|
+
focus on texts and exams originally written in Portuguese.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "bluex"
|
|
25
|
+
description = "MQA benchmark with questions from Brazilian entrance exams"
|
|
26
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
27
|
+
|
|
28
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
29
|
+
# Download the raw data and read all the dialogues
|
|
30
|
+
dataset: Any
|
|
31
|
+
# Read all the instances
|
|
32
|
+
instances: List[Instance] = []
|
|
33
|
+
cache_dir = str(Path(output_path) / "data")
|
|
34
|
+
|
|
35
|
+
dataset = load_dataset("portuguese-benchmark-datasets/BLUEX", cache_dir=cache_dir)
|
|
36
|
+
for example in dataset["questions"]:
|
|
37
|
+
# This scenario disregards issues with images
|
|
38
|
+
if example["has_associated_images"]:
|
|
39
|
+
continue
|
|
40
|
+
question = example["question"]
|
|
41
|
+
choices = example["alternatives"]
|
|
42
|
+
answer = example["answer"]
|
|
43
|
+
|
|
44
|
+
answers_dict = {}
|
|
45
|
+
for alt in choices:
|
|
46
|
+
if ")" in alt:
|
|
47
|
+
label, text = alt.split(")", 1)
|
|
48
|
+
label = label.strip().upper()
|
|
49
|
+
text = text.strip()
|
|
50
|
+
answers_dict[label] = text
|
|
51
|
+
|
|
52
|
+
if answer not in answers_dict:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
correct_answer = answers_dict[answer]
|
|
56
|
+
|
|
57
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
58
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
59
|
+
|
|
60
|
+
instance = Instance(
|
|
61
|
+
input=Input(text=question),
|
|
62
|
+
split=TEST_SPLIT,
|
|
63
|
+
references=[answer_to_reference(text) for text in answers_dict.values()],
|
|
64
|
+
)
|
|
65
|
+
instances.append(instance)
|
|
66
|
+
return instances
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsCodeEfficiencyScenario(Scenario):
|
|
7
|
+
name = "codeinsights_code_efficiency"
|
|
8
|
+
description = "Evaluate runtime efficiency alignment between LLM-generated code and student code"
|
|
9
|
+
tags = ["codeinsights", "c++", "code_efficiency"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario4_data.csv")
|
|
17
|
+
|
|
18
|
+
# Load test cases (unit tests)
|
|
19
|
+
test_cases = self._load_test_cases()
|
|
20
|
+
|
|
21
|
+
# Get available question IDs with test cases
|
|
22
|
+
available_question_ids = set()
|
|
23
|
+
if test_cases:
|
|
24
|
+
available_question_ids = set(test_cases.keys())
|
|
25
|
+
print(f"Loaded test cases for {len(available_question_ids)} questions")
|
|
26
|
+
else:
|
|
27
|
+
print("WARNING: No test cases loaded!")
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
instances = []
|
|
31
|
+
skipped_no_tests = 0
|
|
32
|
+
skipped_insufficient_data = 0
|
|
33
|
+
|
|
34
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
35
|
+
student_df = student_df.sort_values("timestamp")
|
|
36
|
+
if len(student_df) < 4:
|
|
37
|
+
skipped_insufficient_data += 1
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
first = student_df.iloc[0]
|
|
41
|
+
second = student_df.iloc[1]
|
|
42
|
+
third = student_df.iloc[2]
|
|
43
|
+
target = student_df.iloc[3]
|
|
44
|
+
|
|
45
|
+
# Check if target question has test cases BEFORE processing
|
|
46
|
+
target_question_id = target.get("question_unittest_id", None)
|
|
47
|
+
if not target_question_id or str(target_question_id) not in available_question_ids:
|
|
48
|
+
skipped_no_tests += 1
|
|
49
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
# Get test cases for this question (we know they exist now)
|
|
53
|
+
question_test_cases = []
|
|
54
|
+
tc_parsing_success = True
|
|
55
|
+
|
|
56
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
57
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
58
|
+
input_idx = testcase_str.find("Input:")
|
|
59
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
60
|
+
output_idx = testcase_str.find("Output:")
|
|
61
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
62
|
+
tc_parsing_success = False
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
testcase = {
|
|
66
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
67
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
68
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
69
|
+
}
|
|
70
|
+
question_test_cases.append(testcase)
|
|
71
|
+
|
|
72
|
+
if not tc_parsing_success:
|
|
73
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
if len(question_test_cases) < self.num_testcases:
|
|
77
|
+
# If not enough test cases, skip this question
|
|
78
|
+
continue
|
|
79
|
+
if self.num_testcases >= 0:
|
|
80
|
+
# If more than one test case is requested, only take the first ones
|
|
81
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
82
|
+
|
|
83
|
+
# Get student pass pattern for the target question
|
|
84
|
+
student_correctness_pattern = target.get("pass", None)
|
|
85
|
+
if student_correctness_pattern is not None:
|
|
86
|
+
main_part = int(student_correctness_pattern)
|
|
87
|
+
# Convert each character to an int
|
|
88
|
+
student_correctness_list = [int(ch) for ch in str(main_part)]
|
|
89
|
+
else:
|
|
90
|
+
student_correctness_list = []
|
|
91
|
+
|
|
92
|
+
print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
|
|
93
|
+
print(f"Test cases loaded: {len(question_test_cases)}")
|
|
94
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
95
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
96
|
+
|
|
97
|
+
prompt = (
|
|
98
|
+
f"Week: {target['week']}\n"
|
|
99
|
+
f"Topic: {target['topic']}\n\n"
|
|
100
|
+
"Example 1:\n"
|
|
101
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
102
|
+
"Template:\n"
|
|
103
|
+
f"{first['question_template']}\n"
|
|
104
|
+
"Your Code:\n"
|
|
105
|
+
f"{first['response']}\n\n"
|
|
106
|
+
"Example 2:\n"
|
|
107
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
108
|
+
"Template:\n"
|
|
109
|
+
f"{second['question_template']}\n"
|
|
110
|
+
"Your Code:\n"
|
|
111
|
+
f"{second['response']}\n\n"
|
|
112
|
+
"Example 3:\n"
|
|
113
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
114
|
+
"Template:\n"
|
|
115
|
+
f"{third['question_template']}\n"
|
|
116
|
+
"Your Code:\n"
|
|
117
|
+
f"{third['response']}\n\n"
|
|
118
|
+
"Now, using that same student's coding style, attempt this:\n"
|
|
119
|
+
"Ensure that the code works perfectly, but its efficiency should be based on students' past examples.\n"
|
|
120
|
+
"If a student has a tendency to write correct but inefficient code, imitate the inefficiency "
|
|
121
|
+
"but if they write efficiently, write efficiently too.\n"
|
|
122
|
+
f"Question: {target['question_name']} — {target['question_text']}\n\n"
|
|
123
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
124
|
+
if question_test_cases
|
|
125
|
+
else ""
|
|
126
|
+
"Template:\n"
|
|
127
|
+
f"{target['question_template']}\n\n"
|
|
128
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
129
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
130
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
131
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
132
|
+
"Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly. "
|
|
133
|
+
"Return the code in C++ code block format, and nothing else, and produce only one set of code."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
instances.append(
|
|
137
|
+
Instance(
|
|
138
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
139
|
+
input=Input(text=prompt),
|
|
140
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
141
|
+
extra_data={
|
|
142
|
+
"question_template": target["question_template"],
|
|
143
|
+
"test_cases": question_test_cases,
|
|
144
|
+
"question_id": str(target_question_id),
|
|
145
|
+
"question_name": target.get("question_name", ""),
|
|
146
|
+
"student_id": str(student_id),
|
|
147
|
+
"student_correctness_pattern": student_correctness_list,
|
|
148
|
+
},
|
|
149
|
+
split=VALID_SPLIT,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Print summary statistics
|
|
154
|
+
print("\n=== INSTANCE CREATION SUMMARY ===")
|
|
155
|
+
print(f"Total instances created: {len(instances)}")
|
|
156
|
+
print(f"Skipped (insufficient data): {skipped_insufficient_data}")
|
|
157
|
+
print(f"Skipped (no test cases): {skipped_no_tests}")
|
|
158
|
+
print(f"Available test case question IDs: {len(available_question_ids)}")
|
|
159
|
+
|
|
160
|
+
if instances:
|
|
161
|
+
print("Sample created instances:")
|
|
162
|
+
for i, inst in enumerate(instances[:5]):
|
|
163
|
+
if inst.extra_data is None:
|
|
164
|
+
test_count = 0
|
|
165
|
+
else:
|
|
166
|
+
test_count = len(inst.extra_data.get("test_cases", []))
|
|
167
|
+
print(f" {inst.id}: {test_count} test cases")
|
|
168
|
+
|
|
169
|
+
return instances
|
|
170
|
+
|
|
171
|
+
def _load_test_cases(self):
|
|
172
|
+
"""
|
|
173
|
+
Load test cases from external source or return None if not available.
|
|
174
|
+
This method should be implemented based on where your test cases are stored.
|
|
175
|
+
|
|
176
|
+
Expected format:
|
|
177
|
+
{
|
|
178
|
+
"question_id": [
|
|
179
|
+
{
|
|
180
|
+
"unittest": "test_id",
|
|
181
|
+
"input": "test input code",
|
|
182
|
+
"output": "expected output"
|
|
183
|
+
},
|
|
184
|
+
...
|
|
185
|
+
],
|
|
186
|
+
...
|
|
187
|
+
}
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
response = requests.get(
|
|
191
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
192
|
+
)
|
|
193
|
+
if response.status_code == 200:
|
|
194
|
+
return response.json()
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
197
|
+
return {}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class CodeInsightsCorrectCodeScenario(Scenario):
|
|
6
|
+
name = "codeinsights_correct_code"
|
|
7
|
+
description = "Generate correct response code for C++ programming questions"
|
|
8
|
+
tags = ["codeinsights", "c++", "correct_code"]
|
|
9
|
+
|
|
10
|
+
def __init__(self, num_testcases: int = 1):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self.num_testcases = num_testcases
|
|
13
|
+
|
|
14
|
+
def get_instances(self, output_path: str):
|
|
15
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
|
|
16
|
+
|
|
17
|
+
# Load test cases (unit tests)
|
|
18
|
+
instances = []
|
|
19
|
+
for question_id, question_df in df.groupby("question_unittest_id"):
|
|
20
|
+
target = question_df.iloc[0]
|
|
21
|
+
question_test_cases = []
|
|
22
|
+
tc_parsing_success = True
|
|
23
|
+
|
|
24
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
25
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
26
|
+
input_idx = testcase_str.find("Input:")
|
|
27
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
28
|
+
output_idx = testcase_str.find("Output:")
|
|
29
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
30
|
+
tc_parsing_success = False
|
|
31
|
+
break
|
|
32
|
+
|
|
33
|
+
testcase = {
|
|
34
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
35
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
36
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
37
|
+
}
|
|
38
|
+
question_test_cases.append(testcase)
|
|
39
|
+
|
|
40
|
+
if not tc_parsing_success:
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
if len(question_test_cases) < self.num_testcases:
|
|
44
|
+
# If not enough test cases, skip this question
|
|
45
|
+
continue
|
|
46
|
+
if self.num_testcases >= 0:
|
|
47
|
+
# If more than one test case is requested, only take the first ones
|
|
48
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
49
|
+
|
|
50
|
+
prompt = (
|
|
51
|
+
f"Question: {target['question_name']} — {target['question_text']}\n\n"
|
|
52
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
53
|
+
if question_test_cases
|
|
54
|
+
else ""
|
|
55
|
+
"Template:\n"
|
|
56
|
+
f"{target['question_template']}\n\n"
|
|
57
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
58
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
59
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
60
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
61
|
+
"Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly. "
|
|
62
|
+
"Return the code in C++ code block format, and nothing else."
|
|
63
|
+
)
|
|
64
|
+
instances.append(
|
|
65
|
+
Instance(
|
|
66
|
+
id=f"{question_id}",
|
|
67
|
+
input=Input(text=prompt),
|
|
68
|
+
references=[],
|
|
69
|
+
extra_data={
|
|
70
|
+
"question_template": target["question_template"],
|
|
71
|
+
"test_cases": question_test_cases,
|
|
72
|
+
"question_id": str(question_id) if question_id else None,
|
|
73
|
+
"question_name": target.get("question_name", ""),
|
|
74
|
+
},
|
|
75
|
+
split=VALID_SPLIT,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return instances
|