crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_specs/arabic_run_specs.py +6 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/commonsense_scenario.py +7 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/gsm_scenario.py +9 -3
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -7
- helm/benchmark/scenarios/math_scenario.py +11 -4
- helm/benchmark/scenarios/med_qa_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mmlu_scenario.py +8 -2
- helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
- helm/benchmark/static/schema_long_context.yaml +12 -31
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +5 -1
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/together_client.py +4 -0
- helm/clients/vertexai_client.py +4 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +225 -0
- helm/config/model_metadata.yaml +232 -7
- helm/config/tokenizer_configs.yaml +74 -4
- helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -144,3 +144,8 @@ class AdapterSpec:
|
|
|
144
144
|
# Set hash=False to make `AdapterSpec` hashable
|
|
145
145
|
eval_splits: Optional[List[str]] = field(default=None, hash=False)
|
|
146
146
|
"""The splits from which evaluation instances will be drawn."""
|
|
147
|
+
|
|
148
|
+
output_mapping_pattern: Optional[str] = None
|
|
149
|
+
"""Pattern to apply to the output before applying the output mapping for the joint multiple choice adapter.
|
|
150
|
+
If the pattern has no group, the output mapping will be applied to the first match.
|
|
151
|
+
If the pattern has a group, the output mapping will be applied to the group of the first match."""
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.metrics.metric import MetricMetadata
|
|
4
5
|
from helm.common.request import RequestResult
|
|
5
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
@@ -145,3 +146,14 @@ class BBQMetric(EvaluateInstancesMetric):
|
|
|
145
146
|
stats = [acc, amb_bias_stat, disamb_bias_stat]
|
|
146
147
|
|
|
147
148
|
return stats
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
151
|
+
return [
|
|
152
|
+
MetricMetadata(
|
|
153
|
+
name="bbq_accuracy",
|
|
154
|
+
display_name="BBQ accuracy",
|
|
155
|
+
description="BBQ accuracy",
|
|
156
|
+
lower_is_better=False,
|
|
157
|
+
group=None,
|
|
158
|
+
),
|
|
159
|
+
]
|
|
@@ -397,6 +397,16 @@ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
|
|
|
397
397
|
return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
|
|
398
398
|
|
|
399
399
|
|
|
400
|
+
def _apply_output_mapping_pattern(pattern: str, prediction: str) -> str:
|
|
401
|
+
match = re.search(pattern, prediction)
|
|
402
|
+
if not match:
|
|
403
|
+
return ""
|
|
404
|
+
elif match.groups():
|
|
405
|
+
return match.group(0)
|
|
406
|
+
else:
|
|
407
|
+
return match.string
|
|
408
|
+
|
|
409
|
+
|
|
400
410
|
# TODO This should probably be made into an implementation of MetricInterface. For now it lives here
|
|
401
411
|
# just to separate it from basic_metrics.py.
|
|
402
412
|
def compute_reference_metrics(
|
|
@@ -498,6 +508,8 @@ def compute_reference_metrics(
|
|
|
498
508
|
# Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
|
|
499
509
|
# sometimes predict a random letter like 'M'.
|
|
500
510
|
if request_state.output_mapping is not None:
|
|
511
|
+
if adapter_spec.output_mapping_pattern:
|
|
512
|
+
preds = [_apply_output_mapping_pattern(adapter_spec.output_mapping_pattern, pred) for pred in preds]
|
|
501
513
|
preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
|
|
502
514
|
|
|
503
515
|
# Compute max_prob, the probability that the model assigns to its generated text.
|
|
@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
|
|
7
7
|
from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
8
|
-
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
9
9
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
10
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
11
11
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -77,3 +77,15 @@ class SafetyScoreMetric(Metric):
|
|
|
77
77
|
raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
|
|
78
78
|
stats.append(safety_score_stat)
|
|
79
79
|
return stats
|
|
80
|
+
|
|
81
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
82
|
+
return [
|
|
83
|
+
MetricMetadata(
|
|
84
|
+
name="safety_score",
|
|
85
|
+
display_name="LM Evaluated Safety score",
|
|
86
|
+
description="LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the "
|
|
87
|
+
"output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).",
|
|
88
|
+
lower_is_better=False,
|
|
89
|
+
group=None,
|
|
90
|
+
),
|
|
91
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
5
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
7
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
)
|
|
11
|
+
from sklearn.metrics import f1_score, accuracy_score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UltraSuiteASRMetric(EvaluateInstancesMetric):
|
|
15
|
+
"""Score metrics for UltraSuite ASR."""
|
|
16
|
+
|
|
17
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
18
|
+
y_pred: List[str] = []
|
|
19
|
+
y_pred_quasi: List[str] = []
|
|
20
|
+
y_true: List[str] = []
|
|
21
|
+
for request_state in request_states: # one request state per instance
|
|
22
|
+
|
|
23
|
+
for reference in request_state.instance.references:
|
|
24
|
+
if reference.tags == [CORRECT_TAG]:
|
|
25
|
+
true_label = reference.output.text
|
|
26
|
+
break
|
|
27
|
+
|
|
28
|
+
assert request_state.result
|
|
29
|
+
model_output_text = request_state.result.completions[0].text.strip().lower()
|
|
30
|
+
assert request_state.instance.extra_data
|
|
31
|
+
ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
|
|
32
|
+
|
|
33
|
+
if model_output_text == ground_truth_text:
|
|
34
|
+
predicted_label = "typically_developing"
|
|
35
|
+
else:
|
|
36
|
+
predicted_label = "speech_disorder"
|
|
37
|
+
|
|
38
|
+
if normalize_text(predicted_label) == normalize_text(true_label):
|
|
39
|
+
quasi_label = "typically_developing"
|
|
40
|
+
else:
|
|
41
|
+
quasi_label = "speech_disorder"
|
|
42
|
+
|
|
43
|
+
y_true.append(true_label)
|
|
44
|
+
y_pred.append(predicted_label)
|
|
45
|
+
y_pred_quasi.append(quasi_label)
|
|
46
|
+
|
|
47
|
+
return [
|
|
48
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
49
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
50
|
+
Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
|
|
51
|
+
Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
|
|
52
|
+
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
262
263
|
if request_state.result is not None and request_state.result.completions
|
|
263
264
|
else ""
|
|
264
265
|
)
|
|
265
|
-
mapped_output =
|
|
266
|
-
|
|
267
|
-
|
|
266
|
+
mapped_output: Optional[str] = None
|
|
267
|
+
if request_state.output_mapping is not None:
|
|
268
|
+
output_to_map = predicted_text.strip()
|
|
269
|
+
if run_spec.adapter_spec.output_mapping_pattern:
|
|
270
|
+
match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
|
|
271
|
+
if not match:
|
|
272
|
+
output_to_map = ""
|
|
273
|
+
elif match.groups():
|
|
274
|
+
output_to_map = match.group(0)
|
|
275
|
+
else:
|
|
276
|
+
output_to_map = match.string
|
|
277
|
+
mapped_output = request_state.output_mapping.get(output_to_map)
|
|
268
278
|
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
269
279
|
request_state.instance
|
|
270
280
|
)
|
|
@@ -14,10 +14,10 @@ class RunEntry:
|
|
|
14
14
|
description: str
|
|
15
15
|
|
|
16
16
|
# Priority for this run spec (1 is highest priority, 5 is lowest priority)
|
|
17
|
-
priority: int
|
|
17
|
+
priority: Optional[int] = None
|
|
18
18
|
|
|
19
19
|
# Additional groups to add to the run spec
|
|
20
|
-
groups: Optional[List[str]]
|
|
20
|
+
groups: Optional[List[str]] = None
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|
helm/benchmark/run.py
CHANGED
|
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
|
|
|
37
37
|
run_specs: List[RunSpec] = []
|
|
38
38
|
for entry in run_entries:
|
|
39
39
|
# Filter by priority
|
|
40
|
-
if priority is not None and entry.priority > priority:
|
|
40
|
+
if priority is not None and entry.priority is not None and entry.priority > priority:
|
|
41
41
|
continue
|
|
42
42
|
|
|
43
43
|
for run_spec in construct_run_specs(parse_object_spec(entry.description)):
|
|
@@ -12,6 +12,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
_ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
|
|
15
|
+
_ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
@run_spec_function("arabic_mmlu")
|
|
@@ -29,6 +30,7 @@ def get_arabic_mmlu_spec(subset: str) -> RunSpec:
|
|
|
29
30
|
output_noun="الإجابة",
|
|
30
31
|
max_tokens=100,
|
|
31
32
|
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
33
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
32
34
|
)
|
|
33
35
|
|
|
34
36
|
return RunSpec(
|
|
@@ -54,6 +56,7 @@ def get_alghafa_spec(subset: str) -> RunSpec:
|
|
|
54
56
|
output_noun="الإجابة",
|
|
55
57
|
max_tokens=100,
|
|
56
58
|
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
59
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
57
60
|
)
|
|
58
61
|
|
|
59
62
|
return RunSpec(
|
|
@@ -130,6 +133,7 @@ def get_madinah_qa_spec(subset: str) -> RunSpec:
|
|
|
130
133
|
output_noun="الإجابة",
|
|
131
134
|
max_tokens=100,
|
|
132
135
|
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
136
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
133
137
|
)
|
|
134
138
|
|
|
135
139
|
return RunSpec(
|
|
@@ -155,6 +159,7 @@ def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
|
|
|
155
159
|
output_noun="الإجابة",
|
|
156
160
|
max_tokens=100,
|
|
157
161
|
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
162
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
158
163
|
)
|
|
159
164
|
|
|
160
165
|
return RunSpec(
|
|
@@ -180,6 +185,7 @@ def get_arabic_exams_spec(subject: str) -> RunSpec:
|
|
|
180
185
|
output_noun="الإجابة",
|
|
181
186
|
max_tokens=100,
|
|
182
187
|
reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
|
|
188
|
+
output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
|
|
183
189
|
)
|
|
184
190
|
|
|
185
191
|
return RunSpec(
|
|
@@ -1527,7 +1527,7 @@ def get_shc_ent_spec(data_path: str) -> RunSpec:
|
|
|
1527
1527
|
@run_spec_function("shc_privacy_med")
|
|
1528
1528
|
def get_shc_privacy_spec(data_path: str) -> RunSpec:
|
|
1529
1529
|
scenario_spec = ScenarioSpec(
|
|
1530
|
-
class_name="helm.benchmark.scenarios.
|
|
1530
|
+
class_name="helm.benchmark.scenarios.shc_privacy_scenario.SHCPRIVACYMedScenario",
|
|
1531
1531
|
args={"data_path": data_path},
|
|
1532
1532
|
)
|
|
1533
1533
|
|
|
@@ -1550,7 +1550,7 @@ def get_shc_privacy_spec(data_path: str) -> RunSpec:
|
|
|
1550
1550
|
@run_spec_function("shc_proxy_med")
|
|
1551
1551
|
def get_shc_proxy_spec(data_path: str) -> RunSpec:
|
|
1552
1552
|
scenario_spec = ScenarioSpec(
|
|
1553
|
-
class_name="helm.benchmark.scenarios.
|
|
1553
|
+
class_name="helm.benchmark.scenarios.shc_proxy_scenario.SHCPROXYMedScenario",
|
|
1554
1554
|
args={"data_path": data_path},
|
|
1555
1555
|
)
|
|
1556
1556
|
|
|
@@ -112,9 +112,13 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
|
|
|
112
112
|
)
|
|
113
113
|
adapter_spec = _get_generation_adapter_spec(
|
|
114
114
|
instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
|
|
115
|
-
max_tokens=
|
|
115
|
+
max_tokens=50,
|
|
116
116
|
)
|
|
117
|
-
metric_specs: List[MetricSpec] =
|
|
117
|
+
metric_specs: List[MetricSpec] = [
|
|
118
|
+
MetricSpec(
|
|
119
|
+
class_name="helm.benchmark.metrics.ultra_suite_asr_classification_metrics.UltraSuiteASRMetric", args={}
|
|
120
|
+
)
|
|
121
|
+
]
|
|
118
122
|
run_spec_name: str = "ultra_suite_asr_classification"
|
|
119
123
|
return RunSpec(
|
|
120
124
|
name=run_spec_name,
|
|
@@ -2,7 +2,8 @@ import re
|
|
|
2
2
|
from typing import List, Any, Dict
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class AnthropicRedTeamScenario(Scenario):
|
|
@@ -69,3 +70,13 @@ class AnthropicRedTeamScenario(Scenario):
|
|
|
69
70
|
)
|
|
70
71
|
instances.append(instance)
|
|
71
72
|
return instances
|
|
73
|
+
|
|
74
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
75
|
+
return ScenarioMetadata(
|
|
76
|
+
name="anthropic_red_team",
|
|
77
|
+
display_name="Anthropic Red Team",
|
|
78
|
+
description="Anthropic Red Team",
|
|
79
|
+
taxonomy=TaxonomyInfo(task="instruction following sfaety", what="?", when="?", who="?", language="English"),
|
|
80
|
+
main_metric="safety_score",
|
|
81
|
+
main_split="test",
|
|
82
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
49
18
|
|
|
50
19
|
|
|
51
20
|
class UltraSuiteASRClassificationScenario(Scenario):
|
|
@@ -59,9 +28,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
59
28
|
description = "A scenario for evaluating speech disorders in children"
|
|
60
29
|
tags = ["audio", "classification", "speech_disorder", "asr"]
|
|
61
30
|
|
|
62
|
-
# Classification options
|
|
63
|
-
options: List[str] = ["Healthy", "Unhealthy"]
|
|
64
|
-
|
|
65
31
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
66
32
|
"""
|
|
67
33
|
Create instances from the audio files and their corresponding JSON annotations.
|
|
@@ -69,36 +35,40 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
69
35
|
- Audio files (e.g., .mp3)
|
|
70
36
|
- A JSON file with annotations containing 'answer' field
|
|
71
37
|
"""
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
)
|
|
38
|
+
|
|
39
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
40
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
41
|
+
|
|
42
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
43
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
78
44
|
|
|
79
45
|
instances: List[Instance] = []
|
|
80
46
|
split: str = TEST_SPLIT
|
|
81
47
|
|
|
82
|
-
|
|
83
|
-
pairs = find_audio_json_pairs(data_path)
|
|
48
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
84
49
|
|
|
85
|
-
|
|
50
|
+
label = row["disorder_class"]
|
|
51
|
+
transcription = row["transcription"]
|
|
86
52
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
53
|
+
unique_id = str(idx)
|
|
54
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
55
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
56
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
90
57
|
|
|
91
|
-
# Get the correct answer and convert to label
|
|
92
|
-
answer = annotation["disorder_class"]
|
|
93
58
|
# Create references for each option
|
|
94
|
-
references: List[Reference] = [
|
|
59
|
+
references: List[Reference] = []
|
|
60
|
+
for option in ["typically_developing", "speech_disorder"]:
|
|
61
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
62
|
+
references.append(reference)
|
|
95
63
|
|
|
96
64
|
# Create the input with audio and instruction
|
|
97
65
|
content = [
|
|
98
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
66
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
99
67
|
]
|
|
100
68
|
|
|
101
69
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
102
|
-
instances.append(
|
|
70
|
+
instances.append(
|
|
71
|
+
Instance(input=input, references=references, split=split, extra_data={"transcription": transcription})
|
|
72
|
+
)
|
|
103
73
|
|
|
104
74
|
return instances
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
49
18
|
|
|
50
19
|
|
|
51
20
|
class UltraSuiteASRTranscriptionScenario(Scenario):
|
|
@@ -66,31 +35,33 @@ class UltraSuiteASRTranscriptionScenario(Scenario):
|
|
|
66
35
|
- Audio files (e.g., .mp3)
|
|
67
36
|
- A JSON file with annotations containing 'answer' field
|
|
68
37
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
)
|
|
38
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
39
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
42
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
75
43
|
|
|
76
44
|
instances: List[Instance] = []
|
|
77
45
|
split: str = TEST_SPLIT
|
|
78
46
|
|
|
79
47
|
# Find all pairs of audio and JSON files
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
for audio_path, json_path in tqdm(pairs):
|
|
48
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
83
49
|
|
|
84
50
|
# Load the annotation
|
|
85
|
-
|
|
86
|
-
|
|
51
|
+
# Load the annotation
|
|
52
|
+
label = row["disorder_class"]
|
|
53
|
+
|
|
54
|
+
unique_id = str(idx)
|
|
55
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
56
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
57
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
87
58
|
|
|
88
|
-
# Create references for
|
|
89
|
-
references: List[Reference] = [Reference(Output(text=
|
|
59
|
+
# Create references for each option
|
|
60
|
+
references: List[Reference] = [Reference(Output(text=row["transcription"]), tags=[CORRECT_TAG])]
|
|
90
61
|
|
|
91
62
|
# Create the input with audio and instruction
|
|
92
63
|
content = [
|
|
93
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
64
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
94
65
|
]
|
|
95
66
|
|
|
96
67
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,41 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
if len(pairs) == 0:
|
|
49
|
-
raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
|
|
50
|
-
|
|
51
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
52
18
|
|
|
53
19
|
|
|
54
20
|
class UltraSuiteClassificationScenario(Scenario):
|
|
@@ -72,44 +38,39 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
72
38
|
- Audio files (e.g., .mp3)
|
|
73
39
|
- A JSON file with annotations containing 'answer' field
|
|
74
40
|
"""
|
|
41
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
42
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
75
43
|
|
|
76
|
-
print("Downloading SAA-Lab/
|
|
77
|
-
|
|
78
|
-
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
79
|
-
repo_type="dataset",
|
|
80
|
-
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
81
|
-
)
|
|
44
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
45
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
82
46
|
|
|
83
47
|
instances: List[Instance] = []
|
|
84
48
|
split: str = TEST_SPLIT
|
|
85
49
|
|
|
86
|
-
|
|
87
|
-
pairs = find_audio_json_pairs(data_path)
|
|
88
|
-
print(f"Num pairs: {len(pairs)}")
|
|
50
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
89
51
|
|
|
90
|
-
for audio_path, json_path in tqdm(pairs):
|
|
91
52
|
# Load the annotation
|
|
92
|
-
|
|
93
|
-
|
|
53
|
+
label = row["disorder_class"]
|
|
54
|
+
transcription = row["transcription"]
|
|
55
|
+
|
|
56
|
+
unique_id = str(idx)
|
|
57
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
58
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
59
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
94
60
|
|
|
95
|
-
# Get the correct answer and convert to label
|
|
96
|
-
answer = annotation["disorder_class"]
|
|
97
|
-
words = annotation["transcription"]
|
|
98
61
|
# Create references for each option
|
|
99
62
|
references: List[Reference] = []
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
|
|
103
|
-
references.append(reference)
|
|
104
|
-
if option == answer:
|
|
105
|
-
correct_label += 1
|
|
106
|
-
if correct_label == 0:
|
|
63
|
+
options = ["typically_developing", "speech_disorder"]
|
|
64
|
+
if label not in options:
|
|
107
65
|
continue
|
|
66
|
+
for option in options:
|
|
67
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
68
|
+
references.append(reference)
|
|
108
69
|
|
|
109
70
|
# Create the input with audio and instruction
|
|
110
71
|
content = [
|
|
111
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
112
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
72
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
73
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
113
74
|
]
|
|
114
75
|
|
|
115
76
|
input = Input(multimedia_content=MultimediaObject(content))
|