crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show
  1. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
  2. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/metrics/bbq_metrics.py +12 -0
  5. helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
  6. helm/benchmark/metrics/safety_metrics.py +13 -1
  7. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  8. helm/benchmark/presentation/run_display.py +13 -3
  9. helm/benchmark/presentation/run_entry.py +2 -2
  10. helm/benchmark/run.py +1 -1
  11. helm/benchmark/run_specs/arabic_run_specs.py +6 -0
  12. helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
  13. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  14. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  15. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  16. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  17. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  18. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  19. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  20. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  21. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  22. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  23. helm/benchmark/scenarios/commonsense_scenario.py +7 -1
  24. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  25. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  26. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  27. helm/benchmark/scenarios/gsm_scenario.py +9 -3
  28. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  29. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  30. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  31. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  32. helm/benchmark/scenarios/legalbench_scenario.py +6 -7
  33. helm/benchmark/scenarios/math_scenario.py +11 -4
  34. helm/benchmark/scenarios/med_qa_scenario.py +7 -1
  35. helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
  36. helm/benchmark/scenarios/mmlu_scenario.py +8 -2
  37. helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
  38. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  39. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  40. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  41. helm/benchmark/scenarios/spider_scenario.py +18 -0
  42. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  43. helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
  44. helm/benchmark/static/schema_long_context.yaml +12 -31
  45. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  46. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  47. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  48. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  49. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  50. helm/benchmark/static_build/index.html +5 -6
  51. helm/clients/ai21_client.py +2 -0
  52. helm/clients/aleph_alpha_client.py +2 -0
  53. helm/clients/anthropic_client.py +7 -1
  54. helm/clients/audio_language/diva_llama_client.py +2 -0
  55. helm/clients/audio_language/llama_omni_client.py +2 -1
  56. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  57. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  58. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  59. helm/clients/bedrock_client.py +2 -0
  60. helm/clients/cohere_client.py +3 -0
  61. helm/clients/google_client.py +2 -0
  62. helm/clients/http_model_client.py +2 -0
  63. helm/clients/huggingface_client.py +2 -1
  64. helm/clients/ibm_client.py +3 -1
  65. helm/clients/image_generation/adobe_vision_client.py +2 -0
  66. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  67. helm/clients/image_generation/cogview2_client.py +2 -1
  68. helm/clients/image_generation/dalle2_client.py +2 -0
  69. helm/clients/image_generation/dalle_mini_client.py +2 -1
  70. helm/clients/image_generation/deep_floyd_client.py +2 -0
  71. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  72. helm/clients/image_generation/lexica_client.py +2 -0
  73. helm/clients/image_generation/mindalle_client.py +2 -1
  74. helm/clients/image_generation/together_image_generation_client.py +2 -0
  75. helm/clients/megatron_client.py +2 -0
  76. helm/clients/mistral_client.py +2 -0
  77. helm/clients/moderation_api_client.py +2 -0
  78. helm/clients/openai_client.py +5 -1
  79. helm/clients/palmyra_client.py +2 -1
  80. helm/clients/reka_client.py +2 -1
  81. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  82. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  83. helm/clients/together_client.py +4 -0
  84. helm/clients/vertexai_client.py +4 -0
  85. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  86. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  87. helm/clients/vision_language/idefics_client.py +2 -1
  88. helm/clients/vision_language/open_flamingo_client.py +2 -1
  89. helm/clients/vision_language/paligemma_client.py +2 -1
  90. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  91. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  92. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  93. helm/clients/writer_client.py +2 -0
  94. helm/common/hierarchical_logger.py +20 -0
  95. helm/common/optional_dependencies.py +1 -1
  96. helm/common/test_general.py +4 -0
  97. helm/config/model_deployments.yaml +225 -0
  98. helm/config/model_metadata.yaml +232 -7
  99. helm/config/tokenizer_configs.yaml +74 -4
  100. helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
  101. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  102. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  103. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  104. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  105. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  106. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  107. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  108. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  109. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  110. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  111. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  112. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  113. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  114. /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
  115. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  116. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  117. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  118. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  119. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  120. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  121. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -144,3 +144,8 @@ class AdapterSpec:
144
144
  # Set hash=False to make `AdapterSpec` hashable
145
145
  eval_splits: Optional[List[str]] = field(default=None, hash=False)
146
146
  """The splits from which evaluation instances will be drawn."""
147
+
148
+ output_mapping_pattern: Optional[str] = None
149
+ """Pattern to apply to the output before applying the output mapping for the joint multiple choice adapter.
150
+ If the pattern has no group, the output mapping will be applied to the first match.
151
+ If the pattern has a group, the output mapping will be applied to the group of the first match."""
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
3
3
 
4
+ from helm.benchmark.metrics.metric import MetricMetadata
4
5
  from helm.common.request import RequestResult
5
6
  from helm.benchmark.adaptation.request_state import RequestState
6
7
  from helm.benchmark.metrics.metric_name import MetricName
@@ -145,3 +146,14 @@ class BBQMetric(EvaluateInstancesMetric):
145
146
  stats = [acc, amb_bias_stat, disamb_bias_stat]
146
147
 
147
148
  return stats
149
+
150
+ def get_metadata(self) -> List[MetricMetadata]:
151
+ return [
152
+ MetricMetadata(
153
+ name="bbq_accuracy",
154
+ display_name="BBQ accuracy",
155
+ description="BBQ accuracy",
156
+ lower_is_better=False,
157
+ group=None,
158
+ ),
159
+ ]
@@ -397,6 +397,16 @@ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
397
397
  return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
398
398
 
399
399
 
400
+ def _apply_output_mapping_pattern(pattern: str, prediction: str) -> str:
401
+ match = re.search(pattern, prediction)
402
+ if not match:
403
+ return ""
404
+ elif match.groups():
405
+ return match.group(0)
406
+ else:
407
+ return match.string
408
+
409
+
400
410
  # TODO This should probably be made into an implementation of MetricInterface. For now it lives here
401
411
  # just to separate it from basic_metrics.py.
402
412
  def compute_reference_metrics(
@@ -498,6 +508,8 @@ def compute_reference_metrics(
498
508
  # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
499
509
  # sometimes predict a random letter like 'M'.
500
510
  if request_state.output_mapping is not None:
511
+ if adapter_spec.output_mapping_pattern:
512
+ preds = [_apply_output_mapping_pattern(adapter_spec.output_mapping_pattern, pred) for pred in preds]
501
513
  preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
502
514
 
503
515
  # Compute max_prob, the probability that the model assigns to its generated text.
@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
7
7
  from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
8
- from helm.benchmark.metrics.metric import Metric
8
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
9
9
  from helm.benchmark.metrics.metric_name import MetricName
10
10
  from helm.benchmark.metrics.metric_service import MetricService
11
11
  from helm.benchmark.metrics.statistic import Stat
@@ -77,3 +77,15 @@ class SafetyScoreMetric(Metric):
77
77
  raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
78
78
  stats.append(safety_score_stat)
79
79
  return stats
80
+
81
+ def get_metadata(self) -> List[MetricMetadata]:
82
+ return [
83
+ MetricMetadata(
84
+ name="safety_score",
85
+ display_name="LM Evaluated Safety score",
86
+ description="LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the "
87
+ "output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).",
88
+ lower_is_better=False,
89
+ group=None,
90
+ ),
91
+ ]
@@ -0,0 +1,52 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.metrics.metric_name import MetricName
5
+ from helm.benchmark.metrics.statistic import Stat
6
+ from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
7
+ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
8
+ from helm.benchmark.scenarios.scenario import (
9
+ CORRECT_TAG,
10
+ )
11
+ from sklearn.metrics import f1_score, accuracy_score
12
+
13
+
14
+ class UltraSuiteASRMetric(EvaluateInstancesMetric):
15
+ """Score metrics for UltraSuite ASR."""
16
+
17
+ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
18
+ y_pred: List[str] = []
19
+ y_pred_quasi: List[str] = []
20
+ y_true: List[str] = []
21
+ for request_state in request_states: # one request state per instance
22
+
23
+ for reference in request_state.instance.references:
24
+ if reference.tags == [CORRECT_TAG]:
25
+ true_label = reference.output.text
26
+ break
27
+
28
+ assert request_state.result
29
+ model_output_text = request_state.result.completions[0].text.strip().lower()
30
+ assert request_state.instance.extra_data
31
+ ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
32
+
33
+ if model_output_text == ground_truth_text:
34
+ predicted_label = "typically_developing"
35
+ else:
36
+ predicted_label = "speech_disorder"
37
+
38
+ if normalize_text(predicted_label) == normalize_text(true_label):
39
+ quasi_label = "typically_developing"
40
+ else:
41
+ quasi_label = "speech_disorder"
42
+
43
+ y_true.append(true_label)
44
+ y_pred.append(predicted_label)
45
+ y_pred_quasi.append(quasi_label)
46
+
47
+ return [
48
+ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
49
+ Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
50
+ Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
51
+ Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
52
+ ]
@@ -1,6 +1,7 @@
1
1
  from collections import OrderedDict, defaultdict
2
2
  from dataclasses import dataclass
3
3
  import os
4
+ import re
4
5
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
5
6
 
6
7
  from helm.benchmark.adaptation.adapter_spec import (
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
262
263
  if request_state.result is not None and request_state.result.completions
263
264
  else ""
264
265
  )
265
- mapped_output = (
266
- request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
267
- )
266
+ mapped_output: Optional[str] = None
267
+ if request_state.output_mapping is not None:
268
+ output_to_map = predicted_text.strip()
269
+ if run_spec.adapter_spec.output_mapping_pattern:
270
+ match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
271
+ if not match:
272
+ output_to_map = ""
273
+ elif match.groups():
274
+ output_to_map = match.group(0)
275
+ else:
276
+ output_to_map = match.string
277
+ mapped_output = request_state.output_mapping.get(output_to_map)
268
278
  instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
269
279
  request_state.instance
270
280
  )
@@ -14,10 +14,10 @@ class RunEntry:
14
14
  description: str
15
15
 
16
16
  # Priority for this run spec (1 is highest priority, 5 is lowest priority)
17
- priority: int
17
+ priority: Optional[int] = None
18
18
 
19
19
  # Additional groups to add to the run spec
20
- groups: Optional[List[str]]
20
+ groups: Optional[List[str]] = None
21
21
 
22
22
 
23
23
  @dataclass(frozen=True)
helm/benchmark/run.py CHANGED
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
37
37
  run_specs: List[RunSpec] = []
38
38
  for entry in run_entries:
39
39
  # Filter by priority
40
- if priority is not None and entry.priority > priority:
40
+ if priority is not None and entry.priority is not None and entry.priority > priority:
41
41
  continue
42
42
 
43
43
  for run_spec in construct_run_specs(parse_object_spec(entry.description)):
@@ -12,6 +12,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
12
12
 
13
13
 
14
14
  _ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
15
+ _ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
15
16
 
16
17
 
17
18
  @run_spec_function("arabic_mmlu")
@@ -29,6 +30,7 @@ def get_arabic_mmlu_spec(subset: str) -> RunSpec:
29
30
  output_noun="الإجابة",
30
31
  max_tokens=100,
31
32
  reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
33
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
32
34
  )
33
35
 
34
36
  return RunSpec(
@@ -54,6 +56,7 @@ def get_alghafa_spec(subset: str) -> RunSpec:
54
56
  output_noun="الإجابة",
55
57
  max_tokens=100,
56
58
  reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
59
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
57
60
  )
58
61
 
59
62
  return RunSpec(
@@ -130,6 +133,7 @@ def get_madinah_qa_spec(subset: str) -> RunSpec:
130
133
  output_noun="الإجابة",
131
134
  max_tokens=100,
132
135
  reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
136
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
133
137
  )
134
138
 
135
139
  return RunSpec(
@@ -155,6 +159,7 @@ def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
155
159
  output_noun="الإجابة",
156
160
  max_tokens=100,
157
161
  reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
162
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
158
163
  )
159
164
 
160
165
  return RunSpec(
@@ -180,6 +185,7 @@ def get_arabic_exams_spec(subject: str) -> RunSpec:
180
185
  output_noun="الإجابة",
181
186
  max_tokens=100,
182
187
  reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
188
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
183
189
  )
184
190
 
185
191
  return RunSpec(
@@ -1527,7 +1527,7 @@ def get_shc_ent_spec(data_path: str) -> RunSpec:
1527
1527
  @run_spec_function("shc_privacy_med")
1528
1528
  def get_shc_privacy_spec(data_path: str) -> RunSpec:
1529
1529
  scenario_spec = ScenarioSpec(
1530
- class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
1530
+ class_name="helm.benchmark.scenarios.shc_privacy_scenario.SHCPRIVACYMedScenario",
1531
1531
  args={"data_path": data_path},
1532
1532
  )
1533
1533
 
@@ -1550,7 +1550,7 @@ def get_shc_privacy_spec(data_path: str) -> RunSpec:
1550
1550
  @run_spec_function("shc_proxy_med")
1551
1551
  def get_shc_proxy_spec(data_path: str) -> RunSpec:
1552
1552
  scenario_spec = ScenarioSpec(
1553
- class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
1553
+ class_name="helm.benchmark.scenarios.shc_proxy_scenario.SHCPROXYMedScenario",
1554
1554
  args={"data_path": data_path},
1555
1555
  )
1556
1556
 
@@ -112,9 +112,13 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
112
112
  )
113
113
  adapter_spec = _get_generation_adapter_spec(
114
114
  instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
115
- max_tokens=10,
115
+ max_tokens=50,
116
116
  )
117
- metric_specs: List[MetricSpec] = audio_classification_metric_specs()
117
+ metric_specs: List[MetricSpec] = [
118
+ MetricSpec(
119
+ class_name="helm.benchmark.metrics.ultra_suite_asr_classification_metrics.UltraSuiteASRMetric", args={}
120
+ )
121
+ ]
118
122
  run_spec_name: str = "ultra_suite_asr_classification"
119
123
  return RunSpec(
120
124
  name=run_spec_name,
@@ -2,7 +2,8 @@ import re
2
2
  from typing import List, Any, Dict
3
3
  from datasets import load_dataset
4
4
 
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
6
7
 
7
8
 
8
9
  class AnthropicRedTeamScenario(Scenario):
@@ -69,3 +70,13 @@ class AnthropicRedTeamScenario(Scenario):
69
70
  )
70
71
  instances.append(instance)
71
72
  return instances
73
+
74
+ def get_metadata(self) -> ScenarioMetadata:
75
+ return ScenarioMetadata(
76
+ name="anthropic_red_team",
77
+ display_name="Anthropic Red Team",
78
+ description="Anthropic Red Team",
79
+ taxonomy=TaxonomyInfo(task="instruction following sfaety", what="?", when="?", who="?", language="English"),
80
+ main_metric="safety_score",
81
+ main_split="test",
82
+ )
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteASRClassificationScenario(Scenario):
@@ -59,9 +28,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
59
28
  description = "A scenario for evaluating speech disorders in children"
60
29
  tags = ["audio", "classification", "speech_disorder", "asr"]
61
30
 
62
- # Classification options
63
- options: List[str] = ["Healthy", "Unhealthy"]
64
-
65
31
  def get_instances(self, output_path: str) -> List[Instance]:
66
32
  """
67
33
  Create instances from the audio files and their corresponding JSON annotations.
@@ -69,36 +35,40 @@ class UltraSuiteASRClassificationScenario(Scenario):
69
35
  - Audio files (e.g., .mp3)
70
36
  - A JSON file with annotations containing 'answer' field
71
37
  """
72
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
73
- data_path = snapshot_download(
74
- repo_id="SAA-Lab/SLPHelmManualLabels",
75
- repo_type="dataset",
76
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
77
- )
38
+
39
+ audio_save_dir = os.path.join(output_path, "audio_files")
40
+ os.makedirs(audio_save_dir, exist_ok=True)
41
+
42
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
43
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
78
44
 
79
45
  instances: List[Instance] = []
80
46
  split: str = TEST_SPLIT
81
47
 
82
- # Find all pairs of audio and JSON files
83
- pairs = find_audio_json_pairs(data_path)
48
+ for idx, row in enumerate(tqdm(dataset["train"])):
84
49
 
85
- for audio_path, json_path in tqdm(pairs):
50
+ label = row["disorder_class"]
51
+ transcription = row["transcription"]
86
52
 
87
- # Load the annotation
88
- with open(json_path, "r") as f:
89
- annotation = json.load(f)
53
+ unique_id = str(idx)
54
+ local_audio_name = f"{label}_{unique_id}.mp3"
55
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
56
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
90
57
 
91
- # Get the correct answer and convert to label
92
- answer = annotation["disorder_class"]
93
58
  # Create references for each option
94
- references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
59
+ references: List[Reference] = []
60
+ for option in ["typically_developing", "speech_disorder"]:
61
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
62
+ references.append(reference)
95
63
 
96
64
  # Create the input with audio and instruction
97
65
  content = [
98
- MediaObject(content_type="audio/mpeg", location=audio_path),
66
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
99
67
  ]
100
68
 
101
69
  input = Input(multimedia_content=MultimediaObject(content))
102
- instances.append(Instance(input=input, references=references, split=split))
70
+ instances.append(
71
+ Instance(input=input, references=references, split=split, extra_data={"transcription": transcription})
72
+ )
103
73
 
104
74
  return instances
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteASRTranscriptionScenario(Scenario):
@@ -66,31 +35,33 @@ class UltraSuiteASRTranscriptionScenario(Scenario):
66
35
  - Audio files (e.g., .mp3)
67
36
  - A JSON file with annotations containing 'answer' field
68
37
  """
69
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
70
- data_path = snapshot_download(
71
- repo_id="SAA-Lab/SLPHelmManualLabels",
72
- repo_type="dataset",
73
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
74
- )
38
+ audio_save_dir = os.path.join(output_path, "audio_files")
39
+ os.makedirs(audio_save_dir, exist_ok=True)
40
+
41
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
42
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
75
43
 
76
44
  instances: List[Instance] = []
77
45
  split: str = TEST_SPLIT
78
46
 
79
47
  # Find all pairs of audio and JSON files
80
- pairs = find_audio_json_pairs(data_path)
81
-
82
- for audio_path, json_path in tqdm(pairs):
48
+ for idx, row in enumerate(tqdm(dataset["train"])):
83
49
 
84
50
  # Load the annotation
85
- with open(json_path, "r") as f:
86
- annotation = json.load(f)
51
+ # Load the annotation
52
+ label = row["disorder_class"]
53
+
54
+ unique_id = str(idx)
55
+ local_audio_name = f"{label}_{unique_id}.mp3"
56
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
57
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
87
58
 
88
- # Create references for the transcription
89
- references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
59
+ # Create references for each option
60
+ references: List[Reference] = [Reference(Output(text=row["transcription"]), tags=[CORRECT_TAG])]
90
61
 
91
62
  # Create the input with audio and instruction
92
63
  content = [
93
- MediaObject(content_type="audio/mpeg", location=audio_path),
64
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
94
65
  ]
95
66
 
96
67
  input = Input(multimedia_content=MultimediaObject(content))
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,41 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- if len(pairs) == 0:
49
- raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
50
-
51
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
52
18
 
53
19
 
54
20
  class UltraSuiteClassificationScenario(Scenario):
@@ -72,44 +38,39 @@ class UltraSuiteClassificationScenario(Scenario):
72
38
  - Audio files (e.g., .mp3)
73
39
  - A JSON file with annotations containing 'answer' field
74
40
  """
41
+ audio_save_dir = os.path.join(output_path, "audio_files")
42
+ os.makedirs(audio_save_dir, exist_ok=True)
75
43
 
76
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
77
- data_path = snapshot_download(
78
- repo_id="SAA-Lab/SLPHelmManualLabels",
79
- repo_type="dataset",
80
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
81
- )
44
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
45
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
82
46
 
83
47
  instances: List[Instance] = []
84
48
  split: str = TEST_SPLIT
85
49
 
86
- # Find all pairs of audio and JSON files
87
- pairs = find_audio_json_pairs(data_path)
88
- print(f"Num pairs: {len(pairs)}")
50
+ for idx, row in enumerate(tqdm(dataset["train"])):
89
51
 
90
- for audio_path, json_path in tqdm(pairs):
91
52
  # Load the annotation
92
- with open(json_path, "r") as f:
93
- annotation = json.load(f)
53
+ label = row["disorder_class"]
54
+ transcription = row["transcription"]
55
+
56
+ unique_id = str(idx)
57
+ local_audio_name = f"{label}_{unique_id}.mp3"
58
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
59
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
94
60
 
95
- # Get the correct answer and convert to label
96
- answer = annotation["disorder_class"]
97
- words = annotation["transcription"]
98
61
  # Create references for each option
99
62
  references: List[Reference] = []
100
- correct_label = 0
101
- for option in ["typically_developing", "speech_disorder"]:
102
- reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
103
- references.append(reference)
104
- if option == answer:
105
- correct_label += 1
106
- if correct_label == 0:
63
+ options = ["typically_developing", "speech_disorder"]
64
+ if label not in options:
107
65
  continue
66
+ for option in options:
67
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
68
+ references.append(reference)
108
69
 
109
70
  # Create the input with audio and instruction
110
71
  content = [
111
- MediaObject(content_type="audio/mpeg", location=audio_path),
112
- MediaObject(content_type="text/plain", text=self.get_instruction(words)),
72
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
73
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
113
74
  ]
114
75
 
115
76
  input = Input(multimedia_content=MultimediaObject(content))