crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,77 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from tqdm import tqdm
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ class COREBenchScenario(Scenario):
20
+ """COREBench
21
+
22
+ COREBench is a new audio benchmark incorporating multi-speaker conversations. It consists of conversational
23
+ audio, transcript, question, and answer. There are two challenging features of this benchmark: (1) the questions
24
+ are designed to require reasoning over multiple turns of conversation, and (2) the average audio length is
25
+ longer than 1 minute, which is significantly longer than existing benchmarks.
26
+ """
27
+
28
+ ANNOT_URL = (
29
+ "https://huggingface.co/datasets/stanford-crfm/COnversationalREasoningBench_v0.1/resolve/"
30
+ "main/test/instances.jsonl"
31
+ )
32
+ HF_AUDIO_FOLDER = (
33
+ "https://huggingface.co/datasets/stanford-crfm/COnversationalREasoningBench_v0.1/resolve/main/test/audio"
34
+ )
35
+
36
+ COREBENCH_INSTRUCTION = (
37
+ "\n\n Answer the question by just giving the final answer and nothing else. "
38
+ "Answer 'unanswerable' if the question is irrelevant to the audio or cannot be inferred."
39
+ )
40
+
41
+ name = "corebench"
42
+ description = "Exploring multi-speaker conversational audio reasoning task."
43
+ tags: List[str] = ["audio", "reasoning"]
44
+
45
+ def load_jsonl(self, file_path):
46
+ with open(file_path, "r", encoding="utf-8") as f:
47
+ return [json.loads(line.strip()) for line in f]
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ instances: List[Instance] = []
51
+ annot_save_path = os.path.join(output_path, "instances.jsonl")
52
+ ensure_file_downloaded(source_url=COREBenchScenario.ANNOT_URL, target_path=annot_save_path)
53
+ annotations = self.load_jsonl(annot_save_path)
54
+ audio_save_dir = os.path.join(output_path, "audio")
55
+ # Download audio files first
56
+ for row in tqdm(annotations):
57
+ audio_path = row["audio_path"]
58
+ local_audio_path = os.path.join(audio_save_dir, audio_path)
59
+ ensure_file_downloaded(
60
+ source_url=os.path.join(COREBenchScenario.HF_AUDIO_FOLDER, audio_path), target_path=local_audio_path
61
+ )
62
+ for row in tqdm(annotations):
63
+ local_audio_path = os.path.join(audio_save_dir, row["audio_path"])
64
+ answer = row["answer"].lower()
65
+ question = row["question"]
66
+
67
+ input = Input(
68
+ multimedia_content=MultimediaObject(
69
+ [
70
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
71
+ MediaObject(content_type="text/plain", text=question + self.COREBENCH_INSTRUCTION),
72
+ ]
73
+ )
74
+ )
75
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
76
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
77
+ return instances
@@ -19,7 +19,7 @@ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
19
19
 
20
20
 
21
21
  class MUStARDScenario(Scenario):
22
- """
22
+ r"""
23
23
  MUStARD: Multimodal Sarcasm Detection Dataset
24
24
 
25
25
  A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
@@ -0,0 +1,104 @@
1
+ from typing import List, Tuple
2
+ import os
3
+ import json
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from huggingface_hub import snapshot_download
18
+
19
+
20
+ def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
+ """
22
+ Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
+ Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
+
25
+ Args:
26
+ directory: Path to the directory containing the files
27
+
28
+ Returns:
29
+ List of tuples where each tuple contains (mp3_path, json_path)
30
+ """
31
+ pairs = []
32
+
33
+ # Walk through all directories and subdirectories
34
+ for root, _, files in os.walk(directory):
35
+ # Get all MP3 files in current directory
36
+ mp3_files = [f for f in files if f.endswith(".mp3")]
37
+
38
+ for mp3_file in mp3_files:
39
+ base_name = os.path.splitext(mp3_file)[0]
40
+ json_file = f"{base_name}.json"
41
+
42
+ # Check if corresponding JSON file exists in the same directory
43
+ if json_file in files:
44
+ mp3_path = os.path.join(root, mp3_file)
45
+ json_path = os.path.join(root, json_file)
46
+ pairs.append((mp3_path, json_path))
47
+
48
+ return pairs
49
+
50
+
51
+ class UltraSuiteASRClassificationScenario(Scenario):
52
+ """
53
+ A scenario for evaluating whether a child speaker has a speech disorder or not.
54
+ The audio files contain speech from children, potentially with an adult present.
55
+ The task is to classify whether the child speaker is typically developing or has a speech disorder.
56
+ """
57
+
58
+ name = "speech_disorder"
59
+ description = "A scenario for evaluating speech disorders in children"
60
+ tags = ["audio", "classification", "speech_disorder", "asr"]
61
+
62
+ # Classification options
63
+ options: List[str] = ["Healthy", "Unhealthy"]
64
+
65
+ def get_instances(self, output_path: str) -> List[Instance]:
66
+ """
67
+ Create instances from the audio files and their corresponding JSON annotations.
68
+ The data directory should contain:
69
+ - Audio files (e.g., .mp3)
70
+ - A JSON file with annotations containing 'answer' field
71
+ """
72
+ print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
73
+ data_path = snapshot_download(
74
+ repo_id="SAA-Lab/SLPHelmManualLabels",
75
+ repo_type="dataset",
76
+ revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
77
+ )
78
+
79
+ instances: List[Instance] = []
80
+ split: str = TEST_SPLIT
81
+
82
+ # Find all pairs of audio and JSON files
83
+ pairs = find_audio_json_pairs(data_path)
84
+
85
+ for audio_path, json_path in tqdm(pairs):
86
+
87
+ # Load the annotation
88
+ with open(json_path, "r") as f:
89
+ annotation = json.load(f)
90
+
91
+ # Get the correct answer and convert to label
92
+ answer = annotation["disorder_class"]
93
+ # Create references for each option
94
+ references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
95
+
96
+ # Create the input with audio and instruction
97
+ content = [
98
+ MediaObject(content_type="audio/mpeg", location=audio_path),
99
+ ]
100
+
101
+ input = Input(multimedia_content=MultimediaObject(content))
102
+ instances.append(Instance(input=input, references=references, split=split))
103
+
104
+ return instances
@@ -0,0 +1,99 @@
1
+ from typing import List, Tuple
2
+ import os
3
+ import json
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from huggingface_hub import snapshot_download
18
+
19
+
20
+ def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
+ """
22
+ Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
+ Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
+
25
+ Args:
26
+ directory: Path to the directory containing the files
27
+
28
+ Returns:
29
+ List of tuples where each tuple contains (mp3_path, json_path)
30
+ """
31
+ pairs = []
32
+
33
+ # Walk through all directories and subdirectories
34
+ for root, _, files in os.walk(directory):
35
+ # Get all MP3 files in current directory
36
+ mp3_files = [f for f in files if f.endswith(".mp3")]
37
+
38
+ for mp3_file in mp3_files:
39
+ base_name = os.path.splitext(mp3_file)[0]
40
+ json_file = f"{base_name}.json"
41
+
42
+ # Check if corresponding JSON file exists in the same directory
43
+ if json_file in files:
44
+ mp3_path = os.path.join(root, mp3_file)
45
+ json_path = os.path.join(root, json_file)
46
+ pairs.append((mp3_path, json_path))
47
+
48
+ return pairs
49
+
50
+
51
+ class UltraSuiteASRTranscriptionScenario(Scenario):
52
+ """
53
+ A scenario for evaluating the transcription capabilities of ASR systems.
54
+ The audio files contain speech from children, potentially with an adult present.
55
+ The task is to classify whether the child speaker is typically developing or has a speech disorder.
56
+ """
57
+
58
+ name = "speech_disorder"
59
+ description = "A scenario for evaluating speech disorders in children"
60
+ tags = ["audio", "transcription", "speech_disorder", "asr"]
61
+
62
+ def get_instances(self, output_path: str) -> List[Instance]:
63
+ """
64
+ Create instances from the audio files and their corresponding JSON annotations.
65
+ The data directory should contain:
66
+ - Audio files (e.g., .mp3)
67
+ - A JSON file with annotations containing 'answer' field
68
+ """
69
+ print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
70
+ data_path = snapshot_download(
71
+ repo_id="SAA-Lab/SLPHelmManualLabels",
72
+ repo_type="dataset",
73
+ revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
74
+ )
75
+
76
+ instances: List[Instance] = []
77
+ split: str = TEST_SPLIT
78
+
79
+ # Find all pairs of audio and JSON files
80
+ pairs = find_audio_json_pairs(data_path)
81
+
82
+ for audio_path, json_path in tqdm(pairs):
83
+
84
+ # Load the annotation
85
+ with open(json_path, "r") as f:
86
+ annotation = json.load(f)
87
+
88
+ # Create references for the transcription
89
+ references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
90
+
91
+ # Create the input with audio and instruction
92
+ content = [
93
+ MediaObject(content_type="audio/mpeg", location=audio_path),
94
+ ]
95
+
96
+ input = Input(multimedia_content=MultimediaObject(content))
97
+ instances.append(Instance(input=input, references=references, split=split))
98
+
99
+ return instances
@@ -0,0 +1,118 @@
1
+ from typing import List, Tuple
2
+ import os
3
+ import json
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from huggingface_hub import snapshot_download
18
+
19
+
20
+ def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
+ """
22
+ Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
+ Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
+
25
+ Args:
26
+ directory: Path to the directory containing the files
27
+
28
+ Returns:
29
+ List of tuples where each tuple contains (mp3_path, json_path)
30
+ """
31
+ pairs = []
32
+
33
+ # Walk through all directories and subdirectories
34
+ for root, _, files in os.walk(directory):
35
+ # Get all MP3 files in current directory
36
+ mp3_files = [f for f in files if f.endswith(".mp3")]
37
+
38
+ for mp3_file in mp3_files:
39
+ base_name = os.path.splitext(mp3_file)[0]
40
+ json_file = f"{base_name}.json"
41
+
42
+ # Check if corresponding JSON file exists in the same directory
43
+ if json_file in files:
44
+ mp3_path = os.path.join(root, mp3_file)
45
+ json_path = os.path.join(root, json_file)
46
+ pairs.append((mp3_path, json_path))
47
+
48
+ if len(pairs) == 0:
49
+ raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
50
+
51
+ return pairs
52
+
53
+
54
+ class UltraSuiteClassificationScenario(Scenario):
55
+ """
56
+ A scenario for evaluating whether a child speaker has a speech disorder or not.
57
+ The audio files contain speech from children, potentially with an adult present.
58
+ The task is to classify whether the child speaker is typically developing or has a speech disorder.
59
+ """
60
+
61
+ name = "speech_disorder"
62
+ description = "A scenario for evaluating speech disorders in children"
63
+ tags = ["audio", "classification", "speech_disorder"]
64
+
65
+ def get_instruction(self, words: str) -> str:
66
+ return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: 'typically_developing' or 'speech_disorder'. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
67
+
68
+ def get_instances(self, output_path: str) -> List[Instance]:
69
+ """
70
+ Create instances from the audio files and their corresponding JSON annotations.
71
+ The data directory should contain:
72
+ - Audio files (e.g., .mp3)
73
+ - A JSON file with annotations containing 'answer' field
74
+ """
75
+
76
+ print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
77
+ data_path = snapshot_download(
78
+ repo_id="SAA-Lab/SLPHelmManualLabels",
79
+ repo_type="dataset",
80
+ revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
81
+ )
82
+
83
+ instances: List[Instance] = []
84
+ split: str = TEST_SPLIT
85
+
86
+ # Find all pairs of audio and JSON files
87
+ pairs = find_audio_json_pairs(data_path)
88
+ print(f"Num pairs: {len(pairs)}")
89
+
90
+ for audio_path, json_path in tqdm(pairs):
91
+ # Load the annotation
92
+ with open(json_path, "r") as f:
93
+ annotation = json.load(f)
94
+
95
+ # Get the correct answer and convert to label
96
+ answer = annotation["disorder_class"]
97
+ words = annotation["transcription"]
98
+ # Create references for each option
99
+ references: List[Reference] = []
100
+ correct_label = 0
101
+ for option in ["typically_developing", "speech_disorder"]:
102
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
103
+ references.append(reference)
104
+ if option == answer:
105
+ correct_label += 1
106
+ if correct_label == 0:
107
+ continue
108
+
109
+ # Create the input with audio and instruction
110
+ content = [
111
+ MediaObject(content_type="audio/mpeg", location=audio_path),
112
+ MediaObject(content_type="text/plain", text=self.get_instruction(words)),
113
+ ]
114
+
115
+ input = Input(multimedia_content=MultimediaObject(content))
116
+ instances.append(Instance(input=input, references=references, split=split))
117
+
118
+ return instances
@@ -0,0 +1,86 @@
1
+ from typing import List
2
+ import json
3
+
4
+ from tqdm import tqdm
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from huggingface_hub import snapshot_download
17
+ from .ultra_suite_classification_scenario import find_audio_json_pairs
18
+
19
+
20
+ class UltraSuiteDisorderBreakdownScenario(Scenario):
21
+ """
22
+ A scenario for evaluating and classifying specific types of speech disorders in children.
23
+ This scenario extends the basic speech disorder classification by breaking down disorders
24
+ into specific categories: articulation and phonological disorders.
25
+ """
26
+
27
+ name = "speech_disorder"
28
+ description = "A scenario for evaluating and classifying specific types of speech disorders in children"
29
+ tags = ["audio", "classification", "speech_disorder", "disorder_breakdown"]
30
+
31
+ def get_instruction(self, words: str) -> str:
32
+ return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt text the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: A - 'typically developing' (child's speech patterns and development are within normal age-appropriate ranges), B - 'articulation' (difficulty producing specific speech sounds correctly, such as substituting, omitting, or distorting sounds), C - 'phonological' (difficulty understanding and using the sound system of language, affecting sounds of a particular type). 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
33
+
34
+ def get_instances(self, output_path: str) -> List[Instance]:
35
+ """
36
+ Create instances from the audio files and their corresponding JSON annotations.
37
+ The data directory should contain:
38
+ - Audio files (e.g., .mp3)
39
+ - A JSON file with annotations containing 'disorder_class' field
40
+ """
41
+ print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
42
+ data_path = snapshot_download(
43
+ repo_id="SAA-Lab/SLPHelmManualLabels",
44
+ repo_type="dataset",
45
+ revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
46
+ )
47
+
48
+ instances: List[Instance] = []
49
+ split: str = TEST_SPLIT
50
+
51
+ # Find all pairs of audio and JSON files
52
+ pairs = find_audio_json_pairs(data_path)
53
+ print(f"Num pairs: {len(pairs)}")
54
+
55
+ for audio_path, json_path in tqdm(pairs):
56
+ # Load the annotation
57
+ with open(json_path, "r") as f:
58
+ annotation = json.load(f)
59
+
60
+ # Get the correct answer and convert to label
61
+ if "disorder_type" not in annotation or "transcription" not in annotation:
62
+ continue
63
+ label = annotation["disorder_type"]
64
+ prompt = annotation["transcription"]
65
+
66
+ # Create references for each option
67
+ references: List[Reference] = []
68
+ correct_label = 0
69
+ for option in ["typically_developing", "articulation", "phonological"]:
70
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
71
+ references.append(reference)
72
+ if option == label:
73
+ correct_label += 1
74
+ if correct_label == 0:
75
+ continue
76
+
77
+ # Create the input with audio and instruction
78
+ content = [
79
+ MediaObject(content_type="audio/mpeg", location=audio_path),
80
+ MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
81
+ ]
82
+
83
+ input = Input(multimedia_content=MultimediaObject(content))
84
+ instances.append(Instance(input=input, references=references, split=split))
85
+
86
+ return instances
@@ -0,0 +1,117 @@
1
+ from typing import List, Tuple
2
+ import os
3
+ import json
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from huggingface_hub import snapshot_download
18
+
19
+
20
+ def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
+ """
22
+ Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
+ Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
+
25
+ Args:
26
+ directory: Path to the directory containing the files
27
+
28
+ Returns:
29
+ List of tuples where each tuple contains (mp3_path, json_path)
30
+ """
31
+ pairs = []
32
+
33
+ # Walk through all directories and subdirectories
34
+ for root, _, files in os.walk(directory):
35
+ # Get all MP3 files in current directory
36
+ mp3_files = [f for f in files if f.endswith(".mp3")]
37
+
38
+ for mp3_file in mp3_files:
39
+ base_name = os.path.splitext(mp3_file)[0]
40
+ json_file = f"{base_name}.json"
41
+
42
+ # Check if corresponding JSON file exists in the same directory
43
+ if json_file in files:
44
+ mp3_path = os.path.join(root, mp3_file)
45
+ json_path = os.path.join(root, json_file)
46
+ pairs.append((mp3_path, json_path))
47
+
48
+ return pairs
49
+
50
+
51
+ class UltraSuiteDisorderSymptomsScenario(Scenario):
52
+ """
53
+ A scenario identifying features of speech disorders within the provided audio.
54
+ The audio files contain speech from children, potentially with an adult present.
55
+ """
56
+
57
+ name = "speech_disorder"
58
+ description = "A scenario for evaluating speech disorders in children"
59
+ tags = ["audio", "classification", "speech_disorder"]
60
+
61
+ def get_instruction(self, words: str) -> str:
62
+ prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
63
+
64
+ return prompt
65
+
66
+ def get_instances(self, output_path: str) -> List[Instance]:
67
+ """
68
+ Create instances from the audio files and their corresponding JSON annotations.
69
+ The data directory should contain:
70
+ - Audio files (e.g., .mp3)
71
+ - A JSON file with annotations containing 'answer' field
72
+ """
73
+ print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
74
+ data_path = snapshot_download(
75
+ repo_id="SAA-Lab/SLPHelmManualLabels",
76
+ repo_type="dataset",
77
+ revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
78
+ )
79
+
80
+ instances: List[Instance] = []
81
+ split: str = TEST_SPLIT
82
+
83
+ # Find all pairs of audio and JSON files
84
+ pairs = find_audio_json_pairs(data_path)
85
+
86
+ for audio_path, json_path in tqdm(pairs):
87
+
88
+ # Load the annotation
89
+ with open(json_path, "r") as f:
90
+ annotation = json.load(f)
91
+
92
+ # Get the correct answer and convert to label
93
+ if "disorder_symptom" not in annotation or "transcription" not in annotation:
94
+ continue
95
+ label = annotation["disorder_symptom"]
96
+ prompt = annotation["transcription"]
97
+ # Create references for each option
98
+ references: List[Reference] = []
99
+ correct_label = 0
100
+ for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
101
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
102
+ references.append(reference)
103
+ if option == label:
104
+ correct_label += 1
105
+ if correct_label == 0:
106
+ continue
107
+
108
+ # Create the input with audio and instruction
109
+ content = [
110
+ MediaObject(content_type="audio/mpeg", location=audio_path),
111
+ MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
112
+ ]
113
+
114
+ input = Input(multimedia_content=MultimediaObject(content))
115
+ instances.append(Instance(input=input, references=references, split=split))
116
+
117
+ return instances
@@ -46,6 +46,10 @@ class VocalSoundScenario(Scenario):
46
46
  description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
47
47
  tags: List[str] = ["audio", "classification"]
48
48
 
49
+ def __init__(self, sound: str) -> None:
50
+ super().__init__()
51
+ self._sound: str = sound
52
+
49
53
  def get_instances(self, output_path: str) -> List[Instance]:
50
54
  instances: List[Instance] = []
51
55
  down_loading_path = os.path.join(output_path, "download")
@@ -53,7 +57,12 @@ class VocalSoundScenario(Scenario):
53
57
  wav_save_dir = os.path.join(down_loading_path, "audio_16k")
54
58
  for file_name in tqdm(os.listdir(wav_save_dir)):
55
59
  local_audio_path: str = os.path.join(wav_save_dir, file_name)
56
- if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
60
+ if (
61
+ not file_name.endswith(".wav")
62
+ or is_invalid_audio_file(local_audio_path)
63
+ # Skip this problematic file
64
+ or file_name == "m0083_0_sneeze.wav"
65
+ ):
57
66
  continue
58
67
 
59
68
  input = Input(
@@ -61,9 +70,14 @@ class VocalSoundScenario(Scenario):
61
70
  )
62
71
 
63
72
  answer: str = file_name.split("_")[-1].split(".")[0]
73
+ if answer.lower() != self._sound:
74
+ continue
75
+
64
76
  if answer == "throatclearing":
65
77
  answer = "throat clearing"
66
78
 
67
79
  references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
68
80
  instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
81
+
82
+ assert len(instances) > 0, f"No instances found for sound: {self._sound}"
69
83
  return instances
@@ -40,8 +40,7 @@ class VoxCeleb2Scenario(Scenario):
40
40
  "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
41
41
  )
42
42
  IDENTITY_INSTRUCTION = (
43
- "Listen to the audio and take your best guess to determine if the two speakers are the same person. "
44
- "Give just the letter of your answer and nothing else."
43
+ "Listen to the audio and take your best guess to determine if the two speakers are the same person."
45
44
  )
46
45
 
47
46
  name = "voxceleb2"