pelican-nlp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +14 -21
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.0.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,695 @@
1
+ # Standard Library Imports
2
+ import io
3
+ import re
4
+ import unicodedata
5
+ from typing import List, Dict
6
+ from pathlib import Path
7
+ # Third-party Library Imports
8
+ import librosa
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import torch
12
+ import torchaudio
13
+ import torchaudio.transforms as T
14
+ from pydub import AudioSegment
15
+ from pydub.silence import detect_silence
16
+ from transformers import pipeline
17
+ from pyannote.audio import Pipeline as DiarizationPipeline
18
+ import uroman as ur
19
+ import pandas as pd
20
+ import textgrids
21
+ import json
22
+
23
+
24
+ def detect_silence_intervals(audio_segment, min_silence_len=1000, silence_thresh=-40):
25
+ """
26
+ Detects silent intervals in the audio segment.
27
+ Returns a list of [start_ms, end_ms] pairs representing silence periods.
28
+ """
29
+ return detect_silence(
30
+ audio_segment,
31
+ min_silence_len=min_silence_len,
32
+ silence_thresh=silence_thresh
33
+ )
34
+
35
+
36
+ def get_splitting_points(silence_ranges, audio_length_ms):
37
+ """
38
+ Computes splitting points based on silence ranges.
39
+ Returns a sorted list of splitting points in milliseconds.
40
+ """
41
+ # Use list comprehension for faster execution
42
+ splitting_points = [0] + [(start + end) // 2 for start, end in silence_ranges] + [audio_length_ms]
43
+ return splitting_points
44
+
45
+
46
+ def create_initial_chunks(splitting_points):
47
+ """
48
+ Creates initial chunks based on splitting points.
49
+ Returns a list of (start_ms, end_ms) tuples.
50
+ """
51
+ # Utilize zip for efficient pairing
52
+ return list(zip(splitting_points[:-1], splitting_points[1:]))
53
+
54
+
55
+ def adjust_intervals_by_length(intervals, min_length=30000, max_length=180000):
56
+ adjusted_intervals = []
57
+ buffer_start, buffer_end = intervals[0]
58
+
59
+ for start, end in intervals[1:]:
60
+ buffer_end = end
61
+ buffer_length = buffer_end - buffer_start
62
+
63
+ if buffer_length < min_length:
64
+ # Merge with the next interval by extending the buffer
65
+ continue
66
+ else:
67
+ # Split the buffer if it exceeds max_length
68
+ if buffer_length > max_length:
69
+ num_splits = int(np.ceil(buffer_length / max_length))
70
+ split_size = int(np.ceil(buffer_length / num_splits))
71
+ for i in range(num_splits):
72
+ split_start = buffer_start + i * split_size
73
+ split_end = min(buffer_start + (i + 1) * split_size, buffer_end)
74
+ adjusted_intervals.append((split_start, split_end))
75
+ else:
76
+ adjusted_intervals.append((buffer_start, buffer_end))
77
+ # Correctly update buffer_start to prevent overlap
78
+ buffer_start = buffer_end # Start from the end of the last buffer
79
+
80
+ # Handle any remaining buffer
81
+ if buffer_end > buffer_start:
82
+ buffer_length = buffer_end - buffer_start
83
+ if buffer_length >= min_length:
84
+ if buffer_length > max_length:
85
+ num_splits = int(np.ceil(buffer_length / max_length))
86
+ split_size = int(np.ceil(buffer_length / num_splits))
87
+ for i in range(num_splits):
88
+ split_start = buffer_start + i * split_size
89
+ split_end = min(buffer_start + (i + 1) * split_size, buffer_end)
90
+ adjusted_intervals.append((split_start, split_end))
91
+ else:
92
+ adjusted_intervals.append((buffer_start, buffer_end))
93
+ else:
94
+ # Decide how to handle intervals shorter than min_length
95
+ pass
96
+
97
+ return adjusted_intervals
98
+
99
+
100
+ def split_audio_by_intervals(audio_segment, intervals):
101
+ """
102
+ Splits the audio segment into chunks based on the provided intervals.
103
+ Returns a list of (chunk_audio, start_time_ms, end_time_ms) tuples.
104
+ """
105
+ return [(audio_segment[start_ms:end_ms], start_ms, end_ms) for start_ms, end_ms in intervals]
106
+
107
+
108
+ class Chunk:
109
+ def __init__(self, audio_segment, start_time):
110
+ self.audio_segment = audio_segment
111
+ self.start_time = start_time # Start time in the original audio (seconds)
112
+ self.transcript = ""
113
+ self.whisper_alignments = []
114
+ self.forced_alignments = []
115
+
116
+
117
+ # Define the AudioFile class
118
+ class AudioFile:
119
+ def __init__(self, file_path: str, target_rms_db: float = -20):
120
+ self.file_path = file_path
121
+ self.target_rms_db = target_rms_db
122
+ self.normalized_path = None
123
+ self.audio = None
124
+ self.sample_rate = None
125
+ self.chunks: List[Chunk] = []
126
+ self.transcript_text = ""
127
+ self.whisper_alignments = []
128
+ self.forced_alignments = []
129
+ self.speaker_segments = []
130
+ self.combined_data = []
131
+ self.combined_utterances = []
132
+ self.load_audio()
133
+
134
+ def load_audio(self):
135
+ # Load the audio file
136
+ self.audio, self.sample_rate = librosa.load(self.file_path, sr=None)
137
+ print(f"Loaded audio file {self.file_path}")
138
+
139
+ def rms_normalization(self):
140
+ # Convert target RMS dB to linear scale
141
+ target_rms = 10 ** (self.target_rms_db / 20)
142
+
143
+ # Calculate current RMS of the audio
144
+ rms = np.sqrt(np.mean(self.audio ** 2))
145
+
146
+ # Calculate gain required to reach target RMS
147
+ gain = target_rms / rms
148
+ normalized_audio = self.audio * gain
149
+
150
+ # Save the normalized audio to a temporary file
151
+ self.normalized_path = self.file_path.replace(".wav", "_normalized.wav")
152
+ sf.write(self.normalized_path, normalized_audio, self.sample_rate)
153
+ print(f"Normalized audio saved as {self.normalized_path}")
154
+
155
+ def split_on_silence(self, min_silence_len=1000, silence_thresh=-30,
156
+ min_length=30000, max_length=180000):
157
+ # Load the normalized audio using pydub
158
+ audio_segment = AudioSegment.from_file(self.normalized_path)
159
+ audio_length_ms = len(audio_segment)
160
+
161
+ # Step 1: Detect silence intervals
162
+ silence_ranges = detect_silence_intervals(
163
+ audio_segment,
164
+ min_silence_len=min_silence_len,
165
+ silence_thresh=silence_thresh
166
+ )
167
+
168
+ # Step 2: Identify splitting points
169
+ splitting_points = get_splitting_points(silence_ranges, audio_length_ms)
170
+
171
+ # Step 3: Create initial chunks covering the entire audio
172
+ initial_intervals = create_initial_chunks(splitting_points)
173
+
174
+ # Step 4: Adjust intervals based on length constraints
175
+ adjusted_intervals = adjust_intervals_by_length(
176
+ initial_intervals,
177
+ min_length=min_length,
178
+ max_length=max_length
179
+ )
180
+
181
+ # Step 5: Extract chunks
182
+ chunks_with_timestamps = split_audio_by_intervals(audio_segment, adjusted_intervals)
183
+
184
+ # Create Chunk instances with accurate start times
185
+ self.chunks = []
186
+ for chunk_audio, start_i, end_i in chunks_with_timestamps:
187
+ self.chunks.append(Chunk(chunk_audio, start_i / 1000.0)) # Convert ms to seconds
188
+ print(f"Total chunks after splitting: {len(self.chunks)}")
189
+
190
+ def combine_chunks(self):
191
+ # Combine transcripts
192
+ self.transcript_text = " ".join([chunk.transcript for chunk in self.chunks])
193
+ # Combine word alignments
194
+ for chunk in self.chunks:
195
+ self.whisper_alignments.extend(chunk.whisper_alignments)
196
+ self.forced_alignments.extend(chunk.forced_alignments)
197
+
198
+ def aggregate_to_utterances(self):
199
+ if not self.combined_data:
200
+ print("No combined data available to adjust.")
201
+ return
202
+
203
+ utterances = []
204
+ current_utterance = {
205
+ "text": "",
206
+ "start_time": None,
207
+ "end_time": None,
208
+ "speakers": {}
209
+ }
210
+
211
+ sentence_endings = re.compile(r'[.?!]$')
212
+ print(self.combined_data)
213
+ for word_data in self.combined_data:
214
+ word = word_data["word"]
215
+ start_time = word_data["start_time"]
216
+ end_time = word_data["end_time"]
217
+ speaker = word_data["speaker"]
218
+
219
+ # Initialize the current utterance start time if not already set
220
+ if current_utterance["start_time"] is None:
221
+ current_utterance["start_time"] = start_time
222
+
223
+ # Append the word to the current utterance text
224
+ current_utterance["text"] += ("" if current_utterance["text"] == "" else " ") + word
225
+
226
+ # Update the end time of the current utterance
227
+ current_utterance["end_time"] = end_time
228
+
229
+ # Update the speaker count for this utterance
230
+ if speaker not in current_utterance["speakers"]:
231
+ current_utterance["speakers"][speaker] = 0
232
+ current_utterance["speakers"][speaker] += 1
233
+
234
+ # Check if this word ends the sentence
235
+ if sentence_endings.search(word):
236
+ # Determine the majority speaker
237
+ majority_speaker, majority_count = max(
238
+ current_utterance["speakers"].items(), key=lambda item: item[1]
239
+ )
240
+ total_words = sum(current_utterance["speakers"].values())
241
+ confidence = majority_count / total_words
242
+
243
+ # Append the completed utterance
244
+ utterances.append({
245
+ "text": current_utterance["text"],
246
+ "start_time": current_utterance["start_time"],
247
+ "end_time": current_utterance["end_time"],
248
+ "speaker": majority_speaker,
249
+ "confidence": round(confidence, 2),
250
+ })
251
+
252
+ # Reset the current utterance
253
+ current_utterance = {
254
+ "text": "",
255
+ "start_time": None,
256
+ "end_time": None,
257
+ "speakers": {}
258
+ }
259
+
260
+ # Handle any remaining words as the last utterance
261
+ if current_utterance["text"]:
262
+ majority_speaker, majority_count = max(
263
+ current_utterance["speakers"].items(), key=lambda item: item[1]
264
+ )
265
+ total_words = sum(current_utterance["speakers"].values())
266
+ confidence = majority_count / total_words
267
+
268
+ utterances.append({
269
+ "text": current_utterance["text"],
270
+ "start_time": current_utterance["start_time"],
271
+ "end_time": current_utterance["end_time"],
272
+ "speaker": majority_speaker,
273
+ "confidence": round(confidence, 2),
274
+ })
275
+
276
+ self.combined_utterances = utterances
277
+
278
+ def combine_alignment_and_diarization(self, alignment_source: str):
279
+ """
280
+ Combines alignment and diarization data by assigning speaker labels to each word.
281
+
282
+ Parameters:
283
+ - alignment_source (str): The alignment data to use. Options:
284
+ - 'whisper_alignments'
285
+ - 'forced_alignments'
286
+
287
+ Updates:
288
+ - self.combined_data: List of words with assigned speakers.
289
+ """
290
+ # Validate the alignment_source
291
+ if alignment_source not in ['whisper_alignments', 'forced_alignments']:
292
+ raise ValueError("Invalid alignment_source. Choose 'whisper_alignments' or 'forced_alignments'.")
293
+
294
+ # Select the appropriate alignment list
295
+ alignment = getattr(self, alignment_source, None)
296
+ if alignment is None:
297
+ raise ValueError(f"The alignment source '{alignment_source}' does not exist in the AudioFile object.")
298
+
299
+ if not self.speaker_segments:
300
+ print("No speaker segments available for diarization. All words will be labeled as 'UNKNOWN'.")
301
+ # Assign 'UNKNOWN' to all words
302
+ self.combined_data = [
303
+ {**word, 'speaker': 'UNKNOWN'} for word in alignment
304
+ ]
305
+ return
306
+
307
+ combined = []
308
+ seg_idx = 0
309
+ num_segments = len(self.speaker_segments)
310
+ speaker_segments = self.speaker_segments
311
+
312
+ for word in alignment:
313
+ word_start = word['start_time']
314
+ word_end = word['end_time']
315
+ word_duration = max(1e-6, word_end - word_start) # Avoid zero-duration
316
+
317
+ speaker_overlap = {}
318
+
319
+ while seg_idx < num_segments and speaker_segments[seg_idx]['end'] < word_start:
320
+ seg_idx += 1
321
+
322
+ temp_idx = seg_idx
323
+ while temp_idx < num_segments and speaker_segments[temp_idx]['start'] < word_end:
324
+ seg = speaker_segments[temp_idx]
325
+ seg_start = seg['start']
326
+ seg_end = seg['end']
327
+ speaker = seg['speaker']
328
+
329
+ if seg_start <= word_start < seg_end: # Handle zero-duration case
330
+ overlap = word_duration # Treat as full overlap for zero-duration
331
+ else:
332
+ overlap_start = max(word_start, seg_start)
333
+ overlap_end = min(word_end, seg_end)
334
+ overlap = max(0.0, overlap_end - overlap_start)
335
+
336
+ if overlap > 0:
337
+ speaker_overlap[speaker] = speaker_overlap.get(speaker, 0.0) + overlap
338
+
339
+ temp_idx += 1
340
+
341
+ assigned_speaker = max(speaker_overlap, key=speaker_overlap.get) if speaker_overlap else 'UNKNOWN'
342
+
343
+ word_with_speaker = word.copy()
344
+ word_with_speaker['speaker'] = assigned_speaker
345
+ combined.append(word_with_speaker)
346
+
347
+ self.combined_data = combined
348
+ print(self.combined_data)
349
+ print(f"Combined alignment and diarization data with {len(self.combined_data)} entries.")
350
+ # self.adjust_overlapping_intervals()
351
+ self.aggregate_to_utterances()
352
+
353
+
354
+ def save_all_data_as_json(self, output_file="all_audio_data.json"):
355
+ """
356
+ Saves multiple attributes of the AudioFile instance into a single JSON file.
357
+
358
+ Parameters:
359
+ - output_file: str, optional
360
+ The path to save the JSON file. Defaults to 'all_audio_data.json'.
361
+ """
362
+ # Check if combined data exists
363
+ if not self.combined_data:
364
+ print("No combined data available to save. Ensure 'combine_alignment_and_diarization' is run first.")
365
+ return
366
+
367
+ # Prepare the data dictionary
368
+ data = {
369
+ "transcript_text": self.transcript_text,
370
+ "whisper_alignments": self.whisper_alignments,
371
+ "forced_alignments": self.forced_alignments,
372
+ "speaker_segments": self.speaker_segments,
373
+ "combined_data": self.combined_data,
374
+ "utterance_data": self.combined_utterances
375
+ }
376
+
377
+ # Convert the data to JSON format
378
+ try:
379
+ json_output = json.dumps(data, indent=4)
380
+ except TypeError as e:
381
+ print(f"Error serializing data to JSON: {e}")
382
+ return
383
+
384
+ # Optional: Warn if output_file already exists
385
+ if Path(output_file).exists():
386
+ print(f"Warning: '{output_file}' already exists and will be overwritten.")
387
+
388
+ # Save JSON to the specified file
389
+ try:
390
+ with open(output_file, "w", encoding="utf-8") as f:
391
+ f.write(json_output)
392
+ print(f"All audio data successfully saved to '{output_file}'.")
393
+ except Exception as e:
394
+ print(f"Error saving JSON file: {e}")
395
+
396
+ print(self.combined_data)
397
+
398
+ # Define the AudioTranscriber class
399
+ class AudioTranscriber:
400
+ """
401
+ Handles transcription of audio chunks using Whisper.
402
+ """
403
+ def __init__(self):
404
+ # Use 'cuda' if available, else 'mps' for Apple devices, else 'cpu'
405
+ if torch.cuda.is_available():
406
+ self.device = torch.device("cuda")
407
+ elif torch.backends.mps.is_available():
408
+ self.device = torch.device("mps")
409
+ else:
410
+ self.device = torch.device("cpu")
411
+
412
+ self.transcriber = pipeline(
413
+ "automatic-speech-recognition",
414
+ model="openai/whisper-medium",
415
+ device=self.device,
416
+ return_timestamps="word" # Ensure word-level timestamps are returned
417
+ )
418
+ print(f"Initialized AudioTranscriber on device: {self.device}")
419
+
420
+ def transcribe(self, audio_file: AudioFile):
421
+ """
422
+ Transcribes each speech chunk and extracts word-level timestamps.
423
+ """
424
+ print("Transcribing audio chunks...")
425
+ for idx, chunk in enumerate(audio_file.chunks, start=1):
426
+ try:
427
+ with io.BytesIO() as wav_io:
428
+ chunk.audio_segment.export(wav_io, format="wav")
429
+ wav_io.seek(0)
430
+ # Pass the audio data to the transcriber
431
+ transcription_result = self.transcriber(wav_io.read())
432
+
433
+ chunk.transcript = transcription_result.get('text', "").strip()
434
+
435
+ # Extract word alignments from 'chunks' instead of 'words'
436
+ raw_chunks = transcription_result.get('chunks', [])
437
+ clean_chunks = []
438
+ for word_info in raw_chunks:
439
+ # Ensure 'timestamp' exists and has two elements
440
+ if 'timestamp' in word_info and len(word_info['timestamp']) == 2:
441
+ # Convert timestamps to float before addition
442
+ start_time = float(word_info['timestamp'][0]) + chunk.start_time
443
+ end_time = float(word_info['timestamp'][1]) + chunk.start_time
444
+ word_text = word_info.get('text', "").strip()
445
+ if word_text: # Ensure word_text is not empty
446
+ clean_chunks.append({
447
+ "word": word_text,
448
+ "start_time": start_time,
449
+ "end_time": end_time
450
+ })
451
+ chunk.whisper_alignments = clean_chunks
452
+ print(f"Transcribed chunk {idx} successfully with {len(clean_chunks)} words.")
453
+ except Exception as e:
454
+ print(f"Error during transcription of chunk {idx}: {e}")
455
+ chunk.transcript = ""
456
+ chunk.whisper_alignments = []
457
+
458
+
459
+ # Define the ForcedAligner class
460
+ class ForcedAligner:
461
+ def __init__(self, device: str = None):
462
+ # Forcing CPU device due to potential compatibility issues
463
+ if torch.cuda.is_available():
464
+ self.device = torch.device("cuda")
465
+ else:
466
+ self.device = torch.device("cpu")
467
+
468
+ self.bundle = torchaudio.pipelines.MMS_FA
469
+ self.model = self.bundle.get_model().to(self.device)
470
+ self.tokenizer = self.bundle.get_tokenizer()
471
+ self.aligner = self.bundle.get_aligner()
472
+ self.uroman = ur.Uroman()
473
+ self.sample_rate = self.bundle.sample_rate
474
+ print(f"Initialized ForcedAligner on device: {self.device}")
475
+
476
+ def normalize_uroman(self, text: str) -> str:
477
+ text = text.encode('utf-8').decode('utf-8')
478
+ text = text.lower()
479
+ text = text.replace("’", "'")
480
+ text = unicodedata.normalize('NFC', text)
481
+ text = re.sub("([^a-z' ])", " ", text)
482
+ text = re.sub(' +', ' ', text)
483
+ return text.strip()
484
+
485
+ def align(self, audio_file: AudioFile):
486
+ for idx, chunk in enumerate(audio_file.chunks):
487
+ print(f"Aligning chunk {idx + 1}/{len(audio_file.chunks)}")
488
+ # Export chunk to a WAV file in memory
489
+ with io.BytesIO() as wav_io:
490
+ chunk.audio_segment.export(wav_io, format="wav")
491
+ wav_io.seek(0)
492
+ waveform, sample_rate = torchaudio.load(wav_io)
493
+ # Resample if needed
494
+ if sample_rate != self.sample_rate:
495
+ resampler = T.Resample(orig_freq=sample_rate, new_freq=self.sample_rate)
496
+ waveform = resampler(waveform)
497
+ sample_rate = self.sample_rate
498
+ # Normalize and tokenize the transcript
499
+ text_roman = self.uroman.romanize_string(chunk.transcript)
500
+ text_normalized = self.normalize_uroman(text_roman)
501
+ transcript_list = text_normalized.split()
502
+ tokens = self.tokenizer(transcript_list)
503
+ # Perform forced alignment
504
+ with torch.inference_mode():
505
+ emission, _ = self.model(waveform.to(self.device))
506
+ token_spans = self.aligner(emission[0], tokens)
507
+ # Extract timestamps
508
+ num_frames = emission.size(1)
509
+ ratio = waveform.size(1) / num_frames
510
+ for spans, word in zip(token_spans, transcript_list):
511
+ start_sec = (spans[0].start * ratio / sample_rate) + chunk.start_time
512
+ end_sec = (spans[-1].end * ratio / sample_rate) + chunk.start_time
513
+ chunk.forced_alignments.append({
514
+ "word": word,
515
+ "start_time": start_sec,
516
+ "end_time": end_sec
517
+ })
518
+
519
+
520
+ # Define the SpeakerDiarizer class
521
+ class SpeakerDiarizer:
522
+ def __init__(self, hf_token: str, parameters):
523
+ if torch.cuda.is_available():
524
+ self.device = torch.device("cuda")
525
+ elif torch.backends.mps.is_available():
526
+ self.device = torch.device("mps")
527
+ else:
528
+ self.device = torch.device("cpu")
529
+
530
+ self.diarization_pipeline = DiarizationPipeline.from_pretrained(
531
+ "pyannote/speaker-diarization-3.1",
532
+ use_auth_token=hf_token
533
+ )
534
+ print("Initializing SpeakerDiarizer with parameters...")
535
+ self.diarization_pipeline.instantiate(parameters)
536
+ self.diarization_pipeline.to(self.device)
537
+ print("Initialized SpeakerDiarizer successfully.")
538
+
539
+ def diarize(self, audio_file: AudioFile, num_speakers: int = None):
540
+ """
541
+ Performs speaker diarization on the given audio file.
542
+
543
+ Parameters:
544
+ - audio_file (AudioFile): The audio file to diarize.
545
+ - num_speakers (int, optional): The expected number of speakers.
546
+ """
547
+ print("Performing diarization on the entire audio file")
548
+ try:
549
+ if num_speakers is not None:
550
+ diarization_result = self.diarization_pipeline(
551
+ audio_file.normalized_path,
552
+ num_speakers=num_speakers
553
+ )
554
+ print(f"Diarization completed with {num_speakers} speakers.")
555
+ else:
556
+ diarization_result = self.diarization_pipeline(
557
+ audio_file.normalized_path
558
+ )
559
+ print("Diarization completed without specifying number of speakers.")
560
+
561
+ # Prepare speaker segments
562
+ audio_file.speaker_segments = []
563
+ for segment, _, speaker in diarization_result.itertracks(yield_label=True):
564
+ audio_file.speaker_segments.append({
565
+ "start": segment.start,
566
+ "end": segment.end,
567
+ "speaker": speaker
568
+ })
569
+ print(f"Detected {len(audio_file.speaker_segments)} speaker segments.")
570
+
571
+ except Exception as e:
572
+ print(f"An error occurred during diarization: {e}")
573
+
574
+
575
+ def process_audio_files(files, hf_token="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", diarizer_params={
576
+ "segmentation": {
577
+ "min_duration_off": 0.0,
578
+ },
579
+ "clustering": {
580
+ "method": "centroid",
581
+ "min_cluster_size": 12,
582
+ "threshold": 0.8,
583
+ }},
584
+
585
+ num_speakers = 2,
586
+
587
+ output_folder="output",
588
+ min_silence_len=1000,
589
+ silence_thresh=-30,
590
+ min_length=90000,
591
+ max_length=150000,
592
+
593
+ timestamp_source = "whisper_alignments"
594
+ ):
595
+ """
596
+ Processes one or more audio files through all steps of the pipeline.
597
+
598
+ Parameters:
599
+ - files: str or List[str]
600
+ A single file path or a list of file paths to process.
601
+ - hf_token: str, optional
602
+ Hugging Face token for accessing diarization models.
603
+ - diarizer_params: dict, optional
604
+ Parameters for the SpeakerDiarizer model.
605
+ - output_folder: str, optional
606
+ Folder to save the output JSON files. Defaults to 'output'.
607
+ - min_silence_len: int, optional
608
+ Minimum silence length for splitting. Defaults to 1000ms.
609
+ - silence_thresh: int, optional
610
+ Silence threshold in dB for splitting. Defaults to -40dB.
611
+ - min_length: int, optional
612
+ Minimum chunk length in milliseconds. Defaults to 30000ms.
613
+ - max_length: int, optional
614
+ Maximum chunk length in milliseconds. Defaults to 180000ms.
615
+ """
616
+
617
+ Path(output_folder).mkdir(exist_ok=True) # Create output folder if it doesn't exist
618
+
619
+ # Ensure `files` is a list
620
+ if isinstance(files, str):
621
+ files = [files]
622
+
623
+ # Initialize necessary classes
624
+ transcriber = AudioTranscriber()
625
+ aligner = ForcedAligner()
626
+ diarizer = SpeakerDiarizer(hf_token, parameters=diarizer_params)
627
+
628
+ # Process each file
629
+ for file_path in files:
630
+ print(f"Processing file: {file_path}")
631
+
632
+ # Step 1: Load and normalize audio
633
+ audio_file = AudioFile(file_path)
634
+ print("Step 1/6: Normalizing audio...")
635
+ audio_file.rms_normalization()
636
+
637
+ # Step 2: Split audio into chunks
638
+ print("Step 2/6: Splitting audio on silence...")
639
+ audio_file.split_on_silence(
640
+ min_silence_len=min_silence_len,
641
+ silence_thresh=silence_thresh,
642
+ min_length=min_length,
643
+ max_length=max_length
644
+ )
645
+
646
+ # Step 3: Transcribe chunks
647
+ print("Step 3/6: Transcribing audio chunks...")
648
+ transcriber.transcribe(audio_file)
649
+
650
+ for chunk in audio_file.chunks:
651
+ print(chunk.transcript)
652
+ print("\n")
653
+
654
+ # Step 4: Align transcription with audio
655
+ print("Step 4/6: Performing forced alignment...")
656
+ aligner.align(audio_file)
657
+
658
+ # Step 5: Perform speaker diarization
659
+ print("Step 5/6: Performing speaker diarization...")
660
+ diarizer.diarize(audio_file, num_speakers)
661
+
662
+ # Step 6: Combine alignment and diarization
663
+ print("Step 6/6: Combining alignment and diarization data...")
664
+ audio_file.combine_chunks()
665
+ audio_file.combine_alignment_and_diarization(timestamp_source)
666
+
667
+ # Save output
668
+ all_output_file = Path(output_folder) / f"{Path(file_path).stem}_all_outputs.json"
669
+ print(f"Saving results to: {all_output_file}")
670
+
671
+ # audio_file.create_textgrid(f"{Path(file_path).stem}_output.TextGrid")
672
+ audio_file.save_all_data_as_json(all_output_file)
673
+ print(f"Finished processing: {file_path}\n{'-' * 40}")
674
+
675
+
676
+ # Example Usage
677
+ if __name__ == "__main__":
678
+ import os
679
+
680
+ # Define input and output paths
681
+ audio_file_path = "audio.wav" # Replace with your actual audio file path
682
+ output_directory = "output"
683
+
684
+ # Create output directory if it doesn't exist
685
+ os.makedirs(output_directory, exist_ok=True)
686
+
687
+ # List of files to process
688
+ files = [audio_file_path]
689
+
690
+ # Process the audio files
691
+ process_audio_files(
692
+ files=files,
693
+ hf_token="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", # Replace with your actual Hugging Face token
694
+ output_folder=output_directory,
695
+ )