pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
- pelican_nlp-0.1.2.dist-info/RECORD +75 -0
- pelican_nlp-0.1.1.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,695 @@
|
|
1
|
+
# Standard Library Imports
|
2
|
+
import io
|
3
|
+
import re
|
4
|
+
import unicodedata
|
5
|
+
from typing import List, Dict
|
6
|
+
from pathlib import Path
|
7
|
+
# Third-party Library Imports
|
8
|
+
import librosa
|
9
|
+
import numpy as np
|
10
|
+
import soundfile as sf
|
11
|
+
import torch
|
12
|
+
import torchaudio
|
13
|
+
import torchaudio.transforms as T
|
14
|
+
from pydub import AudioSegment
|
15
|
+
from pydub.silence import detect_silence
|
16
|
+
from transformers import pipeline
|
17
|
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
18
|
+
import uroman as ur
|
19
|
+
import pandas as pd
|
20
|
+
import textgrids
|
21
|
+
import json
|
22
|
+
|
23
|
+
|
24
|
+
def detect_silence_intervals(audio_segment, min_silence_len=1000, silence_thresh=-40):
|
25
|
+
"""
|
26
|
+
Detects silent intervals in the audio segment.
|
27
|
+
Returns a list of [start_ms, end_ms] pairs representing silence periods.
|
28
|
+
"""
|
29
|
+
return detect_silence(
|
30
|
+
audio_segment,
|
31
|
+
min_silence_len=min_silence_len,
|
32
|
+
silence_thresh=silence_thresh
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
def get_splitting_points(silence_ranges, audio_length_ms):
|
37
|
+
"""
|
38
|
+
Computes splitting points based on silence ranges.
|
39
|
+
Returns a sorted list of splitting points in milliseconds.
|
40
|
+
"""
|
41
|
+
# Use list comprehension for faster execution
|
42
|
+
splitting_points = [0] + [(start + end) // 2 for start, end in silence_ranges] + [audio_length_ms]
|
43
|
+
return splitting_points
|
44
|
+
|
45
|
+
|
46
|
+
def create_initial_chunks(splitting_points):
|
47
|
+
"""
|
48
|
+
Creates initial chunks based on splitting points.
|
49
|
+
Returns a list of (start_ms, end_ms) tuples.
|
50
|
+
"""
|
51
|
+
# Utilize zip for efficient pairing
|
52
|
+
return list(zip(splitting_points[:-1], splitting_points[1:]))
|
53
|
+
|
54
|
+
|
55
|
+
def adjust_intervals_by_length(intervals, min_length=30000, max_length=180000):
|
56
|
+
adjusted_intervals = []
|
57
|
+
buffer_start, buffer_end = intervals[0]
|
58
|
+
|
59
|
+
for start, end in intervals[1:]:
|
60
|
+
buffer_end = end
|
61
|
+
buffer_length = buffer_end - buffer_start
|
62
|
+
|
63
|
+
if buffer_length < min_length:
|
64
|
+
# Merge with the next interval by extending the buffer
|
65
|
+
continue
|
66
|
+
else:
|
67
|
+
# Split the buffer if it exceeds max_length
|
68
|
+
if buffer_length > max_length:
|
69
|
+
num_splits = int(np.ceil(buffer_length / max_length))
|
70
|
+
split_size = int(np.ceil(buffer_length / num_splits))
|
71
|
+
for i in range(num_splits):
|
72
|
+
split_start = buffer_start + i * split_size
|
73
|
+
split_end = min(buffer_start + (i + 1) * split_size, buffer_end)
|
74
|
+
adjusted_intervals.append((split_start, split_end))
|
75
|
+
else:
|
76
|
+
adjusted_intervals.append((buffer_start, buffer_end))
|
77
|
+
# Correctly update buffer_start to prevent overlap
|
78
|
+
buffer_start = buffer_end # Start from the end of the last buffer
|
79
|
+
|
80
|
+
# Handle any remaining buffer
|
81
|
+
if buffer_end > buffer_start:
|
82
|
+
buffer_length = buffer_end - buffer_start
|
83
|
+
if buffer_length >= min_length:
|
84
|
+
if buffer_length > max_length:
|
85
|
+
num_splits = int(np.ceil(buffer_length / max_length))
|
86
|
+
split_size = int(np.ceil(buffer_length / num_splits))
|
87
|
+
for i in range(num_splits):
|
88
|
+
split_start = buffer_start + i * split_size
|
89
|
+
split_end = min(buffer_start + (i + 1) * split_size, buffer_end)
|
90
|
+
adjusted_intervals.append((split_start, split_end))
|
91
|
+
else:
|
92
|
+
adjusted_intervals.append((buffer_start, buffer_end))
|
93
|
+
else:
|
94
|
+
# Decide how to handle intervals shorter than min_length
|
95
|
+
pass
|
96
|
+
|
97
|
+
return adjusted_intervals
|
98
|
+
|
99
|
+
|
100
|
+
def split_audio_by_intervals(audio_segment, intervals):
|
101
|
+
"""
|
102
|
+
Splits the audio segment into chunks based on the provided intervals.
|
103
|
+
Returns a list of (chunk_audio, start_time_ms, end_time_ms) tuples.
|
104
|
+
"""
|
105
|
+
return [(audio_segment[start_ms:end_ms], start_ms, end_ms) for start_ms, end_ms in intervals]
|
106
|
+
|
107
|
+
|
108
|
+
class Chunk:
|
109
|
+
def __init__(self, audio_segment, start_time):
|
110
|
+
self.audio_segment = audio_segment
|
111
|
+
self.start_time = start_time # Start time in the original audio (seconds)
|
112
|
+
self.transcript = ""
|
113
|
+
self.whisper_alignments = []
|
114
|
+
self.forced_alignments = []
|
115
|
+
|
116
|
+
|
117
|
+
# Define the AudioFile class
|
118
|
+
class AudioFile:
|
119
|
+
def __init__(self, file_path: str, target_rms_db: float = -20):
|
120
|
+
self.file_path = file_path
|
121
|
+
self.target_rms_db = target_rms_db
|
122
|
+
self.normalized_path = None
|
123
|
+
self.audio = None
|
124
|
+
self.sample_rate = None
|
125
|
+
self.chunks: List[Chunk] = []
|
126
|
+
self.transcript_text = ""
|
127
|
+
self.whisper_alignments = []
|
128
|
+
self.forced_alignments = []
|
129
|
+
self.speaker_segments = []
|
130
|
+
self.combined_data = []
|
131
|
+
self.combined_utterances = []
|
132
|
+
self.load_audio()
|
133
|
+
|
134
|
+
def load_audio(self):
|
135
|
+
# Load the audio file
|
136
|
+
self.audio, self.sample_rate = librosa.load(self.file_path, sr=None)
|
137
|
+
print(f"Loaded audio file {self.file_path}")
|
138
|
+
|
139
|
+
def rms_normalization(self):
|
140
|
+
# Convert target RMS dB to linear scale
|
141
|
+
target_rms = 10 ** (self.target_rms_db / 20)
|
142
|
+
|
143
|
+
# Calculate current RMS of the audio
|
144
|
+
rms = np.sqrt(np.mean(self.audio ** 2))
|
145
|
+
|
146
|
+
# Calculate gain required to reach target RMS
|
147
|
+
gain = target_rms / rms
|
148
|
+
normalized_audio = self.audio * gain
|
149
|
+
|
150
|
+
# Save the normalized audio to a temporary file
|
151
|
+
self.normalized_path = self.file_path.replace(".wav", "_normalized.wav")
|
152
|
+
sf.write(self.normalized_path, normalized_audio, self.sample_rate)
|
153
|
+
print(f"Normalized audio saved as {self.normalized_path}")
|
154
|
+
|
155
|
+
def split_on_silence(self, min_silence_len=1000, silence_thresh=-30,
|
156
|
+
min_length=30000, max_length=180000):
|
157
|
+
# Load the normalized audio using pydub
|
158
|
+
audio_segment = AudioSegment.from_file(self.normalized_path)
|
159
|
+
audio_length_ms = len(audio_segment)
|
160
|
+
|
161
|
+
# Step 1: Detect silence intervals
|
162
|
+
silence_ranges = detect_silence_intervals(
|
163
|
+
audio_segment,
|
164
|
+
min_silence_len=min_silence_len,
|
165
|
+
silence_thresh=silence_thresh
|
166
|
+
)
|
167
|
+
|
168
|
+
# Step 2: Identify splitting points
|
169
|
+
splitting_points = get_splitting_points(silence_ranges, audio_length_ms)
|
170
|
+
|
171
|
+
# Step 3: Create initial chunks covering the entire audio
|
172
|
+
initial_intervals = create_initial_chunks(splitting_points)
|
173
|
+
|
174
|
+
# Step 4: Adjust intervals based on length constraints
|
175
|
+
adjusted_intervals = adjust_intervals_by_length(
|
176
|
+
initial_intervals,
|
177
|
+
min_length=min_length,
|
178
|
+
max_length=max_length
|
179
|
+
)
|
180
|
+
|
181
|
+
# Step 5: Extract chunks
|
182
|
+
chunks_with_timestamps = split_audio_by_intervals(audio_segment, adjusted_intervals)
|
183
|
+
|
184
|
+
# Create Chunk instances with accurate start times
|
185
|
+
self.chunks = []
|
186
|
+
for chunk_audio, start_i, end_i in chunks_with_timestamps:
|
187
|
+
self.chunks.append(Chunk(chunk_audio, start_i / 1000.0)) # Convert ms to seconds
|
188
|
+
print(f"Total chunks after splitting: {len(self.chunks)}")
|
189
|
+
|
190
|
+
def combine_chunks(self):
|
191
|
+
# Combine transcripts
|
192
|
+
self.transcript_text = " ".join([chunk.transcript for chunk in self.chunks])
|
193
|
+
# Combine word alignments
|
194
|
+
for chunk in self.chunks:
|
195
|
+
self.whisper_alignments.extend(chunk.whisper_alignments)
|
196
|
+
self.forced_alignments.extend(chunk.forced_alignments)
|
197
|
+
|
198
|
+
def aggregate_to_utterances(self):
|
199
|
+
if not self.combined_data:
|
200
|
+
print("No combined data available to adjust.")
|
201
|
+
return
|
202
|
+
|
203
|
+
utterances = []
|
204
|
+
current_utterance = {
|
205
|
+
"text": "",
|
206
|
+
"start_time": None,
|
207
|
+
"end_time": None,
|
208
|
+
"speakers": {}
|
209
|
+
}
|
210
|
+
|
211
|
+
sentence_endings = re.compile(r'[.?!]$')
|
212
|
+
print(self.combined_data)
|
213
|
+
for word_data in self.combined_data:
|
214
|
+
word = word_data["word"]
|
215
|
+
start_time = word_data["start_time"]
|
216
|
+
end_time = word_data["end_time"]
|
217
|
+
speaker = word_data["speaker"]
|
218
|
+
|
219
|
+
# Initialize the current utterance start time if not already set
|
220
|
+
if current_utterance["start_time"] is None:
|
221
|
+
current_utterance["start_time"] = start_time
|
222
|
+
|
223
|
+
# Append the word to the current utterance text
|
224
|
+
current_utterance["text"] += ("" if current_utterance["text"] == "" else " ") + word
|
225
|
+
|
226
|
+
# Update the end time of the current utterance
|
227
|
+
current_utterance["end_time"] = end_time
|
228
|
+
|
229
|
+
# Update the speaker count for this utterance
|
230
|
+
if speaker not in current_utterance["speakers"]:
|
231
|
+
current_utterance["speakers"][speaker] = 0
|
232
|
+
current_utterance["speakers"][speaker] += 1
|
233
|
+
|
234
|
+
# Check if this word ends the sentence
|
235
|
+
if sentence_endings.search(word):
|
236
|
+
# Determine the majority speaker
|
237
|
+
majority_speaker, majority_count = max(
|
238
|
+
current_utterance["speakers"].items(), key=lambda item: item[1]
|
239
|
+
)
|
240
|
+
total_words = sum(current_utterance["speakers"].values())
|
241
|
+
confidence = majority_count / total_words
|
242
|
+
|
243
|
+
# Append the completed utterance
|
244
|
+
utterances.append({
|
245
|
+
"text": current_utterance["text"],
|
246
|
+
"start_time": current_utterance["start_time"],
|
247
|
+
"end_time": current_utterance["end_time"],
|
248
|
+
"speaker": majority_speaker,
|
249
|
+
"confidence": round(confidence, 2),
|
250
|
+
})
|
251
|
+
|
252
|
+
# Reset the current utterance
|
253
|
+
current_utterance = {
|
254
|
+
"text": "",
|
255
|
+
"start_time": None,
|
256
|
+
"end_time": None,
|
257
|
+
"speakers": {}
|
258
|
+
}
|
259
|
+
|
260
|
+
# Handle any remaining words as the last utterance
|
261
|
+
if current_utterance["text"]:
|
262
|
+
majority_speaker, majority_count = max(
|
263
|
+
current_utterance["speakers"].items(), key=lambda item: item[1]
|
264
|
+
)
|
265
|
+
total_words = sum(current_utterance["speakers"].values())
|
266
|
+
confidence = majority_count / total_words
|
267
|
+
|
268
|
+
utterances.append({
|
269
|
+
"text": current_utterance["text"],
|
270
|
+
"start_time": current_utterance["start_time"],
|
271
|
+
"end_time": current_utterance["end_time"],
|
272
|
+
"speaker": majority_speaker,
|
273
|
+
"confidence": round(confidence, 2),
|
274
|
+
})
|
275
|
+
|
276
|
+
self.combined_utterances = utterances
|
277
|
+
|
278
|
+
def combine_alignment_and_diarization(self, alignment_source: str):
|
279
|
+
"""
|
280
|
+
Combines alignment and diarization data by assigning speaker labels to each word.
|
281
|
+
|
282
|
+
Parameters:
|
283
|
+
- alignment_source (str): The alignment data to use. Options:
|
284
|
+
- 'whisper_alignments'
|
285
|
+
- 'forced_alignments'
|
286
|
+
|
287
|
+
Updates:
|
288
|
+
- self.combined_data: List of words with assigned speakers.
|
289
|
+
"""
|
290
|
+
# Validate the alignment_source
|
291
|
+
if alignment_source not in ['whisper_alignments', 'forced_alignments']:
|
292
|
+
raise ValueError("Invalid alignment_source. Choose 'whisper_alignments' or 'forced_alignments'.")
|
293
|
+
|
294
|
+
# Select the appropriate alignment list
|
295
|
+
alignment = getattr(self, alignment_source, None)
|
296
|
+
if alignment is None:
|
297
|
+
raise ValueError(f"The alignment source '{alignment_source}' does not exist in the AudioFile object.")
|
298
|
+
|
299
|
+
if not self.speaker_segments:
|
300
|
+
print("No speaker segments available for diarization. All words will be labeled as 'UNKNOWN'.")
|
301
|
+
# Assign 'UNKNOWN' to all words
|
302
|
+
self.combined_data = [
|
303
|
+
{**word, 'speaker': 'UNKNOWN'} for word in alignment
|
304
|
+
]
|
305
|
+
return
|
306
|
+
|
307
|
+
combined = []
|
308
|
+
seg_idx = 0
|
309
|
+
num_segments = len(self.speaker_segments)
|
310
|
+
speaker_segments = self.speaker_segments
|
311
|
+
|
312
|
+
for word in alignment:
|
313
|
+
word_start = word['start_time']
|
314
|
+
word_end = word['end_time']
|
315
|
+
word_duration = max(1e-6, word_end - word_start) # Avoid zero-duration
|
316
|
+
|
317
|
+
speaker_overlap = {}
|
318
|
+
|
319
|
+
while seg_idx < num_segments and speaker_segments[seg_idx]['end'] < word_start:
|
320
|
+
seg_idx += 1
|
321
|
+
|
322
|
+
temp_idx = seg_idx
|
323
|
+
while temp_idx < num_segments and speaker_segments[temp_idx]['start'] < word_end:
|
324
|
+
seg = speaker_segments[temp_idx]
|
325
|
+
seg_start = seg['start']
|
326
|
+
seg_end = seg['end']
|
327
|
+
speaker = seg['speaker']
|
328
|
+
|
329
|
+
if seg_start <= word_start < seg_end: # Handle zero-duration case
|
330
|
+
overlap = word_duration # Treat as full overlap for zero-duration
|
331
|
+
else:
|
332
|
+
overlap_start = max(word_start, seg_start)
|
333
|
+
overlap_end = min(word_end, seg_end)
|
334
|
+
overlap = max(0.0, overlap_end - overlap_start)
|
335
|
+
|
336
|
+
if overlap > 0:
|
337
|
+
speaker_overlap[speaker] = speaker_overlap.get(speaker, 0.0) + overlap
|
338
|
+
|
339
|
+
temp_idx += 1
|
340
|
+
|
341
|
+
assigned_speaker = max(speaker_overlap, key=speaker_overlap.get) if speaker_overlap else 'UNKNOWN'
|
342
|
+
|
343
|
+
word_with_speaker = word.copy()
|
344
|
+
word_with_speaker['speaker'] = assigned_speaker
|
345
|
+
combined.append(word_with_speaker)
|
346
|
+
|
347
|
+
self.combined_data = combined
|
348
|
+
print(self.combined_data)
|
349
|
+
print(f"Combined alignment and diarization data with {len(self.combined_data)} entries.")
|
350
|
+
# self.adjust_overlapping_intervals()
|
351
|
+
self.aggregate_to_utterances()
|
352
|
+
|
353
|
+
|
354
|
+
def save_all_data_as_json(self, output_file="all_audio_data.json"):
|
355
|
+
"""
|
356
|
+
Saves multiple attributes of the AudioFile instance into a single JSON file.
|
357
|
+
|
358
|
+
Parameters:
|
359
|
+
- output_file: str, optional
|
360
|
+
The path to save the JSON file. Defaults to 'all_audio_data.json'.
|
361
|
+
"""
|
362
|
+
# Check if combined data exists
|
363
|
+
if not self.combined_data:
|
364
|
+
print("No combined data available to save. Ensure 'combine_alignment_and_diarization' is run first.")
|
365
|
+
return
|
366
|
+
|
367
|
+
# Prepare the data dictionary
|
368
|
+
data = {
|
369
|
+
"transcript_text": self.transcript_text,
|
370
|
+
"whisper_alignments": self.whisper_alignments,
|
371
|
+
"forced_alignments": self.forced_alignments,
|
372
|
+
"speaker_segments": self.speaker_segments,
|
373
|
+
"combined_data": self.combined_data,
|
374
|
+
"utterance_data": self.combined_utterances
|
375
|
+
}
|
376
|
+
|
377
|
+
# Convert the data to JSON format
|
378
|
+
try:
|
379
|
+
json_output = json.dumps(data, indent=4)
|
380
|
+
except TypeError as e:
|
381
|
+
print(f"Error serializing data to JSON: {e}")
|
382
|
+
return
|
383
|
+
|
384
|
+
# Optional: Warn if output_file already exists
|
385
|
+
if Path(output_file).exists():
|
386
|
+
print(f"Warning: '{output_file}' already exists and will be overwritten.")
|
387
|
+
|
388
|
+
# Save JSON to the specified file
|
389
|
+
try:
|
390
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
391
|
+
f.write(json_output)
|
392
|
+
print(f"All audio data successfully saved to '{output_file}'.")
|
393
|
+
except Exception as e:
|
394
|
+
print(f"Error saving JSON file: {e}")
|
395
|
+
|
396
|
+
print(self.combined_data)
|
397
|
+
|
398
|
+
# Define the AudioTranscriber class
|
399
|
+
class AudioTranscriber:
|
400
|
+
"""
|
401
|
+
Handles transcription of audio chunks using Whisper.
|
402
|
+
"""
|
403
|
+
def __init__(self):
|
404
|
+
# Use 'cuda' if available, else 'mps' for Apple devices, else 'cpu'
|
405
|
+
if torch.cuda.is_available():
|
406
|
+
self.device = torch.device("cuda")
|
407
|
+
elif torch.backends.mps.is_available():
|
408
|
+
self.device = torch.device("mps")
|
409
|
+
else:
|
410
|
+
self.device = torch.device("cpu")
|
411
|
+
|
412
|
+
self.transcriber = pipeline(
|
413
|
+
"automatic-speech-recognition",
|
414
|
+
model="openai/whisper-medium",
|
415
|
+
device=self.device,
|
416
|
+
return_timestamps="word" # Ensure word-level timestamps are returned
|
417
|
+
)
|
418
|
+
print(f"Initialized AudioTranscriber on device: {self.device}")
|
419
|
+
|
420
|
+
def transcribe(self, audio_file: AudioFile):
|
421
|
+
"""
|
422
|
+
Transcribes each speech chunk and extracts word-level timestamps.
|
423
|
+
"""
|
424
|
+
print("Transcribing audio chunks...")
|
425
|
+
for idx, chunk in enumerate(audio_file.chunks, start=1):
|
426
|
+
try:
|
427
|
+
with io.BytesIO() as wav_io:
|
428
|
+
chunk.audio_segment.export(wav_io, format="wav")
|
429
|
+
wav_io.seek(0)
|
430
|
+
# Pass the audio data to the transcriber
|
431
|
+
transcription_result = self.transcriber(wav_io.read())
|
432
|
+
|
433
|
+
chunk.transcript = transcription_result.get('text', "").strip()
|
434
|
+
|
435
|
+
# Extract word alignments from 'chunks' instead of 'words'
|
436
|
+
raw_chunks = transcription_result.get('chunks', [])
|
437
|
+
clean_chunks = []
|
438
|
+
for word_info in raw_chunks:
|
439
|
+
# Ensure 'timestamp' exists and has two elements
|
440
|
+
if 'timestamp' in word_info and len(word_info['timestamp']) == 2:
|
441
|
+
# Convert timestamps to float before addition
|
442
|
+
start_time = float(word_info['timestamp'][0]) + chunk.start_time
|
443
|
+
end_time = float(word_info['timestamp'][1]) + chunk.start_time
|
444
|
+
word_text = word_info.get('text', "").strip()
|
445
|
+
if word_text: # Ensure word_text is not empty
|
446
|
+
clean_chunks.append({
|
447
|
+
"word": word_text,
|
448
|
+
"start_time": start_time,
|
449
|
+
"end_time": end_time
|
450
|
+
})
|
451
|
+
chunk.whisper_alignments = clean_chunks
|
452
|
+
print(f"Transcribed chunk {idx} successfully with {len(clean_chunks)} words.")
|
453
|
+
except Exception as e:
|
454
|
+
print(f"Error during transcription of chunk {idx}: {e}")
|
455
|
+
chunk.transcript = ""
|
456
|
+
chunk.whisper_alignments = []
|
457
|
+
|
458
|
+
|
459
|
+
# Define the ForcedAligner class
|
460
|
+
class ForcedAligner:
|
461
|
+
def __init__(self, device: str = None):
|
462
|
+
# Forcing CPU device due to potential compatibility issues
|
463
|
+
if torch.cuda.is_available():
|
464
|
+
self.device = torch.device("cuda")
|
465
|
+
else:
|
466
|
+
self.device = torch.device("cpu")
|
467
|
+
|
468
|
+
self.bundle = torchaudio.pipelines.MMS_FA
|
469
|
+
self.model = self.bundle.get_model().to(self.device)
|
470
|
+
self.tokenizer = self.bundle.get_tokenizer()
|
471
|
+
self.aligner = self.bundle.get_aligner()
|
472
|
+
self.uroman = ur.Uroman()
|
473
|
+
self.sample_rate = self.bundle.sample_rate
|
474
|
+
print(f"Initialized ForcedAligner on device: {self.device}")
|
475
|
+
|
476
|
+
def normalize_uroman(self, text: str) -> str:
|
477
|
+
text = text.encode('utf-8').decode('utf-8')
|
478
|
+
text = text.lower()
|
479
|
+
text = text.replace("’", "'")
|
480
|
+
text = unicodedata.normalize('NFC', text)
|
481
|
+
text = re.sub("([^a-z' ])", " ", text)
|
482
|
+
text = re.sub(' +', ' ', text)
|
483
|
+
return text.strip()
|
484
|
+
|
485
|
+
def align(self, audio_file: AudioFile):
|
486
|
+
for idx, chunk in enumerate(audio_file.chunks):
|
487
|
+
print(f"Aligning chunk {idx + 1}/{len(audio_file.chunks)}")
|
488
|
+
# Export chunk to a WAV file in memory
|
489
|
+
with io.BytesIO() as wav_io:
|
490
|
+
chunk.audio_segment.export(wav_io, format="wav")
|
491
|
+
wav_io.seek(0)
|
492
|
+
waveform, sample_rate = torchaudio.load(wav_io)
|
493
|
+
# Resample if needed
|
494
|
+
if sample_rate != self.sample_rate:
|
495
|
+
resampler = T.Resample(orig_freq=sample_rate, new_freq=self.sample_rate)
|
496
|
+
waveform = resampler(waveform)
|
497
|
+
sample_rate = self.sample_rate
|
498
|
+
# Normalize and tokenize the transcript
|
499
|
+
text_roman = self.uroman.romanize_string(chunk.transcript)
|
500
|
+
text_normalized = self.normalize_uroman(text_roman)
|
501
|
+
transcript_list = text_normalized.split()
|
502
|
+
tokens = self.tokenizer(transcript_list)
|
503
|
+
# Perform forced alignment
|
504
|
+
with torch.inference_mode():
|
505
|
+
emission, _ = self.model(waveform.to(self.device))
|
506
|
+
token_spans = self.aligner(emission[0], tokens)
|
507
|
+
# Extract timestamps
|
508
|
+
num_frames = emission.size(1)
|
509
|
+
ratio = waveform.size(1) / num_frames
|
510
|
+
for spans, word in zip(token_spans, transcript_list):
|
511
|
+
start_sec = (spans[0].start * ratio / sample_rate) + chunk.start_time
|
512
|
+
end_sec = (spans[-1].end * ratio / sample_rate) + chunk.start_time
|
513
|
+
chunk.forced_alignments.append({
|
514
|
+
"word": word,
|
515
|
+
"start_time": start_sec,
|
516
|
+
"end_time": end_sec
|
517
|
+
})
|
518
|
+
|
519
|
+
|
520
|
+
# Define the SpeakerDiarizer class
|
521
|
+
class SpeakerDiarizer:
|
522
|
+
def __init__(self, hf_token: str, parameters):
|
523
|
+
if torch.cuda.is_available():
|
524
|
+
self.device = torch.device("cuda")
|
525
|
+
elif torch.backends.mps.is_available():
|
526
|
+
self.device = torch.device("mps")
|
527
|
+
else:
|
528
|
+
self.device = torch.device("cpu")
|
529
|
+
|
530
|
+
self.diarization_pipeline = DiarizationPipeline.from_pretrained(
|
531
|
+
"pyannote/speaker-diarization-3.1",
|
532
|
+
use_auth_token=hf_token
|
533
|
+
)
|
534
|
+
print("Initializing SpeakerDiarizer with parameters...")
|
535
|
+
self.diarization_pipeline.instantiate(parameters)
|
536
|
+
self.diarization_pipeline.to(self.device)
|
537
|
+
print("Initialized SpeakerDiarizer successfully.")
|
538
|
+
|
539
|
+
def diarize(self, audio_file: AudioFile, num_speakers: int = None):
|
540
|
+
"""
|
541
|
+
Performs speaker diarization on the given audio file.
|
542
|
+
|
543
|
+
Parameters:
|
544
|
+
- audio_file (AudioFile): The audio file to diarize.
|
545
|
+
- num_speakers (int, optional): The expected number of speakers.
|
546
|
+
"""
|
547
|
+
print("Performing diarization on the entire audio file")
|
548
|
+
try:
|
549
|
+
if num_speakers is not None:
|
550
|
+
diarization_result = self.diarization_pipeline(
|
551
|
+
audio_file.normalized_path,
|
552
|
+
num_speakers=num_speakers
|
553
|
+
)
|
554
|
+
print(f"Diarization completed with {num_speakers} speakers.")
|
555
|
+
else:
|
556
|
+
diarization_result = self.diarization_pipeline(
|
557
|
+
audio_file.normalized_path
|
558
|
+
)
|
559
|
+
print("Diarization completed without specifying number of speakers.")
|
560
|
+
|
561
|
+
# Prepare speaker segments
|
562
|
+
audio_file.speaker_segments = []
|
563
|
+
for segment, _, speaker in diarization_result.itertracks(yield_label=True):
|
564
|
+
audio_file.speaker_segments.append({
|
565
|
+
"start": segment.start,
|
566
|
+
"end": segment.end,
|
567
|
+
"speaker": speaker
|
568
|
+
})
|
569
|
+
print(f"Detected {len(audio_file.speaker_segments)} speaker segments.")
|
570
|
+
|
571
|
+
except Exception as e:
|
572
|
+
print(f"An error occurred during diarization: {e}")
|
573
|
+
|
574
|
+
|
575
|
+
def process_audio_files(files, hf_token="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", diarizer_params={
|
576
|
+
"segmentation": {
|
577
|
+
"min_duration_off": 0.0,
|
578
|
+
},
|
579
|
+
"clustering": {
|
580
|
+
"method": "centroid",
|
581
|
+
"min_cluster_size": 12,
|
582
|
+
"threshold": 0.8,
|
583
|
+
}},
|
584
|
+
|
585
|
+
num_speakers = 2,
|
586
|
+
|
587
|
+
output_folder="output",
|
588
|
+
min_silence_len=1000,
|
589
|
+
silence_thresh=-30,
|
590
|
+
min_length=90000,
|
591
|
+
max_length=150000,
|
592
|
+
|
593
|
+
timestamp_source = "whisper_alignments"
|
594
|
+
):
|
595
|
+
"""
|
596
|
+
Processes one or more audio files through all steps of the pipeline.
|
597
|
+
|
598
|
+
Parameters:
|
599
|
+
- files: str or List[str]
|
600
|
+
A single file path or a list of file paths to process.
|
601
|
+
- hf_token: str, optional
|
602
|
+
Hugging Face token for accessing diarization models.
|
603
|
+
- diarizer_params: dict, optional
|
604
|
+
Parameters for the SpeakerDiarizer model.
|
605
|
+
- output_folder: str, optional
|
606
|
+
Folder to save the output JSON files. Defaults to 'output'.
|
607
|
+
- min_silence_len: int, optional
|
608
|
+
Minimum silence length for splitting. Defaults to 1000ms.
|
609
|
+
- silence_thresh: int, optional
|
610
|
+
Silence threshold in dB for splitting. Defaults to -40dB.
|
611
|
+
- min_length: int, optional
|
612
|
+
Minimum chunk length in milliseconds. Defaults to 30000ms.
|
613
|
+
- max_length: int, optional
|
614
|
+
Maximum chunk length in milliseconds. Defaults to 180000ms.
|
615
|
+
"""
|
616
|
+
|
617
|
+
Path(output_folder).mkdir(exist_ok=True) # Create output folder if it doesn't exist
|
618
|
+
|
619
|
+
# Ensure `files` is a list
|
620
|
+
if isinstance(files, str):
|
621
|
+
files = [files]
|
622
|
+
|
623
|
+
# Initialize necessary classes
|
624
|
+
transcriber = AudioTranscriber()
|
625
|
+
aligner = ForcedAligner()
|
626
|
+
diarizer = SpeakerDiarizer(hf_token, parameters=diarizer_params)
|
627
|
+
|
628
|
+
# Process each file
|
629
|
+
for file_path in files:
|
630
|
+
print(f"Processing file: {file_path}")
|
631
|
+
|
632
|
+
# Step 1: Load and normalize audio
|
633
|
+
audio_file = AudioFile(file_path)
|
634
|
+
print("Step 1/6: Normalizing audio...")
|
635
|
+
audio_file.rms_normalization()
|
636
|
+
|
637
|
+
# Step 2: Split audio into chunks
|
638
|
+
print("Step 2/6: Splitting audio on silence...")
|
639
|
+
audio_file.split_on_silence(
|
640
|
+
min_silence_len=min_silence_len,
|
641
|
+
silence_thresh=silence_thresh,
|
642
|
+
min_length=min_length,
|
643
|
+
max_length=max_length
|
644
|
+
)
|
645
|
+
|
646
|
+
# Step 3: Transcribe chunks
|
647
|
+
print("Step 3/6: Transcribing audio chunks...")
|
648
|
+
transcriber.transcribe(audio_file)
|
649
|
+
|
650
|
+
for chunk in audio_file.chunks:
|
651
|
+
print(chunk.transcript)
|
652
|
+
print("\n")
|
653
|
+
|
654
|
+
# Step 4: Align transcription with audio
|
655
|
+
print("Step 4/6: Performing forced alignment...")
|
656
|
+
aligner.align(audio_file)
|
657
|
+
|
658
|
+
# Step 5: Perform speaker diarization
|
659
|
+
print("Step 5/6: Performing speaker diarization...")
|
660
|
+
diarizer.diarize(audio_file, num_speakers)
|
661
|
+
|
662
|
+
# Step 6: Combine alignment and diarization
|
663
|
+
print("Step 6/6: Combining alignment and diarization data...")
|
664
|
+
audio_file.combine_chunks()
|
665
|
+
audio_file.combine_alignment_and_diarization(timestamp_source)
|
666
|
+
|
667
|
+
# Save output
|
668
|
+
all_output_file = Path(output_folder) / f"{Path(file_path).stem}_all_outputs.json"
|
669
|
+
print(f"Saving results to: {all_output_file}")
|
670
|
+
|
671
|
+
# audio_file.create_textgrid(f"{Path(file_path).stem}_output.TextGrid")
|
672
|
+
audio_file.save_all_data_as_json(all_output_file)
|
673
|
+
print(f"Finished processing: {file_path}\n{'-' * 40}")
|
674
|
+
|
675
|
+
|
676
|
+
# Example Usage
|
677
|
+
if __name__ == "__main__":
|
678
|
+
import os
|
679
|
+
|
680
|
+
# Define input and output paths
|
681
|
+
audio_file_path = "audio.wav" # Replace with your actual audio file path
|
682
|
+
output_directory = "output"
|
683
|
+
|
684
|
+
# Create output directory if it doesn't exist
|
685
|
+
os.makedirs(output_directory, exist_ok=True)
|
686
|
+
|
687
|
+
# List of files to process
|
688
|
+
files = [audio_file_path]
|
689
|
+
|
690
|
+
# Process the audio files
|
691
|
+
process_audio_files(
|
692
|
+
files=files,
|
693
|
+
hf_token="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", # Replace with your actual Hugging Face token
|
694
|
+
output_folder=output_directory,
|
695
|
+
)
|