pelican-nlp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +14 -21
- pelican_nlp-0.1.2.dist-info/RECORD +75 -0
- pelican_nlp-0.1.0.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
[]
|
@@ -0,0 +1,314 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import librosa
|
4
|
+
import numpy as np
|
5
|
+
import soundfile as sf
|
6
|
+
import torch
|
7
|
+
import torchaudio
|
8
|
+
import torchaudio.transforms as T
|
9
|
+
import uroman as ur
|
10
|
+
import re
|
11
|
+
import unicodedata
|
12
|
+
from typing import List, Dict
|
13
|
+
from transformers import pipeline
|
14
|
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
15
|
+
from sklearn.cluster import AgglomerativeClustering
|
16
|
+
|
17
|
+
|
18
|
+
class AudioFile:
|
19
|
+
def __init__(self, file_path: str, target_rms_db: float = -20):
|
20
|
+
self.file_path = file_path
|
21
|
+
self.target_rms_db = target_rms_db
|
22
|
+
self.normalized_path = None
|
23
|
+
self.audio = None
|
24
|
+
self.sample_rate = None
|
25
|
+
self.transcript_text = ""
|
26
|
+
self.word_alignments = []
|
27
|
+
self.speaker_segments = []
|
28
|
+
self.combined_data = []
|
29
|
+
self.whisper_word_alignments = []
|
30
|
+
self.forced_alignment_output = []
|
31
|
+
self.whisper_output = []
|
32
|
+
self.diarization_output = []
|
33
|
+
self.aggregated_output = []
|
34
|
+
self.load_audio()
|
35
|
+
|
36
|
+
def load_audio(self):
|
37
|
+
self.audio, self.sample_rate = librosa.load(self.file_path, sr=None)
|
38
|
+
print(f"Loaded audio file {self.file_path}")
|
39
|
+
|
40
|
+
def rms_normalization(self):
|
41
|
+
target_rms = 10 ** (self.target_rms_db / 20)
|
42
|
+
rms = np.sqrt(np.mean(self.audio ** 2))
|
43
|
+
gain = target_rms / rms
|
44
|
+
normalized_audio = self.audio * gain
|
45
|
+
self.normalized_path = self.file_path.replace(".wav", "_normalized.wav")
|
46
|
+
sf.write(self.normalized_path, normalized_audio, self.sample_rate)
|
47
|
+
print(f"Normalized audio saved as {self.normalized_path}")
|
48
|
+
|
49
|
+
def aggregate_transcripts(self, whisper_transcriptions: List[Dict], forced_alignments: List[Dict]):
|
50
|
+
print("Aggregating transcripts from Whisper and Forced Alignment")
|
51
|
+
self.whisper_output = whisper_transcriptions
|
52
|
+
self.forced_alignment_output = forced_alignments
|
53
|
+
|
54
|
+
merged_words = {}
|
55
|
+
for source in [whisper_transcriptions, forced_alignments]:
|
56
|
+
for word_info in source:
|
57
|
+
word = word_info['word'].lower()
|
58
|
+
if word not in merged_words:
|
59
|
+
merged_words[word] = {'count': 0, 'start_time': 0.0, 'end_time': 0.0}
|
60
|
+
merged_words[word]['count'] += 1
|
61
|
+
merged_words[word]['start_time'] += word_info['start_time']
|
62
|
+
merged_words[word]['end_time'] += word_info['end_time']
|
63
|
+
|
64
|
+
aggregated_words = []
|
65
|
+
for word, data in merged_words.items():
|
66
|
+
avg_start = data['start_time'] / data['count']
|
67
|
+
avg_end = data['end_time'] / data['count']
|
68
|
+
aggregated_words.append({
|
69
|
+
'word': word,
|
70
|
+
'start_time': avg_start,
|
71
|
+
'end_time': avg_end
|
72
|
+
})
|
73
|
+
|
74
|
+
aggregated_words.sort(key=lambda x: x['start_time'])
|
75
|
+
self.aggregated_output = aggregated_words.copy()
|
76
|
+
|
77
|
+
print("Assigning speakers to words based on diarization segments")
|
78
|
+
for word in aggregated_words:
|
79
|
+
word_start = word['start_time']
|
80
|
+
word_end = word['end_time']
|
81
|
+
assigned_speaker = self.assign_speaker(word_start, word_end)
|
82
|
+
aggregated_entry = {
|
83
|
+
'word': word['word'],
|
84
|
+
'start_time': word_start,
|
85
|
+
'end_time': word_end,
|
86
|
+
'speaker': assigned_speaker
|
87
|
+
}
|
88
|
+
self.combined_data.append(aggregated_entry)
|
89
|
+
self.aggregated_output.append(aggregated_entry)
|
90
|
+
|
91
|
+
def assign_speaker(self, word_start: float, word_end: float) -> str:
|
92
|
+
max_overlap = 0
|
93
|
+
assigned_speaker = "Unknown"
|
94
|
+
for segment in self.speaker_segments:
|
95
|
+
overlap_start = max(word_start, segment['start'])
|
96
|
+
overlap_end = min(word_end, segment['end'])
|
97
|
+
overlap = max(0.0, overlap_end - overlap_start)
|
98
|
+
if overlap > max_overlap:
|
99
|
+
max_overlap = overlap
|
100
|
+
assigned_speaker = segment['speaker']
|
101
|
+
return assigned_speaker
|
102
|
+
|
103
|
+
def save_individual_outputs(self, output_dir: str):
|
104
|
+
try:
|
105
|
+
os.makedirs(output_dir, exist_ok=True)
|
106
|
+
base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
|
107
|
+
|
108
|
+
whisper_path = os.path.join(output_dir, f"{base_filename}_whisper_timestamps.json")
|
109
|
+
with open(whisper_path, 'w', encoding='utf-8') as f:
|
110
|
+
json.dump(self.whisper_output, f, ensure_ascii=False, indent=4)
|
111
|
+
print(f"Saved Whisper timestamps to {whisper_path}")
|
112
|
+
|
113
|
+
forced_align_path = os.path.join(output_dir, f"{base_filename}_forced_alignment_timestamps.json")
|
114
|
+
with open(forced_align_path, 'w', encoding='utf-8') as f:
|
115
|
+
json.dump(self.forced_alignment_output, f, ensure_ascii=False, indent=4)
|
116
|
+
print(f"Saved Forced Alignment timestamps to {forced_align_path}")
|
117
|
+
|
118
|
+
diarization_path = os.path.join(output_dir, f"{base_filename}_speaker_segments.json")
|
119
|
+
with open(diarization_path, 'w', encoding='utf-8') as f:
|
120
|
+
json.dump(self.speaker_segments, f, ensure_ascii=False, indent=4)
|
121
|
+
print(f"Saved Speaker Diarization segments to {diarization_path}")
|
122
|
+
|
123
|
+
except Exception as e:
|
124
|
+
print(f"Error saving individual outputs: {e}")
|
125
|
+
|
126
|
+
def save_aggregated_output(self, output_dir: str):
|
127
|
+
try:
|
128
|
+
os.makedirs(output_dir, exist_ok=True)
|
129
|
+
base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
|
130
|
+
aggregated_path = os.path.join(output_dir, f"{base_filename}_aggregated_transcript.json")
|
131
|
+
with open(aggregated_path, 'w', encoding='utf-8') as f:
|
132
|
+
json.dump(self.aggregated_output, f, ensure_ascii=False, indent=4)
|
133
|
+
print(f"Saved Aggregated Transcript to {aggregated_path}")
|
134
|
+
except Exception as e:
|
135
|
+
print(f"Error saving aggregated output: {e}")
|
136
|
+
|
137
|
+
|
138
|
+
class AudioTranscriber:
|
139
|
+
def __init__(self, device: str = None):
|
140
|
+
if torch.cuda.is_available():
|
141
|
+
self.device = torch.device("cuda")
|
142
|
+
elif torch.backends.mps.is_available():
|
143
|
+
self.device = torch.device("mps")
|
144
|
+
else:
|
145
|
+
self.device = torch.device("cpu")
|
146
|
+
self.transcriber = pipeline(
|
147
|
+
"automatic-speech-recognition",
|
148
|
+
model="openai/whisper-medium",
|
149
|
+
device=self.device
|
150
|
+
)
|
151
|
+
print(f"Initialized AudioTranscriber on device: {self.device}")
|
152
|
+
|
153
|
+
def transcribe(self, audio_file: AudioFile):
|
154
|
+
print("Transcribing the entire audio file")
|
155
|
+
try:
|
156
|
+
with open(audio_file.normalized_path, 'rb') as f:
|
157
|
+
audio_data = f.read()
|
158
|
+
transcription_result = self.transcriber(
|
159
|
+
audio_data,
|
160
|
+
return_timestamps="word",
|
161
|
+
|
162
|
+
)
|
163
|
+
audio_file.transcript_text = transcription_result['text'].strip()
|
164
|
+
audio_file.whisper_word_alignments = transcription_result.get('words', [])
|
165
|
+
print("Transcription completed")
|
166
|
+
except Exception as e:
|
167
|
+
print(f"Error during transcription: {e}")
|
168
|
+
|
169
|
+
|
170
|
+
class ForcedAligner:
|
171
|
+
def __init__(self, device: str = None):
|
172
|
+
if torch.cuda.is_available():
|
173
|
+
self.device = torch.device("cuda")
|
174
|
+
else:
|
175
|
+
self.device = torch.device("cpu")
|
176
|
+
self.bundle = torchaudio.pipelines.MMS_FA
|
177
|
+
self.model = self.bundle.get_model().to(self.device)
|
178
|
+
self.tokenizer = self.bundle.get_tokenizer()
|
179
|
+
self.aligner = self.bundle.get_aligner()
|
180
|
+
self.uroman = ur.Uroman()
|
181
|
+
self.sample_rate = self.bundle.sample_rate
|
182
|
+
print(f"Initialized ForcedAligner on device: {self.device}")
|
183
|
+
|
184
|
+
def normalize_uroman(self, text: str) -> str:
|
185
|
+
text = text.encode('utf-8').decode('utf-8')
|
186
|
+
text = text.lower()
|
187
|
+
text = text.replace("’", "'")
|
188
|
+
text = unicodedata.normalize('NFC', text)
|
189
|
+
text = re.sub("([^a-z' ])", " ", text)
|
190
|
+
text = re.sub(' +', ' ', text)
|
191
|
+
return text.strip()
|
192
|
+
|
193
|
+
def align(self, audio_file: AudioFile):
|
194
|
+
print("Performing forced alignment on the entire audio")
|
195
|
+
try:
|
196
|
+
waveform, sample_rate = torchaudio.load(audio_file.normalized_path)
|
197
|
+
if sample_rate != self.sample_rate:
|
198
|
+
resampler = T.Resample(orig_freq=sample_rate, new_freq=self.sample_rate)
|
199
|
+
waveform = resampler(waveform)
|
200
|
+
sample_rate = self.sample_rate
|
201
|
+
|
202
|
+
text_roman = self.uroman.romanize_string(audio_file.transcript_text)
|
203
|
+
text_normalized = self.normalize_uroman(text_roman)
|
204
|
+
transcript_list = text_normalized.split()
|
205
|
+
|
206
|
+
tokens = self.tokenizer(transcript_list)
|
207
|
+
|
208
|
+
with torch.inference_mode():
|
209
|
+
emission, _ = self.model(waveform.to(self.device))
|
210
|
+
token_spans = self.aligner(emission[0], tokens)
|
211
|
+
|
212
|
+
num_frames = emission.size(1)
|
213
|
+
ratio = waveform.size(1) / num_frames
|
214
|
+
|
215
|
+
word_alignments = []
|
216
|
+
for spans, word in zip(token_spans, transcript_list):
|
217
|
+
start_sec = spans[0].start * ratio / sample_rate
|
218
|
+
end_sec = spans[-1].end * ratio / sample_rate
|
219
|
+
word_alignments.append({
|
220
|
+
"word": word,
|
221
|
+
"start_time": start_sec,
|
222
|
+
"end_time": end_sec
|
223
|
+
})
|
224
|
+
|
225
|
+
audio_file.word_alignments = word_alignments
|
226
|
+
print("Forced alignment completed")
|
227
|
+
except Exception as e:
|
228
|
+
print(f"Error during forced alignment: {e}")
|
229
|
+
|
230
|
+
|
231
|
+
class SpeakerDiarizer:
|
232
|
+
def __init__(self, hf_token: str, parameters):
|
233
|
+
if torch.cuda.is_available():
|
234
|
+
self.device = torch.device("cuda")
|
235
|
+
elif torch.backends.mps.is_available():
|
236
|
+
self.device = torch.device("mps")
|
237
|
+
else:
|
238
|
+
self.device = torch.device("cpu")
|
239
|
+
self.diarization_pipeline = DiarizationPipeline.from_pretrained(
|
240
|
+
"pyannote/speaker-diarization-3.1",
|
241
|
+
use_auth_token=hf_token
|
242
|
+
)
|
243
|
+
self.diarization_pipeline.instantiate(parameters)
|
244
|
+
self.diarization_pipeline.to(self.device)
|
245
|
+
print("Initialized SpeakerDiarizer")
|
246
|
+
|
247
|
+
def diarize(self, audio_file: AudioFile):
|
248
|
+
print("Performing diarization on the entire audio file")
|
249
|
+
try:
|
250
|
+
diarization_result = self.diarization_pipeline(audio_file.normalized_path)
|
251
|
+
audio_file.speaker_segments = []
|
252
|
+
for segment, _, speaker in diarization_result.itertracks(yield_label=True):
|
253
|
+
audio_file.speaker_segments.append({
|
254
|
+
"start": segment.start,
|
255
|
+
"end": segment.end,
|
256
|
+
"speaker": speaker
|
257
|
+
})
|
258
|
+
print("Diarization completed")
|
259
|
+
except Exception as e:
|
260
|
+
print(f"Error during diarization: {e}")
|
261
|
+
|
262
|
+
|
263
|
+
def process_audio(file_path: str, hf_token: str, output_dir: str,
|
264
|
+
diarizer_params={
|
265
|
+
"segmentation": {
|
266
|
+
"min_duration_off": 0.0,
|
267
|
+
},
|
268
|
+
"clustering": {
|
269
|
+
"method": "centroid",
|
270
|
+
"min_cluster_size": 12,
|
271
|
+
"threshold": 0.7,
|
272
|
+
}}, ):
|
273
|
+
|
274
|
+
audio_file = AudioFile(file_path)
|
275
|
+
audio_file.rms_normalization()
|
276
|
+
|
277
|
+
transcriber = AudioTranscriber()
|
278
|
+
transcriber.transcribe(audio_file)
|
279
|
+
|
280
|
+
aligner = ForcedAligner()
|
281
|
+
aligner.align(audio_file)
|
282
|
+
|
283
|
+
diarizer = SpeakerDiarizer(hf_token, diarizer_params)
|
284
|
+
diarizer.diarize(audio_file)
|
285
|
+
|
286
|
+
audio_file.aggregate_transcripts(
|
287
|
+
whisper_transcriptions=audio_file.whisper_word_alignments,
|
288
|
+
forced_alignments=audio_file.word_alignments
|
289
|
+
)
|
290
|
+
|
291
|
+
audio_file.save_individual_outputs(output_dir=output_dir)
|
292
|
+
|
293
|
+
audio_file.save_aggregated_output(output_dir=output_dir)
|
294
|
+
|
295
|
+
try:
|
296
|
+
combined_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}_combined_data.json")
|
297
|
+
with open(combined_path, 'w', encoding='utf-8') as f:
|
298
|
+
json.dump(audio_file.combined_data, f, ensure_ascii=False, indent=4)
|
299
|
+
print(f"Saved Combined Data to {combined_path}")
|
300
|
+
except Exception as e:
|
301
|
+
print(f"Error saving combined data: {e}")
|
302
|
+
|
303
|
+
print("\nFinal Aggregated and Diarized Transcript:")
|
304
|
+
for entry in audio_file.combined_data:
|
305
|
+
print(f"[{entry['start_time']:.2f}-{entry['end_time']:.2f}] {entry['speaker']}: {entry['word']}")
|
306
|
+
|
307
|
+
|
308
|
+
if __name__ == "__main__":
|
309
|
+
file_path = "audio.wav"
|
310
|
+
output_dir = "output"
|
311
|
+
hf_token = ""
|
312
|
+
process_audio(file_path, hf_token, output_dir)
|
313
|
+
|
314
|
+
|