pelican-nlp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +14 -21
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.0.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,314 @@
1
+ import os
2
+ import json
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
+ import torchaudio
8
+ import torchaudio.transforms as T
9
+ import uroman as ur
10
+ import re
11
+ import unicodedata
12
+ from typing import List, Dict
13
+ from transformers import pipeline
14
+ from pyannote.audio import Pipeline as DiarizationPipeline
15
+ from sklearn.cluster import AgglomerativeClustering
16
+
17
+
18
+ class AudioFile:
19
+ def __init__(self, file_path: str, target_rms_db: float = -20):
20
+ self.file_path = file_path
21
+ self.target_rms_db = target_rms_db
22
+ self.normalized_path = None
23
+ self.audio = None
24
+ self.sample_rate = None
25
+ self.transcript_text = ""
26
+ self.word_alignments = []
27
+ self.speaker_segments = []
28
+ self.combined_data = []
29
+ self.whisper_word_alignments = []
30
+ self.forced_alignment_output = []
31
+ self.whisper_output = []
32
+ self.diarization_output = []
33
+ self.aggregated_output = []
34
+ self.load_audio()
35
+
36
+ def load_audio(self):
37
+ self.audio, self.sample_rate = librosa.load(self.file_path, sr=None)
38
+ print(f"Loaded audio file {self.file_path}")
39
+
40
+ def rms_normalization(self):
41
+ target_rms = 10 ** (self.target_rms_db / 20)
42
+ rms = np.sqrt(np.mean(self.audio ** 2))
43
+ gain = target_rms / rms
44
+ normalized_audio = self.audio * gain
45
+ self.normalized_path = self.file_path.replace(".wav", "_normalized.wav")
46
+ sf.write(self.normalized_path, normalized_audio, self.sample_rate)
47
+ print(f"Normalized audio saved as {self.normalized_path}")
48
+
49
+ def aggregate_transcripts(self, whisper_transcriptions: List[Dict], forced_alignments: List[Dict]):
50
+ print("Aggregating transcripts from Whisper and Forced Alignment")
51
+ self.whisper_output = whisper_transcriptions
52
+ self.forced_alignment_output = forced_alignments
53
+
54
+ merged_words = {}
55
+ for source in [whisper_transcriptions, forced_alignments]:
56
+ for word_info in source:
57
+ word = word_info['word'].lower()
58
+ if word not in merged_words:
59
+ merged_words[word] = {'count': 0, 'start_time': 0.0, 'end_time': 0.0}
60
+ merged_words[word]['count'] += 1
61
+ merged_words[word]['start_time'] += word_info['start_time']
62
+ merged_words[word]['end_time'] += word_info['end_time']
63
+
64
+ aggregated_words = []
65
+ for word, data in merged_words.items():
66
+ avg_start = data['start_time'] / data['count']
67
+ avg_end = data['end_time'] / data['count']
68
+ aggregated_words.append({
69
+ 'word': word,
70
+ 'start_time': avg_start,
71
+ 'end_time': avg_end
72
+ })
73
+
74
+ aggregated_words.sort(key=lambda x: x['start_time'])
75
+ self.aggregated_output = aggregated_words.copy()
76
+
77
+ print("Assigning speakers to words based on diarization segments")
78
+ for word in aggregated_words:
79
+ word_start = word['start_time']
80
+ word_end = word['end_time']
81
+ assigned_speaker = self.assign_speaker(word_start, word_end)
82
+ aggregated_entry = {
83
+ 'word': word['word'],
84
+ 'start_time': word_start,
85
+ 'end_time': word_end,
86
+ 'speaker': assigned_speaker
87
+ }
88
+ self.combined_data.append(aggregated_entry)
89
+ self.aggregated_output.append(aggregated_entry)
90
+
91
+ def assign_speaker(self, word_start: float, word_end: float) -> str:
92
+ max_overlap = 0
93
+ assigned_speaker = "Unknown"
94
+ for segment in self.speaker_segments:
95
+ overlap_start = max(word_start, segment['start'])
96
+ overlap_end = min(word_end, segment['end'])
97
+ overlap = max(0.0, overlap_end - overlap_start)
98
+ if overlap > max_overlap:
99
+ max_overlap = overlap
100
+ assigned_speaker = segment['speaker']
101
+ return assigned_speaker
102
+
103
+ def save_individual_outputs(self, output_dir: str):
104
+ try:
105
+ os.makedirs(output_dir, exist_ok=True)
106
+ base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
107
+
108
+ whisper_path = os.path.join(output_dir, f"{base_filename}_whisper_timestamps.json")
109
+ with open(whisper_path, 'w', encoding='utf-8') as f:
110
+ json.dump(self.whisper_output, f, ensure_ascii=False, indent=4)
111
+ print(f"Saved Whisper timestamps to {whisper_path}")
112
+
113
+ forced_align_path = os.path.join(output_dir, f"{base_filename}_forced_alignment_timestamps.json")
114
+ with open(forced_align_path, 'w', encoding='utf-8') as f:
115
+ json.dump(self.forced_alignment_output, f, ensure_ascii=False, indent=4)
116
+ print(f"Saved Forced Alignment timestamps to {forced_align_path}")
117
+
118
+ diarization_path = os.path.join(output_dir, f"{base_filename}_speaker_segments.json")
119
+ with open(diarization_path, 'w', encoding='utf-8') as f:
120
+ json.dump(self.speaker_segments, f, ensure_ascii=False, indent=4)
121
+ print(f"Saved Speaker Diarization segments to {diarization_path}")
122
+
123
+ except Exception as e:
124
+ print(f"Error saving individual outputs: {e}")
125
+
126
+ def save_aggregated_output(self, output_dir: str):
127
+ try:
128
+ os.makedirs(output_dir, exist_ok=True)
129
+ base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
130
+ aggregated_path = os.path.join(output_dir, f"{base_filename}_aggregated_transcript.json")
131
+ with open(aggregated_path, 'w', encoding='utf-8') as f:
132
+ json.dump(self.aggregated_output, f, ensure_ascii=False, indent=4)
133
+ print(f"Saved Aggregated Transcript to {aggregated_path}")
134
+ except Exception as e:
135
+ print(f"Error saving aggregated output: {e}")
136
+
137
+
138
+ class AudioTranscriber:
139
+ def __init__(self, device: str = None):
140
+ if torch.cuda.is_available():
141
+ self.device = torch.device("cuda")
142
+ elif torch.backends.mps.is_available():
143
+ self.device = torch.device("mps")
144
+ else:
145
+ self.device = torch.device("cpu")
146
+ self.transcriber = pipeline(
147
+ "automatic-speech-recognition",
148
+ model="openai/whisper-medium",
149
+ device=self.device
150
+ )
151
+ print(f"Initialized AudioTranscriber on device: {self.device}")
152
+
153
+ def transcribe(self, audio_file: AudioFile):
154
+ print("Transcribing the entire audio file")
155
+ try:
156
+ with open(audio_file.normalized_path, 'rb') as f:
157
+ audio_data = f.read()
158
+ transcription_result = self.transcriber(
159
+ audio_data,
160
+ return_timestamps="word",
161
+
162
+ )
163
+ audio_file.transcript_text = transcription_result['text'].strip()
164
+ audio_file.whisper_word_alignments = transcription_result.get('words', [])
165
+ print("Transcription completed")
166
+ except Exception as e:
167
+ print(f"Error during transcription: {e}")
168
+
169
+
170
+ class ForcedAligner:
171
+ def __init__(self, device: str = None):
172
+ if torch.cuda.is_available():
173
+ self.device = torch.device("cuda")
174
+ else:
175
+ self.device = torch.device("cpu")
176
+ self.bundle = torchaudio.pipelines.MMS_FA
177
+ self.model = self.bundle.get_model().to(self.device)
178
+ self.tokenizer = self.bundle.get_tokenizer()
179
+ self.aligner = self.bundle.get_aligner()
180
+ self.uroman = ur.Uroman()
181
+ self.sample_rate = self.bundle.sample_rate
182
+ print(f"Initialized ForcedAligner on device: {self.device}")
183
+
184
+ def normalize_uroman(self, text: str) -> str:
185
+ text = text.encode('utf-8').decode('utf-8')
186
+ text = text.lower()
187
+ text = text.replace("’", "'")
188
+ text = unicodedata.normalize('NFC', text)
189
+ text = re.sub("([^a-z' ])", " ", text)
190
+ text = re.sub(' +', ' ', text)
191
+ return text.strip()
192
+
193
+ def align(self, audio_file: AudioFile):
194
+ print("Performing forced alignment on the entire audio")
195
+ try:
196
+ waveform, sample_rate = torchaudio.load(audio_file.normalized_path)
197
+ if sample_rate != self.sample_rate:
198
+ resampler = T.Resample(orig_freq=sample_rate, new_freq=self.sample_rate)
199
+ waveform = resampler(waveform)
200
+ sample_rate = self.sample_rate
201
+
202
+ text_roman = self.uroman.romanize_string(audio_file.transcript_text)
203
+ text_normalized = self.normalize_uroman(text_roman)
204
+ transcript_list = text_normalized.split()
205
+
206
+ tokens = self.tokenizer(transcript_list)
207
+
208
+ with torch.inference_mode():
209
+ emission, _ = self.model(waveform.to(self.device))
210
+ token_spans = self.aligner(emission[0], tokens)
211
+
212
+ num_frames = emission.size(1)
213
+ ratio = waveform.size(1) / num_frames
214
+
215
+ word_alignments = []
216
+ for spans, word in zip(token_spans, transcript_list):
217
+ start_sec = spans[0].start * ratio / sample_rate
218
+ end_sec = spans[-1].end * ratio / sample_rate
219
+ word_alignments.append({
220
+ "word": word,
221
+ "start_time": start_sec,
222
+ "end_time": end_sec
223
+ })
224
+
225
+ audio_file.word_alignments = word_alignments
226
+ print("Forced alignment completed")
227
+ except Exception as e:
228
+ print(f"Error during forced alignment: {e}")
229
+
230
+
231
+ class SpeakerDiarizer:
232
+ def __init__(self, hf_token: str, parameters):
233
+ if torch.cuda.is_available():
234
+ self.device = torch.device("cuda")
235
+ elif torch.backends.mps.is_available():
236
+ self.device = torch.device("mps")
237
+ else:
238
+ self.device = torch.device("cpu")
239
+ self.diarization_pipeline = DiarizationPipeline.from_pretrained(
240
+ "pyannote/speaker-diarization-3.1",
241
+ use_auth_token=hf_token
242
+ )
243
+ self.diarization_pipeline.instantiate(parameters)
244
+ self.diarization_pipeline.to(self.device)
245
+ print("Initialized SpeakerDiarizer")
246
+
247
+ def diarize(self, audio_file: AudioFile):
248
+ print("Performing diarization on the entire audio file")
249
+ try:
250
+ diarization_result = self.diarization_pipeline(audio_file.normalized_path)
251
+ audio_file.speaker_segments = []
252
+ for segment, _, speaker in diarization_result.itertracks(yield_label=True):
253
+ audio_file.speaker_segments.append({
254
+ "start": segment.start,
255
+ "end": segment.end,
256
+ "speaker": speaker
257
+ })
258
+ print("Diarization completed")
259
+ except Exception as e:
260
+ print(f"Error during diarization: {e}")
261
+
262
+
263
+ def process_audio(file_path: str, hf_token: str, output_dir: str,
264
+ diarizer_params={
265
+ "segmentation": {
266
+ "min_duration_off": 0.0,
267
+ },
268
+ "clustering": {
269
+ "method": "centroid",
270
+ "min_cluster_size": 12,
271
+ "threshold": 0.7,
272
+ }}, ):
273
+
274
+ audio_file = AudioFile(file_path)
275
+ audio_file.rms_normalization()
276
+
277
+ transcriber = AudioTranscriber()
278
+ transcriber.transcribe(audio_file)
279
+
280
+ aligner = ForcedAligner()
281
+ aligner.align(audio_file)
282
+
283
+ diarizer = SpeakerDiarizer(hf_token, diarizer_params)
284
+ diarizer.diarize(audio_file)
285
+
286
+ audio_file.aggregate_transcripts(
287
+ whisper_transcriptions=audio_file.whisper_word_alignments,
288
+ forced_alignments=audio_file.word_alignments
289
+ )
290
+
291
+ audio_file.save_individual_outputs(output_dir=output_dir)
292
+
293
+ audio_file.save_aggregated_output(output_dir=output_dir)
294
+
295
+ try:
296
+ combined_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}_combined_data.json")
297
+ with open(combined_path, 'w', encoding='utf-8') as f:
298
+ json.dump(audio_file.combined_data, f, ensure_ascii=False, indent=4)
299
+ print(f"Saved Combined Data to {combined_path}")
300
+ except Exception as e:
301
+ print(f"Error saving combined data: {e}")
302
+
303
+ print("\nFinal Aggregated and Diarized Transcript:")
304
+ for entry in audio_file.combined_data:
305
+ print(f"[{entry['start_time']:.2f}-{entry['end_time']:.2f}] {entry['speaker']}: {entry['word']}")
306
+
307
+
308
+ if __name__ == "__main__":
309
+ file_path = "audio.wav"
310
+ output_dir = "output"
311
+ hf_token = ""
312
+ process_audio(file_path, hf_token, output_dir)
313
+
314
+