pelican-nlp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +14 -21
- pelican_nlp-0.1.2.dist-info/RECORD +75 -0
- pelican_nlp-0.1.0.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
"""
|
2
|
+
Utility functions for the VELAS fluency analysis pipeline.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import List, Dict
|
7
|
+
|
8
|
+
def ensure_output_dir(path: str) -> None:
|
9
|
+
"""
|
10
|
+
Create output directory if it doesn't exist.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
path: Path to create. Can be a directory path or a file path.
|
14
|
+
If it's a file path, the directory containing the file will be created.
|
15
|
+
If it's a directory path, the directory itself will be created.
|
16
|
+
"""
|
17
|
+
# If path ends with a file extension, get its directory
|
18
|
+
if '.' in os.path.basename(path):
|
19
|
+
dir_path = os.path.dirname(os.path.abspath(path))
|
20
|
+
else:
|
21
|
+
dir_path = os.path.abspath(path)
|
22
|
+
|
23
|
+
Path(dir_path).mkdir(parents=True, exist_ok=True)
|
24
|
+
|
25
|
+
def validate_input_data(filepaths: Dict[str, str]) -> Dict[str, str]:
|
26
|
+
"""
|
27
|
+
Check if input files exist and return any errors.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
filepaths: Dictionary mapping file descriptions to their paths
|
31
|
+
e.g., {"behavioral_data": "/path/to/behav.csv"}
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Dictionary of error messages for missing files.
|
35
|
+
Empty dict if all files exist.
|
36
|
+
"""
|
37
|
+
errors = {}
|
38
|
+
for description, path in filepaths.items():
|
39
|
+
if not os.path.exists(path):
|
40
|
+
errors[description] = f"File not found: {path}"
|
41
|
+
return errors
|
@@ -0,0 +1,328 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
Created on Wed Jul 3 14:21:27 2024
|
5
|
+
|
6
|
+
@author: nilsl
|
7
|
+
"""
|
8
|
+
|
9
|
+
import re
|
10
|
+
from collections import defaultdict, OrderedDict
|
11
|
+
import os
|
12
|
+
import re
|
13
|
+
import chardet
|
14
|
+
from collections import defaultdict, OrderedDict
|
15
|
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
16
|
+
from nltk.corpus import stopwords
|
17
|
+
from striprtf.striprtf import rtf_to_text
|
18
|
+
import numpy as np
|
19
|
+
from text_cleaner import TextCleaner
|
20
|
+
|
21
|
+
protocol = {
|
22
|
+
"1": [
|
23
|
+
"täglichen Leben",
|
24
|
+
"ein paar Dinge",
|
25
|
+
"ein paar Sachen",
|
26
|
+
"über sich erzählen",
|
27
|
+
],
|
28
|
+
"2": [
|
29
|
+
"etwas Wichtiges",
|
30
|
+
"Ihrer Kindheit",
|
31
|
+
"letzte Woche",
|
32
|
+
"in Ihrem Leben",
|
33
|
+
"beliebigen Zeit",
|
34
|
+
"zurückdenken",
|
35
|
+
],
|
36
|
+
"3": [
|
37
|
+
"Ihre Gesundheit",
|
38
|
+
"Gesundheit sprechen",
|
39
|
+
"psychische Krankheit",
|
40
|
+
"seit Beginn",
|
41
|
+
"wie Sie sich gefühlt haben",
|
42
|
+
],
|
43
|
+
"4": [
|
44
|
+
"erste Bild",
|
45
|
+
"Bild Nummer",
|
46
|
+
"zweite Bild",
|
47
|
+
"dritte Bild",
|
48
|
+
"auf dem Bild",
|
49
|
+
"Bild sehen",
|
50
|
+
],
|
51
|
+
"5": [
|
52
|
+
"Geschichte zeigen",
|
53
|
+
"der Reihe nach",
|
54
|
+
"Bilder aus einer Geschichte",
|
55
|
+
"Bilder weg",
|
56
|
+
"ein paar Bilder",
|
57
|
+
"Bilderreihe",
|
58
|
+
"Geschichte aus Bilder",
|
59
|
+
"Geschichte aus Bildern",
|
60
|
+
"so viel Zeit",
|
61
|
+
],
|
62
|
+
"6": [
|
63
|
+
"wiederkehrende Träume",
|
64
|
+
"solche Träume",
|
65
|
+
"Träume",
|
66
|
+
"wiederkehrenden Traum",
|
67
|
+
],
|
68
|
+
"7": [
|
69
|
+
"einseitige",
|
70
|
+
"geschriebene Geschichte",
|
71
|
+
"Geschichte geschrieben",
|
72
|
+
"Blatt",
|
73
|
+
"eine Minute",
|
74
|
+
"eigenen Worten",
|
75
|
+
"eigene Worte",
|
76
|
+
"laut vorzulesen",
|
77
|
+
"laut vorlesen",
|
78
|
+
],
|
79
|
+
}
|
80
|
+
|
81
|
+
def load_rtf_files(directory):
|
82
|
+
"""Load and aggregate RTF files by patient and task."""
|
83
|
+
content_by_patient_and_task = defaultdict(lambda: defaultdict(str))
|
84
|
+
|
85
|
+
for filename in os.listdir(directory):
|
86
|
+
if filename.endswith(".rtf") and not filename.startswith("."):
|
87
|
+
file_path = os.path.join(directory, filename)
|
88
|
+
patient_id, task = parse_filename(filename)
|
89
|
+
base_task_name = (
|
90
|
+
"_".join(task.split("_")[:-1]) if "teil" in task else task
|
91
|
+
)
|
92
|
+
try:
|
93
|
+
rtf_content = read_rtf(file_path)
|
94
|
+
content_by_patient_and_task[patient_id][
|
95
|
+
base_task_name
|
96
|
+
] += f" {rtf_content}"
|
97
|
+
except:
|
98
|
+
continue
|
99
|
+
|
100
|
+
return content_by_patient_and_task
|
101
|
+
|
102
|
+
|
103
|
+
def parse_filename(filename):
|
104
|
+
"""Parse filename to extract patient ID and task."""
|
105
|
+
filename = filename[:-4]
|
106
|
+
parts = filename.split("_")
|
107
|
+
patient_id = parts[0]
|
108
|
+
task = parts[2] if len(parts) > 2 else "unknown"
|
109
|
+
part_info = parts[3] if len(parts) > 3 else ""
|
110
|
+
full_task = f"{task}_{part_info}" if part_info else task
|
111
|
+
return patient_id, full_task
|
112
|
+
|
113
|
+
|
114
|
+
def read_rtf(file_path):
|
115
|
+
"""Read RTF file and convert its content to plain text."""
|
116
|
+
with open(file_path, "rb") as file:
|
117
|
+
raw_data = file.read()
|
118
|
+
result = chardet.detect(raw_data)
|
119
|
+
encoding = result["encoding"]
|
120
|
+
|
121
|
+
with open(file_path, "r", encoding=encoding, errors="ignore") as file:
|
122
|
+
rtf_content = file.read()
|
123
|
+
|
124
|
+
return rtf_to_text(rtf_content)
|
125
|
+
|
126
|
+
|
127
|
+
def split_into_lines(text):
|
128
|
+
"""Split text into lines, filtering out empty lines and unwanted content."""
|
129
|
+
lines = text.splitlines()
|
130
|
+
return [
|
131
|
+
line
|
132
|
+
for line in lines
|
133
|
+
if line.strip() and ".mp3" not in line and "audio" not in line
|
134
|
+
]
|
135
|
+
|
136
|
+
|
137
|
+
def extract_and_remove_hashtags(text):
|
138
|
+
"""Extract and remove hashtags from the text."""
|
139
|
+
pattern = r"#(.*?)#"
|
140
|
+
matches = re.findall(pattern, text)
|
141
|
+
text_without_hashtags = re.sub(pattern, "", text)
|
142
|
+
return text_without_hashtags, matches
|
143
|
+
|
144
|
+
|
145
|
+
class Line:
|
146
|
+
"""Represents a line of text with associated metadata."""
|
147
|
+
|
148
|
+
def __init__(self, speaker, text, line_number, tokenizer=None):
|
149
|
+
self.speaker = speaker
|
150
|
+
self.text = text
|
151
|
+
self.line_number = line_number
|
152
|
+
self.length_in_words = len(self.text.split())
|
153
|
+
|
154
|
+
|
155
|
+
def process_lines(
|
156
|
+
pat_id,
|
157
|
+
task,
|
158
|
+
lines,
|
159
|
+
stopwords_list,
|
160
|
+
remove_numbers=False,
|
161
|
+
|
162
|
+
):
|
163
|
+
"""Process lines of text to create a Document object."""
|
164
|
+
document = Document(pat_id, task)
|
165
|
+
for i, line_text in enumerate(lines, start=1):
|
166
|
+
speaker = (
|
167
|
+
"Investigator"
|
168
|
+
if line_text.startswith(("I:", "I::", "I1:", "I2:"))
|
169
|
+
else "Subject"
|
170
|
+
)
|
171
|
+
cleaned_line = TextCleaner.clean_text_diarization_all(line_text, stopwords_list, remove_numbers)
|
172
|
+
if cleaned_line != ".":
|
173
|
+
line = Line(speaker, cleaned_line, i)
|
174
|
+
document.add_line(line)
|
175
|
+
return document
|
176
|
+
|
177
|
+
|
178
|
+
def main(
|
179
|
+
transcripts_dict,
|
180
|
+
output_path,
|
181
|
+
task_dir,
|
182
|
+
protocol,
|
183
|
+
remove_stopwords,
|
184
|
+
remove_numbers,
|
185
|
+
):
|
186
|
+
"""Main function to process documents."""
|
187
|
+
if not os.path.exists(output_path):
|
188
|
+
os.makedirs(output_path)
|
189
|
+
print(f"Folder '{output_path}' was created.")
|
190
|
+
else:
|
191
|
+
print(f"Folder '{output_path}' already exists.")
|
192
|
+
|
193
|
+
if remove_stopwords:
|
194
|
+
stop_list = list(stopwords.words("german"))
|
195
|
+
else:
|
196
|
+
stop_list = []
|
197
|
+
|
198
|
+
for patient_id, tasks in transcripts_dict.items():
|
199
|
+
for task, rtf_content in tasks.items():
|
200
|
+
|
201
|
+
no_hashtags, hashtags = extract_and_remove_hashtags(
|
202
|
+
rtf_content
|
203
|
+
)
|
204
|
+
|
205
|
+
lines = split_into_lines(no_hashtags)
|
206
|
+
document = process_lines(
|
207
|
+
patient_id,
|
208
|
+
task,
|
209
|
+
lines,
|
210
|
+
stop_list,
|
211
|
+
remove_numbers=remove_numbers,
|
212
|
+
)
|
213
|
+
|
214
|
+
if task_dir == "discourse":
|
215
|
+
document.segment_task(protocol)
|
216
|
+
|
217
|
+
document.compile_texts_and_tags()
|
218
|
+
s_tok = np.array(document.word_tags) == "s"
|
219
|
+
words = np.array(document.words)
|
220
|
+
|
221
|
+
with open(output_path + f"{patient_id}_{task}.txt", 'w', encoding='utf-8') as file:
|
222
|
+
file.write(" ".join(words[s_tok]))
|
223
|
+
|
224
|
+
|
225
|
+
#Document class from original diarization script
|
226
|
+
class Document:
|
227
|
+
"""Represents a document with multiple lines of text and associated metadata."""
|
228
|
+
|
229
|
+
def __init__(self, pat_id, task, lines=None):
|
230
|
+
self.pat_id = pat_id
|
231
|
+
self.task = task
|
232
|
+
self.lines = lines if lines is not None else []
|
233
|
+
self.has_segments = True if task == "discourse" else False
|
234
|
+
self.sections = {}
|
235
|
+
self.section_metrics = {}
|
236
|
+
self.length_in_lines = len(self.lines)
|
237
|
+
self.length_in_words = sum(line.length_in_words for line in self.lines)
|
238
|
+
|
239
|
+
# Initialize segments to a default value if no sections are applicable
|
240
|
+
self.segments = ['default'] * len(self.lines) if not self.has_segments else []
|
241
|
+
|
242
|
+
def add_line(self, line):
|
243
|
+
self.lines.append(line)
|
244
|
+
self.length_in_lines = len(self.lines)
|
245
|
+
self.length_in_words += line.length_in_words
|
246
|
+
|
247
|
+
if not self.has_segments:
|
248
|
+
self.segments.append('default')
|
249
|
+
|
250
|
+
def compile_texts_and_tags(self):
|
251
|
+
"""Compile lists of all words and tokens with corresponding speaker tags."""
|
252
|
+
self.words, self.word_tags, = (
|
253
|
+
[],
|
254
|
+
[],
|
255
|
+
)
|
256
|
+
self.word_segments = []
|
257
|
+
|
258
|
+
for line, segment in zip(self.lines, self.segments):
|
259
|
+
line_words = line.text.split()
|
260
|
+
tag = "i" if line.speaker.lower() == "investigator" else "s"
|
261
|
+
|
262
|
+
self.word_segments.extend([segment] * len(line_words))
|
263
|
+
self.words.extend(line_words)
|
264
|
+
self.word_tags.extend([tag] * len(line_words))
|
265
|
+
|
266
|
+
def segment_task(self, protocol, cutoff=1):
|
267
|
+
"""Segment the document based on the given protocol and store sections."""
|
268
|
+
if not self.has_segments:
|
269
|
+
return self.segments # Return default segments if segmentation not applicable
|
270
|
+
|
271
|
+
patterns = {
|
272
|
+
section: re.compile(
|
273
|
+
"|".join(f"(?:\\b{re.escape(term)}\\b)" for term in terms),
|
274
|
+
re.IGNORECASE,
|
275
|
+
)
|
276
|
+
for section, terms in protocol.items()
|
277
|
+
}
|
278
|
+
|
279
|
+
match_scores = defaultdict(list)
|
280
|
+
for section, pattern in patterns.items():
|
281
|
+
for line_index, line in enumerate(self.lines):
|
282
|
+
if pattern.search(line.text):
|
283
|
+
match_scores[section].append(line_index)
|
284
|
+
|
285
|
+
section_order = sorted(protocol.keys(), key=lambda x: int(x))
|
286
|
+
section_starts = OrderedDict()
|
287
|
+
last_index_used = -1
|
288
|
+
|
289
|
+
for section in section_order:
|
290
|
+
line_indices = match_scores[section]
|
291
|
+
valid_starts = [idx for idx in line_indices if idx > last_index_used and len(line_indices) >= cutoff]
|
292
|
+
if valid_starts:
|
293
|
+
start_line = min(valid_starts)
|
294
|
+
section_starts[section] = start_line
|
295
|
+
last_index_used = start_line
|
296
|
+
|
297
|
+
segment_names = ["1"] * len(self.lines)
|
298
|
+
current_section = None
|
299
|
+
for i in range(len(self.lines)):
|
300
|
+
if i in section_starts.values():
|
301
|
+
current_section = [sec for sec, start in section_starts.items() if start == i][0]
|
302
|
+
segment_names[i] = current_section if current_section else "default"
|
303
|
+
|
304
|
+
self.segments = segment_names
|
305
|
+
self.sections = self._create_sections(segment_names)
|
306
|
+
return segment_names
|
307
|
+
|
308
|
+
def _create_sections(self, segment_names):
|
309
|
+
sections = defaultdict(list)
|
310
|
+
for line, segment in zip(self.lines, segment_names):
|
311
|
+
sections[segment].append(line)
|
312
|
+
return sections
|
313
|
+
|
314
|
+
|
315
|
+
if __name__ == "__main__":
|
316
|
+
for task_dir in ["interview"]:
|
317
|
+
transcripts_directory = os.path.join(
|
318
|
+
"..", "..", "..", "data", "language", task_dir, "transcripts")
|
319
|
+
|
320
|
+
transcripts_dict = load_rtf_files(transcripts_directory)
|
321
|
+
print(transcripts_dict.keys())
|
322
|
+
|
323
|
+
args = {
|
324
|
+
"remove_stopwords": False,
|
325
|
+
"remove_numbers": False,
|
326
|
+
}
|
327
|
+
output_path = f"/Users/nilsl/Documents/PUK/VELAS/data/language/{task_dir}/preprocessed_transcripts/"
|
328
|
+
main(transcripts_dict, output_path, task_dir, protocol, **args)
|
File without changes
|