pelican-nlp 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/METADATA +15 -14
  39. pelican_nlp-0.1.3.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,41 @@
1
+ """
2
+ Utility functions for the VELAS fluency analysis pipeline.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from typing import List, Dict
7
+
8
+ def ensure_output_dir(path: str) -> None:
9
+ """
10
+ Create output directory if it doesn't exist.
11
+
12
+ Args:
13
+ path: Path to create. Can be a directory path or a file path.
14
+ If it's a file path, the directory containing the file will be created.
15
+ If it's a directory path, the directory itself will be created.
16
+ """
17
+ # If path ends with a file extension, get its directory
18
+ if '.' in os.path.basename(path):
19
+ dir_path = os.path.dirname(os.path.abspath(path))
20
+ else:
21
+ dir_path = os.path.abspath(path)
22
+
23
+ Path(dir_path).mkdir(parents=True, exist_ok=True)
24
+
25
+ def validate_input_data(filepaths: Dict[str, str]) -> Dict[str, str]:
26
+ """
27
+ Check if input files exist and return any errors.
28
+
29
+ Args:
30
+ filepaths: Dictionary mapping file descriptions to their paths
31
+ e.g., {"behavioral_data": "/path/to/behav.csv"}
32
+
33
+ Returns:
34
+ Dictionary of error messages for missing files.
35
+ Empty dict if all files exist.
36
+ """
37
+ errors = {}
38
+ for description, path in filepaths.items():
39
+ if not os.path.exists(path):
40
+ errors[description] = f"File not found: {path}"
41
+ return errors
@@ -0,0 +1,328 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Jul 3 14:21:27 2024
5
+
6
+ @author: nilsl
7
+ """
8
+
9
+ import re
10
+ from collections import defaultdict, OrderedDict
11
+ import os
12
+ import re
13
+ import chardet
14
+ from collections import defaultdict, OrderedDict
15
+ from nltk.tokenize import sent_tokenize, word_tokenize
16
+ from nltk.corpus import stopwords
17
+ from striprtf.striprtf import rtf_to_text
18
+ import numpy as np
19
+ from text_cleaner import TextCleaner
20
+
21
+ protocol = {
22
+ "1": [
23
+ "täglichen Leben",
24
+ "ein paar Dinge",
25
+ "ein paar Sachen",
26
+ "über sich erzählen",
27
+ ],
28
+ "2": [
29
+ "etwas Wichtiges",
30
+ "Ihrer Kindheit",
31
+ "letzte Woche",
32
+ "in Ihrem Leben",
33
+ "beliebigen Zeit",
34
+ "zurückdenken",
35
+ ],
36
+ "3": [
37
+ "Ihre Gesundheit",
38
+ "Gesundheit sprechen",
39
+ "psychische Krankheit",
40
+ "seit Beginn",
41
+ "wie Sie sich gefühlt haben",
42
+ ],
43
+ "4": [
44
+ "erste Bild",
45
+ "Bild Nummer",
46
+ "zweite Bild",
47
+ "dritte Bild",
48
+ "auf dem Bild",
49
+ "Bild sehen",
50
+ ],
51
+ "5": [
52
+ "Geschichte zeigen",
53
+ "der Reihe nach",
54
+ "Bilder aus einer Geschichte",
55
+ "Bilder weg",
56
+ "ein paar Bilder",
57
+ "Bilderreihe",
58
+ "Geschichte aus Bilder",
59
+ "Geschichte aus Bildern",
60
+ "so viel Zeit",
61
+ ],
62
+ "6": [
63
+ "wiederkehrende Träume",
64
+ "solche Träume",
65
+ "Träume",
66
+ "wiederkehrenden Traum",
67
+ ],
68
+ "7": [
69
+ "einseitige",
70
+ "geschriebene Geschichte",
71
+ "Geschichte geschrieben",
72
+ "Blatt",
73
+ "eine Minute",
74
+ "eigenen Worten",
75
+ "eigene Worte",
76
+ "laut vorzulesen",
77
+ "laut vorlesen",
78
+ ],
79
+ }
80
+
81
+ def load_rtf_files(directory):
82
+ """Load and aggregate RTF files by patient and task."""
83
+ content_by_patient_and_task = defaultdict(lambda: defaultdict(str))
84
+
85
+ for filename in os.listdir(directory):
86
+ if filename.endswith(".rtf") and not filename.startswith("."):
87
+ file_path = os.path.join(directory, filename)
88
+ patient_id, task = parse_filename(filename)
89
+ base_task_name = (
90
+ "_".join(task.split("_")[:-1]) if "teil" in task else task
91
+ )
92
+ try:
93
+ rtf_content = read_rtf(file_path)
94
+ content_by_patient_and_task[patient_id][
95
+ base_task_name
96
+ ] += f" {rtf_content}"
97
+ except:
98
+ continue
99
+
100
+ return content_by_patient_and_task
101
+
102
+
103
+ def parse_filename(filename):
104
+ """Parse filename to extract patient ID and task."""
105
+ filename = filename[:-4]
106
+ parts = filename.split("_")
107
+ patient_id = parts[0]
108
+ task = parts[2] if len(parts) > 2 else "unknown"
109
+ part_info = parts[3] if len(parts) > 3 else ""
110
+ full_task = f"{task}_{part_info}" if part_info else task
111
+ return patient_id, full_task
112
+
113
+
114
+ def read_rtf(file_path):
115
+ """Read RTF file and convert its content to plain text."""
116
+ with open(file_path, "rb") as file:
117
+ raw_data = file.read()
118
+ result = chardet.detect(raw_data)
119
+ encoding = result["encoding"]
120
+
121
+ with open(file_path, "r", encoding=encoding, errors="ignore") as file:
122
+ rtf_content = file.read()
123
+
124
+ return rtf_to_text(rtf_content)
125
+
126
+
127
+ def split_into_lines(text):
128
+ """Split text into lines, filtering out empty lines and unwanted content."""
129
+ lines = text.splitlines()
130
+ return [
131
+ line
132
+ for line in lines
133
+ if line.strip() and ".mp3" not in line and "audio" not in line
134
+ ]
135
+
136
+
137
+ def extract_and_remove_hashtags(text):
138
+ """Extract and remove hashtags from the text."""
139
+ pattern = r"#(.*?)#"
140
+ matches = re.findall(pattern, text)
141
+ text_without_hashtags = re.sub(pattern, "", text)
142
+ return text_without_hashtags, matches
143
+
144
+
145
+ class Line:
146
+ """Represents a line of text with associated metadata."""
147
+
148
+ def __init__(self, speaker, text, line_number, tokenizer=None):
149
+ self.speaker = speaker
150
+ self.text = text
151
+ self.line_number = line_number
152
+ self.length_in_words = len(self.text.split())
153
+
154
+
155
+ def process_lines(
156
+ pat_id,
157
+ task,
158
+ lines,
159
+ stopwords_list,
160
+ remove_numbers=False,
161
+
162
+ ):
163
+ """Process lines of text to create a Document object."""
164
+ document = Document(pat_id, task)
165
+ for i, line_text in enumerate(lines, start=1):
166
+ speaker = (
167
+ "Investigator"
168
+ if line_text.startswith(("I:", "I::", "I1:", "I2:"))
169
+ else "Subject"
170
+ )
171
+ cleaned_line = TextCleaner.clean_text_diarization_all(line_text, stopwords_list, remove_numbers)
172
+ if cleaned_line != ".":
173
+ line = Line(speaker, cleaned_line, i)
174
+ document.add_line(line)
175
+ return document
176
+
177
+
178
+ def main(
179
+ transcripts_dict,
180
+ output_path,
181
+ task_dir,
182
+ protocol,
183
+ remove_stopwords,
184
+ remove_numbers,
185
+ ):
186
+ """Main function to process documents."""
187
+ if not os.path.exists(output_path):
188
+ os.makedirs(output_path)
189
+ print(f"Folder '{output_path}' was created.")
190
+ else:
191
+ print(f"Folder '{output_path}' already exists.")
192
+
193
+ if remove_stopwords:
194
+ stop_list = list(stopwords.words("german"))
195
+ else:
196
+ stop_list = []
197
+
198
+ for patient_id, tasks in transcripts_dict.items():
199
+ for task, rtf_content in tasks.items():
200
+
201
+ no_hashtags, hashtags = extract_and_remove_hashtags(
202
+ rtf_content
203
+ )
204
+
205
+ lines = split_into_lines(no_hashtags)
206
+ document = process_lines(
207
+ patient_id,
208
+ task,
209
+ lines,
210
+ stop_list,
211
+ remove_numbers=remove_numbers,
212
+ )
213
+
214
+ if task_dir == "discourse":
215
+ document.segment_task(protocol)
216
+
217
+ document.compile_texts_and_tags()
218
+ s_tok = np.array(document.word_tags) == "s"
219
+ words = np.array(document.words)
220
+
221
+ with open(output_path + f"{patient_id}_{task}.txt", 'w', encoding='utf-8') as file:
222
+ file.write(" ".join(words[s_tok]))
223
+
224
+
225
+ #Document class from original diarization script
226
+ class Document:
227
+ """Represents a document with multiple lines of text and associated metadata."""
228
+
229
+ def __init__(self, pat_id, task, lines=None):
230
+ self.pat_id = pat_id
231
+ self.task = task
232
+ self.lines = lines if lines is not None else []
233
+ self.has_segments = True if task == "discourse" else False
234
+ self.sections = {}
235
+ self.section_metrics = {}
236
+ self.length_in_lines = len(self.lines)
237
+ self.length_in_words = sum(line.length_in_words for line in self.lines)
238
+
239
+ # Initialize segments to a default value if no sections are applicable
240
+ self.segments = ['default'] * len(self.lines) if not self.has_segments else []
241
+
242
+ def add_line(self, line):
243
+ self.lines.append(line)
244
+ self.length_in_lines = len(self.lines)
245
+ self.length_in_words += line.length_in_words
246
+
247
+ if not self.has_segments:
248
+ self.segments.append('default')
249
+
250
+ def compile_texts_and_tags(self):
251
+ """Compile lists of all words and tokens with corresponding speaker tags."""
252
+ self.words, self.word_tags, = (
253
+ [],
254
+ [],
255
+ )
256
+ self.word_segments = []
257
+
258
+ for line, segment in zip(self.lines, self.segments):
259
+ line_words = line.text.split()
260
+ tag = "i" if line.speaker.lower() == "investigator" else "s"
261
+
262
+ self.word_segments.extend([segment] * len(line_words))
263
+ self.words.extend(line_words)
264
+ self.word_tags.extend([tag] * len(line_words))
265
+
266
+ def segment_task(self, protocol, cutoff=1):
267
+ """Segment the document based on the given protocol and store sections."""
268
+ if not self.has_segments:
269
+ return self.segments # Return default segments if segmentation not applicable
270
+
271
+ patterns = {
272
+ section: re.compile(
273
+ "|".join(f"(?:\\b{re.escape(term)}\\b)" for term in terms),
274
+ re.IGNORECASE,
275
+ )
276
+ for section, terms in protocol.items()
277
+ }
278
+
279
+ match_scores = defaultdict(list)
280
+ for section, pattern in patterns.items():
281
+ for line_index, line in enumerate(self.lines):
282
+ if pattern.search(line.text):
283
+ match_scores[section].append(line_index)
284
+
285
+ section_order = sorted(protocol.keys(), key=lambda x: int(x))
286
+ section_starts = OrderedDict()
287
+ last_index_used = -1
288
+
289
+ for section in section_order:
290
+ line_indices = match_scores[section]
291
+ valid_starts = [idx for idx in line_indices if idx > last_index_used and len(line_indices) >= cutoff]
292
+ if valid_starts:
293
+ start_line = min(valid_starts)
294
+ section_starts[section] = start_line
295
+ last_index_used = start_line
296
+
297
+ segment_names = ["1"] * len(self.lines)
298
+ current_section = None
299
+ for i in range(len(self.lines)):
300
+ if i in section_starts.values():
301
+ current_section = [sec for sec, start in section_starts.items() if start == i][0]
302
+ segment_names[i] = current_section if current_section else "default"
303
+
304
+ self.segments = segment_names
305
+ self.sections = self._create_sections(segment_names)
306
+ return segment_names
307
+
308
+ def _create_sections(self, segment_names):
309
+ sections = defaultdict(list)
310
+ for line, segment in zip(self.lines, segment_names):
311
+ sections[segment].append(line)
312
+ return sections
313
+
314
+
315
+ if __name__ == "__main__":
316
+ for task_dir in ["interview"]:
317
+ transcripts_directory = os.path.join(
318
+ "..", "..", "..", "data", "language", task_dir, "transcripts")
319
+
320
+ transcripts_dict = load_rtf_files(transcripts_directory)
321
+ print(transcripts_dict.keys())
322
+
323
+ args = {
324
+ "remove_stopwords": False,
325
+ "remove_numbers": False,
326
+ }
327
+ output_path = f"/Users/nilsl/Documents/PUK/VELAS/data/language/{task_dir}/preprocessed_transcripts/"
328
+ main(transcripts_dict, output_path, task_dir, protocol, **args)
File without changes