pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,466 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Optimality Analysis for Verbal Fluency Tasks
5
+
6
+ This module computes optimality metrics for verbal fluency responses by analyzing:
7
+ 1. Semantic distances (using FastText embeddings)
8
+ 2. Phonetic distances (using Epitran/PanPhon)
9
+ 3. Orthographic distances (using edit distance)
10
+
11
+ For each response sequence, it:
12
+ 1. Computes distance matrices between consecutive items
13
+ 2. Calculates actual path costs vs random permutations
14
+ 3. Generates z-scores to measure path optimality
15
+ 4. Supports different shuffling modes for baseline comparison
16
+
17
+ Key Parameters:
18
+ - bootstrap: Number of random permutations (default: 10000)
19
+ - min_len: Minimum sequence length to analyze (default: 8)
20
+ - shuffle_mode: How to handle first/last items in permutations
21
+ """
22
+
23
+ import os
24
+ import re
25
+ from typing import Dict, List, Tuple, Any
26
+ import numpy as np
27
+ import pandas as pd
28
+ import fasttext.util
29
+ import editdistance
30
+ import panphon.distance
31
+ import scipy
32
+ import epitran
33
+ from concurrent.futures import ProcessPoolExecutor, as_completed
34
+ from config import OPTIMALITY_CONFIG
35
+ from utils import ensure_output_dir, validate_input_data
36
+
37
+ # Type aliases for clarity
38
+ DistanceMatrix = np.ndarray
39
+ EmbeddingDict = Dict[str, Dict[str, List[Any]]]
40
+
41
+ def load_data_dict(directory_path: str, lower: bool = False) -> Dict[str, List[str]]:
42
+ """
43
+ Load and preprocess verbal fluency responses from text files.
44
+
45
+ Args:
46
+ directory_path: Path to directory containing response files
47
+ lower: Whether to convert text to lowercase
48
+
49
+ Returns:
50
+ Dictionary mapping task IDs to lists of responses
51
+ """
52
+ data_dict = {}
53
+
54
+ for filename in os.listdir(directory_path):
55
+ if not filename.endswith(".txt"):
56
+ continue
57
+
58
+ key = filename.split(".")[0]
59
+ file_path = os.path.join(directory_path, filename)
60
+
61
+ with open(file_path, "r") as file:
62
+ content = file.read().strip()
63
+ words = [word.strip() for word in re.split(";|,", content) if word.strip()]
64
+
65
+ if lower:
66
+ words = [word.lower() for word in words]
67
+
68
+ data_dict[key] = words
69
+ print(f"Loaded {filename}: {words}")
70
+
71
+ return data_dict
72
+
73
+
74
+ def embedded_data_dict(
75
+ data_dict: Dict[str, List[str]],
76
+ min_len: int,
77
+ epi: epitran.Epitran,
78
+ ft_model: Any
79
+ ) -> EmbeddingDict:
80
+ """
81
+ Convert word sequences into embeddings and phonetic transcriptions.
82
+
83
+ Args:
84
+ data_dict: Dictionary of word sequences
85
+ min_len: Minimum sequence length to process
86
+ epi: Epitran model for phonetic transcription
87
+ ft_model: FastText model for word embeddings
88
+
89
+ Returns:
90
+ Dictionary containing words, phonemes and embeddings for each sequence
91
+ """
92
+ embeddings_dict = {}
93
+
94
+ for key, words in data_dict.items():
95
+ if len(words) < min_len:
96
+ continue
97
+
98
+ embeddings_dict[key] = {
99
+ "words": words,
100
+ "phonemes": [epi.transliterate(word) for word in words],
101
+ "embeddings": [ft_model.get_word_vector(word) for word in words]
102
+ }
103
+
104
+ return embeddings_dict
105
+
106
+
107
+ def create_semantic_distance_matrix(embedding_list: List[np.ndarray]) -> DistanceMatrix:
108
+ """
109
+ Create a distance matrix using cosine distances between word embeddings.
110
+
111
+ Args:
112
+ embedding_list: List of word embeddings
113
+
114
+ Returns:
115
+ Matrix of pairwise cosine distances between embeddings
116
+ """
117
+ distances = scipy.spatial.distance.cdist(
118
+ np.array(embedding_list),
119
+ np.array(embedding_list),
120
+ 'cosine'
121
+ )
122
+ np.fill_diagonal(distances, 0)
123
+ return distances
124
+
125
+
126
+ def create_phonetic_distance_matrix(
127
+ words: List[str],
128
+ distance_fun: callable,
129
+ norm_range: Tuple[float, float] = (0, 1)
130
+ ) -> DistanceMatrix:
131
+ """
132
+ Create a distance matrix using phonetic or orthographic distances.
133
+
134
+ Args:
135
+ words: List of words or phonetic transcriptions
136
+ distance_fun: Function to compute distance between two strings
137
+ norm_range: Range to normalize distances to
138
+
139
+ Returns:
140
+ Matrix of pairwise distances between words
141
+ """
142
+ num_words = len(words)
143
+ dist_matrix = np.zeros((num_words, num_words))
144
+
145
+ for i in range(num_words):
146
+ for j in range(i + 1, num_words):
147
+ # Normalize by max length to get relative distance
148
+ distance = distance_fun(words[i], words[j]) / max(len(words[i]), len(words[j]))
149
+ dist_matrix[i, j] = distance
150
+ dist_matrix[j, i] = distance
151
+
152
+ return dist_matrix
153
+
154
+
155
+ def calculate_total_distance_covered(dist_matrix: DistanceMatrix, order: np.ndarray) -> float:
156
+ """
157
+ Calculate total distance covered by a path through items.
158
+
159
+ Args:
160
+ dist_matrix: Matrix of pairwise distances
161
+ order: Sequence of indices defining the path
162
+
163
+ Returns:
164
+ Total distance covered by the path
165
+ """
166
+ distances = dist_matrix[order[:-1], order[1:]]
167
+ return float(np.sum(distances))
168
+
169
+
170
+ def average_similarity(matrix: DistanceMatrix) -> float:
171
+ """
172
+ Calculate average similarity between all pairs of items.
173
+
174
+ Args:
175
+ matrix: Matrix of pairwise distances/similarities
176
+
177
+ Returns:
178
+ Average similarity across all pairs
179
+ """
180
+ n = matrix.shape[0]
181
+
182
+ # Only count upper triangle to avoid double counting
183
+ upper_tri = np.triu(matrix, k=1)
184
+ total = np.sum(upper_tri)
185
+ count = (n * (n - 1)) // 2 # Number of pairs
186
+
187
+ return float(total / count) if count > 0 else 0.0
188
+
189
+
190
+ def get_shuffled_order(n: int, shuffle_mode: str, seed: int) -> np.ndarray:
191
+ """
192
+ Generate shuffled sequence based on specified mode.
193
+
194
+ Args:
195
+ n: Length of sequence
196
+ shuffle_mode: How to handle first/last items:
197
+ - include0_includeN: Shuffle all items
198
+ - exclude0_includeN: Keep first item fixed
199
+ - exclude0_excludeN: Keep first and last items fixed
200
+ seed: Random seed for reproducibility
201
+
202
+ Returns:
203
+ Shuffled sequence of indices
204
+ """
205
+ np.random.seed(seed)
206
+
207
+ if shuffle_mode == "include0_includeN":
208
+ order = np.arange(n)
209
+ np.random.shuffle(order)
210
+ elif shuffle_mode == "exclude0_includeN":
211
+ rest = np.arange(1, n)
212
+ np.random.shuffle(rest)
213
+ order = np.concatenate(([0], rest))
214
+ elif shuffle_mode == "exclude0_excludeN":
215
+ middle = np.arange(1, n-1)
216
+ np.random.shuffle(middle)
217
+ order = np.concatenate(([0], middle, [n-1]))
218
+ else:
219
+ raise ValueError(f"Invalid shuffle mode: {shuffle_mode}")
220
+
221
+ return order
222
+
223
+
224
+ def analyze_optimality_transcript(
225
+ key: str,
226
+ embeddings: Dict[str, List[Any]],
227
+ mode: str,
228
+ min_len: int,
229
+ bootstrap: int,
230
+ phon_dist: Any,
231
+ shuffle_mode: str
232
+ ) -> List[Dict[str, Any]]:
233
+ """
234
+ Analyze optimality of a single transcript using bootstrap permutations.
235
+
236
+ Args:
237
+ key: Identifier for the transcript
238
+ embeddings: Dictionary containing words, phonemes, and embeddings
239
+ mode: Analysis mode ('semantic', 'phonetic', or 'orthographic')
240
+ min_len: Window size for analysis
241
+ bootstrap: Number of random permutations
242
+ phon_dist: PanPhon distance calculator
243
+ shuffle_mode: How to handle permutations
244
+
245
+ Returns:
246
+ List of results for each window position
247
+ """
248
+ answer_res = []
249
+ answer_len = len(embeddings["words"])
250
+
251
+ # Analyze each possible window position
252
+ for i in range((answer_len - min_len) + 1):
253
+ # Get window of items to analyze
254
+ if mode == "semantic":
255
+ window = embeddings["embeddings"][i:i + min_len]
256
+ dist_matrix = create_semantic_distance_matrix(window)
257
+ elif mode == "orthographic":
258
+ window = embeddings["words"][i:i + min_len]
259
+ dist_matrix = create_phonetic_distance_matrix(window, editdistance.eval)
260
+ elif mode == "phonetic":
261
+ window = embeddings["phonemes"][i:i + min_len]
262
+ dist_matrix = create_phonetic_distance_matrix(window, phon_dist.feature_edit_distance)
263
+ else:
264
+ raise ValueError(f"Invalid mode: {mode}")
265
+
266
+ # Calculate costs for actual sequence and permutations
267
+ perm_costs = []
268
+ for j in range(bootstrap):
269
+ order = (np.arange(len(window)) if j == 0
270
+ else get_shuffled_order(len(window), shuffle_mode, j))
271
+ cost = calculate_total_distance_covered(dist_matrix, order)
272
+ perm_costs.append(cost)
273
+
274
+ if j == 0:
275
+ all_pairs_avg = average_similarity(dist_matrix)
276
+
277
+ # Normalize costs by number of edges
278
+ costs_per_edge = np.array(perm_costs) / (min_len - 1)
279
+ true_cost = costs_per_edge[0]
280
+
281
+ # Store results for this window
282
+ window_results = {
283
+ "analysis_mode": mode,
284
+ "study_id": key.split("_")[1],
285
+ "task": key.split("_")[-3],
286
+ "sub_task": key.split("_")[-1],
287
+ "window_index": i,
288
+ "all_pairs_average": all_pairs_avg,
289
+ "actual_dist": true_cost,
290
+ "average_dist": np.mean(costs_per_edge[1:]),
291
+ "std_dist": np.std(costs_per_edge[1:])
292
+ }
293
+ answer_res.append(window_results)
294
+
295
+ return answer_res
296
+
297
+
298
+ def process_key(
299
+ key: str,
300
+ embeddings: Dict[str, List[Any]],
301
+ mode: str,
302
+ min_len: int,
303
+ bootstrap: int,
304
+ phon_dist: Any,
305
+ shuffle_mode: str
306
+ ) -> List[Dict[str, Any]]:
307
+ """Wrapper to process a single key with progress printing."""
308
+ print(f"Processing {key}")
309
+ return analyze_optimality_transcript(
310
+ key, embeddings, mode, min_len, bootstrap, phon_dist, shuffle_mode
311
+ )
312
+
313
+
314
+ def process_data_parallel(
315
+ embeddings_dict: EmbeddingDict,
316
+ modes: List[str],
317
+ min_len: int,
318
+ bootstrap: int,
319
+ phon_dist: Any,
320
+ shuffle_mode: str,
321
+ max_workers: int = 16
322
+ ) -> Dict[str, List[Dict[str, Any]]]:
323
+ """
324
+ Process data in parallel using ProcessPoolExecutor.
325
+
326
+ Args:
327
+ embeddings_dict: Dictionary of embeddings to process
328
+ modes: List of analysis modes to run
329
+ min_len: Window size for analysis
330
+ bootstrap: Number of permutations
331
+ phon_dist: PanPhon distance calculator
332
+ shuffle_mode: Permutation mode
333
+ max_workers: Maximum number of parallel workers
334
+
335
+ Returns:
336
+ Dictionary mapping keys to analysis results
337
+ """
338
+ results_dict = {}
339
+
340
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
341
+ futures = {}
342
+ for mode in modes:
343
+ for key, embeddings in embeddings_dict.items():
344
+ future = executor.submit(
345
+ process_key,
346
+ f"{mode}_{key}",
347
+ embeddings,
348
+ mode,
349
+ min_len,
350
+ bootstrap,
351
+ phon_dist,
352
+ shuffle_mode
353
+ )
354
+ futures[future] = (key, mode)
355
+
356
+ for future in as_completed(futures):
357
+ key, mode = futures[future]
358
+ try:
359
+ result = future.result()
360
+ results_dict[f"{mode}_{key}"] = result
361
+ print(f"Completed processing {mode}_{key}")
362
+ except Exception as exc:
363
+ print(f"{key} generated an exception: {exc}")
364
+
365
+ return results_dict
366
+
367
+
368
+ def process_data_sequential(
369
+ embeddings_dict: EmbeddingDict,
370
+ modes: List[str],
371
+ min_len: int,
372
+ bootstrap: int,
373
+ phon_dist: Any,
374
+ shuffle_mode: str
375
+ ) -> Dict[str, List[Dict[str, Any]]]:
376
+ """Process data sequentially (for debugging/testing)."""
377
+ results_dict = {}
378
+ for key, embeddings in embeddings_dict.items():
379
+ for mode in modes:
380
+ result = analyze_optimality_transcript(
381
+ key, embeddings, mode, min_len, bootstrap, phon_dist, shuffle_mode
382
+ )
383
+ results_dict[f"{key}_{mode}"] = result
384
+ return results_dict
385
+
386
+
387
+ def save_results(results_dict: Dict[str, List[Dict[str, Any]]], output_path: str) -> None:
388
+ """Save results to CSV file."""
389
+ results_list = []
390
+ for lines in results_dict.values():
391
+ results_list.extend(lines)
392
+
393
+ pd.DataFrame(results_list).to_csv(output_path)
394
+ print(f"Results saved to {output_path}")
395
+
396
+
397
+ def main():
398
+ """Main execution function."""
399
+ # Validate input and output paths
400
+ paths_to_validate = {
401
+ "fasttext_model": OPTIMALITY_CONFIG["model"]["fasttext_path"],
402
+ "data_directory": OPTIMALITY_CONFIG["paths"]["data_dir"]
403
+ }
404
+
405
+ # Check input paths
406
+ path_errors = validate_input_data(paths_to_validate)
407
+ if path_errors:
408
+ for desc, error in path_errors.items():
409
+ print(f"Error with {desc}: {error}")
410
+ raise FileNotFoundError("Required files/directories not found")
411
+
412
+ # Ensure results directory exists
413
+ ensure_output_dir(OPTIMALITY_CONFIG["paths"]["results_dir"])
414
+
415
+ # Initialize models
416
+ ft_model = fasttext.load_model(OPTIMALITY_CONFIG["model"]["fasttext_path"])
417
+ epi = epitran.Epitran(OPTIMALITY_CONFIG["model"]["language_code"])
418
+ phon_dist = panphon.distance.Distance()
419
+
420
+ # Load and preprocess data
421
+ data_dict = load_data_dict(
422
+ OPTIMALITY_CONFIG["paths"]["data_dir"],
423
+ OPTIMALITY_CONFIG["preprocessing"]["lower"]
424
+ )
425
+
426
+ # Process each window size and shuffle mode
427
+ for shuffle_mode in OPTIMALITY_CONFIG["shuffle_modes"]:
428
+ for min_len in OPTIMALITY_CONFIG["window_sizes"]:
429
+ print(f"\nProcessing window size {min_len} with mode {shuffle_mode}")
430
+
431
+ # Prepare embeddings
432
+ embeddings_dict = embedded_data_dict(data_dict, min_len, epi, ft_model)
433
+
434
+ # Process data
435
+ results_dict = (
436
+ process_data_parallel(
437
+ embeddings_dict,
438
+ OPTIMALITY_CONFIG["modes"],
439
+ min_len,
440
+ OPTIMALITY_CONFIG["bootstrap"],
441
+ phon_dist,
442
+ shuffle_mode,
443
+ OPTIMALITY_CONFIG["max_workers"]
444
+ ) if OPTIMALITY_CONFIG["parallelize"] else
445
+ process_data_sequential(
446
+ embeddings_dict,
447
+ OPTIMALITY_CONFIG["modes"],
448
+ min_len,
449
+ OPTIMALITY_CONFIG["bootstrap"],
450
+ phon_dist,
451
+ shuffle_mode
452
+ )
453
+ )
454
+
455
+ # Save results
456
+ results_path = os.path.join(
457
+ OPTIMALITY_CONFIG["paths"]["results_dir"],
458
+ f"optimality_{OPTIMALITY_CONFIG['bootstrap']}_window_{min_len}_{shuffle_mode}_"
459
+ f"{'lower' if OPTIMALITY_CONFIG['preprocessing']['lower'] else 'upper'}.csv"
460
+ )
461
+ save_results(results_dict, results_path)
462
+
463
+ return True
464
+
465
+ if __name__ == "__main__":
466
+ main()