pelican-nlp 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/METADATA +15 -14
  39. pelican_nlp-0.1.3.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,653 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Compute semantic and phonetic coherence metrics for VELAS fluency tasks.
5
+
6
+ Algorithm Overview:
7
+ ------------------
8
+ The script implements two main coherence metrics:
9
+
10
+ 1. Semantic Coherence:
11
+ - Uses FastText word embeddings to convert words to 300d vectors
12
+ - Calculates cosine similarity between word pairs
13
+ - Analyzes in sliding windows (e.g., 2-word, 8-word)
14
+ - For each window:
15
+ a) Creates similarity matrix of all word pairs
16
+ b) Computes mean and std of similarities
17
+ c) Aggregates across windows
18
+
19
+ 2. Phonetic Coherence:
20
+ - Converts words to IPA (International Phonetic Alphabet)
21
+ - Computes feature-based edit distance between phonetic transcriptions
22
+ - Uses same window approach as semantic coherence
23
+ - Normalizes distances by word length
24
+
25
+ Window Analysis:
26
+ ---------------
27
+ - Window size n (e.g., n=2 or n=8)
28
+ - For each position i in text:
29
+ 1. Take words[i:i+n]
30
+ 2. Compute all pairwise similarities
31
+ 3. Calculate window statistics
32
+ 4. Move to next position
33
+ - Final metrics:
34
+ * Mean of window means (average coherence)
35
+ * Std of window means (coherence variability)
36
+ * Mean of window stds (within-window variability)
37
+ * Std of window stds (variability of within-window variability)
38
+ """
39
+ import os
40
+ import re
41
+ import time
42
+ import logging
43
+ from typing import List, Tuple, Dict, Union, Optional, Any
44
+ from pathlib import Path
45
+ from collections import Counter
46
+ from itertools import combinations
47
+ from concurrent.futures import ProcessPoolExecutor
48
+
49
+ import numpy as np
50
+ import pandas as pd
51
+ import scipy
52
+ import fasttext
53
+ import fasttext.util
54
+ import epitran
55
+ import panphon
56
+ from utils import ensure_output_dir
57
+ from config import COHERENCE_CONFIG, RESULTS_DIR
58
+
59
+ # Set up logging
60
+ logging.basicConfig(
61
+ level=logging.INFO,
62
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
63
+ handlers=[
64
+ logging.StreamHandler(),
65
+ logging.FileHandler(str(RESULTS_DIR / 'coherence.log'))
66
+ ]
67
+ )
68
+ logger = logging.getLogger(__name__)
69
+
70
+ # Type aliases
71
+ Vector = np.ndarray
72
+ Matrix = np.ndarray
73
+ TokenList = List[str]
74
+ VectorList = List[Vector]
75
+
76
+ def preprocess(
77
+ in_text: str,
78
+ lower: bool = COHERENCE_CONFIG["preprocessing"]["lower"],
79
+ free_text: bool = COHERENCE_CONFIG["preprocessing"]["free_text"]
80
+ ) -> TokenList:
81
+ """
82
+ Preprocess text by tokenizing and optionally lowercasing.
83
+
84
+ Args:
85
+ in_text: Input text to process
86
+ lower: Whether to convert to lowercase
87
+ free_text: If True, split on whitespace; if False, split on semicolons/commas
88
+
89
+ Returns:
90
+ List of preprocessed tokens
91
+ """
92
+ # Strip leading/trailing whitespace
93
+ in_text = in_text.strip()
94
+
95
+ if free_text:
96
+ # Remove punctuation and split on whitespace
97
+ in_text = re.sub(r"[^\w\s]", "", in_text)
98
+ words = in_text.split()
99
+ else:
100
+ # Split structured text on delimiters
101
+ words = re.split(";|,", in_text)
102
+
103
+ # Process tokens
104
+ if lower:
105
+ processed_words = [word.lower().strip() for word in words if word.strip()]
106
+ else:
107
+ processed_words = [word.strip() for word in words if word.strip()]
108
+
109
+ return processed_words
110
+
111
+ def get_vector(
112
+ model: Any,
113
+ mode: str,
114
+ tokens: TokenList,
115
+ error_messages: bool = COHERENCE_CONFIG["error_messages"]
116
+ ) -> List[Union[Vector, str]]:
117
+ """
118
+ Get vector representations for tokens.
119
+
120
+ Args:
121
+ model: Either FastText model (semantic) or Epitran model (phonetic)
122
+ mode: Either "semantic" or "phonetic"
123
+ tokens: List of words to vectorize
124
+ error_messages: Whether to print error messages
125
+
126
+ Returns:
127
+ List of vectors (semantic) or IPA transcriptions (phonetic)
128
+ """
129
+ if mode == "semantic":
130
+ try:
131
+ return [model.get_word_vector(token) for token in tokens]
132
+ except KeyError as e:
133
+ if error_messages:
134
+ print(f"Vector lookup error: {e}")
135
+ return [np.nan]
136
+ elif mode == "phonetic":
137
+ try:
138
+ return [model.transliterate(token) for token in tokens]
139
+ except KeyError as e:
140
+ if error_messages:
141
+ print(f"Phonetic transcription error: {e}")
142
+ return [np.nan]
143
+ else:
144
+ raise ValueError(f"Unknown mode: {mode}")
145
+
146
+ def get_semantic_similarity(vec1: Vector, vec2: Vector) -> float:
147
+ """
148
+ Compute cosine similarity between two vectors.
149
+
150
+ Args:
151
+ vec1: First vector
152
+ vec2: Second vector
153
+
154
+ Returns:
155
+ Cosine similarity (1 - cosine distance)
156
+ """
157
+ try:
158
+ return 1 - scipy.spatial.distance.cosine(vec1, vec2)
159
+ except ValueError:
160
+ return np.nan
161
+
162
+ def get_phonetic_similarity(vec1: str, vec2: str, dist: Any) -> float:
163
+ """
164
+ Compute normalized phonetic similarity using feature edit distance.
165
+
166
+ Args:
167
+ vec1: First IPA transcription
168
+ vec2: Second IPA transcription
169
+ dist: Panphon distance calculator
170
+
171
+ Returns:
172
+ Normalized similarity score (1 - normalized edit distance)
173
+ """
174
+ try:
175
+ return 1 - dist.feature_edit_distance(vec1, vec2) / max(len(vec1), len(vec2))
176
+ except ValueError:
177
+ return np.nan
178
+
179
+ def wordseq(
180
+ tokens: TokenList,
181
+ vectors: Union[VectorList, List[str]],
182
+ mode: str,
183
+ dist: Optional[Any] = None,
184
+ word_sim: bool = False
185
+ ) -> Union[float, Tuple[float, List[Union[str, float]]]]:
186
+ """
187
+ Calculate mean similarity between consecutive words.
188
+
189
+ Args:
190
+ tokens: List of original words
191
+ vectors: List of vector representations
192
+ mode: Either "semantic" or "phonetic"
193
+ dist: Panphon distance calculator (required for phonetic mode)
194
+ word_sim: If True, return word-by-word similarities
195
+
196
+ Returns:
197
+ If word_sim=False: Mean similarity
198
+ If word_sim=True: Tuple of (mean_similarity, list of alternating words and similarities)
199
+ """
200
+ similarities = np.array([])
201
+
202
+ for position, vec in enumerate(vectors):
203
+ if position == 0:
204
+ vec1 = vec
205
+ continue
206
+
207
+ vec2 = vec
208
+ # Calculate similarity based on mode
209
+ if mode == "semantic":
210
+ similarity = get_semantic_similarity(vec1, vec2)
211
+ elif mode == "phonetic":
212
+ similarity = get_phonetic_similarity(vec1, vec2, dist)
213
+ vec1 = vec2
214
+
215
+ similarities = np.append(similarities, similarity)
216
+
217
+ mean_sim = similarities[~np.isnan(similarities)].mean()
218
+
219
+ if word_sim:
220
+ word_sims = [i for sublist in list(zip(tokens, np.append(similarities, np.nan))) for i in sublist][:-1]
221
+ return mean_sim, word_sims
222
+
223
+ return mean_sim
224
+
225
+ def wordmatrix(
226
+ tokens: TokenList,
227
+ vectors: Union[VectorList, List[str]],
228
+ mode: str,
229
+ dist: Optional[Any] = None,
230
+ dataframe: bool = False
231
+ ) -> Union[Matrix, pd.DataFrame]:
232
+ """
233
+ Compute similarity matrix for all word pairs.
234
+
235
+ Args:
236
+ tokens: List of words
237
+ vectors: List of vector representations
238
+ mode: Either "semantic" or "phonetic"
239
+ dist: Panphon distance calculator (required for phonetic mode)
240
+ dataframe: If True, return pandas DataFrame; if False, return numpy array
241
+
242
+ Returns:
243
+ Similarity matrix as either numpy array or pandas DataFrame
244
+ """
245
+ vectors = np.array(vectors)
246
+ if mode == "semantic":
247
+ # Compute cosine similarity matrix
248
+ similarity_matrix = 1 - scipy.spatial.distance.cdist(vectors, vectors, 'cosine')
249
+ # Zero out upper triangle
250
+ upper_triangle_indices = np.triu_indices_from(similarity_matrix)
251
+ similarity_matrix[upper_triangle_indices] = np.nan
252
+ elif mode == "phonetic":
253
+ # Compute pairwise phonetic similarities
254
+ distances = [get_phonetic_similarity(i, j, dist) for (i, j) in combinations(vectors, 2)]
255
+ similarity_matrix = scipy.spatial.distance.squareform(distances)
256
+ upper_triangle_indices = np.triu_indices_from(similarity_matrix)
257
+ similarity_matrix[upper_triangle_indices] = np.nan
258
+
259
+ if dataframe:
260
+ return pd.DataFrame(similarity_matrix, index=tokens, columns=tokens)
261
+
262
+ # Return lower triangle as list of lists
263
+ return [list(row[:i+1]) for i, row in enumerate(similarity_matrix)]
264
+
265
+ def calculate_segment_avg(
266
+ tokens: TokenList,
267
+ vectors: Union[VectorList, List[str]],
268
+ mode: str,
269
+ start_idx: int,
270
+ window_size: int,
271
+ dist: Optional[Any] = None
272
+ ) -> Tuple[float, float]:
273
+ """
274
+ Calculate average similarity for a segment of tokens.
275
+
276
+ Args:
277
+ tokens: List of words
278
+ vectors: List of vector representations
279
+ mode: Either "semantic" or "phonetic"
280
+ start_idx: Starting index of segment
281
+ window_size: Size of window
282
+ dist: Panphon distance calculator (required for phonetic mode)
283
+
284
+ Returns:
285
+ Tuple of (mean_similarity, std_similarity)
286
+ """
287
+ segment_tokens = tokens[start_idx:start_idx + window_size]
288
+ segment_vectors = vectors[start_idx:start_idx + window_size]
289
+
290
+ # Get similarity matrix for segment
291
+ segment_df = wordmatrix(segment_tokens, segment_vectors, mode, dist=dist, dataframe=True)
292
+ segment_values = segment_df.stack()
293
+
294
+ return segment_values.mean(), segment_values.std()
295
+
296
+ def coherence(
297
+ tokens: TokenList,
298
+ vectors: Union[VectorList, List[str]],
299
+ mode: str,
300
+ window_size: int,
301
+ dist: Optional[Any] = None
302
+ ) -> Tuple[float, float, float, float]:
303
+ """
304
+ Compute coherence metrics for a sequence of words.
305
+
306
+ Args:
307
+ tokens: List of words
308
+ vectors: List of vector representations
309
+ mode: Either "semantic" or "phonetic"
310
+ window_size: Size of sliding window (0 for whole text)
311
+ dist: Panphon distance calculator (required for phonetic mode)
312
+
313
+ Returns:
314
+ Tuple of (mean_of_means, std_of_means, mean_of_stds, std_of_stds)
315
+ """
316
+ # Handle empty or single-word sequences
317
+ if len(tokens) < 2:
318
+ return np.nan, np.nan, np.nan, np.nan
319
+
320
+ # Handle whole-text analysis
321
+ elif window_size == 0:
322
+ tokens_df = wordmatrix(tokens, vectors, mode, dist=dist, dataframe=True)
323
+ matrix_values = tokens_df.stack()
324
+ return matrix_values.mean(), np.nan, matrix_values.std(), np.nan
325
+
326
+ # Handle sliding window analysis
327
+ else:
328
+ window_means = []
329
+ window_stds = []
330
+
331
+ for i in range(len(tokens) - window_size + 1):
332
+ window_tokens = tokens[i:i + window_size]
333
+ window_vectors = vectors[i:i + window_size]
334
+
335
+ # Get similarity matrix for window
336
+ window_df = wordmatrix(window_tokens, window_vectors, mode, dist=dist, dataframe=True)
337
+ window_values = window_df.stack()
338
+
339
+ # Calculate window statistics
340
+ window_means.append(window_values.mean())
341
+ window_stds.append(window_values.std())
342
+
343
+ # Calculate aggregate statistics
344
+ mean_mean = np.mean(window_means)
345
+ mean_std = np.std(window_means)
346
+ std_mean = np.mean(window_stds)
347
+ std_std = np.std(window_stds)
348
+
349
+ return mean_mean, mean_std, std_mean, std_std
350
+
351
+ def split_coherence(
352
+ row: pd.Series,
353
+ window: int,
354
+ mode: str,
355
+ dist: Optional[Any] = None
356
+ ) -> pd.Series:
357
+ """
358
+ Compute coherence metrics for a single row with given window size.
359
+
360
+ Args:
361
+ row: DataFrame row containing tokens and vectors
362
+ window: Window size for coherence calculation
363
+ mode: Either "semantic" or "phonetic"
364
+ dist: Panphon distance calculator (required for phonetic mode)
365
+
366
+ Returns:
367
+ Series with coherence metrics for this window size
368
+ """
369
+ # Both modes use 'tokens' column
370
+ tokens = row['tokens']
371
+ vectors = row["phonetic_vectors" if mode == "phonetic" else "embeddings"]
372
+
373
+ mean_mean, mean_std, std_mean, std_std = coherence(
374
+ tokens, vectors, mode, window_size=window, dist=dist
375
+ )
376
+
377
+ return pd.Series({
378
+ f"{mode}_coherence_{window}_mean_of_window_means": mean_mean,
379
+ f"{mode}_coherence_{window}_std_of_window_means": mean_std,
380
+ f"{mode}_coherence_{window}_mean_of_window_stds": std_mean,
381
+ f"{mode}_coherence_{window}_std_of_window_stds": std_std
382
+ })
383
+
384
+ def process_window(
385
+ data_df: pd.DataFrame,
386
+ window: int,
387
+ mode: str,
388
+ dist: Optional[Any] = None
389
+ ) -> pd.DataFrame:
390
+ """
391
+ Process entire DataFrame for a given window size.
392
+
393
+ Args:
394
+ data_df: DataFrame containing tokens and vectors
395
+ window: Window size for coherence calculation
396
+ mode: Either "semantic" or "phonetic"
397
+ dist: Panphon distance calculator (required for phonetic mode)
398
+
399
+ Returns:
400
+ DataFrame with coherence metrics for this window size
401
+ """
402
+ logger.info(f"Processing {mode} coherence with window size: {window}")
403
+ return data_df.apply(lambda row: split_coherence(row, window, mode, dist), axis=1)
404
+
405
+ def apply_coherence(
406
+ data_df: pd.DataFrame,
407
+ windows: List[int],
408
+ mode: str,
409
+ dist: Optional[Any] = None,
410
+ parallelize: bool = False
411
+ ) -> pd.DataFrame:
412
+ """
413
+ Apply coherence calculation across multiple window sizes.
414
+
415
+ Args:
416
+ data_df: DataFrame containing tokens and embedding vectors
417
+ windows: List of window sizes to process
418
+ mode: Either "semantic" or "phonetic"
419
+ dist: Panphon distance calculator (required for phonetic mode)
420
+ parallelize: Whether to use parallel processing
421
+
422
+ Returns:
423
+ DataFrame with coherence metrics for all window sizes
424
+ """
425
+ if parallelize:
426
+ with ProcessPoolExecutor() as executor:
427
+ futures = [
428
+ executor.submit(
429
+ process_window,
430
+ data_df.copy(),
431
+ window,
432
+ mode,
433
+ dist
434
+ )
435
+ for window in windows
436
+ ]
437
+ results = [future.result() for future in futures]
438
+ else:
439
+ results = [
440
+ process_window(data_df.copy(), window, mode, dist)
441
+ for window in windows #window_sizes not windows
442
+ ]
443
+
444
+ # Combine results into main DataFrame
445
+ for result in results:
446
+ for col in result.columns:
447
+ data_df[col] = result[col]
448
+
449
+ return data_df
450
+
451
+ def ipa_to_features(ipa_tokens: List[str]) -> List[List[float]]:
452
+ """
453
+ Convert IPA tokens into phonetic feature vectors.
454
+
455
+ Args:
456
+ ipa_tokens: List of IPA transcriptions
457
+
458
+ Returns:
459
+ List of feature vectors for each token
460
+ """
461
+ return [fasttext.word_to_vector_list(ipa) for ipa in ipa_tokens]
462
+
463
+ def load_models() -> Tuple[Any, Any, Any]:
464
+ """Load FastText, Epitran, and Panphon models."""
465
+ logger.info("Loading models...")
466
+
467
+ # Load FastText model
468
+ logger.info("Loading FastText model...")
469
+ ft_model = fasttext.load_model(COHERENCE_CONFIG["model"]["fasttext_path"])
470
+
471
+ # Load Epitran model
472
+ logger.info("Loading Epitran model...")
473
+ epi = epitran.Epitran(COHERENCE_CONFIG["model"]["language_code"])
474
+
475
+ # Load Panphon distance calculator
476
+ logger.info("Loading Panphon model...")
477
+ dist = panphon.distance.Distance()
478
+
479
+ return ft_model, epi, dist
480
+
481
+ def load_transcripts() -> pd.DataFrame:
482
+ """Load and preprocess transcripts."""
483
+ logger.info("Loading transcripts...")
484
+
485
+ # Get list of transcript files
486
+ transcript_files = [f for f in os.listdir(COHERENCE_CONFIG["paths"]["data_dir"]) if f.endswith('.txt')]
487
+ logger.info(f"Found {len(transcript_files)} transcript files")
488
+
489
+ # Load each transcript
490
+ data = []
491
+
492
+
493
+ # Load each transcript
494
+ data = []
495
+ for filename in transcript_files:
496
+ filepath = os.path.join(COHERENCE_CONFIG["paths"]["data_dir"], filename)
497
+
498
+ try:
499
+ # Read file with UTF-8, ignoring errors or replacing unknown chars
500
+ with open(filepath, 'r', encoding='utf-8') as f:
501
+ content = f.read()
502
+ tokens = preprocess(content)
503
+ data.append({
504
+ 'file_name': filename,
505
+ 'transcript': content,
506
+ 'tokens': tokens
507
+ })
508
+ except UnicodeDecodeError:
509
+ print(f"Warning: UnicodeDecodeError in file {filename}. Trying alternative encoding...")
510
+
511
+ # Try alternative encoding (ISO-8859-1 or cp1252)
512
+ try:
513
+ with open(filepath, 'r', encoding='ISO-8859-1') as f:
514
+ content = f.read()
515
+ tokens = preprocess(content)
516
+ data.append({
517
+ 'file_name': filename,
518
+ 'transcript': content,
519
+ 'tokens': tokens
520
+ })
521
+ except Exception as e:
522
+ print(f"Failed to read {filename} with alternative encoding: {e}")
523
+ continue # Skip the file
524
+
525
+ return pd.DataFrame(data)
526
+
527
+ def process_transcripts(
528
+ data_df: pd.DataFrame,
529
+ ft_model: Any,
530
+ epi: Any,
531
+ dist: Any,
532
+ config: Dict
533
+ ) -> pd.DataFrame:
534
+ """
535
+ Process transcripts to compute coherence metrics.
536
+
537
+ Args:
538
+ data_df: DataFrame with raw transcripts
539
+ ft_model: FastText model
540
+ epi: Epitran model
541
+ dist: Panphon distance calculator
542
+ config: Configuration dictionary
543
+
544
+ Returns:
545
+ DataFrame with computed metrics
546
+ """
547
+ logger.info("Processing transcripts...")
548
+
549
+ # Tokenization
550
+ logger.info("Tokenizing transcripts...")
551
+ data_df["tokens"] = data_df["transcript"].apply(
552
+ lambda x: preprocess(x, config["preprocessing"]["lower"], config["preprocessing"]["free_text"])
553
+ )
554
+
555
+ # Process each mode
556
+ for mode in config["modes"]:
557
+ if mode == "semantic":
558
+ # Generate embeddings
559
+ logger.info("Generating FastText embeddings...")
560
+ data_df["embeddings"] = data_df["tokens"].apply(
561
+ lambda x: get_vector(ft_model, "semantic", x)
562
+ )
563
+
564
+ # Calculate word sequence similarities
565
+ logger.info("Computing word sequence similarities...")
566
+ data_df["wordseq_ft"] = data_df.apply(
567
+ lambda row: wordseq(row["tokens"], row["embeddings"], "semantic"),
568
+ axis=1
569
+ )
570
+
571
+ # Apply coherence calculations
572
+ logger.info("Computing coherence metrics...")
573
+ data_df = apply_coherence(data_df, config["windows"], "semantic")
574
+
575
+ elif mode == "phonetic":
576
+ # Generate phonetic vectors
577
+ logger.info("Generating phonetic vectors...")
578
+ data_df["phonetic_vectors"] = data_df["tokens"].apply(
579
+ lambda x: get_vector(epi, "phonetic", x)
580
+ )
581
+
582
+ # Calculate word sequence similarities
583
+ data_df["wordseq_phon"] = data_df.apply(
584
+ lambda row: wordseq(row["tokens"], row["phonetic_vectors"], "phonetic", dist),
585
+ axis=1
586
+ )
587
+
588
+ # Apply coherence calculations
589
+ data_df = apply_coherence(data_df, config["windows"], "phonetic", dist=dist)
590
+
591
+ # Calculate additional metrics
592
+ logger.info("Computing additional metrics...")
593
+ data_df["number_tokens"] = data_df["tokens"].apply(len)
594
+ data_df["multiword_count"] = data_df["tokens"].apply(
595
+ lambda x: len([i for i in x if len(i.split()) > 1])
596
+ )
597
+
598
+ return data_df
599
+
600
+ def save_results(data_df: pd.DataFrame) -> None:
601
+ """Save processed results to CSV."""
602
+ # Extract study_id and sub_task from filename
603
+ data_df["study_id"] = data_df["file_name"].apply(lambda x: x.split("_")[0]) # Get first part (0029)
604
+ data_df["sub_task"] = data_df["file_name"].apply(lambda x: x.split("_")[-1].replace(".txt", ""))
605
+
606
+ # Determine output path
607
+ out_file = os.path.join(
608
+ COHERENCE_CONFIG["paths"]["results_dir"],
609
+ f'coherence_results{"_lower" if COHERENCE_CONFIG["preprocessing"]["lower"] else "_upper"}.csv'
610
+ )
611
+
612
+ # Ensure output directory exists
613
+ ensure_output_dir(out_file)
614
+
615
+ # Save results, dropping intermediate columns if they exist
616
+ columns_to_drop = [
617
+ col for col in [
618
+ "transcript", "tokens",
619
+ "embeddings", "phonetic_vectors", "file_name"
620
+ ] if col in data_df.columns
621
+ ]
622
+
623
+ # Reorder columns to put study_id and sub_task first
624
+ final_df = data_df.drop(columns=columns_to_drop)
625
+ cols = ["study_id", "sub_task"] + [col for col in final_df.columns if col not in ["study_id", "sub_task"]]
626
+ final_df = final_df[cols]
627
+
628
+ final_df.to_csv(out_file, index=None)
629
+ logger.info(f"Results saved to: {out_file}")
630
+
631
+ def main():
632
+ """Main execution function."""
633
+ try:
634
+ logger.info("Starting coherence analysis...")
635
+
636
+ # Load models
637
+ ft_model, epi, dist = load_models()
638
+
639
+ # Load and process transcripts
640
+ data_df = load_transcripts()
641
+ data_df = process_transcripts(data_df, ft_model, epi, dist, COHERENCE_CONFIG)
642
+
643
+ # Save results
644
+ save_results(data_df)
645
+
646
+ logger.info("Coherence analysis completed successfully!")
647
+
648
+ except Exception as e:
649
+ logger.error(f"Error during execution: {str(e)}", exc_info=True)
650
+ raise
651
+
652
+ if __name__ == "__main__":
653
+ main()