pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
- pelican_nlp-0.1.2.dist-info/RECORD +75 -0
- pelican_nlp-0.1.1.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,653 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
Compute semantic and phonetic coherence metrics for VELAS fluency tasks.
|
5
|
+
|
6
|
+
Algorithm Overview:
|
7
|
+
------------------
|
8
|
+
The script implements two main coherence metrics:
|
9
|
+
|
10
|
+
1. Semantic Coherence:
|
11
|
+
- Uses FastText word embeddings to convert words to 300d vectors
|
12
|
+
- Calculates cosine similarity between word pairs
|
13
|
+
- Analyzes in sliding windows (e.g., 2-word, 8-word)
|
14
|
+
- For each window:
|
15
|
+
a) Creates similarity matrix of all word pairs
|
16
|
+
b) Computes mean and std of similarities
|
17
|
+
c) Aggregates across windows
|
18
|
+
|
19
|
+
2. Phonetic Coherence:
|
20
|
+
- Converts words to IPA (International Phonetic Alphabet)
|
21
|
+
- Computes feature-based edit distance between phonetic transcriptions
|
22
|
+
- Uses same window approach as semantic coherence
|
23
|
+
- Normalizes distances by word length
|
24
|
+
|
25
|
+
Window Analysis:
|
26
|
+
---------------
|
27
|
+
- Window size n (e.g., n=2 or n=8)
|
28
|
+
- For each position i in text:
|
29
|
+
1. Take words[i:i+n]
|
30
|
+
2. Compute all pairwise similarities
|
31
|
+
3. Calculate window statistics
|
32
|
+
4. Move to next position
|
33
|
+
- Final metrics:
|
34
|
+
* Mean of window means (average coherence)
|
35
|
+
* Std of window means (coherence variability)
|
36
|
+
* Mean of window stds (within-window variability)
|
37
|
+
* Std of window stds (variability of within-window variability)
|
38
|
+
"""
|
39
|
+
import os
|
40
|
+
import re
|
41
|
+
import time
|
42
|
+
import logging
|
43
|
+
from typing import List, Tuple, Dict, Union, Optional, Any
|
44
|
+
from pathlib import Path
|
45
|
+
from collections import Counter
|
46
|
+
from itertools import combinations
|
47
|
+
from concurrent.futures import ProcessPoolExecutor
|
48
|
+
|
49
|
+
import numpy as np
|
50
|
+
import pandas as pd
|
51
|
+
import scipy
|
52
|
+
import fasttext
|
53
|
+
import fasttext.util
|
54
|
+
import epitran
|
55
|
+
import panphon
|
56
|
+
from utils import ensure_output_dir
|
57
|
+
from config import COHERENCE_CONFIG, RESULTS_DIR
|
58
|
+
|
59
|
+
# Set up logging
|
60
|
+
logging.basicConfig(
|
61
|
+
level=logging.INFO,
|
62
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
63
|
+
handlers=[
|
64
|
+
logging.StreamHandler(),
|
65
|
+
logging.FileHandler(str(RESULTS_DIR / 'coherence.log'))
|
66
|
+
]
|
67
|
+
)
|
68
|
+
logger = logging.getLogger(__name__)
|
69
|
+
|
70
|
+
# Type aliases
|
71
|
+
Vector = np.ndarray
|
72
|
+
Matrix = np.ndarray
|
73
|
+
TokenList = List[str]
|
74
|
+
VectorList = List[Vector]
|
75
|
+
|
76
|
+
def preprocess(
|
77
|
+
in_text: str,
|
78
|
+
lower: bool = COHERENCE_CONFIG["preprocessing"]["lower"],
|
79
|
+
free_text: bool = COHERENCE_CONFIG["preprocessing"]["free_text"]
|
80
|
+
) -> TokenList:
|
81
|
+
"""
|
82
|
+
Preprocess text by tokenizing and optionally lowercasing.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
in_text: Input text to process
|
86
|
+
lower: Whether to convert to lowercase
|
87
|
+
free_text: If True, split on whitespace; if False, split on semicolons/commas
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
List of preprocessed tokens
|
91
|
+
"""
|
92
|
+
# Strip leading/trailing whitespace
|
93
|
+
in_text = in_text.strip()
|
94
|
+
|
95
|
+
if free_text:
|
96
|
+
# Remove punctuation and split on whitespace
|
97
|
+
in_text = re.sub(r"[^\w\s]", "", in_text)
|
98
|
+
words = in_text.split()
|
99
|
+
else:
|
100
|
+
# Split structured text on delimiters
|
101
|
+
words = re.split(";|,", in_text)
|
102
|
+
|
103
|
+
# Process tokens
|
104
|
+
if lower:
|
105
|
+
processed_words = [word.lower().strip() for word in words if word.strip()]
|
106
|
+
else:
|
107
|
+
processed_words = [word.strip() for word in words if word.strip()]
|
108
|
+
|
109
|
+
return processed_words
|
110
|
+
|
111
|
+
def get_vector(
|
112
|
+
model: Any,
|
113
|
+
mode: str,
|
114
|
+
tokens: TokenList,
|
115
|
+
error_messages: bool = COHERENCE_CONFIG["error_messages"]
|
116
|
+
) -> List[Union[Vector, str]]:
|
117
|
+
"""
|
118
|
+
Get vector representations for tokens.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
model: Either FastText model (semantic) or Epitran model (phonetic)
|
122
|
+
mode: Either "semantic" or "phonetic"
|
123
|
+
tokens: List of words to vectorize
|
124
|
+
error_messages: Whether to print error messages
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
List of vectors (semantic) or IPA transcriptions (phonetic)
|
128
|
+
"""
|
129
|
+
if mode == "semantic":
|
130
|
+
try:
|
131
|
+
return [model.get_word_vector(token) for token in tokens]
|
132
|
+
except KeyError as e:
|
133
|
+
if error_messages:
|
134
|
+
print(f"Vector lookup error: {e}")
|
135
|
+
return [np.nan]
|
136
|
+
elif mode == "phonetic":
|
137
|
+
try:
|
138
|
+
return [model.transliterate(token) for token in tokens]
|
139
|
+
except KeyError as e:
|
140
|
+
if error_messages:
|
141
|
+
print(f"Phonetic transcription error: {e}")
|
142
|
+
return [np.nan]
|
143
|
+
else:
|
144
|
+
raise ValueError(f"Unknown mode: {mode}")
|
145
|
+
|
146
|
+
def get_semantic_similarity(vec1: Vector, vec2: Vector) -> float:
|
147
|
+
"""
|
148
|
+
Compute cosine similarity between two vectors.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
vec1: First vector
|
152
|
+
vec2: Second vector
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
Cosine similarity (1 - cosine distance)
|
156
|
+
"""
|
157
|
+
try:
|
158
|
+
return 1 - scipy.spatial.distance.cosine(vec1, vec2)
|
159
|
+
except ValueError:
|
160
|
+
return np.nan
|
161
|
+
|
162
|
+
def get_phonetic_similarity(vec1: str, vec2: str, dist: Any) -> float:
|
163
|
+
"""
|
164
|
+
Compute normalized phonetic similarity using feature edit distance.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
vec1: First IPA transcription
|
168
|
+
vec2: Second IPA transcription
|
169
|
+
dist: Panphon distance calculator
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
Normalized similarity score (1 - normalized edit distance)
|
173
|
+
"""
|
174
|
+
try:
|
175
|
+
return 1 - dist.feature_edit_distance(vec1, vec2) / max(len(vec1), len(vec2))
|
176
|
+
except ValueError:
|
177
|
+
return np.nan
|
178
|
+
|
179
|
+
def wordseq(
|
180
|
+
tokens: TokenList,
|
181
|
+
vectors: Union[VectorList, List[str]],
|
182
|
+
mode: str,
|
183
|
+
dist: Optional[Any] = None,
|
184
|
+
word_sim: bool = False
|
185
|
+
) -> Union[float, Tuple[float, List[Union[str, float]]]]:
|
186
|
+
"""
|
187
|
+
Calculate mean similarity between consecutive words.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
tokens: List of original words
|
191
|
+
vectors: List of vector representations
|
192
|
+
mode: Either "semantic" or "phonetic"
|
193
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
194
|
+
word_sim: If True, return word-by-word similarities
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
If word_sim=False: Mean similarity
|
198
|
+
If word_sim=True: Tuple of (mean_similarity, list of alternating words and similarities)
|
199
|
+
"""
|
200
|
+
similarities = np.array([])
|
201
|
+
|
202
|
+
for position, vec in enumerate(vectors):
|
203
|
+
if position == 0:
|
204
|
+
vec1 = vec
|
205
|
+
continue
|
206
|
+
|
207
|
+
vec2 = vec
|
208
|
+
# Calculate similarity based on mode
|
209
|
+
if mode == "semantic":
|
210
|
+
similarity = get_semantic_similarity(vec1, vec2)
|
211
|
+
elif mode == "phonetic":
|
212
|
+
similarity = get_phonetic_similarity(vec1, vec2, dist)
|
213
|
+
vec1 = vec2
|
214
|
+
|
215
|
+
similarities = np.append(similarities, similarity)
|
216
|
+
|
217
|
+
mean_sim = similarities[~np.isnan(similarities)].mean()
|
218
|
+
|
219
|
+
if word_sim:
|
220
|
+
word_sims = [i for sublist in list(zip(tokens, np.append(similarities, np.nan))) for i in sublist][:-1]
|
221
|
+
return mean_sim, word_sims
|
222
|
+
|
223
|
+
return mean_sim
|
224
|
+
|
225
|
+
def wordmatrix(
|
226
|
+
tokens: TokenList,
|
227
|
+
vectors: Union[VectorList, List[str]],
|
228
|
+
mode: str,
|
229
|
+
dist: Optional[Any] = None,
|
230
|
+
dataframe: bool = False
|
231
|
+
) -> Union[Matrix, pd.DataFrame]:
|
232
|
+
"""
|
233
|
+
Compute similarity matrix for all word pairs.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
tokens: List of words
|
237
|
+
vectors: List of vector representations
|
238
|
+
mode: Either "semantic" or "phonetic"
|
239
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
240
|
+
dataframe: If True, return pandas DataFrame; if False, return numpy array
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
Similarity matrix as either numpy array or pandas DataFrame
|
244
|
+
"""
|
245
|
+
vectors = np.array(vectors)
|
246
|
+
if mode == "semantic":
|
247
|
+
# Compute cosine similarity matrix
|
248
|
+
similarity_matrix = 1 - scipy.spatial.distance.cdist(vectors, vectors, 'cosine')
|
249
|
+
# Zero out upper triangle
|
250
|
+
upper_triangle_indices = np.triu_indices_from(similarity_matrix)
|
251
|
+
similarity_matrix[upper_triangle_indices] = np.nan
|
252
|
+
elif mode == "phonetic":
|
253
|
+
# Compute pairwise phonetic similarities
|
254
|
+
distances = [get_phonetic_similarity(i, j, dist) for (i, j) in combinations(vectors, 2)]
|
255
|
+
similarity_matrix = scipy.spatial.distance.squareform(distances)
|
256
|
+
upper_triangle_indices = np.triu_indices_from(similarity_matrix)
|
257
|
+
similarity_matrix[upper_triangle_indices] = np.nan
|
258
|
+
|
259
|
+
if dataframe:
|
260
|
+
return pd.DataFrame(similarity_matrix, index=tokens, columns=tokens)
|
261
|
+
|
262
|
+
# Return lower triangle as list of lists
|
263
|
+
return [list(row[:i+1]) for i, row in enumerate(similarity_matrix)]
|
264
|
+
|
265
|
+
def calculate_segment_avg(
|
266
|
+
tokens: TokenList,
|
267
|
+
vectors: Union[VectorList, List[str]],
|
268
|
+
mode: str,
|
269
|
+
start_idx: int,
|
270
|
+
window_size: int,
|
271
|
+
dist: Optional[Any] = None
|
272
|
+
) -> Tuple[float, float]:
|
273
|
+
"""
|
274
|
+
Calculate average similarity for a segment of tokens.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
tokens: List of words
|
278
|
+
vectors: List of vector representations
|
279
|
+
mode: Either "semantic" or "phonetic"
|
280
|
+
start_idx: Starting index of segment
|
281
|
+
window_size: Size of window
|
282
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
Tuple of (mean_similarity, std_similarity)
|
286
|
+
"""
|
287
|
+
segment_tokens = tokens[start_idx:start_idx + window_size]
|
288
|
+
segment_vectors = vectors[start_idx:start_idx + window_size]
|
289
|
+
|
290
|
+
# Get similarity matrix for segment
|
291
|
+
segment_df = wordmatrix(segment_tokens, segment_vectors, mode, dist=dist, dataframe=True)
|
292
|
+
segment_values = segment_df.stack()
|
293
|
+
|
294
|
+
return segment_values.mean(), segment_values.std()
|
295
|
+
|
296
|
+
def coherence(
|
297
|
+
tokens: TokenList,
|
298
|
+
vectors: Union[VectorList, List[str]],
|
299
|
+
mode: str,
|
300
|
+
window_size: int,
|
301
|
+
dist: Optional[Any] = None
|
302
|
+
) -> Tuple[float, float, float, float]:
|
303
|
+
"""
|
304
|
+
Compute coherence metrics for a sequence of words.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
tokens: List of words
|
308
|
+
vectors: List of vector representations
|
309
|
+
mode: Either "semantic" or "phonetic"
|
310
|
+
window_size: Size of sliding window (0 for whole text)
|
311
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
Tuple of (mean_of_means, std_of_means, mean_of_stds, std_of_stds)
|
315
|
+
"""
|
316
|
+
# Handle empty or single-word sequences
|
317
|
+
if len(tokens) < 2:
|
318
|
+
return np.nan, np.nan, np.nan, np.nan
|
319
|
+
|
320
|
+
# Handle whole-text analysis
|
321
|
+
elif window_size == 0:
|
322
|
+
tokens_df = wordmatrix(tokens, vectors, mode, dist=dist, dataframe=True)
|
323
|
+
matrix_values = tokens_df.stack()
|
324
|
+
return matrix_values.mean(), np.nan, matrix_values.std(), np.nan
|
325
|
+
|
326
|
+
# Handle sliding window analysis
|
327
|
+
else:
|
328
|
+
window_means = []
|
329
|
+
window_stds = []
|
330
|
+
|
331
|
+
for i in range(len(tokens) - window_size + 1):
|
332
|
+
window_tokens = tokens[i:i + window_size]
|
333
|
+
window_vectors = vectors[i:i + window_size]
|
334
|
+
|
335
|
+
# Get similarity matrix for window
|
336
|
+
window_df = wordmatrix(window_tokens, window_vectors, mode, dist=dist, dataframe=True)
|
337
|
+
window_values = window_df.stack()
|
338
|
+
|
339
|
+
# Calculate window statistics
|
340
|
+
window_means.append(window_values.mean())
|
341
|
+
window_stds.append(window_values.std())
|
342
|
+
|
343
|
+
# Calculate aggregate statistics
|
344
|
+
mean_mean = np.mean(window_means)
|
345
|
+
mean_std = np.std(window_means)
|
346
|
+
std_mean = np.mean(window_stds)
|
347
|
+
std_std = np.std(window_stds)
|
348
|
+
|
349
|
+
return mean_mean, mean_std, std_mean, std_std
|
350
|
+
|
351
|
+
def split_coherence(
|
352
|
+
row: pd.Series,
|
353
|
+
window: int,
|
354
|
+
mode: str,
|
355
|
+
dist: Optional[Any] = None
|
356
|
+
) -> pd.Series:
|
357
|
+
"""
|
358
|
+
Compute coherence metrics for a single row with given window size.
|
359
|
+
|
360
|
+
Args:
|
361
|
+
row: DataFrame row containing tokens and vectors
|
362
|
+
window: Window size for coherence calculation
|
363
|
+
mode: Either "semantic" or "phonetic"
|
364
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
Series with coherence metrics for this window size
|
368
|
+
"""
|
369
|
+
# Both modes use 'tokens' column
|
370
|
+
tokens = row['tokens']
|
371
|
+
vectors = row["phonetic_vectors" if mode == "phonetic" else "embeddings"]
|
372
|
+
|
373
|
+
mean_mean, mean_std, std_mean, std_std = coherence(
|
374
|
+
tokens, vectors, mode, window_size=window, dist=dist
|
375
|
+
)
|
376
|
+
|
377
|
+
return pd.Series({
|
378
|
+
f"{mode}_coherence_{window}_mean_of_window_means": mean_mean,
|
379
|
+
f"{mode}_coherence_{window}_std_of_window_means": mean_std,
|
380
|
+
f"{mode}_coherence_{window}_mean_of_window_stds": std_mean,
|
381
|
+
f"{mode}_coherence_{window}_std_of_window_stds": std_std
|
382
|
+
})
|
383
|
+
|
384
|
+
def process_window(
|
385
|
+
data_df: pd.DataFrame,
|
386
|
+
window: int,
|
387
|
+
mode: str,
|
388
|
+
dist: Optional[Any] = None
|
389
|
+
) -> pd.DataFrame:
|
390
|
+
"""
|
391
|
+
Process entire DataFrame for a given window size.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
data_df: DataFrame containing tokens and vectors
|
395
|
+
window: Window size for coherence calculation
|
396
|
+
mode: Either "semantic" or "phonetic"
|
397
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
DataFrame with coherence metrics for this window size
|
401
|
+
"""
|
402
|
+
logger.info(f"Processing {mode} coherence with window size: {window}")
|
403
|
+
return data_df.apply(lambda row: split_coherence(row, window, mode, dist), axis=1)
|
404
|
+
|
405
|
+
def apply_coherence(
|
406
|
+
data_df: pd.DataFrame,
|
407
|
+
windows: List[int],
|
408
|
+
mode: str,
|
409
|
+
dist: Optional[Any] = None,
|
410
|
+
parallelize: bool = False
|
411
|
+
) -> pd.DataFrame:
|
412
|
+
"""
|
413
|
+
Apply coherence calculation across multiple window sizes.
|
414
|
+
|
415
|
+
Args:
|
416
|
+
data_df: DataFrame containing tokens and embedding vectors
|
417
|
+
windows: List of window sizes to process
|
418
|
+
mode: Either "semantic" or "phonetic"
|
419
|
+
dist: Panphon distance calculator (required for phonetic mode)
|
420
|
+
parallelize: Whether to use parallel processing
|
421
|
+
|
422
|
+
Returns:
|
423
|
+
DataFrame with coherence metrics for all window sizes
|
424
|
+
"""
|
425
|
+
if parallelize:
|
426
|
+
with ProcessPoolExecutor() as executor:
|
427
|
+
futures = [
|
428
|
+
executor.submit(
|
429
|
+
process_window,
|
430
|
+
data_df.copy(),
|
431
|
+
window,
|
432
|
+
mode,
|
433
|
+
dist
|
434
|
+
)
|
435
|
+
for window in windows
|
436
|
+
]
|
437
|
+
results = [future.result() for future in futures]
|
438
|
+
else:
|
439
|
+
results = [
|
440
|
+
process_window(data_df.copy(), window, mode, dist)
|
441
|
+
for window in windows #window_sizes not windows
|
442
|
+
]
|
443
|
+
|
444
|
+
# Combine results into main DataFrame
|
445
|
+
for result in results:
|
446
|
+
for col in result.columns:
|
447
|
+
data_df[col] = result[col]
|
448
|
+
|
449
|
+
return data_df
|
450
|
+
|
451
|
+
def ipa_to_features(ipa_tokens: List[str]) -> List[List[float]]:
|
452
|
+
"""
|
453
|
+
Convert IPA tokens into phonetic feature vectors.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
ipa_tokens: List of IPA transcriptions
|
457
|
+
|
458
|
+
Returns:
|
459
|
+
List of feature vectors for each token
|
460
|
+
"""
|
461
|
+
return [fasttext.word_to_vector_list(ipa) for ipa in ipa_tokens]
|
462
|
+
|
463
|
+
def load_models() -> Tuple[Any, Any, Any]:
|
464
|
+
"""Load FastText, Epitran, and Panphon models."""
|
465
|
+
logger.info("Loading models...")
|
466
|
+
|
467
|
+
# Load FastText model
|
468
|
+
logger.info("Loading FastText model...")
|
469
|
+
ft_model = fasttext.load_model(COHERENCE_CONFIG["model"]["fasttext_path"])
|
470
|
+
|
471
|
+
# Load Epitran model
|
472
|
+
logger.info("Loading Epitran model...")
|
473
|
+
epi = epitran.Epitran(COHERENCE_CONFIG["model"]["language_code"])
|
474
|
+
|
475
|
+
# Load Panphon distance calculator
|
476
|
+
logger.info("Loading Panphon model...")
|
477
|
+
dist = panphon.distance.Distance()
|
478
|
+
|
479
|
+
return ft_model, epi, dist
|
480
|
+
|
481
|
+
def load_transcripts() -> pd.DataFrame:
|
482
|
+
"""Load and preprocess transcripts."""
|
483
|
+
logger.info("Loading transcripts...")
|
484
|
+
|
485
|
+
# Get list of transcript files
|
486
|
+
transcript_files = [f for f in os.listdir(COHERENCE_CONFIG["paths"]["data_dir"]) if f.endswith('.txt')]
|
487
|
+
logger.info(f"Found {len(transcript_files)} transcript files")
|
488
|
+
|
489
|
+
# Load each transcript
|
490
|
+
data = []
|
491
|
+
|
492
|
+
|
493
|
+
# Load each transcript
|
494
|
+
data = []
|
495
|
+
for filename in transcript_files:
|
496
|
+
filepath = os.path.join(COHERENCE_CONFIG["paths"]["data_dir"], filename)
|
497
|
+
|
498
|
+
try:
|
499
|
+
# Read file with UTF-8, ignoring errors or replacing unknown chars
|
500
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
501
|
+
content = f.read()
|
502
|
+
tokens = preprocess(content)
|
503
|
+
data.append({
|
504
|
+
'file_name': filename,
|
505
|
+
'transcript': content,
|
506
|
+
'tokens': tokens
|
507
|
+
})
|
508
|
+
except UnicodeDecodeError:
|
509
|
+
print(f"Warning: UnicodeDecodeError in file {filename}. Trying alternative encoding...")
|
510
|
+
|
511
|
+
# Try alternative encoding (ISO-8859-1 or cp1252)
|
512
|
+
try:
|
513
|
+
with open(filepath, 'r', encoding='ISO-8859-1') as f:
|
514
|
+
content = f.read()
|
515
|
+
tokens = preprocess(content)
|
516
|
+
data.append({
|
517
|
+
'file_name': filename,
|
518
|
+
'transcript': content,
|
519
|
+
'tokens': tokens
|
520
|
+
})
|
521
|
+
except Exception as e:
|
522
|
+
print(f"Failed to read {filename} with alternative encoding: {e}")
|
523
|
+
continue # Skip the file
|
524
|
+
|
525
|
+
return pd.DataFrame(data)
|
526
|
+
|
527
|
+
def process_transcripts(
|
528
|
+
data_df: pd.DataFrame,
|
529
|
+
ft_model: Any,
|
530
|
+
epi: Any,
|
531
|
+
dist: Any,
|
532
|
+
config: Dict
|
533
|
+
) -> pd.DataFrame:
|
534
|
+
"""
|
535
|
+
Process transcripts to compute coherence metrics.
|
536
|
+
|
537
|
+
Args:
|
538
|
+
data_df: DataFrame with raw transcripts
|
539
|
+
ft_model: FastText model
|
540
|
+
epi: Epitran model
|
541
|
+
dist: Panphon distance calculator
|
542
|
+
config: Configuration dictionary
|
543
|
+
|
544
|
+
Returns:
|
545
|
+
DataFrame with computed metrics
|
546
|
+
"""
|
547
|
+
logger.info("Processing transcripts...")
|
548
|
+
|
549
|
+
# Tokenization
|
550
|
+
logger.info("Tokenizing transcripts...")
|
551
|
+
data_df["tokens"] = data_df["transcript"].apply(
|
552
|
+
lambda x: preprocess(x, config["preprocessing"]["lower"], config["preprocessing"]["free_text"])
|
553
|
+
)
|
554
|
+
|
555
|
+
# Process each mode
|
556
|
+
for mode in config["modes"]:
|
557
|
+
if mode == "semantic":
|
558
|
+
# Generate embeddings
|
559
|
+
logger.info("Generating FastText embeddings...")
|
560
|
+
data_df["embeddings"] = data_df["tokens"].apply(
|
561
|
+
lambda x: get_vector(ft_model, "semantic", x)
|
562
|
+
)
|
563
|
+
|
564
|
+
# Calculate word sequence similarities
|
565
|
+
logger.info("Computing word sequence similarities...")
|
566
|
+
data_df["wordseq_ft"] = data_df.apply(
|
567
|
+
lambda row: wordseq(row["tokens"], row["embeddings"], "semantic"),
|
568
|
+
axis=1
|
569
|
+
)
|
570
|
+
|
571
|
+
# Apply coherence calculations
|
572
|
+
logger.info("Computing coherence metrics...")
|
573
|
+
data_df = apply_coherence(data_df, config["windows"], "semantic")
|
574
|
+
|
575
|
+
elif mode == "phonetic":
|
576
|
+
# Generate phonetic vectors
|
577
|
+
logger.info("Generating phonetic vectors...")
|
578
|
+
data_df["phonetic_vectors"] = data_df["tokens"].apply(
|
579
|
+
lambda x: get_vector(epi, "phonetic", x)
|
580
|
+
)
|
581
|
+
|
582
|
+
# Calculate word sequence similarities
|
583
|
+
data_df["wordseq_phon"] = data_df.apply(
|
584
|
+
lambda row: wordseq(row["tokens"], row["phonetic_vectors"], "phonetic", dist),
|
585
|
+
axis=1
|
586
|
+
)
|
587
|
+
|
588
|
+
# Apply coherence calculations
|
589
|
+
data_df = apply_coherence(data_df, config["windows"], "phonetic", dist=dist)
|
590
|
+
|
591
|
+
# Calculate additional metrics
|
592
|
+
logger.info("Computing additional metrics...")
|
593
|
+
data_df["number_tokens"] = data_df["tokens"].apply(len)
|
594
|
+
data_df["multiword_count"] = data_df["tokens"].apply(
|
595
|
+
lambda x: len([i for i in x if len(i.split()) > 1])
|
596
|
+
)
|
597
|
+
|
598
|
+
return data_df
|
599
|
+
|
600
|
+
def save_results(data_df: pd.DataFrame) -> None:
|
601
|
+
"""Save processed results to CSV."""
|
602
|
+
# Extract study_id and sub_task from filename
|
603
|
+
data_df["study_id"] = data_df["file_name"].apply(lambda x: x.split("_")[0]) # Get first part (0029)
|
604
|
+
data_df["sub_task"] = data_df["file_name"].apply(lambda x: x.split("_")[-1].replace(".txt", ""))
|
605
|
+
|
606
|
+
# Determine output path
|
607
|
+
out_file = os.path.join(
|
608
|
+
COHERENCE_CONFIG["paths"]["results_dir"],
|
609
|
+
f'coherence_results{"_lower" if COHERENCE_CONFIG["preprocessing"]["lower"] else "_upper"}.csv'
|
610
|
+
)
|
611
|
+
|
612
|
+
# Ensure output directory exists
|
613
|
+
ensure_output_dir(out_file)
|
614
|
+
|
615
|
+
# Save results, dropping intermediate columns if they exist
|
616
|
+
columns_to_drop = [
|
617
|
+
col for col in [
|
618
|
+
"transcript", "tokens",
|
619
|
+
"embeddings", "phonetic_vectors", "file_name"
|
620
|
+
] if col in data_df.columns
|
621
|
+
]
|
622
|
+
|
623
|
+
# Reorder columns to put study_id and sub_task first
|
624
|
+
final_df = data_df.drop(columns=columns_to_drop)
|
625
|
+
cols = ["study_id", "sub_task"] + [col for col in final_df.columns if col not in ["study_id", "sub_task"]]
|
626
|
+
final_df = final_df[cols]
|
627
|
+
|
628
|
+
final_df.to_csv(out_file, index=None)
|
629
|
+
logger.info(f"Results saved to: {out_file}")
|
630
|
+
|
631
|
+
def main():
|
632
|
+
"""Main execution function."""
|
633
|
+
try:
|
634
|
+
logger.info("Starting coherence analysis...")
|
635
|
+
|
636
|
+
# Load models
|
637
|
+
ft_model, epi, dist = load_models()
|
638
|
+
|
639
|
+
# Load and process transcripts
|
640
|
+
data_df = load_transcripts()
|
641
|
+
data_df = process_transcripts(data_df, ft_model, epi, dist, COHERENCE_CONFIG)
|
642
|
+
|
643
|
+
# Save results
|
644
|
+
save_results(data_df)
|
645
|
+
|
646
|
+
logger.info("Coherence analysis completed successfully!")
|
647
|
+
|
648
|
+
except Exception as e:
|
649
|
+
logger.error(f"Error during execution: {str(e)}", exc_info=True)
|
650
|
+
raise
|
651
|
+
|
652
|
+
if __name__ == "__main__":
|
653
|
+
main()
|