pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,274 @@
1
+ #import pandas as pd
2
+ #import numpy as np
3
+ #from pydub import AudioSegment
4
+ #from pyannote.audio import Model, Inference
5
+
6
+ class AudioFeatureExtractor:
7
+ def __init__(self, model_name, token, device="cpu"):
8
+ """
9
+ Initializes the AudioFeatureExtractor class.
10
+
11
+ Parameters:
12
+ - model_name: str, name of the pretrained model_instance from pyannote
13
+ - token: str, the Hugging Face authentication token for downloading the model_instance
14
+ - device: str, device to run the model_instance on (default is "cpu")
15
+ """
16
+ self.model = Model.from_pretrained(model_name, use_auth_token=token).to(device)
17
+
18
+ def extract_audio_window(self, audio, start_time=0, duration=60000):
19
+ """
20
+ Extract a segment from the audio starting at `start_time` with a specified duration.
21
+
22
+ Parameters:
23
+ - audio: AudioSegment object, the input audio
24
+ - start_time: int, starting point of the window in milliseconds (default is 0)
25
+ - duration: int, duration of the window to extract in milliseconds (default is 60000)
26
+
27
+ Returns:
28
+ - AudioSegment object of the extracted window
29
+ """
30
+ end_time = start_time + duration
31
+ return audio[start_time:end_time]
32
+
33
+ def extract_embeddings(self, inference, file_path):
34
+ """
35
+ Extract embeddings from an audio file using the inference model_instance.
36
+
37
+ Parameters:
38
+ - inference: Inference object from pyannote
39
+ - file_path: str, path to the audio file
40
+
41
+ Returns:
42
+ - numpy array of embeddings
43
+ """
44
+ embeddings = inference(file_path)
45
+ return np.asarray(embeddings)
46
+
47
+ def process_audio(self, file_path, mode="whole", start_time=0, duration=60000, window_step=None):
48
+ """
49
+ Process an audio file, extracting either whole or windowed embeddings based on mode.
50
+
51
+ Parameters:
52
+ - file_path: str, path to the audio file
53
+ - mode: str, "whole" for whole file extraction or "window" for windowed extraction (default is "whole")
54
+ - start_time: int, start time for the audio segment in milliseconds (only for window mode, default is 0)
55
+ - duration: int, duration for the audio segment in milliseconds (default is 60000)
56
+ - window_step: int, step size for window extraction in milliseconds (only for "window" mode)
57
+
58
+ Returns:
59
+ - numpy array of embeddings
60
+ """
61
+ audio = AudioSegment.from_file(file_path)
62
+ if mode == "whole":
63
+ inference = Inference(self.model, window="whole")
64
+ embeddings = self.extract_embeddings(inference, file_path)
65
+ elif mode == "window":
66
+ # If window mode is specified, we extract in a sliding window fashion
67
+ embeddings = []
68
+ inference = Inference(self.model, window="sliding", duration=duration, step=window_step)
69
+
70
+ # Split audio into windows and extract embeddings for each window
71
+ for i in range(0, len(audio), window_step):
72
+ window_audio = self.extract_audio_window(audio, start_time=i, duration=duration)
73
+ temp_path = f"temp_window_{i}.wav"
74
+ window_audio.export(temp_path, format="wav")
75
+ window_embeddings = self.extract_embeddings(inference, temp_path)
76
+ embeddings.append(window_embeddings)
77
+ os.remove(temp_path)
78
+ embeddings = np.vstack(embeddings) # Stack all window embeddings
79
+ else:
80
+ raise ValueError("Invalid mode. Use 'whole' or 'window'.")
81
+
82
+ return embeddings
83
+
84
+ def save_embeddings(self, embeddings, output_path):
85
+ """
86
+ Save the embeddings to a CSV file.
87
+
88
+ Parameters:
89
+ - embeddings: numpy array of embeddings
90
+ - output_path: str, path to save the CSV file
91
+ """
92
+ df = pd.DataFrame(embeddings)
93
+ df.to_csv(output_path, index=False)
94
+
95
+ # Example usage:
96
+ if __name__ == "__main__":
97
+ # Initialize the extractor
98
+ extractor = AudioFeatureExtractor(
99
+ model_name="pyannote/embedding",
100
+ token="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",
101
+ device="mps"
102
+ )
103
+
104
+ # Process a whole file
105
+ whole_embeddings = extractor.process_audio(
106
+ file_path="path/to/audio_file.wav",
107
+ mode="whole"
108
+ )
109
+
110
+ # Process a file using sliding window extraction
111
+ window_embeddings = extractor.process_audio(
112
+ file_path="path/to/audio_file.wav",
113
+ mode="window",
114
+ start_time=0,
115
+ duration=10000, # e.g., 10 seconds window
116
+ window_step=5000 # e.g., 5 seconds step
117
+ )
118
+
119
+ # Save the embeddings
120
+ extractor.save_embeddings(whole_embeddings, "path/to/output_whole.csv")
121
+ extractor.save_embeddings(window_embeddings, "path/to/output_window.csv")
122
+
123
+
124
+
125
+ '''import os
126
+ import numpy as np
127
+ from pydub import AudioSegment
128
+ from pyannote.audio import Model, Inference'''
129
+
130
+
131
+ class AudioFeatureExtractor:
132
+ def __init__(self, model_name_or_instance, device="cpu", use_auth_token=None):
133
+ """
134
+ Initializes the AudioFeatureExtractor class.
135
+
136
+ Parameters:
137
+ - model_name_or_instance: str or Model, the name of the pretrained model_instance from pyannote or an instance of Model
138
+ - device: str, device to run the model_instance on (default is "cpu")
139
+ - use_auth_token: str, Hugging Face authentication token if required
140
+ """
141
+ if isinstance(model_name_or_instance, str):
142
+ self.model = Model.from_pretrained(
143
+ model_name_or_instance, use_auth_token=use_auth_token
144
+ ).to(device)
145
+ else:
146
+ self.model = model_name_or_instance.to(device)
147
+ self.device = device
148
+
149
+ def extract_audio_window(self, audio, start_time=0, duration=None):
150
+ """
151
+ Extract a segment from the audio starting at 'start_time' with a specified 'duration'.
152
+
153
+ Parameters:
154
+ - audio: AudioSegment object, the input audio
155
+ - start_time: int, starting point of the window in milliseconds (default is 0)
156
+ - duration: int, duration of the window to extract in milliseconds (default is None, till the end)
157
+
158
+ Returns:
159
+ - AudioSegment object of the extracted window
160
+ """
161
+ if duration is None:
162
+ duration = len(audio) - start_time
163
+ end_time = start_time + duration
164
+ return audio[start_time:end_time]
165
+
166
+ def extract_embeddings(self, inference, file_path):
167
+ """
168
+ Extract embeddings from the audio file using the specified inference model_instance.
169
+
170
+ Parameters:
171
+ - inference: Inference object from pyannote
172
+ - file_path: str, path to the audio file
173
+
174
+ Returns:
175
+ - numpy array of embeddings
176
+ """
177
+ embeddings = inference(file_path)
178
+ return np.asarray(embeddings)
179
+
180
+ def process_audio(self, file_path, mode="whole", window_duration=None, window_step=None, start_time=0, end_time=None):
181
+ """
182
+ Process an audio file, extracting embeddings based on the specified mode.
183
+
184
+ Parameters:
185
+ - file_path: str, path to the audio file
186
+ - mode: str, "whole" for whole file extraction or "windowed" for windowed extraction (default is "whole")
187
+ - window_duration: int, duration of the window in milliseconds (required for "windowed" mode)
188
+ - window_step: int, step size in milliseconds between windows (required for "windowed" mode)
189
+ - start_time: int, start time in milliseconds for processing (default is 0)
190
+ - end_time: int, end time in milliseconds for processing (default is None, till the end)
191
+
192
+ Returns:
193
+ - numpy array of embeddings
194
+ """
195
+ # Load and optionally trim the audio file
196
+ audio = AudioSegment.from_file(file_path)
197
+ if end_time is None or end_time > len(audio):
198
+ end_time = len(audio)
199
+ audio = audio[start_time:end_time]
200
+
201
+ # Export the (possibly trimmed) audio to a temporary file
202
+ temp_dir = "temp_audio"
203
+ os.makedirs(temp_dir, exist_ok=True)
204
+ temp_path = os.path.join(temp_dir, "temp_audio.wav")
205
+ audio.export(temp_path, format="wav")
206
+
207
+ if mode == "whole":
208
+ inference = Inference(self.model, window="whole")
209
+ embeddings = self.extract_embeddings(inference, temp_path)
210
+ elif mode == "windowed":
211
+ if window_duration is None or window_step is None:
212
+ raise ValueError("window_duration and window_step must be specified for 'windowed' mode.")
213
+ # Convert milliseconds to seconds for pyannote
214
+ window_duration_sec = window_duration / 1000.0
215
+ window_step_sec = window_step / 1000.0
216
+ inference = Inference(
217
+ self.model,
218
+ window="sliding",
219
+ duration=window_duration_sec,
220
+ step=window_step_sec
221
+ )
222
+ embeddings = self.extract_embeddings(inference, temp_path)
223
+ else:
224
+ os.remove(temp_path)
225
+ raise ValueError("Invalid mode. Use 'whole' or 'windowed'.")
226
+
227
+ # Clean up temporary file
228
+ os.remove(temp_path)
229
+ return embeddings
230
+
231
+ def save_embeddings(self, embeddings, output_path):
232
+ """
233
+ Save the embeddings to a file.
234
+
235
+ Parameters:
236
+ - embeddings: numpy array, the embeddings to save
237
+ - output_path: str, the path where embeddings will be saved
238
+ """
239
+ np.save(output_path, embeddings)
240
+ # Alternatively, to save as CSV:
241
+ # np.savetxt(output_path, embeddings, delimiter=",")
242
+
243
+
244
+ # Example usage:
245
+ if __name__ == "__main__":
246
+ # Initialize the extractor with a model_instance name and token if required
247
+ extractor = AudioFeatureExtractor(
248
+ model_name_or_instance="pyannote/embedding",
249
+ device="cpu",
250
+ use_auth_token="YOUR_HUGGING_FACE_TOKEN" # Replace with your token if necessary
251
+ )
252
+
253
+ # Path to your audio file
254
+ audio_file_path = "path/to/your/audio_file.wav"
255
+
256
+ # Extract embeddings from the whole audio file
257
+ whole_embeddings = extractor.process_audio(
258
+ file_path=audio_file_path,
259
+ mode="whole"
260
+ )
261
+
262
+ # Save the embeddings
263
+ extractor.save_embeddings(whole_embeddings, "whole_embeddings.npy")
264
+
265
+ # Extract embeddings using sliding windows
266
+ windowed_embeddings = extractor.process_audio(
267
+ file_path=audio_file_path,
268
+ mode="windowed",
269
+ window_duration=5000, # Window duration in milliseconds (e.g., 5000 ms = 5 seconds)
270
+ window_step=1000 # Window step in milliseconds (e.g., 1000 ms = 1 second)
271
+ )
272
+
273
+ # Save the windowed embeddings
274
+ extractor.save_embeddings(windowed_embeddings, "windowed_embeddings.npy")
File without changes
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Tue Jun 18 12:28:18 2024
5
+
6
+ @author: nilsl
7
+ """
8
+ import os
9
+ import pandas as pd
10
+ import numpy as np
11
+ from config import CONFIG
12
+
13
+
14
+ def add_prefix_if_short(text, length, prefix):
15
+ if len(text) < length:
16
+ return prefix + text
17
+ return text
18
+
19
+
20
+ def parse_array_from_string(s):
21
+ """Parses a numpy array from a string representation."""
22
+ cleaned = s.strip("[]")
23
+ return np.array([float(x) for x in cleaned.split()])
24
+
25
+ def load_csvs_to_dataframe(directory, match_str, indeces):
26
+ df_list = []
27
+ # Loop through all files in the specified directory
28
+ for filename in os.listdir(directory):
29
+ # Check if the file is a CSV and contains the match_str in the filename
30
+ if filename.endswith(".csv") and match_str in filename:
31
+ # Construct the full path to the file
32
+ file_path = os.path.join(directory, filename)
33
+ # Read the CSV file into a DataFrame
34
+ df = pd.read_csv(file_path, index_col=0, dtype=str)
35
+ # Remove the file extension and use the filename as the key in the dictionary
36
+ for i, index in enumerate(indeces):
37
+ df[f"attr_{i}"] = filename.split(".")[0].split("_")[index]
38
+ df_list.append(df)
39
+
40
+ return pd.concat(df_list, axis=0)
41
+
42
+
43
+ def pivot_df(df, index_cols, pivot_cols):
44
+ # Perform the pivot operation
45
+ df_pivot = df.pivot_table(
46
+ index=index_cols, columns=pivot_cols, aggfunc="first"
47
+ )
48
+
49
+ # Flatten the MultiIndex columns by joining level names with corresponding values
50
+ df_pivot.columns = [
51
+ "{}_{}".format("_".join(map(str, col[:-1])), col[-1])
52
+ for col in df_pivot.columns
53
+ ]
54
+
55
+ # Reset index to make index columns regular columns again
56
+ df_pivot.reset_index(inplace=True)
57
+
58
+ # Handling the columns after resetting index
59
+ all_cols = df_pivot.columns.tolist()
60
+ non_index_cols = [col for col in all_cols if col not in index_cols]
61
+
62
+ # Sorting non-index columns by base name and context number, modified to handle multiple pivots
63
+ sorted_cols = sorted(
64
+ non_index_cols, key=lambda x: (x.split("_")[0], int(x.split("_")[-1]))
65
+ )
66
+
67
+ # Reordering DataFrame columns including index and sorted other columns
68
+ df_pivot = df_pivot[index_cols + sorted_cols]
69
+
70
+ return df_pivot
71
+
72
+ def main():
73
+ """Main execution function."""
74
+ lower = CONFIG["shared"]["preprocessing"]["lower"]
75
+
76
+ behav = pd.read_csv(CONFIG["aggregation"]["paths"]["behav_agg"], dtype=str)
77
+ behav["study_id"] = (
78
+ behav["study_id"]
79
+ .apply(add_prefix_if_short, length=4, prefix="0")
80
+ .apply(add_prefix_if_short, length=4, prefix="0")
81
+ )
82
+ behav_scores = [col for col in behav.columns if not col in ["study_id", "group", "gender", "first_language", "diagnosis"]]
83
+ behav[behav_scores] = behav[behav_scores].astype(float)
84
+
85
+ demo = pd.read_csv(
86
+ CONFIG["aggregation"]["paths"]["demo_clinical"],
87
+ dtype=str
88
+ )[CONFIG["aggregation"]["demo_columns"]]
89
+
90
+ demo_scores = [col for col in demo.columns if not col in ['study_id', 'group', 'gender', 'first_language', 'diagnosis']]
91
+ demo[demo_scores] = demo[demo_scores].astype(float)
92
+
93
+ behav_mss = pd.read_csv(CONFIG["aggregation"]["paths"]["questionnaires"], dtype=str)
94
+ behav_mss[behav_mss.columns.drop("study_id")] = behav_mss[behav_mss.columns.drop("study_id")].astype(float)
95
+
96
+ behav_mss["study_id"] = (
97
+ behav_mss["study_id"]
98
+ .apply(add_prefix_if_short, length=4, prefix="0")
99
+ .apply(add_prefix_if_short, length=4, prefix="0")
100
+ )
101
+
102
+ fluency_optimality = (
103
+ load_csvs_to_dataframe(
104
+ CONFIG["optimality"]["paths"]["results_dir"],
105
+ "lower" if lower else "upper",
106
+ [3, 4, 5],
107
+ )
108
+ .rename(
109
+ columns={
110
+ "attr_0": "min_length",
111
+ "attr_1": "index_0_shuffle",
112
+ "attr_2": "index_-1_shuffle",
113
+ }
114
+ )
115
+ .drop("task", axis=1)
116
+ )
117
+
118
+ fluency_optimality["z_Real"] = (
119
+ fluency_optimality["actual_dist"].astype(float)
120
+ - fluency_optimality["average_dist"].astype(float)
121
+ ) / fluency_optimality["std_dist"].astype(float)
122
+ fluency_optimality["z_Real"] = fluency_optimality["z_Real"].astype(float)
123
+ fluency_optimality["all_pairs_average"] = fluency_optimality[
124
+ "all_pairs_average"
125
+ ].astype(float)
126
+ fluency_optimality["actual_dist"] = fluency_optimality["actual_dist"].astype(
127
+ float
128
+ )
129
+ fluency_optimality["min_length"] = fluency_optimality["min_length"].astype(
130
+ int
131
+ )
132
+
133
+ fluency_optimality = (
134
+ fluency_optimality.drop("window_index", axis = 1).groupby(
135
+ [
136
+ "min_length",
137
+ "index_0_shuffle",
138
+ "index_-1_shuffle",
139
+ "analysis_mode",
140
+ "study_id",
141
+ "sub_task",
142
+ ]
143
+ )["z_Real"]
144
+ .mean()
145
+ .reset_index()
146
+ )
147
+
148
+ fluency_optimality_pivot = pivot_df(
149
+ fluency_optimality,
150
+ ["study_id", "sub_task"],
151
+ ["analysis_mode", "index_0_shuffle", "index_-1_shuffle", "min_length"],
152
+ )
153
+
154
+ fluency_coherence = pd.read_csv(
155
+ os.path.join(CONFIG["coherence"]["paths"]["results_dir"],
156
+ f"coherence_results{'_lower' if lower else '_upper'}.csv"),
157
+ dtype=str
158
+ )
159
+
160
+ print(fluency_coherence.columns)
161
+ print(fluency_coherence.head())
162
+
163
+ fluency_coherence[fluency_coherence.columns.drop(["study_id", "sub_task"])] = fluency_coherence[fluency_coherence.columns.drop(["study_id", "sub_task"])].astype(float)
164
+
165
+
166
+ index = demo.merge(behav).merge(behav_mss)
167
+ metrics = fluency_coherence.merge(fluency_optimality_pivot, how = "outer")
168
+
169
+ paper_df = index.merge(metrics)
170
+
171
+ paper_df["task"] = ""
172
+
173
+ paper_df.loc[paper_df["sub_task"] == "b", "task"] = "phonetic"
174
+ paper_df.loc[paper_df["sub_task"] == "k", "task"] = "phonetic"
175
+ paper_df.loc[paper_df["sub_task"] == "m", "task"] = "phonetic"
176
+
177
+ paper_df.loc[paper_df["sub_task"] == "animals", "task"] = "semantic"
178
+ paper_df.loc[paper_df["sub_task"] == "clothes", "task"] = "semantic"
179
+ paper_df.loc[paper_df["sub_task"] == "food", "task"] = "semantic"
180
+
181
+ paper_df.to_csv(CONFIG["aggregation"]["paths"]["output"])
182
+
183
+ return True
184
+
185
+ if __name__ == "__main__":
186
+ main()
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Process VELAS behavioral data.
5
+
6
+ This script:
7
+ 1. Loads the VELAS behavioral master data
8
+ 2. Renames cognitive test variables for clarity
9
+ 3. Selects relevant columns
10
+ 4. Outputs cleaned CSV
11
+ """
12
+ import pandas as pd
13
+ from utils import ensure_output_dir
14
+ from config import BEHAVIORAL_CONFIG
15
+
16
+ def load_behavioral_data(filepath):
17
+ """Load behavioral data from CSV."""
18
+ return pd.read_csv(filepath)
19
+
20
+ def rename_cognitive_variables(df):
21
+ """Rename cognitive test variables for clarity."""
22
+ return df.rename(columns=BEHAVIORAL_CONFIG["cognitive_variable_mapping"])
23
+
24
+ def save_aggregated_data(df, output_path):
25
+ """Save relevant columns to CSV."""
26
+ ensure_output_dir(output_path)
27
+ df[BEHAVIORAL_CONFIG["columns_to_save"]].to_csv(output_path, index=False)
28
+
29
+ def main():
30
+ # Get paths from config
31
+ paths = BEHAVIORAL_CONFIG["paths"]
32
+
33
+ # Process data
34
+ df = load_behavioral_data(paths["input"])
35
+ print(df.columns)
36
+ df = rename_cognitive_variables(df)
37
+ save_aggregated_data(df, paths["output"])
38
+
39
+ print(f"Processed behavioral data saved to: {paths['output']}")
40
+
41
+ if __name__ == "__main__":
42
+ main()
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Process VELAS fluency transcripts by cleaning duplicates and hyphenated words.
5
+
6
+ This script:
7
+ 1. Analyzes text files for duplicates and hyphenated words
8
+ 2. Cleans transcripts by removing duplicates and hyphens
9
+ 3. Saves processed transcripts to output directory
10
+ """
11
+ import os
12
+ import re
13
+ from collections import Counter
14
+ from pathlib import Path
15
+ from typing import Dict, List, Tuple
16
+ from utils import ensure_output_dir, validate_input_data
17
+ from config import DUPLICATES_CONFIG
18
+
19
+
20
+ #implemented in fluency_cleaner==================================
21
+ def analyze_transcript(content: str) -> Tuple[int, int]:
22
+ """
23
+ Count duplicates and hyphenated words in a transcript.
24
+
25
+ Args:
26
+ content: Semicolon-separated transcript content
27
+
28
+ Returns:
29
+ Tuple of (duplicate_count, hyphenated_word_count)
30
+ """
31
+ words = content.split(';')
32
+ word_counter = Counter(words)
33
+
34
+ duplicates = sum(count - 1 for count in word_counter.values() if count > 1)
35
+ hyphenated = sum(1 for word in words if '-' in word)
36
+
37
+ return duplicates, hyphenated
38
+ #=============================================================================
39
+
40
+ def analyze_directory(directory: str) -> Dict[str, int]:
41
+ """
42
+ Analyze all transcripts in directory for duplicates and hyphenated words.
43
+
44
+ Args:
45
+ directory: Path to transcript directory
46
+
47
+ Returns:
48
+ Dictionary with total counts of duplicates and hyphenated words
49
+ """
50
+ total_duplicates = 0
51
+ total_hyphenated = 0
52
+
53
+ for filename in os.listdir(directory):
54
+ if filename.endswith('.txt') and DUPLICATES_CONFIG["file_filter"] in filename:
55
+ filepath = os.path.join(directory, filename)
56
+ with open(filepath, 'r') as file:
57
+ content = file.read()
58
+ duplicates, hyphenated = analyze_transcript(content)
59
+ total_duplicates += duplicates
60
+ total_hyphenated += hyphenated
61
+
62
+ return {
63
+ 'duplicates': total_duplicates,
64
+ 'hyphenated': total_hyphenated
65
+ }
66
+
67
+ #=======================================================
68
+ #implemented in fluency_cleaner
69
+ def clean_transcript(content: str) -> str:
70
+
71
+ # Remove whitespace and decorators
72
+ content = re.sub(r'\s+', '', content).strip()
73
+
74
+ # Split and clean words
75
+ words = [word for word in content.split(';') if word]
76
+ words = [word.replace('-', '') for word in words]
77
+
78
+ # Remove duplicate words while preserving order
79
+ word_counter = Counter(words)
80
+ seen = set()
81
+ cleaned_words = []
82
+
83
+ for word in words:
84
+ if word in seen and word_counter[word] > 1:
85
+ word_counter[word] -= 1
86
+ else:
87
+ cleaned_words.append(word)
88
+ seen.add(word)
89
+
90
+ return ';'.join(cleaned_words)
91
+ #===========================================================
92
+
93
+ def process_directory(input_dir: str, output_dir: str) -> None:
94
+ """
95
+ Process all transcripts in directory, cleaning and saving to output directory.
96
+
97
+ Args:
98
+ input_dir: Directory containing raw transcripts
99
+ output_dir: Directory for cleaned transcripts
100
+ """
101
+ # Create the output directory and any necessary parent directories
102
+ print(f"Creating output directory: {output_dir}")
103
+ print(f"Output directory exists before ensure_output_dir? {os.path.exists(output_dir)}")
104
+ ensure_output_dir(output_dir)
105
+ print(f"Output directory exists after ensure_output_dir? {os.path.exists(output_dir)}")
106
+
107
+ for filename in os.listdir(input_dir):
108
+ if filename.endswith('.txt') and DUPLICATES_CONFIG["file_filter"] in filename:
109
+ input_path = os.path.join(input_dir, filename)
110
+ output_path = os.path.join(output_dir, filename)
111
+
112
+ print(f"\nProcessing file: {filename}")
113
+ print(f"Input path: {input_path}")
114
+ print(f"Output path: {output_path}")
115
+ print(f"Output dir exists? {os.path.exists(os.path.dirname(output_path))}")
116
+
117
+ # Ensure the directory for this specific file exists
118
+ ensure_output_dir(os.path.dirname(output_path))
119
+ print(f"Output dir exists after ensure? {os.path.exists(os.path.dirname(output_path))}")
120
+
121
+ with open(input_path, 'r') as infile:
122
+ content = infile.read()
123
+ cleaned_content = clean_transcript(content)
124
+
125
+ with open(output_path, 'w') as outfile:
126
+ outfile.write(cleaned_content)
127
+
128
+ def print_analysis_results(results: Dict[str, int], stage: str) -> None:
129
+ """Print analysis results in a formatted way."""
130
+ print(f"\nAnalysis results ({stage}):")
131
+ print(f"- Total duplicates: {results['duplicates']}")
132
+ print(f"- Total hyphenated words: {results['hyphenated']}")
133
+
134
+ def main():
135
+ # Get paths from config
136
+ paths = DUPLICATES_CONFIG["paths"]
137
+
138
+ # Validate input paths and create output directories
139
+ input_errors = validate_input_data({"transcripts": paths["input"]})
140
+ if input_errors:
141
+ for desc, error in input_errors.items():
142
+ print(f"Error with {desc}: {error}")
143
+ return
144
+
145
+ print(f"\nInput directory: {paths['input']}")
146
+ print(f"Output directory: {paths['output']}")
147
+ print(f"Input directory exists? {os.path.exists(paths['input'])}")
148
+
149
+ ensure_output_dir(paths["output"])
150
+ print(f"Output directory exists after ensure? {os.path.exists(paths['output'])}")
151
+
152
+ # Analyze original transcripts
153
+ print("\nAnalyzing original transcripts...")
154
+ original_results = analyze_directory(paths["input"])
155
+ print_analysis_results(original_results, "before cleaning")
156
+
157
+ # Process transcripts
158
+ print("\nCleaning transcripts...")
159
+ process_directory(paths["input"], paths["output"])
160
+
161
+ # Analyze cleaned transcripts
162
+ print("\nAnalyzing cleaned transcripts...")
163
+ cleaned_results = analyze_directory(paths["output"])
164
+ print_analysis_results(cleaned_results, "after cleaning")
165
+
166
+ print(f"\nCleaned transcripts saved to: {paths['output']}")
167
+
168
+ if __name__ == "__main__":
169
+ main()