pelican-nlp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/__init__.py +9 -0
- pelican_nlp/core/__init__.py +5 -0
- pelican_nlp/core/audio_document.py +20 -0
- pelican_nlp/core/corpus.py +296 -0
- pelican_nlp/core/document.py +226 -0
- pelican_nlp/core/subject.py +30 -0
- pelican_nlp/extraction/__init__.py +2 -0
- pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
- pelican_nlp/extraction/distance_from_randomness.py +109 -0
- pelican_nlp/extraction/extract_embeddings.py +57 -0
- pelican_nlp/extraction/extract_logits.py +102 -0
- pelican_nlp/extraction/language_model.py +71 -0
- pelican_nlp/extraction/semantic_similarity.py +60 -0
- pelican_nlp/extraction/test_documents/test_features.csv +4 -0
- pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
- pelican_nlp/main.py +211 -0
- pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
- pelican_nlp/preprocessing/LPDS.py +77 -0
- pelican_nlp/preprocessing/__init__.py +7 -0
- pelican_nlp/preprocessing/pipeline.py +50 -0
- pelican_nlp/preprocessing/speaker_diarization.py +33 -0
- pelican_nlp/preprocessing/text_cleaner.py +224 -0
- pelican_nlp/preprocessing/text_importer.py +42 -0
- pelican_nlp/preprocessing/text_normalizer.py +24 -0
- pelican_nlp/preprocessing/text_tokenizer.py +43 -0
- pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
- pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
- pelican_nlp/sample_configuration_files/config_general.yml +131 -0
- pelican_nlp/utils/__init__.py +3 -0
- pelican_nlp/utils/csv_functions.py +193 -0
- pelican_nlp/utils/sample_usage.py +17 -0
- pelican_nlp/utils/setup_functions.py +93 -0
- pelican_nlp-0.1.0.dist-info/METADATA +146 -0
- pelican_nlp-0.1.0.dist-info/RECORD +39 -0
- pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
- pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
- pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
pelican_nlp/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
class AudioFile:
|
4
|
+
def __init__(self, file_path, name, **kwargs):
|
5
|
+
self.file_path = file_path
|
6
|
+
self.name = name
|
7
|
+
self.file = os.path.join(file_path, name)
|
8
|
+
|
9
|
+
#Initialize optional attributes
|
10
|
+
self.subject_ID = kwargs.get('subject_ID')
|
11
|
+
self.task = kwargs.get('task')
|
12
|
+
self.num_speakers = kwargs.get('num_speakers')
|
13
|
+
self.corpus_name = None
|
14
|
+
self.recording_length = None
|
15
|
+
|
16
|
+
self.opensmile_results = None
|
17
|
+
self.prosogram_features = None
|
18
|
+
|
19
|
+
def __repr__(self):
|
20
|
+
return f"file_name={self.name}"
|
@@ -0,0 +1,296 @@
|
|
1
|
+
"""
|
2
|
+
This module provides the Corpus class, which aggregates documents where the same processing
|
3
|
+
steps applied and results should be aggregated.
|
4
|
+
(e.g. all fluency files from task 'animals' or all image-descriptions from the same image)
|
5
|
+
|
6
|
+
This class contains the pipelines for homogenous processing and metric extraction of all grouped files.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from ..preprocessing import TextPreprocessingPipeline
|
10
|
+
from ..utils.csv_functions import store_features_to_csv
|
11
|
+
from ..extraction.language_model import Model
|
12
|
+
from ..preprocessing.speaker_diarization import TextDiarizer
|
13
|
+
from ..preprocessing import text_cleaner as textcleaner
|
14
|
+
import os
|
15
|
+
import pandas as pd
|
16
|
+
import re
|
17
|
+
|
18
|
+
class Corpus:
|
19
|
+
def __init__(self, corpus_name, documents, configuration_settings):
|
20
|
+
self.name = corpus_name
|
21
|
+
self.documents = documents
|
22
|
+
self.config = configuration_settings
|
23
|
+
self.derivative_dir = self.config['PATH_TO_PROJECT_FOLDER']+'/derivatives'
|
24
|
+
self.pipeline = TextPreprocessingPipeline(self.config)
|
25
|
+
self.task = configuration_settings['task_name']
|
26
|
+
self.results_path = None
|
27
|
+
|
28
|
+
def preprocess_all_documents(self):
|
29
|
+
"""Preprocess all documents"""
|
30
|
+
print('Preprocessing all documents...')
|
31
|
+
for document in self.documents:
|
32
|
+
document.detect_sections()
|
33
|
+
document.process_document(self.pipeline)
|
34
|
+
|
35
|
+
def get_all_processed_texts(self):
|
36
|
+
result = {}
|
37
|
+
for subject in self.documents:
|
38
|
+
result[subject.name] = subject.get_processed_texts()
|
39
|
+
return result
|
40
|
+
|
41
|
+
def create_corpus_results_consolidation_csv(self):
|
42
|
+
"""Create separate aggregated results CSV files for each metric."""
|
43
|
+
print("Creating aggregated results files per metric...")
|
44
|
+
|
45
|
+
try:
|
46
|
+
derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
|
47
|
+
except (AttributeError, IndexError):
|
48
|
+
print("Error: No valid results path found in documents")
|
49
|
+
return
|
50
|
+
|
51
|
+
# Create aggregations folder
|
52
|
+
aggregation_path = os.path.join(derivatives_path, 'aggregations')
|
53
|
+
os.makedirs(aggregation_path, exist_ok=True)
|
54
|
+
|
55
|
+
# Initialize results dictionary with metrics as keys
|
56
|
+
results_by_metric = {}
|
57
|
+
|
58
|
+
# Walk through all directories in derivatives
|
59
|
+
for root, dirs, files in os.walk(derivatives_path):
|
60
|
+
# Skip the aggregations directory itself
|
61
|
+
if 'aggregations' in root:
|
62
|
+
continue
|
63
|
+
|
64
|
+
for file in files:
|
65
|
+
if not file.endswith('.csv'):
|
66
|
+
continue
|
67
|
+
|
68
|
+
file_path = os.path.join(root, file)
|
69
|
+
try:
|
70
|
+
df = pd.read_csv(file_path)
|
71
|
+
subject_key = os.path.basename(file).split('_')[0]
|
72
|
+
|
73
|
+
# Determine metric type from file path
|
74
|
+
if 'semantic-similarity-window' in file:
|
75
|
+
metric = 'semantic-similarity'
|
76
|
+
elif 'distance-from-randomness' in file:
|
77
|
+
metric = 'distance-from-randomness'
|
78
|
+
else:
|
79
|
+
continue
|
80
|
+
|
81
|
+
# Initialize metric dict if not exists
|
82
|
+
if metric not in results_by_metric:
|
83
|
+
results_by_metric[metric] = {}
|
84
|
+
|
85
|
+
# Initialize subject dict if not exists
|
86
|
+
if subject_key not in results_by_metric[metric]:
|
87
|
+
results_by_metric[metric][subject_key] = {}
|
88
|
+
|
89
|
+
# Process based on metric type
|
90
|
+
if metric == 'semantic-similarity':
|
91
|
+
window_size = re.search(r'window-(\d+)', file).group(1)
|
92
|
+
for _, row in df.iterrows():
|
93
|
+
if 'Metric' in df.columns and 'Similarity_Score' in df.columns:
|
94
|
+
metric_name = f"window_{window_size}_{row['Metric']}"
|
95
|
+
results_by_metric[metric][subject_key][metric_name] = row['Similarity_Score']
|
96
|
+
|
97
|
+
except Exception as e:
|
98
|
+
print(f"Error processing {file_path}: {e}")
|
99
|
+
continue
|
100
|
+
|
101
|
+
# Save separate aggregated results for each metric
|
102
|
+
for metric, metric_results in results_by_metric.items():
|
103
|
+
if metric_results:
|
104
|
+
output_file = os.path.join(aggregation_path, f'{self.name}_{metric}_aggregated_results.csv')
|
105
|
+
pd.DataFrame(metric_results).T.to_csv(output_file)
|
106
|
+
print(f"Aggregated results for {metric} saved to: {output_file}")
|
107
|
+
|
108
|
+
if not results_by_metric:
|
109
|
+
print("No results to aggregate")
|
110
|
+
|
111
|
+
def extract_logits(self):
|
112
|
+
from pelican_nlp.extraction.extract_logits import LogitsExtractor
|
113
|
+
from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
|
114
|
+
logits_options = self.config['options_logits']
|
115
|
+
project_path = self.config['PATH_TO_PROJECT_FOLDER']
|
116
|
+
|
117
|
+
print('logits extraction in progress')
|
118
|
+
model_name = logits_options['model_name']
|
119
|
+
logitsExtractor = LogitsExtractor(logits_options,
|
120
|
+
self.pipeline,
|
121
|
+
project_path)
|
122
|
+
model = Model(model_name, project_path)
|
123
|
+
model.load_model()
|
124
|
+
model_instance = model.model_instance
|
125
|
+
tokenizer = TextTokenizer(logits_options['tokenization_method'], model_name=logits_options['model_name'])
|
126
|
+
for i in range(len(self.documents)):
|
127
|
+
|
128
|
+
for key, section in self.documents[i].cleaned_sections.items():
|
129
|
+
|
130
|
+
if self.config['discourse'] == True:
|
131
|
+
section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'],
|
132
|
+
logits_options['keep_speakertags'])
|
133
|
+
#print(f'parsed section is {section}')
|
134
|
+
else:
|
135
|
+
section = [section]
|
136
|
+
|
137
|
+
print(f'Extracting Logits for section {key}')
|
138
|
+
|
139
|
+
for part in section:
|
140
|
+
print(part)
|
141
|
+
logits = logitsExtractor.extract_features(part, tokenizer, model_instance)
|
142
|
+
print(logits)
|
143
|
+
self.documents[i].logits.append(logits)
|
144
|
+
|
145
|
+
#'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
|
146
|
+
store_features_to_csv(logits,
|
147
|
+
self.derivative_dir,
|
148
|
+
self.documents[i],
|
149
|
+
metric='logits')
|
150
|
+
|
151
|
+
def extract_embeddings(self):
|
152
|
+
from pelican_nlp.extraction.extract_embeddings import EmbeddingsExtractor
|
153
|
+
|
154
|
+
embedding_options = self.config['options_embeddings']
|
155
|
+
print('Embeddings extraction in progress...')
|
156
|
+
embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.config['PATH_TO_PROJECT_FOLDER'])
|
157
|
+
for i in range(len(self.documents)):
|
158
|
+
for key, section in self.documents[i].cleaned_sections.items():
|
159
|
+
print(f'Processing section {key}')
|
160
|
+
|
161
|
+
if self.config['discourse']:
|
162
|
+
section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
|
163
|
+
else:
|
164
|
+
section = [section]
|
165
|
+
|
166
|
+
embeddings, token_count = embeddingsExtractor.extract_embeddings_from_text(section)
|
167
|
+
self.documents[i].embeddings.append(embeddings)
|
168
|
+
|
169
|
+
if self.task == 'fluency':
|
170
|
+
self.documents[i].fluency_word_count = token_count
|
171
|
+
|
172
|
+
for utterance in embeddings:
|
173
|
+
|
174
|
+
if self.config['options_embeddings']['semantic-similarity']:
|
175
|
+
from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
|
176
|
+
get_semantic_similarity_windows
|
177
|
+
consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
|
178
|
+
print(f'Mean semantic similarity: {mean_similarity:.4f}')
|
179
|
+
|
180
|
+
for window_size in self.config['options_semantic-similarity']['window_sizes']:
|
181
|
+
window_stats = get_semantic_similarity_windows(utterance, window_size)
|
182
|
+
if isinstance(window_stats, tuple) and len(window_stats) == 5:
|
183
|
+
window_data = {
|
184
|
+
'mean_of_window_means': window_stats[0],
|
185
|
+
'std_of_window_means': window_stats[1],
|
186
|
+
'mean_of_window_stds': window_stats[2],
|
187
|
+
'std_of_window_stds': window_stats[3],
|
188
|
+
'mean_of_window_medians': window_stats[4]
|
189
|
+
}
|
190
|
+
print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
|
191
|
+
else:
|
192
|
+
window_data = {
|
193
|
+
'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
|
194
|
+
'std': window_stats[1] if isinstance(window_stats, tuple) and len(window_stats) > 1 else None
|
195
|
+
}
|
196
|
+
|
197
|
+
store_features_to_csv(window_data,
|
198
|
+
self.derivative_dir,
|
199
|
+
self.documents[i],
|
200
|
+
metric=f'semantic-similarity-window-{window_size}')
|
201
|
+
|
202
|
+
if self.config['options_embeddings']['distance-from-randomness']:
|
203
|
+
from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
|
204
|
+
divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
|
205
|
+
print(f'Divergence from optimality metrics: {divergence}')
|
206
|
+
store_features_to_csv(divergence,
|
207
|
+
self.derivative_dir,
|
208
|
+
self.documents[i],
|
209
|
+
metric='distance-from-randomness')
|
210
|
+
|
211
|
+
# Process tokens
|
212
|
+
if embedding_options['clean_embedding_tokens']:
|
213
|
+
cleaned_embeddings = []
|
214
|
+
if isinstance(utterance, dict):
|
215
|
+
# Handle dictionary case (PyTorch models)
|
216
|
+
for token, embedding in utterance.items():
|
217
|
+
if 'xlm-roberta-base' in self.config['options_embeddings']['model_name'].lower():
|
218
|
+
cleaned_token = textcleaner.clean_subword_token_RoBERTa(token)
|
219
|
+
else:
|
220
|
+
cleaned_token = textcleaner.clean_token_generic(token)
|
221
|
+
if cleaned_token is not None:
|
222
|
+
cleaned_embeddings.append((cleaned_token, embedding))
|
223
|
+
else:
|
224
|
+
# Handle list of tuples case (fastText)
|
225
|
+
for token, embedding in utterance:
|
226
|
+
cleaned_token = textcleaner.clean_token_generic(token)
|
227
|
+
if cleaned_token is not None:
|
228
|
+
cleaned_embeddings.append((cleaned_token, embedding))
|
229
|
+
else:
|
230
|
+
cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
|
231
|
+
|
232
|
+
store_features_to_csv(cleaned_embeddings,
|
233
|
+
self.derivative_dir,
|
234
|
+
self.documents[i],
|
235
|
+
metric='embeddings')
|
236
|
+
return
|
237
|
+
|
238
|
+
def extract_opensmile_features(self):
|
239
|
+
from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
|
240
|
+
for i in range(len(self.documents)):
|
241
|
+
results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
|
242
|
+
self.documents[i].recording_length = recording_length # Store the recording length
|
243
|
+
results['subject_ID'] = self.documents[i].subject_ID # Set the subject ID
|
244
|
+
print('results obtained')
|
245
|
+
store_features_to_csv(results,
|
246
|
+
self.derivative_dir,
|
247
|
+
self.documents[i],
|
248
|
+
metric='opensmile-features')
|
249
|
+
|
250
|
+
def extract_prosogram(self):
|
251
|
+
from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
|
252
|
+
for i in range(len(self.documents)):
|
253
|
+
results = AudioFeatureExtraction.extract_prosogram_profile(self.documents[i].file)
|
254
|
+
print('prosogram obtained')
|
255
|
+
|
256
|
+
def create_document_information_csv(self):
|
257
|
+
"""Create CSV file with summarized document parameters based on config specifications."""
|
258
|
+
print("Creating document information summary...")
|
259
|
+
|
260
|
+
try:
|
261
|
+
derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
|
262
|
+
except (AttributeError, IndexError):
|
263
|
+
print("Error: No valid results path found in documents")
|
264
|
+
return
|
265
|
+
|
266
|
+
# Create document_information folder inside aggregations
|
267
|
+
doc_info_path = os.path.join(derivatives_path, 'aggregations', 'document_information')
|
268
|
+
os.makedirs(doc_info_path, exist_ok=True)
|
269
|
+
|
270
|
+
# Define output file path
|
271
|
+
output_file = os.path.join(doc_info_path, f'{self.name}_document-information.csv')
|
272
|
+
|
273
|
+
# Get parameters to include from config
|
274
|
+
parameters_to_include = self.config.get('document_information_output', {}).get('parameters', [])
|
275
|
+
|
276
|
+
if not parameters_to_include:
|
277
|
+
print("Warning: No parameters specified in config for document information output")
|
278
|
+
return
|
279
|
+
|
280
|
+
# Get document information based on specified parameters
|
281
|
+
document_info = []
|
282
|
+
for doc in self.documents:
|
283
|
+
# Get all attributes using vars()
|
284
|
+
attrs = vars(doc)
|
285
|
+
# Filter based on specified parameters
|
286
|
+
info = {
|
287
|
+
param: attrs.get(param)
|
288
|
+
for param in parameters_to_include
|
289
|
+
if param in attrs
|
290
|
+
}
|
291
|
+
document_info.append(info)
|
292
|
+
|
293
|
+
# Convert to DataFrame and save to CSV
|
294
|
+
df = pd.DataFrame(document_info)
|
295
|
+
df.to_csv(output_file, index=False)
|
296
|
+
print(f"Document information saved to: {output_file}")
|
@@ -0,0 +1,226 @@
|
|
1
|
+
"""
|
2
|
+
This module provides the Document class, each instance representing one file within a corpus.
|
3
|
+
The Document class stores all document specific information.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import os
|
7
|
+
import re
|
8
|
+
from pelican_nlp.preprocessing import TextImporter
|
9
|
+
from collections import defaultdict, OrderedDict
|
10
|
+
|
11
|
+
class Document:
|
12
|
+
|
13
|
+
def __init__(self, file_path, name, **kwargs):
|
14
|
+
"""Initialize Document object.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
file_path: Path to document file
|
18
|
+
name: Document name
|
19
|
+
**kwargs: Optional document attributes
|
20
|
+
"""
|
21
|
+
self.file_path = file_path
|
22
|
+
self.name = name
|
23
|
+
self.file = os.path.join(file_path, name)
|
24
|
+
|
25
|
+
# Initialize optional attributes
|
26
|
+
self.subject_ID = kwargs.get('subject_ID')
|
27
|
+
self.task = kwargs.get('task')
|
28
|
+
self.num_speakers = kwargs.get('num_speakers')
|
29
|
+
self.has_sections = kwargs.get('has_sections', False)
|
30
|
+
self.has_section_titles = kwargs.get('has_section_titles')
|
31
|
+
self.section_identifier = kwargs.get('section_identifier')
|
32
|
+
self.number_of_sections = kwargs.get('number_of_sections')
|
33
|
+
self.lines = kwargs.get('lines', [])
|
34
|
+
self.new_parameter = kwargs.get('new_parameter')
|
35
|
+
self.another_metric = kwargs.get('another_metric')
|
36
|
+
|
37
|
+
# Derived attributes
|
38
|
+
self.has_segments = self.task == "discourse"
|
39
|
+
self.segments = [] if self.has_segments else ["default"] * len(self.lines)
|
40
|
+
self.sections = None
|
41
|
+
|
42
|
+
# Initialize processing attributes
|
43
|
+
self._init_processing_attributes()
|
44
|
+
self._init_document_metrics()
|
45
|
+
|
46
|
+
def _init_processing_attributes(self):
|
47
|
+
"""Initialize attributes related to text processing."""
|
48
|
+
self.extension = None
|
49
|
+
self.session = None
|
50
|
+
self.corpus_name = None
|
51
|
+
self.sections = {}
|
52
|
+
self.section_metrics = {}
|
53
|
+
|
54
|
+
# Load raw text
|
55
|
+
self.importer = TextImporter(self.file_path)
|
56
|
+
self.raw_text = self.importer.load_text(self.file)
|
57
|
+
|
58
|
+
# Text processing state
|
59
|
+
self.fluency_word_count = None
|
60
|
+
self.fluency_duplicate_count = None
|
61
|
+
self.cleaned_sections = {}
|
62
|
+
self.tokens_logits = []
|
63
|
+
self.tokens_embeddings = []
|
64
|
+
self.normalized_tokens = None
|
65
|
+
self.processed_text = None
|
66
|
+
self.logits = []
|
67
|
+
self.embeddings = []
|
68
|
+
self.acoustic_features = None
|
69
|
+
|
70
|
+
def _init_document_metrics(self):
|
71
|
+
"""Initialize document metrics."""
|
72
|
+
self.length_in_lines = len(self.lines)
|
73
|
+
self.length_in_words = sum(line.length_in_words for line in self.lines)
|
74
|
+
self.fluency = None
|
75
|
+
self.number_of_duplicates = None
|
76
|
+
self.number_of_hyphenated_words = None
|
77
|
+
|
78
|
+
def __repr__(self):
|
79
|
+
return f"file_name={self.name}"
|
80
|
+
|
81
|
+
def add_line(self, line):
|
82
|
+
self.lines.append(line)
|
83
|
+
self.length_in_lines = len(self.lines)
|
84
|
+
self.length_in_words += line.length_in_words
|
85
|
+
if not self.has_segments:
|
86
|
+
self.segments.append("default")
|
87
|
+
|
88
|
+
def compile_texts_and_tags(self):
|
89
|
+
self.words, self.word_tags, self.word_segments = [], [], []
|
90
|
+
for line, segment in zip(self.lines, self.segments):
|
91
|
+
line_words = line.text.split()
|
92
|
+
tag = "i" if line.speaker.lower() == "investigator" else "s"
|
93
|
+
|
94
|
+
self.word_segments.extend([segment] * len(line_words))
|
95
|
+
self.words.extend(line_words)
|
96
|
+
self.word_tags.extend([tag] * len(line_words))
|
97
|
+
|
98
|
+
def segment_task(self, protocol, cutoff=1):
|
99
|
+
if not self.has_segments:
|
100
|
+
return self.segments
|
101
|
+
|
102
|
+
patterns = {
|
103
|
+
section: re.compile("|".join(f"(?:\\b{re.escape(term)}\\b)" for term in terms), re.IGNORECASE)
|
104
|
+
for section, terms in protocol.items()
|
105
|
+
}
|
106
|
+
|
107
|
+
match_scores = defaultdict(list)
|
108
|
+
for section, pattern in patterns.items():
|
109
|
+
for line_index, line in enumerate(self.lines):
|
110
|
+
if pattern.search(line.text):
|
111
|
+
match_scores[section].append(line_index)
|
112
|
+
|
113
|
+
section_order = sorted(protocol.keys(), key=lambda x: int(x))
|
114
|
+
section_starts = OrderedDict()
|
115
|
+
last_index_used = -1
|
116
|
+
|
117
|
+
for section in section_order:
|
118
|
+
line_indices = match_scores[section]
|
119
|
+
valid_starts = [idx for idx in line_indices if idx > last_index_used and len(line_indices) >= cutoff]
|
120
|
+
if valid_starts:
|
121
|
+
start_line = min(valid_starts)
|
122
|
+
section_starts[section] = start_line
|
123
|
+
last_index_used = start_line
|
124
|
+
|
125
|
+
segment_names = ["1"] * len(self.lines)
|
126
|
+
current_section = None
|
127
|
+
for i in range(len(self.lines)):
|
128
|
+
if i in section_starts.values():
|
129
|
+
current_section = [sec for sec, start in section_starts.items() if start == i][0]
|
130
|
+
segment_names[i] = current_section if current_section else "default"
|
131
|
+
|
132
|
+
self.segments = segment_names
|
133
|
+
self.sections = self._create_sections(segment_names)
|
134
|
+
return segment_names
|
135
|
+
|
136
|
+
def _create_sections(self, segment_names):
|
137
|
+
sections = defaultdict(list)
|
138
|
+
for line, segment in zip(self.lines, segment_names):
|
139
|
+
sections[segment].append(line)
|
140
|
+
return sections
|
141
|
+
|
142
|
+
def detect_sections(self):
|
143
|
+
print(f'detecting sections...')
|
144
|
+
if not self.raw_text:
|
145
|
+
raise ValueError("Raw text must be loaded before detecting sections.")
|
146
|
+
|
147
|
+
lines = self.raw_text.splitlines()
|
148
|
+
if not self.has_sections:
|
149
|
+
if self.has_section_titles and lines:
|
150
|
+
title, content = (lines[0].strip(), "\n".join(lines[1:]).strip()) if lines else ("untitled section", "")
|
151
|
+
else:
|
152
|
+
title, content = "untitled section", "\n".join(lines).strip()
|
153
|
+
self.sections = {title: content}
|
154
|
+
print(self.sections)
|
155
|
+
return
|
156
|
+
|
157
|
+
sections = {}
|
158
|
+
current_title, current_content = None, []
|
159
|
+
section_titles = []
|
160
|
+
|
161
|
+
for line in lines:
|
162
|
+
if line.startswith(self.section_identifier):
|
163
|
+
if current_title:
|
164
|
+
sections[current_title] = "\n".join(current_content).strip()
|
165
|
+
|
166
|
+
current_title = line.strip()
|
167
|
+
section_titles.append(current_title)
|
168
|
+
current_content = []
|
169
|
+
else:
|
170
|
+
if current_title:
|
171
|
+
current_content.append(line)
|
172
|
+
|
173
|
+
if current_title:
|
174
|
+
sections[current_title] = "\n".join(current_content).strip()
|
175
|
+
|
176
|
+
self.sections = sections
|
177
|
+
|
178
|
+
if self.number_of_sections is not None and len(self.sections) != self.number_of_sections:
|
179
|
+
raise ValueError("Incorrect number of sections detected.")
|
180
|
+
|
181
|
+
def process_document(self, pipeline):
|
182
|
+
print(f"Processing document: {self.name}")
|
183
|
+
pipeline.process_document(self)
|
184
|
+
|
185
|
+
def clean_text(self, cleaner):
|
186
|
+
if not self.sections:
|
187
|
+
raise ValueError("Text must be divided into sections before cleaning.")
|
188
|
+
|
189
|
+
self.cleaned_sections = self.sections.copy()
|
190
|
+
for title, content in self.sections.items():
|
191
|
+
if self.fluency:
|
192
|
+
self.cleaned_sections[title] = (
|
193
|
+
cleaner.clean_fluency_transcripts(self, content)
|
194
|
+
)
|
195
|
+
else:
|
196
|
+
self.cleaned_sections[title] = (
|
197
|
+
cleaner.clean(self, content)
|
198
|
+
)
|
199
|
+
|
200
|
+
def tokenize_text(self, tokenizer, purpose):
|
201
|
+
if not self.cleaned_sections:
|
202
|
+
raise ValueError("Text must be cleaned before tokenizing.")
|
203
|
+
|
204
|
+
for _, content in self.cleaned_sections.items():
|
205
|
+
tokens = tokenizer.tokenize(content)
|
206
|
+
if purpose == "logits":
|
207
|
+
self.tokens_logits.append(tokens)
|
208
|
+
elif purpose == "embeddings":
|
209
|
+
self.tokens_embeddings.append(tokens)
|
210
|
+
|
211
|
+
def normalize_text(self, normalizer):
|
212
|
+
if not self.tokens_logits:
|
213
|
+
raise ValueError("Text must be tokenized before normalization.")
|
214
|
+
|
215
|
+
self.normalized_tokens = normalizer.normalize(self.tokens_logits)
|
216
|
+
|
217
|
+
def get_processed_text(self):
|
218
|
+
return " ".join(self.normalized_tokens) if self.normalized_tokens else self.tokens_logits
|
219
|
+
|
220
|
+
def get_document_metadata(self):
|
221
|
+
return {
|
222
|
+
"file_path": self.file_path,
|
223
|
+
"task": self.task,
|
224
|
+
"num_speakers": self.num_speakers,
|
225
|
+
"has_sections": self.has_sections,
|
226
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""
|
2
|
+
This module provides the Subject class, each instance representing one subject.
|
3
|
+
The Subject class stores all subject specific information and a list of corresponding documents.
|
4
|
+
"""
|
5
|
+
|
6
|
+
class Subject:
|
7
|
+
def __init__(self, subjectID, description=None):
|
8
|
+
|
9
|
+
self.subjectID = subjectID
|
10
|
+
self.gender = None
|
11
|
+
self.age = None
|
12
|
+
self.name = None
|
13
|
+
self.description = description # Description of the subject
|
14
|
+
self.documents = [] # List of TextDocument instances
|
15
|
+
self.numberOfSessions = None
|
16
|
+
|
17
|
+
def __repr__(self):
|
18
|
+
return f"Subject(ID={self.subjectID})"
|
19
|
+
|
20
|
+
def add_document(self, document):
|
21
|
+
self.documents.append(document)
|
22
|
+
document.subject = self
|
23
|
+
|
24
|
+
def process_subject(self, importer, cleaner, tokenizer, normalizer):
|
25
|
+
print(f'Subject {self.subjectID} is being processed')
|
26
|
+
for document in self.documents:
|
27
|
+
continue
|
28
|
+
|
29
|
+
def get_subject_info(self):
|
30
|
+
return f"Subject: {self.name}\nDescription: {self.description}\nNumber of files: {len(self.documents)}"
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import audiofile
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
class AudioFeatureExtraction:
|
5
|
+
|
6
|
+
@staticmethod
|
7
|
+
def opensmile_extraction(file, opensmile_configurations):
|
8
|
+
print(f'opensmile extraction in progress...')
|
9
|
+
import opensmile
|
10
|
+
|
11
|
+
print(f'audio file is: {file}')
|
12
|
+
|
13
|
+
signal, sampling_rate = audiofile.read(
|
14
|
+
file,
|
15
|
+
always_2d=True,
|
16
|
+
duration=opensmile_configurations['duration'],
|
17
|
+
offset=opensmile_configurations['offset']
|
18
|
+
)
|
19
|
+
|
20
|
+
# extract eGeMAPSv02 feature set
|
21
|
+
smile = opensmile.Smile(
|
22
|
+
feature_set=opensmile.FeatureSet.eGeMAPSv02,
|
23
|
+
feature_level=opensmile.FeatureLevel.Functionals
|
24
|
+
)
|
25
|
+
|
26
|
+
output = smile.process_signal(
|
27
|
+
signal,
|
28
|
+
sampling_rate
|
29
|
+
)
|
30
|
+
|
31
|
+
# Create result dictionary with only the values we want
|
32
|
+
result = {}
|
33
|
+
result['subject_ID'] = None # This will be set by the calling function
|
34
|
+
for feature in smile.feature_names:
|
35
|
+
# Extract just the numerical value from the output
|
36
|
+
result[feature] = float(output[feature].values[0])
|
37
|
+
|
38
|
+
# Get recording length from the index
|
39
|
+
recording_length = output.index[0][1].total_seconds() # This gets the end time from the MultiIndex
|
40
|
+
|
41
|
+
return result, recording_length
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def extract_prosogram_profile(file):
|
45
|
+
"""
|
46
|
+
Extract prosodic features using prosogram through parselmouth.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
profile (DataFrame): Prosogram analysis results
|
50
|
+
"""
|
51
|
+
import parselmouth
|
52
|
+
from pelican.praat import PROSOGRAM_SCRIPT
|
53
|
+
try:
|
54
|
+
sound = parselmouth.Sound(file)
|
55
|
+
# Common Prosogram parameters
|
56
|
+
result = parselmouth.praat.run_file(
|
57
|
+
PROSOGRAM_SCRIPT,
|
58
|
+
arguments=[sound, "save=yes", "draw=no"],
|
59
|
+
capture_output=True
|
60
|
+
)
|
61
|
+
|
62
|
+
# Convert result into a DataFrame with the same format
|
63
|
+
profile = pd.read_csv(pd.compat.StringIO(result), sep="\t")
|
64
|
+
|
65
|
+
return profile
|
66
|
+
|
67
|
+
except Exception as e:
|
68
|
+
print(f"Error processing {file}")
|
69
|
+
print(f"Full error message: {str(e)}")
|
70
|
+
raise # This will show the full stack trace
|
71
|
+
return None
|