pelican-nlp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pelican_nlp/__init__.py +9 -0
  2. pelican_nlp/core/__init__.py +5 -0
  3. pelican_nlp/core/audio_document.py +20 -0
  4. pelican_nlp/core/corpus.py +296 -0
  5. pelican_nlp/core/document.py +226 -0
  6. pelican_nlp/core/subject.py +30 -0
  7. pelican_nlp/extraction/__init__.py +2 -0
  8. pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
  9. pelican_nlp/extraction/distance_from_randomness.py +109 -0
  10. pelican_nlp/extraction/extract_embeddings.py +57 -0
  11. pelican_nlp/extraction/extract_logits.py +102 -0
  12. pelican_nlp/extraction/language_model.py +71 -0
  13. pelican_nlp/extraction/semantic_similarity.py +60 -0
  14. pelican_nlp/extraction/test_documents/test_features.csv +4 -0
  15. pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
  16. pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
  17. pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
  18. pelican_nlp/main.py +211 -0
  19. pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
  20. pelican_nlp/preprocessing/LPDS.py +77 -0
  21. pelican_nlp/preprocessing/__init__.py +7 -0
  22. pelican_nlp/preprocessing/pipeline.py +50 -0
  23. pelican_nlp/preprocessing/speaker_diarization.py +33 -0
  24. pelican_nlp/preprocessing/text_cleaner.py +224 -0
  25. pelican_nlp/preprocessing/text_importer.py +42 -0
  26. pelican_nlp/preprocessing/text_normalizer.py +24 -0
  27. pelican_nlp/preprocessing/text_tokenizer.py +43 -0
  28. pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
  29. pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
  30. pelican_nlp/sample_configuration_files/config_general.yml +131 -0
  31. pelican_nlp/utils/__init__.py +3 -0
  32. pelican_nlp/utils/csv_functions.py +193 -0
  33. pelican_nlp/utils/sample_usage.py +17 -0
  34. pelican_nlp/utils/setup_functions.py +93 -0
  35. pelican_nlp-0.1.0.dist-info/METADATA +146 -0
  36. pelican_nlp-0.1.0.dist-info/RECORD +39 -0
  37. pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
  38. pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
  39. pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,9 @@
1
+ # Version and metadata
2
+ __version__ = "0.1.0"
3
+ __author__ = "Yves Pauli"
4
+
5
+ try:
6
+ from .main import Pelican
7
+ except ImportError as e:
8
+ print(f"Warning: Could not import Pelican class: {e}")
9
+
@@ -0,0 +1,5 @@
1
+ # Import core classes for easier access
2
+ from .corpus import Corpus
3
+ from .document import Document
4
+ from .audio_document import AudioFile
5
+ from .subject import Subject
@@ -0,0 +1,20 @@
1
+ import os
2
+
3
+ class AudioFile:
4
+ def __init__(self, file_path, name, **kwargs):
5
+ self.file_path = file_path
6
+ self.name = name
7
+ self.file = os.path.join(file_path, name)
8
+
9
+ #Initialize optional attributes
10
+ self.subject_ID = kwargs.get('subject_ID')
11
+ self.task = kwargs.get('task')
12
+ self.num_speakers = kwargs.get('num_speakers')
13
+ self.corpus_name = None
14
+ self.recording_length = None
15
+
16
+ self.opensmile_results = None
17
+ self.prosogram_features = None
18
+
19
+ def __repr__(self):
20
+ return f"file_name={self.name}"
@@ -0,0 +1,296 @@
1
+ """
2
+ This module provides the Corpus class, which aggregates documents where the same processing
3
+ steps applied and results should be aggregated.
4
+ (e.g. all fluency files from task 'animals' or all image-descriptions from the same image)
5
+
6
+ This class contains the pipelines for homogenous processing and metric extraction of all grouped files.
7
+ """
8
+
9
+ from ..preprocessing import TextPreprocessingPipeline
10
+ from ..utils.csv_functions import store_features_to_csv
11
+ from ..extraction.language_model import Model
12
+ from ..preprocessing.speaker_diarization import TextDiarizer
13
+ from ..preprocessing import text_cleaner as textcleaner
14
+ import os
15
+ import pandas as pd
16
+ import re
17
+
18
+ class Corpus:
19
+ def __init__(self, corpus_name, documents, configuration_settings):
20
+ self.name = corpus_name
21
+ self.documents = documents
22
+ self.config = configuration_settings
23
+ self.derivative_dir = self.config['PATH_TO_PROJECT_FOLDER']+'/derivatives'
24
+ self.pipeline = TextPreprocessingPipeline(self.config)
25
+ self.task = configuration_settings['task_name']
26
+ self.results_path = None
27
+
28
+ def preprocess_all_documents(self):
29
+ """Preprocess all documents"""
30
+ print('Preprocessing all documents...')
31
+ for document in self.documents:
32
+ document.detect_sections()
33
+ document.process_document(self.pipeline)
34
+
35
+ def get_all_processed_texts(self):
36
+ result = {}
37
+ for subject in self.documents:
38
+ result[subject.name] = subject.get_processed_texts()
39
+ return result
40
+
41
+ def create_corpus_results_consolidation_csv(self):
42
+ """Create separate aggregated results CSV files for each metric."""
43
+ print("Creating aggregated results files per metric...")
44
+
45
+ try:
46
+ derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
47
+ except (AttributeError, IndexError):
48
+ print("Error: No valid results path found in documents")
49
+ return
50
+
51
+ # Create aggregations folder
52
+ aggregation_path = os.path.join(derivatives_path, 'aggregations')
53
+ os.makedirs(aggregation_path, exist_ok=True)
54
+
55
+ # Initialize results dictionary with metrics as keys
56
+ results_by_metric = {}
57
+
58
+ # Walk through all directories in derivatives
59
+ for root, dirs, files in os.walk(derivatives_path):
60
+ # Skip the aggregations directory itself
61
+ if 'aggregations' in root:
62
+ continue
63
+
64
+ for file in files:
65
+ if not file.endswith('.csv'):
66
+ continue
67
+
68
+ file_path = os.path.join(root, file)
69
+ try:
70
+ df = pd.read_csv(file_path)
71
+ subject_key = os.path.basename(file).split('_')[0]
72
+
73
+ # Determine metric type from file path
74
+ if 'semantic-similarity-window' in file:
75
+ metric = 'semantic-similarity'
76
+ elif 'distance-from-randomness' in file:
77
+ metric = 'distance-from-randomness'
78
+ else:
79
+ continue
80
+
81
+ # Initialize metric dict if not exists
82
+ if metric not in results_by_metric:
83
+ results_by_metric[metric] = {}
84
+
85
+ # Initialize subject dict if not exists
86
+ if subject_key not in results_by_metric[metric]:
87
+ results_by_metric[metric][subject_key] = {}
88
+
89
+ # Process based on metric type
90
+ if metric == 'semantic-similarity':
91
+ window_size = re.search(r'window-(\d+)', file).group(1)
92
+ for _, row in df.iterrows():
93
+ if 'Metric' in df.columns and 'Similarity_Score' in df.columns:
94
+ metric_name = f"window_{window_size}_{row['Metric']}"
95
+ results_by_metric[metric][subject_key][metric_name] = row['Similarity_Score']
96
+
97
+ except Exception as e:
98
+ print(f"Error processing {file_path}: {e}")
99
+ continue
100
+
101
+ # Save separate aggregated results for each metric
102
+ for metric, metric_results in results_by_metric.items():
103
+ if metric_results:
104
+ output_file = os.path.join(aggregation_path, f'{self.name}_{metric}_aggregated_results.csv')
105
+ pd.DataFrame(metric_results).T.to_csv(output_file)
106
+ print(f"Aggregated results for {metric} saved to: {output_file}")
107
+
108
+ if not results_by_metric:
109
+ print("No results to aggregate")
110
+
111
+ def extract_logits(self):
112
+ from pelican_nlp.extraction.extract_logits import LogitsExtractor
113
+ from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
114
+ logits_options = self.config['options_logits']
115
+ project_path = self.config['PATH_TO_PROJECT_FOLDER']
116
+
117
+ print('logits extraction in progress')
118
+ model_name = logits_options['model_name']
119
+ logitsExtractor = LogitsExtractor(logits_options,
120
+ self.pipeline,
121
+ project_path)
122
+ model = Model(model_name, project_path)
123
+ model.load_model()
124
+ model_instance = model.model_instance
125
+ tokenizer = TextTokenizer(logits_options['tokenization_method'], model_name=logits_options['model_name'])
126
+ for i in range(len(self.documents)):
127
+
128
+ for key, section in self.documents[i].cleaned_sections.items():
129
+
130
+ if self.config['discourse'] == True:
131
+ section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'],
132
+ logits_options['keep_speakertags'])
133
+ #print(f'parsed section is {section}')
134
+ else:
135
+ section = [section]
136
+
137
+ print(f'Extracting Logits for section {key}')
138
+
139
+ for part in section:
140
+ print(part)
141
+ logits = logitsExtractor.extract_features(part, tokenizer, model_instance)
142
+ print(logits)
143
+ self.documents[i].logits.append(logits)
144
+
145
+ #'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
146
+ store_features_to_csv(logits,
147
+ self.derivative_dir,
148
+ self.documents[i],
149
+ metric='logits')
150
+
151
+ def extract_embeddings(self):
152
+ from pelican_nlp.extraction.extract_embeddings import EmbeddingsExtractor
153
+
154
+ embedding_options = self.config['options_embeddings']
155
+ print('Embeddings extraction in progress...')
156
+ embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.config['PATH_TO_PROJECT_FOLDER'])
157
+ for i in range(len(self.documents)):
158
+ for key, section in self.documents[i].cleaned_sections.items():
159
+ print(f'Processing section {key}')
160
+
161
+ if self.config['discourse']:
162
+ section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
163
+ else:
164
+ section = [section]
165
+
166
+ embeddings, token_count = embeddingsExtractor.extract_embeddings_from_text(section)
167
+ self.documents[i].embeddings.append(embeddings)
168
+
169
+ if self.task == 'fluency':
170
+ self.documents[i].fluency_word_count = token_count
171
+
172
+ for utterance in embeddings:
173
+
174
+ if self.config['options_embeddings']['semantic-similarity']:
175
+ from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
176
+ get_semantic_similarity_windows
177
+ consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
178
+ print(f'Mean semantic similarity: {mean_similarity:.4f}')
179
+
180
+ for window_size in self.config['options_semantic-similarity']['window_sizes']:
181
+ window_stats = get_semantic_similarity_windows(utterance, window_size)
182
+ if isinstance(window_stats, tuple) and len(window_stats) == 5:
183
+ window_data = {
184
+ 'mean_of_window_means': window_stats[0],
185
+ 'std_of_window_means': window_stats[1],
186
+ 'mean_of_window_stds': window_stats[2],
187
+ 'std_of_window_stds': window_stats[3],
188
+ 'mean_of_window_medians': window_stats[4]
189
+ }
190
+ print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
191
+ else:
192
+ window_data = {
193
+ 'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
194
+ 'std': window_stats[1] if isinstance(window_stats, tuple) and len(window_stats) > 1 else None
195
+ }
196
+
197
+ store_features_to_csv(window_data,
198
+ self.derivative_dir,
199
+ self.documents[i],
200
+ metric=f'semantic-similarity-window-{window_size}')
201
+
202
+ if self.config['options_embeddings']['distance-from-randomness']:
203
+ from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
204
+ divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
205
+ print(f'Divergence from optimality metrics: {divergence}')
206
+ store_features_to_csv(divergence,
207
+ self.derivative_dir,
208
+ self.documents[i],
209
+ metric='distance-from-randomness')
210
+
211
+ # Process tokens
212
+ if embedding_options['clean_embedding_tokens']:
213
+ cleaned_embeddings = []
214
+ if isinstance(utterance, dict):
215
+ # Handle dictionary case (PyTorch models)
216
+ for token, embedding in utterance.items():
217
+ if 'xlm-roberta-base' in self.config['options_embeddings']['model_name'].lower():
218
+ cleaned_token = textcleaner.clean_subword_token_RoBERTa(token)
219
+ else:
220
+ cleaned_token = textcleaner.clean_token_generic(token)
221
+ if cleaned_token is not None:
222
+ cleaned_embeddings.append((cleaned_token, embedding))
223
+ else:
224
+ # Handle list of tuples case (fastText)
225
+ for token, embedding in utterance:
226
+ cleaned_token = textcleaner.clean_token_generic(token)
227
+ if cleaned_token is not None:
228
+ cleaned_embeddings.append((cleaned_token, embedding))
229
+ else:
230
+ cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
231
+
232
+ store_features_to_csv(cleaned_embeddings,
233
+ self.derivative_dir,
234
+ self.documents[i],
235
+ metric='embeddings')
236
+ return
237
+
238
+ def extract_opensmile_features(self):
239
+ from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
240
+ for i in range(len(self.documents)):
241
+ results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
242
+ self.documents[i].recording_length = recording_length # Store the recording length
243
+ results['subject_ID'] = self.documents[i].subject_ID # Set the subject ID
244
+ print('results obtained')
245
+ store_features_to_csv(results,
246
+ self.derivative_dir,
247
+ self.documents[i],
248
+ metric='opensmile-features')
249
+
250
+ def extract_prosogram(self):
251
+ from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
252
+ for i in range(len(self.documents)):
253
+ results = AudioFeatureExtraction.extract_prosogram_profile(self.documents[i].file)
254
+ print('prosogram obtained')
255
+
256
+ def create_document_information_csv(self):
257
+ """Create CSV file with summarized document parameters based on config specifications."""
258
+ print("Creating document information summary...")
259
+
260
+ try:
261
+ derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
262
+ except (AttributeError, IndexError):
263
+ print("Error: No valid results path found in documents")
264
+ return
265
+
266
+ # Create document_information folder inside aggregations
267
+ doc_info_path = os.path.join(derivatives_path, 'aggregations', 'document_information')
268
+ os.makedirs(doc_info_path, exist_ok=True)
269
+
270
+ # Define output file path
271
+ output_file = os.path.join(doc_info_path, f'{self.name}_document-information.csv')
272
+
273
+ # Get parameters to include from config
274
+ parameters_to_include = self.config.get('document_information_output', {}).get('parameters', [])
275
+
276
+ if not parameters_to_include:
277
+ print("Warning: No parameters specified in config for document information output")
278
+ return
279
+
280
+ # Get document information based on specified parameters
281
+ document_info = []
282
+ for doc in self.documents:
283
+ # Get all attributes using vars()
284
+ attrs = vars(doc)
285
+ # Filter based on specified parameters
286
+ info = {
287
+ param: attrs.get(param)
288
+ for param in parameters_to_include
289
+ if param in attrs
290
+ }
291
+ document_info.append(info)
292
+
293
+ # Convert to DataFrame and save to CSV
294
+ df = pd.DataFrame(document_info)
295
+ df.to_csv(output_file, index=False)
296
+ print(f"Document information saved to: {output_file}")
@@ -0,0 +1,226 @@
1
+ """
2
+ This module provides the Document class, each instance representing one file within a corpus.
3
+ The Document class stores all document specific information.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ from pelican_nlp.preprocessing import TextImporter
9
+ from collections import defaultdict, OrderedDict
10
+
11
+ class Document:
12
+
13
+ def __init__(self, file_path, name, **kwargs):
14
+ """Initialize Document object.
15
+
16
+ Args:
17
+ file_path: Path to document file
18
+ name: Document name
19
+ **kwargs: Optional document attributes
20
+ """
21
+ self.file_path = file_path
22
+ self.name = name
23
+ self.file = os.path.join(file_path, name)
24
+
25
+ # Initialize optional attributes
26
+ self.subject_ID = kwargs.get('subject_ID')
27
+ self.task = kwargs.get('task')
28
+ self.num_speakers = kwargs.get('num_speakers')
29
+ self.has_sections = kwargs.get('has_sections', False)
30
+ self.has_section_titles = kwargs.get('has_section_titles')
31
+ self.section_identifier = kwargs.get('section_identifier')
32
+ self.number_of_sections = kwargs.get('number_of_sections')
33
+ self.lines = kwargs.get('lines', [])
34
+ self.new_parameter = kwargs.get('new_parameter')
35
+ self.another_metric = kwargs.get('another_metric')
36
+
37
+ # Derived attributes
38
+ self.has_segments = self.task == "discourse"
39
+ self.segments = [] if self.has_segments else ["default"] * len(self.lines)
40
+ self.sections = None
41
+
42
+ # Initialize processing attributes
43
+ self._init_processing_attributes()
44
+ self._init_document_metrics()
45
+
46
+ def _init_processing_attributes(self):
47
+ """Initialize attributes related to text processing."""
48
+ self.extension = None
49
+ self.session = None
50
+ self.corpus_name = None
51
+ self.sections = {}
52
+ self.section_metrics = {}
53
+
54
+ # Load raw text
55
+ self.importer = TextImporter(self.file_path)
56
+ self.raw_text = self.importer.load_text(self.file)
57
+
58
+ # Text processing state
59
+ self.fluency_word_count = None
60
+ self.fluency_duplicate_count = None
61
+ self.cleaned_sections = {}
62
+ self.tokens_logits = []
63
+ self.tokens_embeddings = []
64
+ self.normalized_tokens = None
65
+ self.processed_text = None
66
+ self.logits = []
67
+ self.embeddings = []
68
+ self.acoustic_features = None
69
+
70
+ def _init_document_metrics(self):
71
+ """Initialize document metrics."""
72
+ self.length_in_lines = len(self.lines)
73
+ self.length_in_words = sum(line.length_in_words for line in self.lines)
74
+ self.fluency = None
75
+ self.number_of_duplicates = None
76
+ self.number_of_hyphenated_words = None
77
+
78
+ def __repr__(self):
79
+ return f"file_name={self.name}"
80
+
81
+ def add_line(self, line):
82
+ self.lines.append(line)
83
+ self.length_in_lines = len(self.lines)
84
+ self.length_in_words += line.length_in_words
85
+ if not self.has_segments:
86
+ self.segments.append("default")
87
+
88
+ def compile_texts_and_tags(self):
89
+ self.words, self.word_tags, self.word_segments = [], [], []
90
+ for line, segment in zip(self.lines, self.segments):
91
+ line_words = line.text.split()
92
+ tag = "i" if line.speaker.lower() == "investigator" else "s"
93
+
94
+ self.word_segments.extend([segment] * len(line_words))
95
+ self.words.extend(line_words)
96
+ self.word_tags.extend([tag] * len(line_words))
97
+
98
+ def segment_task(self, protocol, cutoff=1):
99
+ if not self.has_segments:
100
+ return self.segments
101
+
102
+ patterns = {
103
+ section: re.compile("|".join(f"(?:\\b{re.escape(term)}\\b)" for term in terms), re.IGNORECASE)
104
+ for section, terms in protocol.items()
105
+ }
106
+
107
+ match_scores = defaultdict(list)
108
+ for section, pattern in patterns.items():
109
+ for line_index, line in enumerate(self.lines):
110
+ if pattern.search(line.text):
111
+ match_scores[section].append(line_index)
112
+
113
+ section_order = sorted(protocol.keys(), key=lambda x: int(x))
114
+ section_starts = OrderedDict()
115
+ last_index_used = -1
116
+
117
+ for section in section_order:
118
+ line_indices = match_scores[section]
119
+ valid_starts = [idx for idx in line_indices if idx > last_index_used and len(line_indices) >= cutoff]
120
+ if valid_starts:
121
+ start_line = min(valid_starts)
122
+ section_starts[section] = start_line
123
+ last_index_used = start_line
124
+
125
+ segment_names = ["1"] * len(self.lines)
126
+ current_section = None
127
+ for i in range(len(self.lines)):
128
+ if i in section_starts.values():
129
+ current_section = [sec for sec, start in section_starts.items() if start == i][0]
130
+ segment_names[i] = current_section if current_section else "default"
131
+
132
+ self.segments = segment_names
133
+ self.sections = self._create_sections(segment_names)
134
+ return segment_names
135
+
136
+ def _create_sections(self, segment_names):
137
+ sections = defaultdict(list)
138
+ for line, segment in zip(self.lines, segment_names):
139
+ sections[segment].append(line)
140
+ return sections
141
+
142
+ def detect_sections(self):
143
+ print(f'detecting sections...')
144
+ if not self.raw_text:
145
+ raise ValueError("Raw text must be loaded before detecting sections.")
146
+
147
+ lines = self.raw_text.splitlines()
148
+ if not self.has_sections:
149
+ if self.has_section_titles and lines:
150
+ title, content = (lines[0].strip(), "\n".join(lines[1:]).strip()) if lines else ("untitled section", "")
151
+ else:
152
+ title, content = "untitled section", "\n".join(lines).strip()
153
+ self.sections = {title: content}
154
+ print(self.sections)
155
+ return
156
+
157
+ sections = {}
158
+ current_title, current_content = None, []
159
+ section_titles = []
160
+
161
+ for line in lines:
162
+ if line.startswith(self.section_identifier):
163
+ if current_title:
164
+ sections[current_title] = "\n".join(current_content).strip()
165
+
166
+ current_title = line.strip()
167
+ section_titles.append(current_title)
168
+ current_content = []
169
+ else:
170
+ if current_title:
171
+ current_content.append(line)
172
+
173
+ if current_title:
174
+ sections[current_title] = "\n".join(current_content).strip()
175
+
176
+ self.sections = sections
177
+
178
+ if self.number_of_sections is not None and len(self.sections) != self.number_of_sections:
179
+ raise ValueError("Incorrect number of sections detected.")
180
+
181
+ def process_document(self, pipeline):
182
+ print(f"Processing document: {self.name}")
183
+ pipeline.process_document(self)
184
+
185
+ def clean_text(self, cleaner):
186
+ if not self.sections:
187
+ raise ValueError("Text must be divided into sections before cleaning.")
188
+
189
+ self.cleaned_sections = self.sections.copy()
190
+ for title, content in self.sections.items():
191
+ if self.fluency:
192
+ self.cleaned_sections[title] = (
193
+ cleaner.clean_fluency_transcripts(self, content)
194
+ )
195
+ else:
196
+ self.cleaned_sections[title] = (
197
+ cleaner.clean(self, content)
198
+ )
199
+
200
+ def tokenize_text(self, tokenizer, purpose):
201
+ if not self.cleaned_sections:
202
+ raise ValueError("Text must be cleaned before tokenizing.")
203
+
204
+ for _, content in self.cleaned_sections.items():
205
+ tokens = tokenizer.tokenize(content)
206
+ if purpose == "logits":
207
+ self.tokens_logits.append(tokens)
208
+ elif purpose == "embeddings":
209
+ self.tokens_embeddings.append(tokens)
210
+
211
+ def normalize_text(self, normalizer):
212
+ if not self.tokens_logits:
213
+ raise ValueError("Text must be tokenized before normalization.")
214
+
215
+ self.normalized_tokens = normalizer.normalize(self.tokens_logits)
216
+
217
+ def get_processed_text(self):
218
+ return " ".join(self.normalized_tokens) if self.normalized_tokens else self.tokens_logits
219
+
220
+ def get_document_metadata(self):
221
+ return {
222
+ "file_path": self.file_path,
223
+ "task": self.task,
224
+ "num_speakers": self.num_speakers,
225
+ "has_sections": self.has_sections,
226
+ }
@@ -0,0 +1,30 @@
1
+ """
2
+ This module provides the Subject class, each instance representing one subject.
3
+ The Subject class stores all subject specific information and a list of corresponding documents.
4
+ """
5
+
6
+ class Subject:
7
+ def __init__(self, subjectID, description=None):
8
+
9
+ self.subjectID = subjectID
10
+ self.gender = None
11
+ self.age = None
12
+ self.name = None
13
+ self.description = description # Description of the subject
14
+ self.documents = [] # List of TextDocument instances
15
+ self.numberOfSessions = None
16
+
17
+ def __repr__(self):
18
+ return f"Subject(ID={self.subjectID})"
19
+
20
+ def add_document(self, document):
21
+ self.documents.append(document)
22
+ document.subject = self
23
+
24
+ def process_subject(self, importer, cleaner, tokenizer, normalizer):
25
+ print(f'Subject {self.subjectID} is being processed')
26
+ for document in self.documents:
27
+ continue
28
+
29
+ def get_subject_info(self):
30
+ return f"Subject: {self.name}\nDescription: {self.description}\nNumber of files: {len(self.documents)}"
@@ -0,0 +1,2 @@
1
+ # Import extraction related classes and functions
2
+ from .language_model import Model
@@ -0,0 +1,71 @@
1
+ import audiofile
2
+ import pandas as pd
3
+
4
+ class AudioFeatureExtraction:
5
+
6
+ @staticmethod
7
+ def opensmile_extraction(file, opensmile_configurations):
8
+ print(f'opensmile extraction in progress...')
9
+ import opensmile
10
+
11
+ print(f'audio file is: {file}')
12
+
13
+ signal, sampling_rate = audiofile.read(
14
+ file,
15
+ always_2d=True,
16
+ duration=opensmile_configurations['duration'],
17
+ offset=opensmile_configurations['offset']
18
+ )
19
+
20
+ # extract eGeMAPSv02 feature set
21
+ smile = opensmile.Smile(
22
+ feature_set=opensmile.FeatureSet.eGeMAPSv02,
23
+ feature_level=opensmile.FeatureLevel.Functionals
24
+ )
25
+
26
+ output = smile.process_signal(
27
+ signal,
28
+ sampling_rate
29
+ )
30
+
31
+ # Create result dictionary with only the values we want
32
+ result = {}
33
+ result['subject_ID'] = None # This will be set by the calling function
34
+ for feature in smile.feature_names:
35
+ # Extract just the numerical value from the output
36
+ result[feature] = float(output[feature].values[0])
37
+
38
+ # Get recording length from the index
39
+ recording_length = output.index[0][1].total_seconds() # This gets the end time from the MultiIndex
40
+
41
+ return result, recording_length
42
+
43
+ @staticmethod
44
+ def extract_prosogram_profile(file):
45
+ """
46
+ Extract prosodic features using prosogram through parselmouth.
47
+
48
+ Returns:
49
+ profile (DataFrame): Prosogram analysis results
50
+ """
51
+ import parselmouth
52
+ from pelican.praat import PROSOGRAM_SCRIPT
53
+ try:
54
+ sound = parselmouth.Sound(file)
55
+ # Common Prosogram parameters
56
+ result = parselmouth.praat.run_file(
57
+ PROSOGRAM_SCRIPT,
58
+ arguments=[sound, "save=yes", "draw=no"],
59
+ capture_output=True
60
+ )
61
+
62
+ # Convert result into a DataFrame with the same format
63
+ profile = pd.read_csv(pd.compat.StringIO(result), sep="\t")
64
+
65
+ return profile
66
+
67
+ except Exception as e:
68
+ print(f"Error processing {file}")
69
+ print(f"Full error message: {str(e)}")
70
+ raise # This will show the full stack trace
71
+ return None