pelican-nlp 0.2.7__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pelican_nlp-0.2.7/pelican_nlp.egg-info → pelican_nlp-0.3.0}/PKG-INFO +1 -1
- pelican_nlp-0.3.0/pelican_nlp/_version.py +1 -0
- pelican_nlp-0.3.0/pelican_nlp/config.py +14 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/core/corpus.py +26 -30
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/core/subject.py +3 -3
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/acoustic_feature_extraction.py +1 -1
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/extract_embeddings.py +3 -1
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/extract_logits.py +4 -2
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/language_model.py +40 -10
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/main.py +49 -27
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/LPDS.py +9 -6
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/utils/csv_functions.py +45 -35
- pelican_nlp-0.3.0/pelican_nlp/utils/filename_parser.py +23 -0
- pelican_nlp-0.3.0/pelican_nlp/utils/setup_functions.py +118 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0/pelican_nlp.egg-info}/PKG-INFO +1 -1
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/SOURCES.txt +2 -0
- pelican_nlp-0.2.7/pelican_nlp/_version.py +0 -1
- pelican_nlp-0.2.7/pelican_nlp/utils/setup_functions.py +0 -92
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/LICENSE +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/MANIFEST.in +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/README.rst +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_discourse/config_discourse.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_discourse/subjects/sub-01/interview/sub-01_interview_schizophrenia_run-01.rtf +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/config_fluency.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_animals.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_clothes.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_food.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_animals.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_clothes.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_food.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/config_image-descriptions.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_drug.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_placebo.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_drug.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_placebo.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_drug.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_placebo.docx +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/extract_acoustic_features.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/behavioral_data.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/check_duplicates.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/coherence.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/config.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/main.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/plot_fluency.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/plotting_utils.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/questionnaires_data.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/stats_fluency.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/utils.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/speaker_diarization_Nils.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/test.json +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcribe_audio.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcription.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcription_gui.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/word_boundaries.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/Silvia_files/prosogram/prosogram.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/cli.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_audio.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_discourse.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_fluency.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_general.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_morteza.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/core/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/core/audio_document.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/core/document.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/distance_from_randomness.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/semantic_similarity.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/test_features.csv +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1_4.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/praat/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/pipeline.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/speaker_diarization.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_cleaner.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_importer.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_normalizer.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_tokenizer.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/sample_configuration_files/config_discourse.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/sample_configuration_files/config_fluency.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/sample_configuration_files/config_general.yml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/utils/__init__.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/utils/sample_usage.py +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/dependency_links.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/entry_points.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/requires.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/top_level.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pyproject.toml +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/requirements.txt +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/setup.cfg +0 -0
- {pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/tests/__init__.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.3.0"
|
@@ -0,0 +1,14 @@
|
|
1
|
+
"""
|
2
|
+
Global configuration settings for the Pelican project.
|
3
|
+
|
4
|
+
This file is not the configuration.yml file created for the users adaptations.
|
5
|
+
For consistency of pipeline, DO NOT CHANGE.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Debug flag
|
9
|
+
DEBUG_MODE = False
|
10
|
+
|
11
|
+
def debug_print(*args, **kwargs):
|
12
|
+
"""Print only if debug mode is enabled."""
|
13
|
+
if DEBUG_MODE:
|
14
|
+
print(*args, **kwargs)
|
@@ -15,20 +15,24 @@ import os
|
|
15
15
|
import pandas as pd
|
16
16
|
import re
|
17
17
|
|
18
|
+
from pelican_nlp.config import debug_print
|
19
|
+
|
18
20
|
class Corpus:
|
19
21
|
def __init__(self, corpus_name, documents, configuration_settings, project_folder):
|
20
22
|
self.name = corpus_name
|
23
|
+
self.key = corpus_name.split('-')[0]
|
24
|
+
self.value = corpus_name.split('-')[1]
|
21
25
|
self.documents = documents
|
22
26
|
self.config = configuration_settings
|
23
27
|
self.project_folder = project_folder
|
24
|
-
self.
|
28
|
+
self.derivatives_dir = project_folder / 'derivatives'
|
25
29
|
self.pipeline = TextPreprocessingPipeline(self.config)
|
26
30
|
self.task = configuration_settings['task_name']
|
27
31
|
self.results_path = None
|
28
32
|
|
29
33
|
def preprocess_all_documents(self):
|
30
34
|
"""Preprocess all documents"""
|
31
|
-
print('Preprocessing all documents...')
|
35
|
+
print(f'Preprocessing all documents of corpus {self.name}...')
|
32
36
|
for document in self.documents:
|
33
37
|
document.detect_sections()
|
34
38
|
document.process_document(self.pipeline)
|
@@ -43,21 +47,15 @@ class Corpus:
|
|
43
47
|
"""Create separate aggregated results CSV files for each metric."""
|
44
48
|
print("Creating aggregated results files per metric...")
|
45
49
|
|
46
|
-
try:
|
47
|
-
derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
|
48
|
-
except (AttributeError, IndexError):
|
49
|
-
print("Error: No valid results path found in documents")
|
50
|
-
return
|
51
|
-
|
52
50
|
# Create aggregations folder
|
53
|
-
aggregation_path = os.path.join(
|
51
|
+
aggregation_path = os.path.join(self.derivatives_dir, 'aggregations')
|
54
52
|
os.makedirs(aggregation_path, exist_ok=True)
|
55
53
|
|
56
54
|
# Initialize results dictionary with metrics as keys
|
57
55
|
results_by_metric = {}
|
58
56
|
|
59
57
|
# Walk through all directories in derivatives
|
60
|
-
for root, dirs, files in os.walk(
|
58
|
+
for root, dirs, files in os.walk(self.derivatives_dir):
|
61
59
|
# Skip the aggregations directory itself
|
62
60
|
if 'aggregations' in root:
|
63
61
|
continue
|
@@ -115,6 +113,7 @@ class Corpus:
|
|
115
113
|
logits_options = self.config['options_logits']
|
116
114
|
|
117
115
|
print('logits extraction in progress')
|
116
|
+
|
118
117
|
model_name = logits_options['model_name']
|
119
118
|
logitsExtractor = LogitsExtractor(logits_options,
|
120
119
|
self.pipeline,
|
@@ -144,7 +143,7 @@ class Corpus:
|
|
144
143
|
|
145
144
|
#'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
|
146
145
|
store_features_to_csv(logits,
|
147
|
-
self.
|
146
|
+
self.derivatives_dir,
|
148
147
|
self.documents[i],
|
149
148
|
metric='logits')
|
150
149
|
|
@@ -154,9 +153,12 @@ class Corpus:
|
|
154
153
|
embedding_options = self.config['options_embeddings']
|
155
154
|
print('Embeddings extraction in progress...')
|
156
155
|
embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.project_folder)
|
156
|
+
debug_print(len(self.documents))
|
157
157
|
for i in range(len(self.documents)):
|
158
|
+
|
159
|
+
debug_print(f'cleaned sections: {self.documents[i].cleaned_sections}')
|
158
160
|
for key, section in self.documents[i].cleaned_sections.items():
|
159
|
-
|
161
|
+
debug_print(f'Processing section {key}')
|
160
162
|
|
161
163
|
if self.config['discourse']:
|
162
164
|
section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
|
@@ -175,7 +177,7 @@ class Corpus:
|
|
175
177
|
from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
|
176
178
|
get_semantic_similarity_windows
|
177
179
|
consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
|
178
|
-
|
180
|
+
debug_print(f'Mean semantic similarity: {mean_similarity:.4f}')
|
179
181
|
|
180
182
|
for window_size in self.config['options_semantic-similarity']['window_sizes']:
|
181
183
|
window_stats = get_semantic_similarity_windows(utterance, window_size)
|
@@ -187,7 +189,7 @@ class Corpus:
|
|
187
189
|
'std_of_window_stds': window_stats[3],
|
188
190
|
'mean_of_window_medians': window_stats[4]
|
189
191
|
}
|
190
|
-
|
192
|
+
debug_print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
|
191
193
|
else:
|
192
194
|
window_data = {
|
193
195
|
'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
|
@@ -195,16 +197,16 @@ class Corpus:
|
|
195
197
|
}
|
196
198
|
|
197
199
|
store_features_to_csv(window_data,
|
198
|
-
self.
|
200
|
+
self.derivatives_dir,
|
199
201
|
self.documents[i],
|
200
202
|
metric=f'semantic-similarity-window-{window_size}')
|
201
203
|
|
202
204
|
if self.config['options_embeddings']['distance-from-randomness']:
|
203
205
|
from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
|
204
206
|
divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
|
205
|
-
|
207
|
+
debug_print(f'Divergence from optimality metrics: {divergence}')
|
206
208
|
store_features_to_csv(divergence,
|
207
|
-
self.
|
209
|
+
self.derivatives_dir,
|
208
210
|
self.documents[i],
|
209
211
|
metric='distance-from-randomness')
|
210
212
|
|
@@ -230,7 +232,7 @@ class Corpus:
|
|
230
232
|
cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
|
231
233
|
|
232
234
|
store_features_to_csv(cleaned_embeddings,
|
233
|
-
self.
|
235
|
+
self.derivatives_dir,
|
234
236
|
self.documents[i],
|
235
237
|
metric='embeddings')
|
236
238
|
return
|
@@ -241,11 +243,11 @@ class Corpus:
|
|
241
243
|
results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
|
242
244
|
self.documents[i].recording_length = recording_length # Store the recording length
|
243
245
|
results['subject_ID'] = self.documents[i].subject_ID # Set the subject ID
|
244
|
-
print('results obtained')
|
246
|
+
print('opensmile results obtained')
|
245
247
|
store_features_to_csv(results,
|
246
|
-
|
247
|
-
|
248
|
-
|
248
|
+
self.derivatives_dir,
|
249
|
+
self.documents[i],
|
250
|
+
metric='opensmile-features')
|
249
251
|
|
250
252
|
def extract_prosogram(self):
|
251
253
|
from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
|
@@ -257,14 +259,8 @@ class Corpus:
|
|
257
259
|
"""Create CSV file with summarized document parameters based on config specifications."""
|
258
260
|
print("Creating document information summary...")
|
259
261
|
|
260
|
-
try:
|
261
|
-
derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
|
262
|
-
except (AttributeError, IndexError):
|
263
|
-
print("Error: No valid results path found in documents")
|
264
|
-
return
|
265
|
-
|
266
262
|
# Create document_information folder inside aggregations
|
267
|
-
doc_info_path = os.path.join(
|
263
|
+
doc_info_path = os.path.join(self.derivatives_dir, 'aggregations', 'document_information')
|
268
264
|
os.makedirs(doc_info_path, exist_ok=True)
|
269
265
|
|
270
266
|
# Define output file path
|
@@ -293,4 +289,4 @@ class Corpus:
|
|
293
289
|
# Convert to DataFrame and save to CSV
|
294
290
|
df = pd.DataFrame(document_info)
|
295
291
|
df.to_csv(output_file, index=False)
|
296
|
-
|
292
|
+
debug_print(f"Document information saved to: {output_file}")
|
@@ -4,12 +4,12 @@ The Subject class stores all subject specific information and a list of correspo
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
class Subject:
|
7
|
-
def __init__(self,
|
7
|
+
def __init__(self, name, description=None):
|
8
8
|
|
9
|
-
self.
|
9
|
+
self.name = name
|
10
|
+
self.subjectID = None
|
10
11
|
self.gender = None
|
11
12
|
self.age = None
|
12
|
-
self.name = None
|
13
13
|
self.description = description # Description of the subject
|
14
14
|
self.documents = [] # List of TextDocument instances
|
15
15
|
self.numberOfSessions = None
|
{pelican_nlp-0.2.7 → pelican_nlp-0.3.0}/pelican_nlp/extraction/acoustic_feature_extraction.py
RENAMED
@@ -49,7 +49,7 @@ class AudioFeatureExtraction:
|
|
49
49
|
profile (DataFrame): Prosogram analysis results
|
50
50
|
"""
|
51
51
|
import parselmouth
|
52
|
-
from
|
52
|
+
from pelican_nlp.praat import PROSOGRAM_SCRIPT
|
53
53
|
try:
|
54
54
|
sound = parselmouth.Sound(file)
|
55
55
|
# Common Prosogram parameters
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from pelican_nlp.extraction.language_model import Model
|
2
2
|
from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
|
3
3
|
|
4
|
+
from pelican_nlp.config import debug_print
|
5
|
+
|
4
6
|
class EmbeddingsExtractor:
|
5
7
|
def __init__(self, embeddings_configurations, project_path):
|
6
8
|
self.embeddings_configurations = embeddings_configurations
|
@@ -22,7 +24,7 @@ class EmbeddingsExtractor:
|
|
22
24
|
|
23
25
|
# Tokenize the input text
|
24
26
|
inputs = self.Tokenizer.tokenize_text(text)
|
25
|
-
|
27
|
+
debug_print(f'inputs are: {inputs}')
|
26
28
|
|
27
29
|
if self.embeddings_configurations['pytorch_based_model']:
|
28
30
|
#e.g. RoBERTa Model or Llama Model
|
@@ -2,6 +2,8 @@ import torch
|
|
2
2
|
import torch.nn.functional as F
|
3
3
|
from tqdm import tqdm
|
4
4
|
|
5
|
+
from pelican_nlp.config import debug_print
|
6
|
+
|
5
7
|
class LogitsExtractor:
|
6
8
|
def __init__(self, options, pipeline, project_path):
|
7
9
|
|
@@ -13,9 +15,9 @@ class LogitsExtractor:
|
|
13
15
|
|
14
16
|
def extract_features(self, section, tokenizer, model):
|
15
17
|
|
16
|
-
|
18
|
+
debug_print(f'section to tokenize: {section}')
|
17
19
|
tokens = tokenizer.tokenize_text(section)
|
18
|
-
|
20
|
+
debug_print(tokens)
|
19
21
|
|
20
22
|
chunk_size = self.options['chunk_size']
|
21
23
|
overlap_size = self.options['overlap_size']
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import torch
|
2
2
|
import psutil
|
3
3
|
import os
|
4
|
+
import shutil
|
4
5
|
|
5
6
|
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
|
6
7
|
from transformers import AutoModelForCausalLM
|
@@ -26,24 +27,53 @@ class Model:
|
|
26
27
|
# Set the model path using proper OS path joining
|
27
28
|
model_path = os.path.join(model_dir, 'cc.de.300.bin')
|
28
29
|
|
29
|
-
# Download only if model doesn't exist
|
30
|
-
|
30
|
+
# Download only if model doesn't exist or is invalid
|
31
|
+
need_download = True
|
32
|
+
if os.path.exists(model_path):
|
31
33
|
try:
|
34
|
+
self.model_instance = fasttext.load_model(model_path)
|
35
|
+
need_download = False
|
36
|
+
except ValueError:
|
37
|
+
print(f"Existing model file is corrupted, re-downloading...")
|
38
|
+
os.remove(model_path)
|
39
|
+
|
40
|
+
if need_download:
|
41
|
+
print("Downloading FastText model...")
|
42
|
+
try:
|
43
|
+
# Try the built-in FastText downloader first
|
32
44
|
fasttext.util.download_model('de', if_exists='ignore')
|
33
|
-
|
34
|
-
|
45
|
+
# Find the downloaded file in current directory
|
46
|
+
downloaded_file = 'cc.de.300.bin'
|
47
|
+
if os.path.exists(downloaded_file):
|
48
|
+
# Move the file to the correct location
|
49
|
+
shutil.move(downloaded_file, model_path)
|
50
|
+
else:
|
51
|
+
raise FileNotFoundError("FastText downloader didn't create the expected file")
|
52
|
+
except (OSError, ValueError, FileNotFoundError) as e:
|
53
|
+
print(f"FastText downloader failed, using direct download: {str(e)}")
|
54
|
+
# Direct download fallback
|
35
55
|
import urllib.request
|
36
56
|
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz'
|
37
|
-
|
57
|
+
print(f"Downloading from {url}...")
|
58
|
+
temp_gz_path = model_path + '.gz'
|
59
|
+
urllib.request.urlretrieve(url, temp_gz_path)
|
60
|
+
|
38
61
|
# Decompress the file
|
62
|
+
print("Decompressing model file...")
|
39
63
|
import gzip
|
40
|
-
with gzip.open(
|
64
|
+
with gzip.open(temp_gz_path, 'rb') as f_in:
|
41
65
|
with open(model_path, 'wb') as f_out:
|
42
66
|
f_out.write(f_in.read())
|
43
|
-
os.remove(
|
67
|
+
os.remove(temp_gz_path)
|
68
|
+
print("Model decompressed successfully")
|
69
|
+
|
70
|
+
# Verify the downloaded model
|
71
|
+
try:
|
72
|
+
self.model_instance = fasttext.load_model(model_path)
|
73
|
+
except ValueError as e:
|
74
|
+
raise ValueError(f"Failed to load downloaded model: {str(e)}. Please try removing {model_path} and running again.")
|
44
75
|
|
45
|
-
|
46
|
-
print('FastText model loaded.')
|
76
|
+
print(f'FastText model loaded successfully from {model_path}')
|
47
77
|
elif self.model_name == 'xlm-roberta-base':
|
48
78
|
from transformers import AutoModel
|
49
79
|
self.model_instance = AutoModel.from_pretrained(
|
@@ -80,7 +110,7 @@ class Model:
|
|
80
110
|
def device_map_creation(self):
|
81
111
|
#check if cuda is available
|
82
112
|
if not torch.cuda.is_available():
|
83
|
-
print('Careful: Cuda not available, using CPU. This
|
113
|
+
print('Careful: Cuda not available, using CPU. This can be slow. Consider running pipeline on different device')
|
84
114
|
else:
|
85
115
|
print(f'{torch.cuda.get_device_name(0)} available.')
|
86
116
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
Pelican Project
|
4
|
-
|
3
|
+
Pelican-nlp Project
|
4
|
+
===================
|
5
5
|
|
6
|
-
Pelican is a tool developed to enable consistent and reproducible language processing.
|
6
|
+
Pelican-nlp is a tool developed to enable consistent and reproducible language processing.
|
7
7
|
Main entry point for the Pelican project handling document processing and metric extraction.
|
8
8
|
|
9
9
|
Author: Yves Pauli
|
@@ -23,6 +23,9 @@ import sys
|
|
23
23
|
from pelican_nlp.core import Corpus
|
24
24
|
from pelican_nlp.utils.setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
|
25
25
|
from pelican_nlp.preprocessing import LPDS
|
26
|
+
from pelican_nlp.utils.filename_parser import parse_lpds_filename
|
27
|
+
|
28
|
+
from config import debug_print
|
26
29
|
|
27
30
|
project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
|
28
31
|
|
@@ -30,7 +33,8 @@ class Pelican:
|
|
30
33
|
|
31
34
|
"""Main class for the Pelican project handling document processing and metric extraction."""
|
32
35
|
|
33
|
-
def __init__(self, config_path: str = None, dev_mode: bool =
|
36
|
+
def __init__(self, config_path: str = None, dev_mode: bool = False) -> None:
|
37
|
+
|
34
38
|
self.dev_mode = dev_mode
|
35
39
|
|
36
40
|
# If no config path is provided, use the default config from package; used for dev-mode
|
@@ -83,23 +87,25 @@ class Pelican:
|
|
83
87
|
subjects = subject_instantiator(self.config, self.project_path)
|
84
88
|
|
85
89
|
# Process each corpus
|
86
|
-
for
|
87
|
-
self._process_corpus(
|
88
|
-
|
89
|
-
def _process_corpus(self, corpus_name: str, subjects: List) -> None:
|
90
|
+
for corpus_value in self.config['corpus_values']:
|
91
|
+
self._process_corpus(self.config['corpus_key'], corpus_value, subjects)
|
90
92
|
|
93
|
+
def _process_corpus(self, corpus_key: str, corpus_value: str, subjects: List) -> None:
|
91
94
|
"""Process a single corpus including preprocessing and metric extraction."""
|
92
|
-
print(f'Processing corpus: {corpus_name}')
|
93
95
|
|
94
|
-
|
95
|
-
|
96
|
+
corpus_entity = corpus_key + '-' + corpus_value
|
97
|
+
print(f'Processing corpus: {corpus_entity}')
|
98
|
+
debug_print(subjects, corpus_entity)
|
99
|
+
corpus_documents = self._identify_corpus_files(subjects, corpus_entity)
|
100
|
+
debug_print(len(corpus_documents))
|
101
|
+
corpus = Corpus(corpus_entity, corpus_documents[corpus_entity], self.config, self.project_path)
|
96
102
|
|
97
|
-
for document in corpus_documents[
|
98
|
-
document.corpus_name =
|
103
|
+
for document in corpus_documents[corpus_entity]:
|
104
|
+
document.corpus_name = corpus_entity
|
99
105
|
|
100
106
|
if self.config['input_file']=='text':
|
101
107
|
corpus.preprocess_all_documents()
|
102
|
-
print(f'Corpus {
|
108
|
+
print(f'Corpus {corpus_key} is preprocessed')
|
103
109
|
|
104
110
|
self._extract_metrics(corpus)
|
105
111
|
|
@@ -140,18 +146,34 @@ class Pelican:
|
|
140
146
|
|
141
147
|
self._clear_gpu_memory()
|
142
148
|
|
143
|
-
def _identify_corpus_files(self, subjects: List,
|
144
|
-
"""Identify and group files
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
149
|
+
def _identify_corpus_files(self, subjects: List, entity: str) -> Dict:
|
150
|
+
"""Identify and group files based on specified entity-value pair."""
|
151
|
+
debug_print(f'identifying corpus files')
|
152
|
+
corpus_dict = {entity: []}
|
153
|
+
debug_print(len(subjects))
|
154
|
+
|
155
|
+
# Check if entity is in key-value format
|
156
|
+
if '-' in entity:
|
157
|
+
key, value = entity.split('-', 1)
|
158
|
+
|
159
|
+
for subject in subjects:
|
160
|
+
debug_print(subject.documents)
|
161
|
+
for document in subject.documents:
|
162
|
+
entities = parse_lpds_filename(document.name)
|
163
|
+
debug_print(entities)
|
164
|
+
if key in entities and str(entities[key]) == value:
|
165
|
+
corpus_dict[entity].append(document)
|
166
|
+
else:
|
167
|
+
# Entity is just a value, check all keys
|
168
|
+
for subject in subjects:
|
169
|
+
debug_print(subject.documents)
|
170
|
+
for document in subject.documents:
|
171
|
+
entities = parse_lpds_filename(document.name)
|
172
|
+
debug_print(entities)
|
173
|
+
# Convert all values to strings for comparison
|
174
|
+
if any(str(val) == entity for val in entities.values()):
|
175
|
+
corpus_dict[entity].append(document)
|
176
|
+
|
155
177
|
return corpus_dict
|
156
178
|
|
157
179
|
def _handle_output_directory(self) -> None:
|
@@ -207,4 +229,4 @@ class Pelican:
|
|
207
229
|
|
208
230
|
|
209
231
|
if __name__ == '__main__':
|
210
|
-
Pelican(project_path).run()
|
232
|
+
Pelican(project_path, dev_mode=True).run()
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
import os
|
3
3
|
|
4
|
+
from pelican_nlp.config import debug_print
|
5
|
+
|
4
6
|
class LPDS:
|
5
7
|
def __init__(self, project_folder, multiple_sessions):
|
6
8
|
self.project_folder = project_folder
|
@@ -18,7 +20,7 @@ class LPDS:
|
|
18
20
|
suggested_files = ["dataset_description.json", "README", "CHANGES", "participants.tsv"]
|
19
21
|
for file in suggested_files:
|
20
22
|
if not os.path.isfile(os.path.join(self.project_folder, file)):
|
21
|
-
|
23
|
+
debug_print(f"Warning: Missing suggested file '{file}' in the project folder.")
|
22
24
|
|
23
25
|
# Check for the 'subjects' folder
|
24
26
|
if not os.path.isdir(self.subjects_folder):
|
@@ -38,15 +40,16 @@ class LPDS:
|
|
38
40
|
if self.multiple_sessions:
|
39
41
|
session_folders = [f for f in os.listdir(subject_path) if
|
40
42
|
os.path.isdir(os.path.join(subject_path, f))]
|
41
|
-
if
|
43
|
+
if session_folders:
|
44
|
+
if 'ses-01' not in session_folders:
|
45
|
+
print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
|
46
|
+
else:
|
42
47
|
print(f"Warning: No session folders found in '{subject_folder}'.")
|
43
|
-
if 'ses-01' not in session_folders:
|
44
|
-
print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
|
45
48
|
|
46
49
|
# Check for optional subject_metadata file
|
47
50
|
metadata_file = os.path.join(subject_path, "subject_metadata")
|
48
51
|
if not os.path.isfile(metadata_file):
|
49
|
-
|
52
|
+
debug_print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
|
50
53
|
continue
|
51
54
|
|
52
55
|
session_folders = subject_folder
|
@@ -68,7 +71,7 @@ class LPDS:
|
|
68
71
|
else:
|
69
72
|
pattern = fr"^{subject_folder}_{task_folder}.*"
|
70
73
|
if not re.match(pattern, file):
|
71
|
-
|
74
|
+
debug_print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
|
72
75
|
|
73
76
|
def derivative_dir_creator(self):
|
74
77
|
# Create the 'derivatives' folder if it doesn't exist
|
@@ -1,58 +1,66 @@
|
|
1
1
|
import os
|
2
2
|
import csv
|
3
|
-
|
3
|
+
from .filename_parser import parse_lpds_filename
|
4
|
+
from pelican_nlp.config import debug_print
|
4
5
|
|
5
6
|
def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
|
6
|
-
"""Store various types of features to CSV files with consistent formatting.
|
7
|
+
"""Store various types of features to CSV files with consistent formatting."""
|
8
|
+
|
9
|
+
# Parse entities from the document name
|
10
|
+
entities = parse_lpds_filename(doc_class.name)
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
derivatives_dir: Base directory for all derivatives
|
11
|
-
doc_class: Document class containing subject, session (optional), task, and task_addition (optional) info
|
12
|
-
metric: Type of metric being stored
|
13
|
-
"""
|
14
|
-
# Get the appropriate metric folder
|
15
|
-
metric_folder = metric
|
12
|
+
# Get the base filename without extension and current suffix
|
13
|
+
base_filename = os.path.splitext(doc_class.name)[0] # Remove extension
|
16
14
|
|
17
|
-
#
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
doc_class.corpus_name
|
22
|
-
]
|
15
|
+
# If there's a suffix in the entities, remove it from the base filename
|
16
|
+
if 'suffix' in entities:
|
17
|
+
# Remove the current suffix
|
18
|
+
base_filename = base_filename.replace(f"_{entities['suffix']}", "")
|
23
19
|
|
24
|
-
#
|
25
|
-
|
26
|
-
filename_parts.insert(1, doc_class.session)
|
20
|
+
# Create the new filename with the metric as suffix
|
21
|
+
filename = f"{base_filename}_{metric}.csv"
|
27
22
|
|
28
|
-
#
|
29
|
-
|
23
|
+
# Extract core information from entities for directory structure
|
24
|
+
subject_ID = f"sub-{entities['sub']}" if 'sub' in entities else None
|
25
|
+
if not subject_ID:
|
26
|
+
raise ValueError(f"Missing required 'sub' entity in filename: {doc_class.name}")
|
30
27
|
|
31
|
-
|
32
|
-
|
33
|
-
filename += f"_{doc_class.task_addition}"
|
28
|
+
session = f"ses-{entities['ses']}" if 'ses' in entities else None
|
29
|
+
task = f"task-{entities['task']}" if 'task' in entities else None
|
34
30
|
|
35
|
-
#
|
36
|
-
filename += f"_{metric}.csv"
|
37
|
-
|
38
|
-
# Build the full path
|
31
|
+
# Build the full path components
|
39
32
|
path_components = [
|
40
33
|
derivatives_dir,
|
41
|
-
|
42
|
-
|
34
|
+
metric, # Use metric as the folder name
|
35
|
+
subject_ID,
|
43
36
|
]
|
44
37
|
|
45
38
|
# Add session to path if it exists
|
46
|
-
if
|
47
|
-
path_components.append(
|
39
|
+
if session:
|
40
|
+
path_components.append(session)
|
48
41
|
|
49
|
-
|
42
|
+
# Add task to path if it exists
|
43
|
+
if task:
|
44
|
+
path_components.append(task)
|
50
45
|
|
51
46
|
# Create directory and get final filepath
|
52
|
-
|
47
|
+
# Ensure all components have compatible types by using str() conversion
|
48
|
+
base_path = os.path.join(str(derivatives_dir), str(metric), str(subject_ID))
|
49
|
+
|
50
|
+
# Build path incrementally with explicit type conversion
|
51
|
+
if session:
|
52
|
+
final_results_path = os.path.join(base_path, str(session))
|
53
|
+
else:
|
54
|
+
final_results_path = base_path
|
55
|
+
|
56
|
+
if task:
|
57
|
+
final_results_path = os.path.join(final_results_path, str(task))
|
58
|
+
|
59
|
+
|
60
|
+
debug_print(final_results_path)
|
53
61
|
os.makedirs(final_results_path, exist_ok=True)
|
54
62
|
|
55
|
-
output_filepath = os.path.join(final_results_path, filename)
|
63
|
+
output_filepath = os.path.join(final_results_path, str(filename))
|
56
64
|
file_exists = os.path.exists(output_filepath)
|
57
65
|
|
58
66
|
# Write data based on metric type
|
@@ -146,6 +154,8 @@ def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
|
|
146
154
|
row_data.append(value)
|
147
155
|
writer.writerow(row_data)
|
148
156
|
|
157
|
+
return output_filepath
|
158
|
+
|
149
159
|
|
150
160
|
def _build_filename_parts(path_parts, corpus, metric, config=None):
|
151
161
|
"""Helper function to build filename components."""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
def parse_lpds_filename(filename):
|
4
|
+
"""Parse LPDS-style filename into entity-value pairs."""
|
5
|
+
|
6
|
+
entities = {}
|
7
|
+
name = Path(filename)
|
8
|
+
|
9
|
+
# Handle extension
|
10
|
+
entities['extension'] = name.suffix
|
11
|
+
|
12
|
+
# Split into components
|
13
|
+
parts = name.stem.split('_')
|
14
|
+
|
15
|
+
# Parse each entity-value pair
|
16
|
+
for part in parts:
|
17
|
+
if '-' in part:
|
18
|
+
key, value = part.split('-', 1)
|
19
|
+
entities[key] = value
|
20
|
+
else:
|
21
|
+
entities['suffix'] = part
|
22
|
+
|
23
|
+
return entities
|