pelican-nlp 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pelican_nlp/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.7"
1
+ __version__ = "0.3.1"
pelican_nlp/config.py ADDED
@@ -0,0 +1,14 @@
1
+ """
2
+ Global configuration settings for the Pelican project.
3
+
4
+ This file is not the configuration.yml file created for the users adaptations.
5
+ For consistency of pipeline, DO NOT CHANGE.
6
+ """
7
+
8
+ # Debug flag
9
+ DEBUG_MODE = False
10
+
11
+ def debug_print(*args, **kwargs):
12
+ """Print only if debug mode is enabled."""
13
+ if DEBUG_MODE:
14
+ print(*args, **kwargs)
@@ -15,20 +15,24 @@ import os
15
15
  import pandas as pd
16
16
  import re
17
17
 
18
+ from pelican_nlp.config import debug_print
19
+
18
20
  class Corpus:
19
21
  def __init__(self, corpus_name, documents, configuration_settings, project_folder):
20
22
  self.name = corpus_name
23
+ self.key = corpus_name.split('-')[0]
24
+ self.value = corpus_name.split('-')[1]
21
25
  self.documents = documents
22
26
  self.config = configuration_settings
23
27
  self.project_folder = project_folder
24
- self.derivative_dir = project_folder / 'derivatives'
28
+ self.derivatives_dir = project_folder / 'derivatives'
25
29
  self.pipeline = TextPreprocessingPipeline(self.config)
26
30
  self.task = configuration_settings['task_name']
27
31
  self.results_path = None
28
32
 
29
33
  def preprocess_all_documents(self):
30
34
  """Preprocess all documents"""
31
- print('Preprocessing all documents...')
35
+ print(f'Preprocessing all documents of corpus {self.name}...')
32
36
  for document in self.documents:
33
37
  document.detect_sections()
34
38
  document.process_document(self.pipeline)
@@ -43,21 +47,15 @@ class Corpus:
43
47
  """Create separate aggregated results CSV files for each metric."""
44
48
  print("Creating aggregated results files per metric...")
45
49
 
46
- try:
47
- derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
48
- except (AttributeError, IndexError):
49
- print("Error: No valid results path found in documents")
50
- return
51
-
52
50
  # Create aggregations folder
53
- aggregation_path = os.path.join(derivatives_path, 'aggregations')
51
+ aggregation_path = os.path.join(self.derivatives_dir, 'aggregations')
54
52
  os.makedirs(aggregation_path, exist_ok=True)
55
53
 
56
54
  # Initialize results dictionary with metrics as keys
57
55
  results_by_metric = {}
58
56
 
59
57
  # Walk through all directories in derivatives
60
- for root, dirs, files in os.walk(derivatives_path):
58
+ for root, dirs, files in os.walk(self.derivatives_dir):
61
59
  # Skip the aggregations directory itself
62
60
  if 'aggregations' in root:
63
61
  continue
@@ -115,6 +113,7 @@ class Corpus:
115
113
  logits_options = self.config['options_logits']
116
114
 
117
115
  print('logits extraction in progress')
116
+
118
117
  model_name = logits_options['model_name']
119
118
  logitsExtractor = LogitsExtractor(logits_options,
120
119
  self.pipeline,
@@ -144,7 +143,7 @@ class Corpus:
144
143
 
145
144
  #'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
146
145
  store_features_to_csv(logits,
147
- self.derivative_dir,
146
+ self.derivatives_dir,
148
147
  self.documents[i],
149
148
  metric='logits')
150
149
 
@@ -154,9 +153,12 @@ class Corpus:
154
153
  embedding_options = self.config['options_embeddings']
155
154
  print('Embeddings extraction in progress...')
156
155
  embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.project_folder)
156
+ debug_print(len(self.documents))
157
157
  for i in range(len(self.documents)):
158
+
159
+ debug_print(f'cleaned sections: {self.documents[i].cleaned_sections}')
158
160
  for key, section in self.documents[i].cleaned_sections.items():
159
- print(f'Processing section {key}')
161
+ debug_print(f'Processing section {key}')
160
162
 
161
163
  if self.config['discourse']:
162
164
  section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
@@ -175,7 +177,7 @@ class Corpus:
175
177
  from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
176
178
  get_semantic_similarity_windows
177
179
  consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
178
- print(f'Mean semantic similarity: {mean_similarity:.4f}')
180
+ debug_print(f'Mean semantic similarity: {mean_similarity:.4f}')
179
181
 
180
182
  for window_size in self.config['options_semantic-similarity']['window_sizes']:
181
183
  window_stats = get_semantic_similarity_windows(utterance, window_size)
@@ -187,7 +189,7 @@ class Corpus:
187
189
  'std_of_window_stds': window_stats[3],
188
190
  'mean_of_window_medians': window_stats[4]
189
191
  }
190
- print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
192
+ debug_print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
191
193
  else:
192
194
  window_data = {
193
195
  'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
@@ -195,16 +197,16 @@ class Corpus:
195
197
  }
196
198
 
197
199
  store_features_to_csv(window_data,
198
- self.derivative_dir,
200
+ self.derivatives_dir,
199
201
  self.documents[i],
200
202
  metric=f'semantic-similarity-window-{window_size}')
201
203
 
202
204
  if self.config['options_embeddings']['distance-from-randomness']:
203
205
  from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
204
206
  divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
205
- print(f'Divergence from optimality metrics: {divergence}')
207
+ debug_print(f'Divergence from optimality metrics: {divergence}')
206
208
  store_features_to_csv(divergence,
207
- self.derivative_dir,
209
+ self.derivatives_dir,
208
210
  self.documents[i],
209
211
  metric='distance-from-randomness')
210
212
 
@@ -230,7 +232,7 @@ class Corpus:
230
232
  cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
231
233
 
232
234
  store_features_to_csv(cleaned_embeddings,
233
- self.derivative_dir,
235
+ self.derivatives_dir,
234
236
  self.documents[i],
235
237
  metric='embeddings')
236
238
  return
@@ -241,11 +243,11 @@ class Corpus:
241
243
  results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
242
244
  self.documents[i].recording_length = recording_length # Store the recording length
243
245
  results['subject_ID'] = self.documents[i].subject_ID # Set the subject ID
244
- print('results obtained')
246
+ print('opensmile results obtained')
245
247
  store_features_to_csv(results,
246
- self.derivative_dir,
247
- self.documents[i],
248
- metric='opensmile-features')
248
+ self.derivatives_dir,
249
+ self.documents[i],
250
+ metric='opensmile-features')
249
251
 
250
252
  def extract_prosogram(self):
251
253
  from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
@@ -257,14 +259,8 @@ class Corpus:
257
259
  """Create CSV file with summarized document parameters based on config specifications."""
258
260
  print("Creating document information summary...")
259
261
 
260
- try:
261
- derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
262
- except (AttributeError, IndexError):
263
- print("Error: No valid results path found in documents")
264
- return
265
-
266
262
  # Create document_information folder inside aggregations
267
- doc_info_path = os.path.join(derivatives_path, 'aggregations', 'document_information')
263
+ doc_info_path = os.path.join(self.derivatives_dir, 'aggregations', 'document_information')
268
264
  os.makedirs(doc_info_path, exist_ok=True)
269
265
 
270
266
  # Define output file path
@@ -293,4 +289,4 @@ class Corpus:
293
289
  # Convert to DataFrame and save to CSV
294
290
  df = pd.DataFrame(document_info)
295
291
  df.to_csv(output_file, index=False)
296
- print(f"Document information saved to: {output_file}")
292
+ debug_print(f"Document information saved to: {output_file}")
@@ -4,12 +4,12 @@ The Subject class stores all subject specific information and a list of correspo
4
4
  """
5
5
 
6
6
  class Subject:
7
- def __init__(self, subjectID, description=None):
7
+ def __init__(self, name, description=None):
8
8
 
9
- self.subjectID = subjectID
9
+ self.name = name
10
+ self.subjectID = None
10
11
  self.gender = None
11
12
  self.age = None
12
- self.name = None
13
13
  self.description = description # Description of the subject
14
14
  self.documents = [] # List of TextDocument instances
15
15
  self.numberOfSessions = None
@@ -49,7 +49,7 @@ class AudioFeatureExtraction:
49
49
  profile (DataFrame): Prosogram analysis results
50
50
  """
51
51
  import parselmouth
52
- from pelican.praat import PROSOGRAM_SCRIPT
52
+ from pelican_nlp.praat import PROSOGRAM_SCRIPT
53
53
  try:
54
54
  sound = parselmouth.Sound(file)
55
55
  # Common Prosogram parameters
@@ -1,6 +1,8 @@
1
1
  from pelican_nlp.extraction.language_model import Model
2
2
  from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
3
3
 
4
+ from pelican_nlp.config import debug_print
5
+
4
6
  class EmbeddingsExtractor:
5
7
  def __init__(self, embeddings_configurations, project_path):
6
8
  self.embeddings_configurations = embeddings_configurations
@@ -22,7 +24,7 @@ class EmbeddingsExtractor:
22
24
 
23
25
  # Tokenize the input text
24
26
  inputs = self.Tokenizer.tokenize_text(text)
25
- print(f'inputs are: {inputs}')
27
+ debug_print(f'inputs are: {inputs}')
26
28
 
27
29
  if self.embeddings_configurations['pytorch_based_model']:
28
30
  #e.g. RoBERTa Model or Llama Model
@@ -2,6 +2,8 @@ import torch
2
2
  import torch.nn.functional as F
3
3
  from tqdm import tqdm
4
4
 
5
+ from pelican_nlp.config import debug_print
6
+
5
7
  class LogitsExtractor:
6
8
  def __init__(self, options, pipeline, project_path):
7
9
 
@@ -13,9 +15,9 @@ class LogitsExtractor:
13
15
 
14
16
  def extract_features(self, section, tokenizer, model):
15
17
 
16
- print(f'section to tokenize: {section}')
18
+ debug_print(f'section to tokenize: {section}')
17
19
  tokens = tokenizer.tokenize_text(section)
18
- print(tokens)
20
+ debug_print(tokens)
19
21
 
20
22
  chunk_size = self.options['chunk_size']
21
23
  overlap_size = self.options['overlap_size']
@@ -1,6 +1,7 @@
1
1
  import torch
2
2
  import psutil
3
3
  import os
4
+ import shutil
4
5
 
5
6
  from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
6
7
  from transformers import AutoModelForCausalLM
@@ -26,24 +27,53 @@ class Model:
26
27
  # Set the model path using proper OS path joining
27
28
  model_path = os.path.join(model_dir, 'cc.de.300.bin')
28
29
 
29
- # Download only if model doesn't exist
30
- if not os.path.exists(model_path):
30
+ # Download only if model doesn't exist or is invalid
31
+ need_download = True
32
+ if os.path.exists(model_path):
31
33
  try:
34
+ self.model_instance = fasttext.load_model(model_path)
35
+ need_download = False
36
+ except ValueError:
37
+ print(f"Existing model file is corrupted, re-downloading...")
38
+ os.remove(model_path)
39
+
40
+ if need_download:
41
+ print("Downloading FastText model...")
42
+ try:
43
+ # Try the built-in FastText downloader first
32
44
  fasttext.util.download_model('de', if_exists='ignore')
33
- except OSError:
34
- # Direct download fallback for Windows
45
+ # Find the downloaded file in current directory
46
+ downloaded_file = 'cc.de.300.bin'
47
+ if os.path.exists(downloaded_file):
48
+ # Move the file to the correct location
49
+ shutil.move(downloaded_file, model_path)
50
+ else:
51
+ raise FileNotFoundError("FastText downloader didn't create the expected file")
52
+ except (OSError, ValueError, FileNotFoundError) as e:
53
+ print(f"FastText downloader failed, using direct download: {str(e)}")
54
+ # Direct download fallback
35
55
  import urllib.request
36
56
  url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz'
37
- urllib.request.urlretrieve(url, model_path + '.gz')
57
+ print(f"Downloading from {url}...")
58
+ temp_gz_path = model_path + '.gz'
59
+ urllib.request.urlretrieve(url, temp_gz_path)
60
+
38
61
  # Decompress the file
62
+ print("Decompressing model file...")
39
63
  import gzip
40
- with gzip.open(model_path + '.gz', 'rb') as f_in:
64
+ with gzip.open(temp_gz_path, 'rb') as f_in:
41
65
  with open(model_path, 'wb') as f_out:
42
66
  f_out.write(f_in.read())
43
- os.remove(model_path + '.gz')
67
+ os.remove(temp_gz_path)
68
+ print("Model decompressed successfully")
69
+
70
+ # Verify the downloaded model
71
+ try:
72
+ self.model_instance = fasttext.load_model(model_path)
73
+ except ValueError as e:
74
+ raise ValueError(f"Failed to load downloaded model: {str(e)}. Please try removing {model_path} and running again.")
44
75
 
45
- self.model_instance = fasttext.load_model(model_path)
46
- print('FastText model loaded.')
76
+ print(f'FastText model loaded successfully from {model_path}')
47
77
  elif self.model_name == 'xlm-roberta-base':
48
78
  from transformers import AutoModel
49
79
  self.model_instance = AutoModel.from_pretrained(
@@ -80,7 +110,7 @@ class Model:
80
110
  def device_map_creation(self):
81
111
  #check if cuda is available
82
112
  if not torch.cuda.is_available():
83
- print('Careful: Cuda not available, using CPU. This will be very slow.')
113
+ print('Careful: Cuda not available, using CPU. This can be slow. Consider running pipeline on different device')
84
114
  else:
85
115
  print(f'{torch.cuda.get_device_name(0)} available.')
86
116
 
pelican_nlp/main.py CHANGED
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Pelican Project
4
- ===============
3
+ Pelican-nlp Project
4
+ ===================
5
5
 
6
- Pelican is a tool developed to enable consistent and reproducible language processing.
6
+ Pelican-nlp is a tool developed to enable consistent and reproducible language processing.
7
7
  Main entry point for the Pelican project handling document processing and metric extraction.
8
8
 
9
9
  Author: Yves Pauli
@@ -23,6 +23,9 @@ import sys
23
23
  from pelican_nlp.core import Corpus
24
24
  from pelican_nlp.utils.setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
25
25
  from pelican_nlp.preprocessing import LPDS
26
+ from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
+
28
+ from config import debug_print
26
29
 
27
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
28
31
 
@@ -30,7 +33,8 @@ class Pelican:
30
33
 
31
34
  """Main class for the Pelican project handling document processing and metric extraction."""
32
35
 
33
- def __init__(self, config_path: str = None, dev_mode: bool = True) -> None:
36
+ def __init__(self, config_path: str = None, dev_mode: bool = False) -> None:
37
+
34
38
  self.dev_mode = dev_mode
35
39
 
36
40
  # If no config path is provided, use the default config from package; used for dev-mode
@@ -83,23 +87,25 @@ class Pelican:
83
87
  subjects = subject_instantiator(self.config, self.project_path)
84
88
 
85
89
  # Process each corpus
86
- for corpus_name in self.config['corpus_names']:
87
- self._process_corpus(corpus_name, subjects)
88
-
89
- def _process_corpus(self, corpus_name: str, subjects: List) -> None:
90
+ for corpus_value in self.config['corpus_values']:
91
+ self._process_corpus(self.config['corpus_key'], corpus_value, subjects)
90
92
 
93
+ def _process_corpus(self, corpus_key: str, corpus_value: str, subjects: List) -> None:
91
94
  """Process a single corpus including preprocessing and metric extraction."""
92
- print(f'Processing corpus: {corpus_name}')
93
95
 
94
- corpus_documents = self._identify_corpus_files(subjects, corpus_name)
95
- corpus = Corpus(corpus_name, corpus_documents[corpus_name], self.config, self.project_path)
96
+ corpus_entity = corpus_key + '-' + corpus_value
97
+ print(f'Processing corpus: {corpus_entity}')
98
+ debug_print(subjects, corpus_entity)
99
+ corpus_documents = self._identify_corpus_files(subjects, corpus_entity)
100
+ debug_print(len(corpus_documents))
101
+ corpus = Corpus(corpus_entity, corpus_documents[corpus_entity], self.config, self.project_path)
96
102
 
97
- for document in corpus_documents[corpus_name]:
98
- document.corpus_name = corpus_name
103
+ for document in corpus_documents[corpus_entity]:
104
+ document.corpus_name = corpus_entity
99
105
 
100
106
  if self.config['input_file']=='text':
101
107
  corpus.preprocess_all_documents()
102
- print(f'Corpus {corpus_name} is preprocessed')
108
+ print(f'Corpus {corpus_key} is preprocessed')
103
109
 
104
110
  self._extract_metrics(corpus)
105
111
 
@@ -140,18 +146,34 @@ class Pelican:
140
146
 
141
147
  self._clear_gpu_memory()
142
148
 
143
- def _identify_corpus_files(self, subjects: List, corpus: str) -> Dict:
144
- """Identify and group files belonging to a specific corpus."""
145
- corpus_dict = {corpus: []}
146
- for subject in subjects:
147
- for document in subject.documents:
148
- name = Path(document.name)
149
- document.extension = name.suffix
150
- # Split by both '_' and '.' to get all parts
151
- parts = name.stem.replace('.', '_').split('_')
152
- # Check if corpus name appears in any part
153
- if corpus in parts:
154
- corpus_dict[corpus].append(document)
149
+ def _identify_corpus_files(self, subjects: List, entity: str) -> Dict:
150
+ """Identify and group files based on specified entity-value pair."""
151
+ debug_print(f'identifying corpus files')
152
+ corpus_dict = {entity: []}
153
+ debug_print(len(subjects))
154
+
155
+ # Check if entity is in key-value format
156
+ if '-' in entity:
157
+ key, value = entity.split('-', 1)
158
+
159
+ for subject in subjects:
160
+ debug_print(subject.documents)
161
+ for document in subject.documents:
162
+ entities = parse_lpds_filename(document.name)
163
+ debug_print(entities)
164
+ if key in entities and str(entities[key]) == value:
165
+ corpus_dict[entity].append(document)
166
+ else:
167
+ # Entity is just a value, check all keys
168
+ for subject in subjects:
169
+ debug_print(subject.documents)
170
+ for document in subject.documents:
171
+ entities = parse_lpds_filename(document.name)
172
+ debug_print(entities)
173
+ # Convert all values to strings for comparison
174
+ if any(str(val) == entity for val in entities.values()):
175
+ corpus_dict[entity].append(document)
176
+
155
177
  return corpus_dict
156
178
 
157
179
  def _handle_output_directory(self) -> None:
@@ -207,4 +229,4 @@ class Pelican:
207
229
 
208
230
 
209
231
  if __name__ == '__main__':
210
- Pelican(project_path).run()
232
+ Pelican(project_path, dev_mode=True).run()
@@ -1,6 +1,8 @@
1
1
  import re
2
2
  import os
3
3
 
4
+ from pelican_nlp.config import debug_print
5
+
4
6
  class LPDS:
5
7
  def __init__(self, project_folder, multiple_sessions):
6
8
  self.project_folder = project_folder
@@ -18,7 +20,7 @@ class LPDS:
18
20
  suggested_files = ["dataset_description.json", "README", "CHANGES", "participants.tsv"]
19
21
  for file in suggested_files:
20
22
  if not os.path.isfile(os.path.join(self.project_folder, file)):
21
- print(f"Warning: Missing suggested file '{file}' in the project folder.")
23
+ debug_print(f"Warning: Missing suggested file '{file}' in the project folder.")
22
24
 
23
25
  # Check for the 'subjects' folder
24
26
  if not os.path.isdir(self.subjects_folder):
@@ -38,15 +40,16 @@ class LPDS:
38
40
  if self.multiple_sessions:
39
41
  session_folders = [f for f in os.listdir(subject_path) if
40
42
  os.path.isdir(os.path.join(subject_path, f))]
41
- if not session_folders:
43
+ if session_folders:
44
+ if 'ses-01' not in session_folders:
45
+ print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
46
+ else:
42
47
  print(f"Warning: No session folders found in '{subject_folder}'.")
43
- if 'ses-01' not in session_folders:
44
- print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
45
48
 
46
49
  # Check for optional subject_metadata file
47
50
  metadata_file = os.path.join(subject_path, "subject_metadata")
48
51
  if not os.path.isfile(metadata_file):
49
- #print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
52
+ debug_print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
50
53
  continue
51
54
 
52
55
  session_folders = subject_folder
@@ -68,7 +71,7 @@ class LPDS:
68
71
  else:
69
72
  pattern = fr"^{subject_folder}_{task_folder}.*"
70
73
  if not re.match(pattern, file):
71
- print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
74
+ debug_print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
72
75
 
73
76
  def derivative_dir_creator(self):
74
77
  # Create the 'derivatives' folder if it doesn't exist
@@ -1,58 +1,66 @@
1
1
  import os
2
2
  import csv
3
- import numpy as np
3
+ from .filename_parser import parse_lpds_filename
4
+ from pelican_nlp.config import debug_print
4
5
 
5
6
  def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
6
- """Store various types of features to CSV files with consistent formatting.
7
+ """Store various types of features to CSV files with consistent formatting."""
8
+
9
+ # Parse entities from the document name
10
+ entities = parse_lpds_filename(doc_class.name)
7
11
 
8
- Args:
9
- input_data: The data to be stored in CSV format
10
- derivatives_dir: Base directory for all derivatives
11
- doc_class: Document class containing subject, session (optional), task, and task_addition (optional) info
12
- metric: Type of metric being stored
13
- """
14
- # Get the appropriate metric folder
15
- metric_folder = metric
12
+ # Get the base filename without extension and current suffix
13
+ base_filename = os.path.splitext(doc_class.name)[0] # Remove extension
16
14
 
17
- # Build base filename parts from doc_class
18
- filename_parts = [
19
- doc_class.subject_ID,
20
- doc_class.task,
21
- doc_class.corpus_name
22
- ]
15
+ # If there's a suffix in the entities, remove it from the base filename
16
+ if 'suffix' in entities:
17
+ # Remove the current suffix
18
+ base_filename = base_filename.replace(f"_{entities['suffix']}", "")
23
19
 
24
- # Add session to filename if it exists
25
- if hasattr(doc_class, 'session') and doc_class.session:
26
- filename_parts.insert(1, doc_class.session)
20
+ # Create the new filename with the metric as suffix
21
+ filename = f"{base_filename}_{metric}.csv"
27
22
 
28
- # Join the base parts with underscores
29
- filename = "_".join(filename_parts)
23
+ # Extract core information from entities for directory structure
24
+ subject_ID = f"sub-{entities['sub']}" if 'sub' in entities else None
25
+ if not subject_ID:
26
+ raise ValueError(f"Missing required 'sub' entity in filename: {doc_class.name}")
30
27
 
31
- # Add task_addition with underscore if it exists
32
- if hasattr(doc_class, 'task_addition') and doc_class.task_addition:
33
- filename += f"_{doc_class.task_addition}"
28
+ session = f"ses-{entities['ses']}" if 'ses' in entities else None
29
+ task = f"task-{entities['task']}" if 'task' in entities else None
34
30
 
35
- # Add the metric with an underscore
36
- filename += f"_{metric}.csv"
37
-
38
- # Build the full path
31
+ # Build the full path components
39
32
  path_components = [
40
33
  derivatives_dir,
41
- metric_folder,
42
- doc_class.subject_ID,
34
+ metric, # Use metric as the folder name
35
+ subject_ID,
43
36
  ]
44
37
 
45
38
  # Add session to path if it exists
46
- if hasattr(doc_class, 'session') and doc_class.session:
47
- path_components.append(doc_class.session)
39
+ if session:
40
+ path_components.append(session)
48
41
 
49
- path_components.append(doc_class.task)
42
+ # Add task to path if it exists
43
+ if task:
44
+ path_components.append(task)
50
45
 
51
46
  # Create directory and get final filepath
52
- final_results_path = os.path.join(*path_components)
47
+ # Ensure all components have compatible types by using str() conversion
48
+ base_path = os.path.join(str(derivatives_dir), str(metric), str(subject_ID))
49
+
50
+ # Build path incrementally with explicit type conversion
51
+ if session:
52
+ final_results_path = os.path.join(base_path, str(session))
53
+ else:
54
+ final_results_path = base_path
55
+
56
+ if task:
57
+ final_results_path = os.path.join(final_results_path, str(task))
58
+
59
+
60
+ debug_print(final_results_path)
53
61
  os.makedirs(final_results_path, exist_ok=True)
54
62
 
55
- output_filepath = os.path.join(final_results_path, filename)
63
+ output_filepath = os.path.join(final_results_path, str(filename))
56
64
  file_exists = os.path.exists(output_filepath)
57
65
 
58
66
  # Write data based on metric type
@@ -146,6 +154,8 @@ def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
146
154
  row_data.append(value)
147
155
  writer.writerow(row_data)
148
156
 
157
+ return output_filepath
158
+
149
159
 
150
160
  def _build_filename_parts(path_parts, corpus, metric, config=None):
151
161
  """Helper function to build filename components."""
@@ -0,0 +1,23 @@
1
+ from pathlib import Path
2
+
3
+ def parse_lpds_filename(filename):
4
+ """Parse LPDS-style filename into entity-value pairs."""
5
+
6
+ entities = {}
7
+ name = Path(filename)
8
+
9
+ # Handle extension
10
+ entities['extension'] = name.suffix
11
+
12
+ # Split into components
13
+ parts = name.stem.split('_')
14
+
15
+ # Parse each entity-value pair
16
+ for part in parts:
17
+ if '-' in part:
18
+ key, value = part.split('-', 1)
19
+ entities[key] = value
20
+ else:
21
+ entities['suffix'] = part
22
+
23
+ return entities
@@ -1,81 +1,107 @@
1
1
  import os
2
- from pelican_nlp.core.subject import Subject
3
2
  import shutil
4
3
  import yaml
5
4
  import sys
5
+ from pelican_nlp.core.subject import Subject
6
+ from .filename_parser import parse_lpds_filename
7
+ from ..config import debug_print
8
+
6
9
 
7
10
  def subject_instantiator(config, project_folder):
8
11
  path_to_subjects = os.path.join(project_folder, 'subjects')
9
12
  print('Instantiating Subjects...')
10
- subjects = [Subject(subject) for subject in os.listdir(path_to_subjects)]
13
+
14
+ # Get all subject directories that match sub-* pattern
15
+ subjects = [
16
+ Subject(subject_dir)
17
+ for subject_dir in os.listdir(path_to_subjects)
18
+ ]
11
19
 
12
20
  # Identifying all subject files
13
21
  for subject in subjects:
14
- if config['multiple_sessions']:
15
- paths = _get_subject_sessions(subject, project_folder)
16
- else:
17
- paths = [os.path.join(path_to_subjects, subject.subjectID)]
22
+ # Get subject ID from directory name (e.g., 'sub-01' -> '01')
23
+ subject.subjectID = subject.name.split('-')[1]
24
+
25
+ # Find all files for this subject recursively
26
+ subject_path = os.path.join(path_to_subjects, subject.name)
27
+ all_files = []
28
+ for root, _, files in os.walk(subject_path):
29
+ all_files.extend([os.path.join(root, f) for f in files])
30
+
31
+ # Filter files by task name from config
32
+ task_files = []
33
+ for file_path in all_files:
34
+ filename = os.path.basename(file_path)
35
+ entities = parse_lpds_filename(filename)
36
+ if entities.get('task') == config['task_name']:
37
+ task_files.append((file_path, filename))
38
+
39
+ # Instantiate documents for matching files
40
+ for file_path, filename in task_files:
41
+ entities = parse_lpds_filename(filename)
42
+ document = _instantiate_document(file_path, filename, entities, config)
43
+ subject.documents.append(document)
18
44
 
19
- for path in paths:
20
- file_path = os.path.join(path, config['task_name'])
21
- subject.documents.extend(_instantiate_documents(file_path, subject.subjectID, config))
22
- print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
45
+ debug_print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
46
+
47
+ # Set up results paths for each document
23
48
  for document in subject.documents:
24
- parts = document.file_path.split(os.sep)
49
+ entities = parse_lpds_filename(document.name)
50
+
51
+ # Build derivatives path based on entities
52
+ derivatives_parts = [project_folder, 'derivatives']
25
53
 
26
- # Adjust path components based on whether session exists
27
- if config.get('multiple_sessions', False):
28
- subject_ID, session, task = parts[-4], parts[-3], parts[-2]
29
- document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, session, task)
30
- else:
31
- subject_ID, task = parts[-3], parts[-2]
32
- document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, task)
54
+ # Always include subject
55
+ derivatives_parts.append(f"sub-{entities['sub']}")
56
+
57
+ # Add session if present
58
+ if 'ses' in entities:
59
+ derivatives_parts.append(f"ses-{entities['ses']}")
60
+
61
+ # Add task
62
+ derivatives_parts.append(f"task-{entities['task']}")
63
+
64
+ document.results_path = os.path.join(*derivatives_parts)
33
65
 
34
66
  return subjects
35
67
 
36
- def _get_subject_sessions(subject, project_path):
37
- session_dir = os.path.join(os.path.join(project_path, 'subjects'), subject.subjectID)
38
- session_paths = [
39
- os.path.join(session_dir, session)
40
- for session in os.listdir(session_dir)
41
- if os.path.isdir(os.path.join(session_dir, session))
42
- ]
43
- subject.numberOfSessions = len(session_paths)
44
- return session_paths
68
+ def _instantiate_document(filepath, filename, entities, config):
69
+ """Create appropriate document instance based on config and entities"""
45
70
 
46
- def _instantiate_documents(filepath, subject, config):
71
+ common_kwargs = {
72
+ 'file_path': os.path.dirname(filepath),
73
+ 'name': filename,
74
+ 'subject_ID': entities.get('sub'),
75
+ 'task': entities.get('task'),
76
+ # Check for specific entities that might indicate document type
77
+ 'fluency': 'cat' in entities and entities['cat'] == 'semantic',
78
+ 'num_speakers': config['number_of_speakers'],
79
+ }
47
80
 
48
- if config['input_file']=='text':
81
+ if config['input_file'] == 'text':
49
82
  from pelican_nlp.core.document import Document
50
- return [
51
- Document(
52
- filepath,
53
- file_name,
54
- subject_ID = subject,
55
- task=config['task_name'],
56
- fluency=config['fluency_task'],
57
- has_sections=config['has_multiple_sections'],
58
- section_identifier=config['section_identification'],
59
- number_of_sections=config['number_of_sections'],
60
- num_speakers=config['number_of_speakers'],
61
- has_section_titles=config['has_section_titles']
62
- )
63
- for file_name in os.listdir(filepath)
64
- ]
65
-
66
- elif config['input_file']=='audio':
83
+ return Document(
84
+ **common_kwargs,
85
+ # Use entities for section information if available, fall back to config
86
+ has_sections=bool(entities.get('sections', config['has_multiple_sections'])),
87
+ section_identifier=config['section_identification'],
88
+ number_of_sections=config['number_of_sections'],
89
+ has_section_titles=config['has_section_titles'],
90
+ # Add any additional entities as attributes
91
+ session=entities.get('ses'),
92
+ acquisition=entities.get('acq'),
93
+ category=entities.get('cat'),
94
+ run=entities.get('run'),
95
+ )
96
+ elif config['input_file'] == 'audio':
67
97
  from pelican_nlp.core.audio_document import AudioFile
68
- return [
69
- AudioFile(
70
- filepath,
71
- file_name,
72
- subject_ID=subject,
73
- task=config['task_name'],
74
- fluency=config['fluency_task'],
75
- num_speakers=config['number_of_speakers'],
76
- )
77
- for file_name in os.listdir(filepath)
78
- ]
98
+ return AudioFile(
99
+ **common_kwargs,
100
+ # Add audio-specific entities
101
+ recording_type=entities.get('rec'),
102
+ channel=entities.get('ch'),
103
+ run=entities.get('run'),
104
+ )
79
105
 
80
106
  def remove_previous_derivative_dir(output_directory):
81
107
  if os.path.isdir(output_directory):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pelican_nlp
3
- Version: 0.2.7
3
+ Version: 0.3.1
4
4
  Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
5
  Author-email: Yves Pauli <yves.pauli@gmail.com>
6
6
  License-Expression: CC-BY-NC-4.0
@@ -45,20 +45,31 @@ Requires-Dist: huggingface_hub==0.29.2
45
45
  Dynamic: license-file
46
46
 
47
47
  ====================================
48
- PELICAN_nlp
48
+ pelican_nlp
49
49
  ====================================
50
50
 
51
- pelican_nlp stands for "Preprocessing and Extraction of Linguistic Information for Computational Analysis - Natural Language Processing". This package enables the creation of standardized and reproducible language processing pipelines, extracting linguistic features from various tasks like discourse, fluency, and image descriptions.
51
+ .. |logo| image:: docs/images/pelican_logo.png
52
+ :alt: PELICAN_nlp Logo
53
+ :width: 200px
52
54
 
53
- .. image:: https://img.shields.io/pypi/v/package-name.svg
55
+ +------------+-------------------------------------------------------------------+
56
+ | |logo| | pelican_nlp stands for "Preprocessing and Extraction of Linguistic|
57
+ | | Information for Computational Analysis - Natural Language |
58
+ | | Processing". This package enables the creation of standardized and|
59
+ | | reproducible language processing pipelines, extracting linguistic |
60
+ | | features from various tasks like discourse, fluency, and image |
61
+ | | descriptions. |
62
+ +------------+-------------------------------------------------------------------+
63
+
64
+ .. image:: https://img.shields.io/pypi/v/pelican_nlp.svg
54
65
  :target: https://pypi.org/project/pelican_nlp/
55
66
  :alt: PyPI version
56
67
 
57
- .. image:: https://img.shields.io/github/license/username/package-name.svg
68
+ .. image:: https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg
58
69
  :target: https://github.com/ypauli/pelican_nlp/blob/main/LICENSE
59
- :alt: License
70
+ :alt: License CC BY-NC 4.0
60
71
 
61
- .. image:: https://img.shields.io/pypi/pyversions/package-name.svg
72
+ .. image:: https://img.shields.io/pypi/pyversions/pelican_nlp.svg
62
73
  :target: https://pypi.org/project/pelican_nlp/
63
74
  :alt: Supported Python Versions
64
75
 
@@ -1,7 +1,8 @@
1
1
  pelican_nlp/__init__.py,sha256=TD5xjKeXXAH6nUWG-6igbClgovi5r8RIEqI_ix1QeYo,204
2
- pelican_nlp/_version.py,sha256=LIho7asb0pp1iNbJvXEhRMluyGN4gB4RHIIbAKpROsc,21
2
+ pelican_nlp/_version.py,sha256=v-ExhFzOD_GemLcOptv2ZODgnklv9iqEEospk_bU1_w,21
3
3
  pelican_nlp/cli.py,sha256=mPz-ASIMUme69G6YGVpTnHr5VfM3XA4h29WFd7DXpa4,588
4
- pelican_nlp/main.py,sha256=HX2Rbl4j7RXaMXlGCtggBBqcg3gRh-ey1PdLsQcDX30,7660
4
+ pelican_nlp/config.py,sha256=cqUYLeqQB_Y-drR4dpxz8l-aLKl7TcfiB8SeN_rNq4I,352
5
+ pelican_nlp/main.py,sha256=43jz94Zit931nZXs1hSAAPimRbX8Vmj-bEx7rDoYtZ4,8674
5
6
  pelican_nlp/Nils_backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
7
  pelican_nlp/Nils_backup/extract_acoustic_features.py,sha256=eSP8lXxbZ15YE1HqxGtma9uWOcSN-fI-ig-NwQ9eOA8,10771
7
8
  pelican_nlp/Nils_backup/speaker_diarization_Nils.py,sha256=3RIhjKihu4Z1rruMt9KESFE2lqesfzIpRr7rLummUEo,10219
@@ -38,15 +39,15 @@ pelican_nlp/configuration_files/config_general.yml,sha256=Dx06lK77yHSiH5U8vxrfm5
38
39
  pelican_nlp/configuration_files/config_morteza.yml,sha256=ZUcEIHrXWH9H3r42kTWIFEfgtqZBpyYUMOErVC7X3z8,3152
39
40
  pelican_nlp/core/__init__.py,sha256=whJc5dWsGsKn2IAw-D4BvCvUKW1sVtWYE1WJIuUr5uI,165
40
41
  pelican_nlp/core/audio_document.py,sha256=hhSJNgeqSYa6_uws2ho66agHhAdHuKN3EIEdIsIcXKg,586
41
- pelican_nlp/core/corpus.py,sha256=EIt-3giRaFe0vcJoKla_J8uVF_zR6oGmbQnNbllO9C0,15142
42
+ pelican_nlp/core/corpus.py,sha256=bP8exSraPIekc8WD7GdUIJrV03lS2p1FMdiAV_6HTDY,14989
42
43
  pelican_nlp/core/document.py,sha256=j2HP5FX6cfmXHo7OWVFCX6cMsDyqsOmNlnGNNNfCm2c,8467
43
- pelican_nlp/core/subject.py,sha256=-pi3jDzb2zLiG8JNAi9i-9Jd-VtsPxDO4ShQci2QSMg,1059
44
+ pelican_nlp/core/subject.py,sha256=Jx99vPn0K0KT_9BsJOY8XviFU_GuZGuwtb1rbLNkiUI,1049
44
45
  pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
45
- pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=6Csrr6uotarhuAzxYlGFAil9K4PLUqa9vWw607peRoA,2319
46
+ pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=Ol6fqyy94Iym1Z-eTVoz8EmqfV58boz5WAoamAK7JVE,2323
46
47
  pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
47
- pelican_nlp/extraction/extract_embeddings.py,sha256=e5bcNlskd7f-JkWtfd7YutGV5bqcURKrAkETRyTx93Q,2457
48
- pelican_nlp/extraction/extract_logits.py,sha256=Lc7Es86T8mlSvLMhiDHpFdCc0kCZ9fNr3-VFnOyeybs,3869
49
- pelican_nlp/extraction/language_model.py,sha256=npew_4ziTCNE87pjN8LL0eTPujlewVr8pMT7BsmzEjo,4038
48
+ pelican_nlp/extraction/extract_embeddings.py,sha256=6lzKbZpe5kCWHMh_ca0M-Xl_UF64bmGXEeQjFFTnsOA,2507
49
+ pelican_nlp/extraction/extract_logits.py,sha256=kvZn9dZWsZiSPcbQ8hKtFcS9XxNlMmL-WGvpToMMo7c,3925
50
+ pelican_nlp/extraction/language_model.py,sha256=37vVNFL31DVIBPSuyQK1rkEm8kiCXHTpGYv4Vk8w2bM,5676
50
51
  pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
51
52
  pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
52
53
  pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
@@ -54,7 +55,7 @@ pelican_nlp/extraction/test_documents/wallace_1.1_3.txt,sha256=gs5REE10myK3Nm9JB
54
55
  pelican_nlp/extraction/test_documents/wallace_1_4.txt,sha256=95Z7gS92KERCocrbOAFbJntf5QoE-6p0GL67XQEffqI,3963
55
56
  pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py,sha256=svXXyLEA62mLa0KUfSiOSFFMjYk17K7BJbxUoLf0l9w,1468
56
57
  pelican_nlp/praat/__init__.py,sha256=uSEaUZ2nw7lH0twbRJL5BltJTJpopj5XCVhIbeM42bg,1035
57
- pelican_nlp/preprocessing/LPDS.py,sha256=4UWkMMSrdU-nWVi8eKiWQSGD7f7lemB42aI0fFn6ZLU,4097
58
+ pelican_nlp/preprocessing/LPDS.py,sha256=reaIqxDz_g7VG_J_8mCK2vcg4I1XrvZPTLgglupKiv4,4184
58
59
  pelican_nlp/preprocessing/__init__.py,sha256=ZYgOUlKPXmltYez3urPZmsAWRWSEqZ3_l_gN2aqd15s,293
59
60
  pelican_nlp/preprocessing/pipeline.py,sha256=t2zJAvZRO12MdAKQgm8XZxfZND7_8gFtzHF9Rq2L2aE,1796
60
61
  pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_In0-nzFOkZ44cBnoKLk,1122
@@ -66,12 +67,13 @@ pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=OaTCoMwhDjrOI
66
67
  pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=JYpq90K4AF5TslzESJK6Nidw6-D1IiqD_6cdmlCd5-w,2990
67
68
  pelican_nlp/sample_configuration_files/config_general.yml,sha256=-GAVATlqXuQq4ANSW0JauwIGhr7ET_oZiBiM7I40AkA,3424
68
69
  pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
69
- pelican_nlp/utils/csv_functions.py,sha256=hsG73gm3Up9sAerp6gIxuNHaeP1vJj6HSh7ggVm1SSo,7272
70
+ pelican_nlp/utils/csv_functions.py,sha256=7X8pGh49TGZGs7h6JrJD846swCqSHL32mmXJ-8qLWPE,7774
71
+ pelican_nlp/utils/filename_parser.py,sha256=PGSKjiYDe_JVAFGcaYHdIYazB3p4MUiG6n8h_uZl8d8,551
70
72
  pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
71
- pelican_nlp/utils/setup_functions.py,sha256=t4WG5qd5iYpNNBGklje_8ukwmJp_C9RMLLi7veDgNeA,3574
72
- pelican_nlp-0.2.7.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
73
- pelican_nlp-0.2.7.dist-info/METADATA,sha256=YyZBYza89dtKbvLLHXkxOEZ1BODloXBjh-zZSODLfVI,6155
74
- pelican_nlp-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
75
- pelican_nlp-0.2.7.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
76
- pelican_nlp-0.2.7.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
77
- pelican_nlp-0.2.7.dist-info/RECORD,,
73
+ pelican_nlp/utils/setup_functions.py,sha256=Ovd3VMCRpVg_BU8gcF6rGc9mp0zsD2iqJRqRB61lxOg,4529
74
+ pelican_nlp-0.3.1.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
75
+ pelican_nlp-0.3.1.dist-info/METADATA,sha256=sgUAHpBqowrsg_yFXs6-HDSgI77js6uqf8josFxjpcM,6593
76
+ pelican_nlp-0.3.1.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
77
+ pelican_nlp-0.3.1.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
78
+ pelican_nlp-0.3.1.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
79
+ pelican_nlp-0.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5