PyPI - pelican-nlp - Versions diffs - 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

pelican-nlp 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

pelican_nlp/_version.py +1 -1
pelican_nlp/config.py +14 -0
pelican_nlp/core/corpus.py +26 -30
pelican_nlp/core/subject.py +3 -3
pelican_nlp/extraction/acoustic_feature_extraction.py +1 -1
pelican_nlp/extraction/extract_embeddings.py +3 -1
pelican_nlp/extraction/extract_logits.py +4 -2
pelican_nlp/extraction/language_model.py +40 -10
pelican_nlp/main.py +49 -27
pelican_nlp/preprocessing/LPDS.py +9 -6
pelican_nlp/utils/csv_functions.py +45 -35
pelican_nlp/utils/filename_parser.py +23 -0
pelican_nlp/utils/setup_functions.py +83 -57
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/METADATA +18 -7
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/RECORD +19 -17
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/WHEEL +1 -1
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/entry_points.txt +0 -0
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/licenses/LICENSE +0 -0
{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/top_level.txt +0 -0

pelican_nlp/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.7"
1	+ __version__ = "0.3.1"

pelican_nlp/config.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""
+Global configuration settings for the Pelican project.
+This file is not the configuration.yml file created for the users adaptations.
+For consistency of pipeline, DO NOT CHANGE.
+"""
+# Debug flag
+DEBUG_MODE = False
+def debug_print(*args, **kwargs):
+    """Print only if debug mode is enabled."""
+    if DEBUG_MODE:
+        print(*args, **kwargs)

pelican_nlp/core/corpus.py CHANGED Viewed

@@ -15,20 +15,24 @@ import os
 import pandas as pd
 import re
+from pelican_nlp.config import debug_print
 class Corpus:
     def __init__(self, corpus_name, documents, configuration_settings, project_folder):
         self.name = corpus_name
+        self.key = corpus_name.split('-')[0]
+        self.value = corpus_name.split('-')[1]
         self.documents = documents
         self.config = configuration_settings
         self.project_folder = project_folder
-        self.derivative_dir = project_folder / 'derivatives'
+        self.derivatives_dir = project_folder / 'derivatives'
         self.pipeline = TextPreprocessingPipeline(self.config)
         self.task = configuration_settings['task_name']
         self.results_path = None
     def preprocess_all_documents(self):
         """Preprocess all documents"""
-        print('Preprocessing all documents...')
+        print(f'Preprocessing all documents of corpus {self.name}...')
         for document in self.documents:
             document.detect_sections()
             document.process_document(self.pipeline)
@@ -43,21 +47,15 @@ class Corpus:
         """Create separate aggregated results CSV files for each metric."""
         print("Creating aggregated results files per metric...")
-        try:
-            derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
-        except (AttributeError, IndexError):
-            print("Error: No valid results path found in documents")
-            return
         # Create aggregations folder
-        aggregation_path = os.path.join(derivatives_path, 'aggregations')
+        aggregation_path = os.path.join(self.derivatives_dir, 'aggregations')
         os.makedirs(aggregation_path, exist_ok=True)
         # Initialize results dictionary with metrics as keys
         results_by_metric = {}
         # Walk through all directories in derivatives
-        for root, dirs, files in os.walk(derivatives_path):
+        for root, dirs, files in os.walk(self.derivatives_dir):
             # Skip the aggregations directory itself
             if 'aggregations' in root:
                 continue
@@ -115,6 +113,7 @@ class Corpus:
         logits_options = self.config['options_logits']
         print('logits extraction in progress')
         model_name = logits_options['model_name']
         logitsExtractor = LogitsExtractor(logits_options,
                                           self.pipeline,
@@ -144,7 +143,7 @@ class Corpus:
                     #'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
                     store_features_to_csv(logits,
-                                          self.derivative_dir,
+                                          self.derivatives_dir,
                                           self.documents[i],
                                           metric='logits')
@@ -154,9 +153,12 @@ class Corpus:
         embedding_options = self.config['options_embeddings']
         print('Embeddings extraction in progress...')
         embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.project_folder)
+        debug_print(len(self.documents))
         for i in range(len(self.documents)):
+            debug_print(f'cleaned sections: {self.documents[i].cleaned_sections}')
             for key, section in self.documents[i].cleaned_sections.items():
-                print(f'Processing section {key}')
+                debug_print(f'Processing section {key}')
                 if self.config['discourse']:
                     section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
@@ -175,7 +177,7 @@ class Corpus:
                         from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
                             get_semantic_similarity_windows
                         consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
-                        print(f'Mean semantic similarity: {mean_similarity:.4f}')
+                        debug_print(f'Mean semantic similarity: {mean_similarity:.4f}')
                         for window_size in self.config['options_semantic-similarity']['window_sizes']:
                             window_stats = get_semantic_similarity_windows(utterance, window_size)
@@ -187,7 +189,7 @@ class Corpus:
                                     'std_of_window_stds': window_stats[3],
                                     'mean_of_window_medians': window_stats[4]
                                 }
-                                print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
+                                debug_print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
                             else:
                                 window_data = {
                                     'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
@@ -195,16 +197,16 @@ class Corpus:
                                 }
                             store_features_to_csv(window_data,
-                                                  self.derivative_dir,
+                                                  self.derivatives_dir,
                                                   self.documents[i],
                                                   metric=f'semantic-similarity-window-{window_size}')
                     if self.config['options_embeddings']['distance-from-randomness']:
                         from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
                         divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
-                        print(f'Divergence from optimality metrics: {divergence}')
+                        debug_print(f'Divergence from optimality metrics: {divergence}')
                         store_features_to_csv(divergence,
-                                              self.derivative_dir,
+                                              self.derivatives_dir,
                                               self.documents[i],
                                               metric='distance-from-randomness')
@@ -230,7 +232,7 @@ class Corpus:
                         cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
                     store_features_to_csv(cleaned_embeddings,
-                                          self.derivative_dir,
+                                          self.derivatives_dir,
                                           self.documents[i],
                                           metric='embeddings')
         return
@@ -241,11 +243,11 @@ class Corpus:
             results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
             self.documents[i].recording_length = recording_length  # Store the recording length
             results['subject_ID'] = self.documents[i].subject_ID  # Set the subject ID
-            print('results obtained')
+            print('opensmile results obtained')
             store_features_to_csv(results,
-                                self.derivative_dir,
-                                self.documents[i],
-                                metric='opensmile-features')
+                                  self.derivatives_dir,
+                                  self.documents[i],
+                                  metric='opensmile-features')
     def extract_prosogram(self):
         from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
@@ -257,14 +259,8 @@ class Corpus:
         """Create CSV file with summarized document parameters based on config specifications."""
         print("Creating document information summary...")
-        try:
-            derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
-        except (AttributeError, IndexError):
-            print("Error: No valid results path found in documents")
-            return
         # Create document_information folder inside aggregations
-        doc_info_path = os.path.join(derivatives_path, 'aggregations', 'document_information')
+        doc_info_path = os.path.join(self.derivatives_dir, 'aggregations', 'document_information')
         os.makedirs(doc_info_path, exist_ok=True)
         # Define output file path
@@ -293,4 +289,4 @@ class Corpus:
         # Convert to DataFrame and save to CSV
         df = pd.DataFrame(document_info)
         df.to_csv(output_file, index=False)
-        print(f"Document information saved to: {output_file}")
+        debug_print(f"Document information saved to: {output_file}")

pelican_nlp/core/subject.py CHANGED Viewed

@@ -4,12 +4,12 @@ The Subject class stores all subject specific information and a list of correspo
 """
 class Subject:
-    def __init__(self, subjectID, description=None):
+    def __init__(self, name, description=None):
-        self.subjectID = subjectID
+        self.name = name
+        self.subjectID = None
         self.gender = None
         self.age = None
-        self.name = None
         self.description = description  # Description of the subject
         self.documents = []  # List of TextDocument instances
         self.numberOfSessions = None

pelican_nlp/extraction/acoustic_feature_extraction.py CHANGED Viewed

@@ -49,7 +49,7 @@ class AudioFeatureExtraction:
             profile (DataFrame): Prosogram analysis results
         """
         import parselmouth
-        from pelican.praat import PROSOGRAM_SCRIPT
+        from pelican_nlp.praat import PROSOGRAM_SCRIPT
         try:
             sound = parselmouth.Sound(file)
             # Common Prosogram parameters

pelican_nlp/extraction/extract_embeddings.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from pelican_nlp.extraction.language_model import Model
 from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
+from pelican_nlp.config import debug_print
 class EmbeddingsExtractor:
     def __init__(self, embeddings_configurations, project_path):
         self.embeddings_configurations = embeddings_configurations
@@ -22,7 +24,7 @@ class EmbeddingsExtractor:
             # Tokenize the input text
             inputs = self.Tokenizer.tokenize_text(text)
-            print(f'inputs are: {inputs}')
+            debug_print(f'inputs are: {inputs}')
             if self.embeddings_configurations['pytorch_based_model']:
                 #e.g. RoBERTa Model or Llama Model

pelican_nlp/extraction/extract_logits.py CHANGED Viewed

@@ -2,6 +2,8 @@ import torch
 import torch.nn.functional as F
 from tqdm import tqdm
+from pelican_nlp.config import debug_print
 class LogitsExtractor:
     def __init__(self, options, pipeline, project_path):
@@ -13,9 +15,9 @@ class LogitsExtractor:
     def extract_features(self, section, tokenizer, model):
-        print(f'section to tokenize: {section}')
+        debug_print(f'section to tokenize: {section}')
         tokens = tokenizer.tokenize_text(section)
-        print(tokens)
+        debug_print(tokens)
         chunk_size = self.options['chunk_size']
         overlap_size = self.options['overlap_size']

pelican_nlp/extraction/language_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import psutil
 import os
+import shutil
 from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
 from transformers import AutoModelForCausalLM
@@ -26,24 +27,53 @@ class Model:
             # Set the model path using proper OS path joining
             model_path = os.path.join(model_dir, 'cc.de.300.bin')
-            # Download only if model doesn't exist
-            if not os.path.exists(model_path):
+            # Download only if model doesn't exist or is invalid
+            need_download = True
+            if os.path.exists(model_path):
                 try:
+                    self.model_instance = fasttext.load_model(model_path)
+                    need_download = False
+                except ValueError:
+                    print(f"Existing model file is corrupted, re-downloading...")
+                    os.remove(model_path)
+            if need_download:
+                print("Downloading FastText model...")
+                try:
+                    # Try the built-in FastText downloader first
                     fasttext.util.download_model('de', if_exists='ignore')
-                except OSError:
-                    # Direct download fallback for Windows
+                    # Find the downloaded file in current directory
+                    downloaded_file = 'cc.de.300.bin'
+                    if os.path.exists(downloaded_file):
+                        # Move the file to the correct location
+                        shutil.move(downloaded_file, model_path)
+                    else:
+                        raise FileNotFoundError("FastText downloader didn't create the expected file")
+                except (OSError, ValueError, FileNotFoundError) as e:
+                    print(f"FastText downloader failed, using direct download: {str(e)}")
+                    # Direct download fallback
                     import urllib.request
                     url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz'
-                    urllib.request.urlretrieve(url, model_path + '.gz')
+                    print(f"Downloading from {url}...")
+                    temp_gz_path = model_path + '.gz'
+                    urllib.request.urlretrieve(url, temp_gz_path)
                     # Decompress the file
+                    print("Decompressing model file...")
                     import gzip
-                    with gzip.open(model_path + '.gz', 'rb') as f_in:
+                    with gzip.open(temp_gz_path, 'rb') as f_in:
                         with open(model_path, 'wb') as f_out:
                             f_out.write(f_in.read())
-                    os.remove(model_path + '.gz')
+                    os.remove(temp_gz_path)
+                    print("Model decompressed successfully")
+                # Verify the downloaded model
+                try:
+                    self.model_instance = fasttext.load_model(model_path)
+                except ValueError as e:
+                    raise ValueError(f"Failed to load downloaded model: {str(e)}. Please try removing {model_path} and running again.")
-            self.model_instance = fasttext.load_model(model_path)
-            print('FastText model loaded.')
+            print(f'FastText model loaded successfully from {model_path}')
         elif self.model_name == 'xlm-roberta-base':
             from transformers import AutoModel
             self.model_instance = AutoModel.from_pretrained(
@@ -80,7 +110,7 @@ class Model:
     def device_map_creation(self):
         #check if cuda is available
         if not torch.cuda.is_available():
-            print('Careful: Cuda not available, using CPU. This will be very slow.')
+            print('Careful: Cuda not available, using CPU. This can be slow. Consider running pipeline on different device')
         else:
             print(f'{torch.cuda.get_device_name(0)} available.')

pelican_nlp/main.py CHANGED Viewed

@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """
-Pelican Project
-===============
+Pelican-nlp Project
+===================
-Pelican is a tool developed to enable consistent and reproducible language processing.
+Pelican-nlp is a tool developed to enable consistent and reproducible language processing.
 Main entry point for the Pelican project handling document processing and metric extraction.
 Author: Yves Pauli
@@ -23,6 +23,9 @@ import sys
 from pelican_nlp.core import Corpus
 from pelican_nlp.utils.setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
 from pelican_nlp.preprocessing import LPDS
+from pelican_nlp.utils.filename_parser import parse_lpds_filename
+from config import debug_print
 project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
@@ -30,7 +33,8 @@ class Pelican:
     """Main class for the Pelican project handling document processing and metric extraction."""
-    def __init__(self, config_path: str = None, dev_mode: bool = True) -> None:
+    def __init__(self, config_path: str = None, dev_mode: bool = False) -> None:
         self.dev_mode = dev_mode
         # If no config path is provided, use the default config from package; used for dev-mode
@@ -83,23 +87,25 @@ class Pelican:
         subjects = subject_instantiator(self.config, self.project_path)
         # Process each corpus
-        for corpus_name in self.config['corpus_names']:
-            self._process_corpus(corpus_name, subjects)
-    def _process_corpus(self, corpus_name: str, subjects: List) -> None:
+        for corpus_value in self.config['corpus_values']:
+            self._process_corpus(self.config['corpus_key'], corpus_value, subjects)
+    def _process_corpus(self, corpus_key: str, corpus_value: str, subjects: List) -> None:
         """Process a single corpus including preprocessing and metric extraction."""
-        print(f'Processing corpus: {corpus_name}')
-        corpus_documents = self._identify_corpus_files(subjects, corpus_name)
-        corpus = Corpus(corpus_name, corpus_documents[corpus_name], self.config, self.project_path)
+        corpus_entity = corpus_key + '-' + corpus_value
+        print(f'Processing corpus: {corpus_entity}')
+        debug_print(subjects, corpus_entity)
+        corpus_documents = self._identify_corpus_files(subjects, corpus_entity)
+        debug_print(len(corpus_documents))
+        corpus = Corpus(corpus_entity, corpus_documents[corpus_entity], self.config, self.project_path)
-        for document in corpus_documents[corpus_name]:
-            document.corpus_name = corpus_name
+        for document in corpus_documents[corpus_entity]:
+            document.corpus_name = corpus_entity
         if self.config['input_file']=='text':
             corpus.preprocess_all_documents()
-            print(f'Corpus {corpus_name} is preprocessed')
+            print(f'Corpus {corpus_key} is preprocessed')
             self._extract_metrics(corpus)
@@ -140,18 +146,34 @@ class Pelican:
         self._clear_gpu_memory()
-    def _identify_corpus_files(self, subjects: List, corpus: str) -> Dict:
-        """Identify and group files belonging to a specific corpus."""
-        corpus_dict = {corpus: []}
-        for subject in subjects:
-            for document in subject.documents:
-                name = Path(document.name)
-                document.extension = name.suffix
-                # Split by both '_' and '.' to get all parts
-                parts = name.stem.replace('.', '_').split('_')
-                # Check if corpus name appears in any part
-                if corpus in parts:
-                    corpus_dict[corpus].append(document)
+    def _identify_corpus_files(self, subjects: List, entity: str) -> Dict:
+        """Identify and group files based on specified entity-value pair."""
+        debug_print(f'identifying corpus files')
+        corpus_dict = {entity: []}
+        debug_print(len(subjects))
+        # Check if entity is in key-value format
+        if '-' in entity:
+            key, value = entity.split('-', 1)
+            for subject in subjects:
+                debug_print(subject.documents)
+                for document in subject.documents:
+                    entities = parse_lpds_filename(document.name)
+                    debug_print(entities)
+                    if key in entities and str(entities[key]) == value:
+                        corpus_dict[entity].append(document)
+        else:
+            # Entity is just a value, check all keys
+            for subject in subjects:
+                debug_print(subject.documents)
+                for document in subject.documents:
+                    entities = parse_lpds_filename(document.name)
+                    debug_print(entities)
+                    # Convert all values to strings for comparison
+                    if any(str(val) == entity for val in entities.values()):
+                        corpus_dict[entity].append(document)
         return corpus_dict
     def _handle_output_directory(self) -> None:
@@ -207,4 +229,4 @@ class Pelican:
 if __name__ == '__main__':
-    Pelican(project_path).run()
+    Pelican(project_path, dev_mode=True).run()

pelican_nlp/preprocessing/LPDS.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import re
 import os
+from pelican_nlp.config import debug_print
 class LPDS:
     def __init__(self, project_folder, multiple_sessions):
         self.project_folder = project_folder
@@ -18,7 +20,7 @@ class LPDS:
         suggested_files = ["dataset_description.json", "README", "CHANGES", "participants.tsv"]
         for file in suggested_files:
             if not os.path.isfile(os.path.join(self.project_folder, file)):
-                print(f"Warning: Missing suggested file '{file}' in the project folder.")
+                debug_print(f"Warning: Missing suggested file '{file}' in the project folder.")
         # Check for the 'subjects' folder
         if not os.path.isdir(self.subjects_folder):
@@ -38,15 +40,16 @@ class LPDS:
             if self.multiple_sessions:
                 session_folders = [f for f in os.listdir(subject_path) if
                                    os.path.isdir(os.path.join(subject_path, f))]
-                if not session_folders:
+                if session_folders:
+                    if 'ses-01' not in session_folders:
+                        print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
+                else:
                     print(f"Warning: No session folders found in '{subject_folder}'.")
-                if 'ses-01' not in session_folders:
-                    print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
             # Check for optional subject_metadata file
             metadata_file = os.path.join(subject_path, "subject_metadata")
             if not os.path.isfile(metadata_file):
-                #print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
+                debug_print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
                 continue
             session_folders = subject_folder
@@ -68,7 +71,7 @@ class LPDS:
                         else:
                             pattern = fr"^{subject_folder}_{task_folder}.*"
                         if not re.match(pattern, file):
-                            print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
+                            debug_print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
     def derivative_dir_creator(self):
         # Create the 'derivatives' folder if it doesn't exist

pelican_nlp/utils/csv_functions.py CHANGED Viewed

@@ -1,58 +1,66 @@
 import os
 import csv
-import numpy as np
+from .filename_parser import parse_lpds_filename
+from pelican_nlp.config import debug_print
 def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
-    """Store various types of features to CSV files with consistent formatting.
+    """Store various types of features to CSV files with consistent formatting."""
+    # Parse entities from the document name
+    entities = parse_lpds_filename(doc_class.name)
-    Args:
-        input_data: The data to be stored in CSV format
-        derivatives_dir: Base directory for all derivatives
-        doc_class: Document class containing subject, session (optional), task, and task_addition (optional) info
-        metric: Type of metric being stored
-    """
-    # Get the appropriate metric folder
-    metric_folder = metric
+    # Get the base filename without extension and current suffix
+    base_filename = os.path.splitext(doc_class.name)[0]  # Remove extension
-    # Build base filename parts from doc_class
-    filename_parts = [
-        doc_class.subject_ID,
-        doc_class.task,
-        doc_class.corpus_name
-    ]
+    # If there's a suffix in the entities, remove it from the base filename
+    if 'suffix' in entities:
+        # Remove the current suffix
+        base_filename = base_filename.replace(f"_{entities['suffix']}", "")
-    # Add session to filename if it exists
-    if hasattr(doc_class, 'session') and doc_class.session:
-        filename_parts.insert(1, doc_class.session)
+    # Create the new filename with the metric as suffix
+    filename = f"{base_filename}_{metric}.csv"
-    # Join the base parts with underscores
-    filename = "_".join(filename_parts)
+    # Extract core information from entities for directory structure
+    subject_ID = f"sub-{entities['sub']}" if 'sub' in entities else None
+    if not subject_ID:
+        raise ValueError(f"Missing required 'sub' entity in filename: {doc_class.name}")
-    # Add task_addition with underscore if it exists
-    if hasattr(doc_class, 'task_addition') and doc_class.task_addition:
-        filename += f"_{doc_class.task_addition}"
+    session = f"ses-{entities['ses']}" if 'ses' in entities else None
+    task = f"task-{entities['task']}" if 'task' in entities else None
-    # Add the metric with an underscore
-    filename += f"_{metric}.csv"
-    # Build the full path
+    # Build the full path components
     path_components = [
         derivatives_dir,
-        metric_folder,
-        doc_class.subject_ID,
+        metric,  # Use metric as the folder name
+        subject_ID,
     ]
     # Add session to path if it exists
-    if hasattr(doc_class, 'session') and doc_class.session:
-        path_components.append(doc_class.session)
+    if session:
+        path_components.append(session)
-    path_components.append(doc_class.task)
+    # Add task to path if it exists
+    if task:
+        path_components.append(task)
     # Create directory and get final filepath
-    final_results_path = os.path.join(*path_components)
+    # Ensure all components have compatible types by using str() conversion
+    base_path = os.path.join(str(derivatives_dir), str(metric), str(subject_ID))
+    # Build path incrementally with explicit type conversion
+    if session:
+        final_results_path = os.path.join(base_path, str(session))
+    else:
+        final_results_path = base_path
+    if task:
+        final_results_path = os.path.join(final_results_path, str(task))
+    debug_print(final_results_path)
     os.makedirs(final_results_path, exist_ok=True)
-    output_filepath = os.path.join(final_results_path, filename)
+    output_filepath = os.path.join(final_results_path, str(filename))
     file_exists = os.path.exists(output_filepath)
     # Write data based on metric type
@@ -146,6 +154,8 @@ def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
                     row_data.append(value)
                 writer.writerow(row_data)
+    return output_filepath
 def _build_filename_parts(path_parts, corpus, metric, config=None):
     """Helper function to build filename components."""

pelican_nlp/utils/filename_parser.py ADDED Viewed

@@ -0,0 +1,23 @@
+from pathlib import Path
+def parse_lpds_filename(filename):
+    """Parse LPDS-style filename into entity-value pairs."""
+    entities = {}
+    name = Path(filename)
+    # Handle extension
+    entities['extension'] = name.suffix
+    # Split into components
+    parts = name.stem.split('_')
+    # Parse each entity-value pair
+    for part in parts:
+        if '-' in part:
+            key, value = part.split('-', 1)
+            entities[key] = value
+        else:
+            entities['suffix'] = part
+    return entities

pelican_nlp/utils/setup_functions.py CHANGED Viewed

@@ -1,81 +1,107 @@
 import os
-from pelican_nlp.core.subject import Subject
 import shutil
 import yaml
 import sys
+from pelican_nlp.core.subject import Subject
+from .filename_parser import parse_lpds_filename
+from ..config import debug_print
 def subject_instantiator(config, project_folder):
     path_to_subjects = os.path.join(project_folder, 'subjects')
     print('Instantiating Subjects...')
-    subjects = [Subject(subject) for subject in os.listdir(path_to_subjects)]
+    # Get all subject directories that match sub-* pattern
+    subjects = [
+        Subject(subject_dir)
+        for subject_dir in os.listdir(path_to_subjects)
+    ]
     # Identifying all subject files
     for subject in subjects:
-        if config['multiple_sessions']:
-            paths = _get_subject_sessions(subject, project_folder)
-        else:
-            paths = [os.path.join(path_to_subjects, subject.subjectID)]
+        # Get subject ID from directory name (e.g., 'sub-01' -> '01')
+        subject.subjectID = subject.name.split('-')[1]
+        # Find all files for this subject recursively
+        subject_path = os.path.join(path_to_subjects, subject.name)
+        all_files = []
+        for root, _, files in os.walk(subject_path):
+            all_files.extend([os.path.join(root, f) for f in files])
+        # Filter files by task name from config
+        task_files = []
+        for file_path in all_files:
+            filename = os.path.basename(file_path)
+            entities = parse_lpds_filename(filename)
+            if entities.get('task') == config['task_name']:
+                task_files.append((file_path, filename))
+        # Instantiate documents for matching files
+        for file_path, filename in task_files:
+            entities = parse_lpds_filename(filename)
+            document = _instantiate_document(file_path, filename, entities, config)
+            subject.documents.append(document)
-        for path in paths:
-            file_path = os.path.join(path, config['task_name'])
-            subject.documents.extend(_instantiate_documents(file_path, subject.subjectID, config))
-        print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
+        debug_print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
+        # Set up results paths for each document
         for document in subject.documents:
-            parts = document.file_path.split(os.sep)
+            entities = parse_lpds_filename(document.name)
+            # Build derivatives path based on entities
+            derivatives_parts = [project_folder, 'derivatives']
-            # Adjust path components based on whether session exists
-            if config.get('multiple_sessions', False):
-                subject_ID, session, task = parts[-4], parts[-3], parts[-2]
-                document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, session, task)
-            else:
-                subject_ID, task = parts[-3], parts[-2]
-                document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, task)
+            # Always include subject
+            derivatives_parts.append(f"sub-{entities['sub']}")
+            # Add session if present
+            if 'ses' in entities:
+                derivatives_parts.append(f"ses-{entities['ses']}")
+            # Add task
+            derivatives_parts.append(f"task-{entities['task']}")
+            document.results_path = os.path.join(*derivatives_parts)
     return subjects
-def _get_subject_sessions(subject, project_path):
-    session_dir = os.path.join(os.path.join(project_path, 'subjects'), subject.subjectID)
-    session_paths = [
-        os.path.join(session_dir, session)
-        for session in os.listdir(session_dir)
-        if os.path.isdir(os.path.join(session_dir, session))
-    ]
-    subject.numberOfSessions = len(session_paths)
-    return session_paths
+def _instantiate_document(filepath, filename, entities, config):
+    """Create appropriate document instance based on config and entities"""
-def _instantiate_documents(filepath, subject, config):
+    common_kwargs = {
+        'file_path': os.path.dirname(filepath),
+        'name': filename,
+        'subject_ID': entities.get('sub'),
+        'task': entities.get('task'),
+        # Check for specific entities that might indicate document type
+        'fluency': 'cat' in entities and entities['cat'] == 'semantic',
+        'num_speakers': config['number_of_speakers'],
+    }
-    if config['input_file']=='text':
+    if config['input_file'] == 'text':
         from pelican_nlp.core.document import Document
-        return [
-            Document(
-                filepath,
-                file_name,
-                subject_ID = subject,
-                task=config['task_name'],
-                fluency=config['fluency_task'],
-                has_sections=config['has_multiple_sections'],
-                section_identifier=config['section_identification'],
-                number_of_sections=config['number_of_sections'],
-                num_speakers=config['number_of_speakers'],
-                has_section_titles=config['has_section_titles']
-            )
-            for file_name in os.listdir(filepath)
-        ]
-    elif config['input_file']=='audio':
+        return Document(
+            **common_kwargs,
+            # Use entities for section information if available, fall back to config
+            has_sections=bool(entities.get('sections', config['has_multiple_sections'])),
+            section_identifier=config['section_identification'],
+            number_of_sections=config['number_of_sections'],
+            has_section_titles=config['has_section_titles'],
+            # Add any additional entities as attributes
+            session=entities.get('ses'),
+            acquisition=entities.get('acq'),
+            category=entities.get('cat'),
+            run=entities.get('run'),
+        )
+    elif config['input_file'] == 'audio':
         from pelican_nlp.core.audio_document import AudioFile
-        return [
-            AudioFile(
-                filepath,
-                file_name,
-                subject_ID=subject,
-                task=config['task_name'],
-                fluency=config['fluency_task'],
-                num_speakers=config['number_of_speakers'],
-            )
-            for file_name in os.listdir(filepath)
-        ]
+        return AudioFile(
+            **common_kwargs,
+            # Add audio-specific entities
+            recording_type=entities.get('rec'),
+            channel=entities.get('ch'),
+            run=entities.get('run'),
+        )
 def remove_previous_derivative_dir(output_directory):
     if os.path.isdir(output_directory):

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pelican_nlp
-Version: 0.2.7
+Version: 0.3.1
 Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
 Author-email: Yves Pauli <yves.pauli@gmail.com>
 License-Expression: CC-BY-NC-4.0
@@ -45,20 +45,31 @@ Requires-Dist: huggingface_hub==0.29.2
 Dynamic: license-file
 ====================================
-PELICAN_nlp
+pelican_nlp
 ====================================
-pelican_nlp stands for "Preprocessing and Extraction of Linguistic Information for Computational Analysis - Natural Language Processing". This package enables the creation of standardized and reproducible language processing pipelines, extracting linguistic features from various tasks like discourse, fluency, and image descriptions.
+.. |logo| image:: docs/images/pelican_logo.png
+    :alt: PELICAN_nlp Logo
+    :width: 200px
-.. image:: https://img.shields.io/pypi/v/package-name.svg
++------------+-------------------------------------------------------------------+
+| |logo|     | pelican_nlp stands for "Preprocessing and Extraction of Linguistic|
+|            | Information for Computational Analysis - Natural Language         |
+|            | Processing". This package enables the creation of standardized and|
+|            | reproducible language processing pipelines, extracting linguistic |
+|            | features from various tasks like discourse, fluency, and image    |
+|            | descriptions.                                                     |
++------------+-------------------------------------------------------------------+
+.. image:: https://img.shields.io/pypi/v/pelican_nlp.svg
     :target: https://pypi.org/project/pelican_nlp/
     :alt: PyPI version
-.. image:: https://img.shields.io/github/license/username/package-name.svg
+.. image:: https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg
     :target: https://github.com/ypauli/pelican_nlp/blob/main/LICENSE
-    :alt: License
+    :alt: License CC BY-NC 4.0
-.. image:: https://img.shields.io/pypi/pyversions/package-name.svg
+.. image:: https://img.shields.io/pypi/pyversions/pelican_nlp.svg
     :target: https://pypi.org/project/pelican_nlp/
     :alt: Supported Python Versions

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,8 @@
 pelican_nlp/__init__.py,sha256=TD5xjKeXXAH6nUWG-6igbClgovi5r8RIEqI_ix1QeYo,204
-pelican_nlp/_version.py,sha256=LIho7asb0pp1iNbJvXEhRMluyGN4gB4RHIIbAKpROsc,21
+pelican_nlp/_version.py,sha256=v-ExhFzOD_GemLcOptv2ZODgnklv9iqEEospk_bU1_w,21
 pelican_nlp/cli.py,sha256=mPz-ASIMUme69G6YGVpTnHr5VfM3XA4h29WFd7DXpa4,588
-pelican_nlp/main.py,sha256=HX2Rbl4j7RXaMXlGCtggBBqcg3gRh-ey1PdLsQcDX30,7660
+pelican_nlp/config.py,sha256=cqUYLeqQB_Y-drR4dpxz8l-aLKl7TcfiB8SeN_rNq4I,352
+pelican_nlp/main.py,sha256=43jz94Zit931nZXs1hSAAPimRbX8Vmj-bEx7rDoYtZ4,8674
 pelican_nlp/Nils_backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pelican_nlp/Nils_backup/extract_acoustic_features.py,sha256=eSP8lXxbZ15YE1HqxGtma9uWOcSN-fI-ig-NwQ9eOA8,10771
 pelican_nlp/Nils_backup/speaker_diarization_Nils.py,sha256=3RIhjKihu4Z1rruMt9KESFE2lqesfzIpRr7rLummUEo,10219
@@ -38,15 +39,15 @@ pelican_nlp/configuration_files/config_general.yml,sha256=Dx06lK77yHSiH5U8vxrfm5
 pelican_nlp/configuration_files/config_morteza.yml,sha256=ZUcEIHrXWH9H3r42kTWIFEfgtqZBpyYUMOErVC7X3z8,3152
 pelican_nlp/core/__init__.py,sha256=whJc5dWsGsKn2IAw-D4BvCvUKW1sVtWYE1WJIuUr5uI,165
 pelican_nlp/core/audio_document.py,sha256=hhSJNgeqSYa6_uws2ho66agHhAdHuKN3EIEdIsIcXKg,586
-pelican_nlp/core/corpus.py,sha256=EIt-3giRaFe0vcJoKla_J8uVF_zR6oGmbQnNbllO9C0,15142
+pelican_nlp/core/corpus.py,sha256=bP8exSraPIekc8WD7GdUIJrV03lS2p1FMdiAV_6HTDY,14989
 pelican_nlp/core/document.py,sha256=j2HP5FX6cfmXHo7OWVFCX6cMsDyqsOmNlnGNNNfCm2c,8467
-pelican_nlp/core/subject.py,sha256=-pi3jDzb2zLiG8JNAi9i-9Jd-VtsPxDO4ShQci2QSMg,1059
+pelican_nlp/core/subject.py,sha256=Jx99vPn0K0KT_9BsJOY8XviFU_GuZGuwtb1rbLNkiUI,1049
 pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
-pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=6Csrr6uotarhuAzxYlGFAil9K4PLUqa9vWw607peRoA,2319
+pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=Ol6fqyy94Iym1Z-eTVoz8EmqfV58boz5WAoamAK7JVE,2323
 pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
-pelican_nlp/extraction/extract_embeddings.py,sha256=e5bcNlskd7f-JkWtfd7YutGV5bqcURKrAkETRyTx93Q,2457
-pelican_nlp/extraction/extract_logits.py,sha256=Lc7Es86T8mlSvLMhiDHpFdCc0kCZ9fNr3-VFnOyeybs,3869
-pelican_nlp/extraction/language_model.py,sha256=npew_4ziTCNE87pjN8LL0eTPujlewVr8pMT7BsmzEjo,4038
+pelican_nlp/extraction/extract_embeddings.py,sha256=6lzKbZpe5kCWHMh_ca0M-Xl_UF64bmGXEeQjFFTnsOA,2507
+pelican_nlp/extraction/extract_logits.py,sha256=kvZn9dZWsZiSPcbQ8hKtFcS9XxNlMmL-WGvpToMMo7c,3925
+pelican_nlp/extraction/language_model.py,sha256=37vVNFL31DVIBPSuyQK1rkEm8kiCXHTpGYv4Vk8w2bM,5676
 pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
 pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
 pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
@@ -54,7 +55,7 @@ pelican_nlp/extraction/test_documents/wallace_1.1_3.txt,sha256=gs5REE10myK3Nm9JB
 pelican_nlp/extraction/test_documents/wallace_1_4.txt,sha256=95Z7gS92KERCocrbOAFbJntf5QoE-6p0GL67XQEffqI,3963
 pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py,sha256=svXXyLEA62mLa0KUfSiOSFFMjYk17K7BJbxUoLf0l9w,1468
 pelican_nlp/praat/__init__.py,sha256=uSEaUZ2nw7lH0twbRJL5BltJTJpopj5XCVhIbeM42bg,1035
-pelican_nlp/preprocessing/LPDS.py,sha256=4UWkMMSrdU-nWVi8eKiWQSGD7f7lemB42aI0fFn6ZLU,4097
+pelican_nlp/preprocessing/LPDS.py,sha256=reaIqxDz_g7VG_J_8mCK2vcg4I1XrvZPTLgglupKiv4,4184
 pelican_nlp/preprocessing/__init__.py,sha256=ZYgOUlKPXmltYez3urPZmsAWRWSEqZ3_l_gN2aqd15s,293
 pelican_nlp/preprocessing/pipeline.py,sha256=t2zJAvZRO12MdAKQgm8XZxfZND7_8gFtzHF9Rq2L2aE,1796
 pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_In0-nzFOkZ44cBnoKLk,1122
@@ -66,12 +67,13 @@ pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=OaTCoMwhDjrOI
 pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=JYpq90K4AF5TslzESJK6Nidw6-D1IiqD_6cdmlCd5-w,2990
 pelican_nlp/sample_configuration_files/config_general.yml,sha256=-GAVATlqXuQq4ANSW0JauwIGhr7ET_oZiBiM7I40AkA,3424
 pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
-pelican_nlp/utils/csv_functions.py,sha256=hsG73gm3Up9sAerp6gIxuNHaeP1vJj6HSh7ggVm1SSo,7272
+pelican_nlp/utils/csv_functions.py,sha256=7X8pGh49TGZGs7h6JrJD846swCqSHL32mmXJ-8qLWPE,7774
+pelican_nlp/utils/filename_parser.py,sha256=PGSKjiYDe_JVAFGcaYHdIYazB3p4MUiG6n8h_uZl8d8,551
 pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
-pelican_nlp/utils/setup_functions.py,sha256=t4WG5qd5iYpNNBGklje_8ukwmJp_C9RMLLi7veDgNeA,3574
-pelican_nlp-0.2.7.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
-pelican_nlp-0.2.7.dist-info/METADATA,sha256=YyZBYza89dtKbvLLHXkxOEZ1BODloXBjh-zZSODLfVI,6155
-pelican_nlp-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-pelican_nlp-0.2.7.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
-pelican_nlp-0.2.7.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
-pelican_nlp-0.2.7.dist-info/RECORD,,
+pelican_nlp/utils/setup_functions.py,sha256=Ovd3VMCRpVg_BU8gcF6rGc9mp0zsD2iqJRqRB61lxOg,4529
+pelican_nlp-0.3.1.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
+pelican_nlp-0.3.1.dist-info/METADATA,sha256=sgUAHpBqowrsg_yFXs6-HDSgI77js6uqf8josFxjpcM,6593
+pelican_nlp-0.3.1.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
+pelican_nlp-0.3.1.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
+pelican_nlp-0.3.1.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
+pelican_nlp-0.3.1.dist-info/RECORD,,

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (79.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{pelican_nlp-0.2.7.dist-info → pelican_nlp-0.3.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

pelican-nlp 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl

pelican-nlp 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl