PyPI - pelican-nlp - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pelican-nlp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

pelican_nlp/__init__.py +9 -0
pelican_nlp/core/__init__.py +5 -0
pelican_nlp/core/audio_document.py +20 -0
pelican_nlp/core/corpus.py +296 -0
pelican_nlp/core/document.py +226 -0
pelican_nlp/core/subject.py +30 -0
pelican_nlp/extraction/__init__.py +2 -0
pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
pelican_nlp/extraction/distance_from_randomness.py +109 -0
pelican_nlp/extraction/extract_embeddings.py +57 -0
pelican_nlp/extraction/extract_logits.py +102 -0
pelican_nlp/extraction/language_model.py +71 -0
pelican_nlp/extraction/semantic_similarity.py +60 -0
pelican_nlp/extraction/test_documents/test_features.csv +4 -0
pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
pelican_nlp/main.py +211 -0
pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
pelican_nlp/preprocessing/LPDS.py +77 -0
pelican_nlp/preprocessing/__init__.py +7 -0
pelican_nlp/preprocessing/pipeline.py +50 -0
pelican_nlp/preprocessing/speaker_diarization.py +33 -0
pelican_nlp/preprocessing/text_cleaner.py +224 -0
pelican_nlp/preprocessing/text_importer.py +42 -0
pelican_nlp/preprocessing/text_normalizer.py +24 -0
pelican_nlp/preprocessing/text_tokenizer.py +43 -0
pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
pelican_nlp/sample_configuration_files/config_general.yml +131 -0
pelican_nlp/utils/__init__.py +3 -0
pelican_nlp/utils/csv_functions.py +193 -0
pelican_nlp/utils/sample_usage.py +17 -0
pelican_nlp/utils/setup_functions.py +93 -0
pelican_nlp-0.1.0.dist-info/METADATA +146 -0
pelican_nlp-0.1.0.dist-info/RECORD +39 -0
pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0

pelican_nlp/preprocessing/text_importer.py ADDED Viewed

@@ -0,0 +1,42 @@
+import os
+class TextImporter:
+    def __init__(self, file_path):
+        self.file_path = file_path
+    def load_text(self, file_path):
+        # Possible file formats txt and docx, expand if necessary
+        ext = os.path.splitext(file_path)[-1].lower()
+        if ext == '.txt':
+            return self._load_txt(file_path)
+        elif ext == '.docx':
+            return self._load_docx(file_path)
+        elif ext == '.rtf':
+            return self._load_rtf(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {ext}")
+    def _load_txt(self,file_path):
+        with open(file_path, 'r') as file:
+            return file.read()
+    def _load_docx(self,file_path):
+        import docx2txt
+        doc = docx2txt.process(file_path)
+        return doc
+        #return '\n'.join([para.text for para in doc.paragraphs])
+    def _load_rtf(self, file_path):
+        """Read RTF file and convert its content to plain text."""
+        from striprtf.striprtf import rtf_to_text
+        import chardet
+        with open(file_path, "rb") as file:
+            raw_data = file.read()
+            result = chardet.detect(raw_data)
+            encoding = result["encoding"]
+        with open(file_path, "r", encoding=encoding, errors="ignore") as file:
+            rtf_content = file.read()
+        return rtf_to_text(rtf_content)

pelican_nlp/preprocessing/text_normalizer.py ADDED Viewed

@@ -0,0 +1,24 @@
+import spacy
+class TextNormalizer:
+    def __init__(self, options):
+        self.options = options
+        self.nlp = None  # Initialize as None, load only when needed
+    def _load_model(self):
+        """Load spaCy model if not already loaded."""
+        if self.nlp is None:
+            self.nlp = spacy.load('de_core_news_sm')
+    def normalize(self, tokens):
+        method = self.options.get('method')
+        if method == 'lemmatization':
+            self._load_model()  # Load model only when lemmatization is needed
+            return [self.nlp(token)[0].lemma_ for token in tokens]
+        elif method == 'stemming':
+            self._load_model()  # Load model only when stemming is needed
+            doc = self.nlp(" ".join(tokens))
+            return [token._.stemmed for token in doc]
+        else:
+            raise ValueError(f"Unsupported normalization method: {method}")

pelican_nlp/preprocessing/text_tokenizer.py ADDED Viewed

@@ -0,0 +1,43 @@
+import torch
+class TextTokenizer:
+    def __init__(self, method, model_name=None, max_length=None):
+        self.tokenization_method = method
+        self.model_name = model_name
+        self.max_sequence_length=max_length
+        self.tokenizer = self.get_tokenizer()
+        self.device_used = 'cuda' if torch.cuda.is_available() else 'cpu'
+    def tokenize_text(self, text):
+        method = self.tokenization_method
+        if not isinstance(text, str):
+            raise ValueError(f"to tokenize a text it must be a in string format, but it is in format {type(text)}")
+        if method == 'whitespace':
+            # Tokenize by whitespace
+            return text.split()
+        elif method == 'model_roberta':
+            # Tokenize using the model's tokenizer
+            return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
+        elif method == 'model':
+            return self.tokenizer.encode(text, return_tensors='pt')
+        else:
+            raise ValueError(f"Unsupported tokenization method: {method}")
+    def convert_ids_to_tokens(self, ids):
+        return self.tokenizer.convert_ids_to_tokens(ids)
+    def get_tokenizer(self):
+        if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
+            from transformers import AutoTokenizer
+            return AutoTokenizer.from_pretrained(
+                self.model_name,
+                trust_remote_code=False,  # Don't execute arbitrary model code
+                use_safetensors=True
+            )
+        else:
+            return None

pelican_nlp/sample_configuration_files/config_discourse.yml ADDED Viewed

@@ -0,0 +1,103 @@
+# Configuration file for discourse protocols
+#=======================================
+input_file: "text" #or 'audio'
+discourse: &discourse_flag true
+#=====================================
+#general configurations; always adapt
+PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
+language: "german" # Possibly add options for German and English
+task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
+corpus_names:
+  - "placebo"
+  - "schizophrenia"
+metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
+number_of_speakers: 3
+subject_speakertag: "B"
+#=========================================================
+#Optional configurations; Change with preference. However, default settings recommended
+fluency_task: &fluency_flag false
+cleaning_options:
+  general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
+  remove_brackets_and_bracketcontent: true
+  remove_timestamps: true
+  timestamp_pattern_example: "#00:00:19-0#"
+  remove_punctuation: false
+  lowercase: false
+  #Options for fluency tasks
+  fluency_task: *fluency_flag
+  word_splitter: null
+  remove_hyphens: null
+  remove_duplicates: null
+options_logits:
+  chunk_size: 128
+  overlap_size: 64
+  tokenization_method: "model"
+  #method: "model_instance" # Options: model, regex, nltk, etc.
+  model_name: "DiscoResearch/Llama3-German-8B-32k" # Replace with your model instance name
+  remove_punctuation: true
+  lowercase: true
+  keep_speakertags: true
+options_embeddings:
+  tokenization_method: "whitespace" #"model" or "whitespace"
+  max_length: 512 #max sequence length
+  model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
+  pytorch_based_model: false
+  method: "model_instance"
+  remove_punctuation: false
+  lowercase: false
+  keep_speakertags: true
+  clean_embedding_tokens: true
+  output_options:
+    exclude_special_tokens: true
+    remove_'_'_character: true
+    remove_speaker_labels: true
+    remove_punctuation_and_symbols: true
+    remove_brackets_and_content: true
+  semantic-similarity: false
+  window_size: null
+  clean_tokens: false
+  divergence_from_optimality: false
+#================================================================================
+#Extra configurations:
+pipeline_options:
+  quality_check: false
+  clean_text: true
+  tokenize_text: false
+  normalize_text: false
+general_cleaning_options:
+  strip_whitespace: true
+  merge_multiple_whitespaces: true
+  remove_whitespace_before_punctuation: true
+  merge_newline_characters: true
+  remove_backslashes: true
+has_multiple_sections: false #evaluated independently
+has_section_titles: false
+section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
+number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
+# Options for extract_embeddings
+window_sizes: [2]
+metric_function: cosine_similarity
+aggregation_functions: mean_of_means
+normalization_options:
+  method: "lemmatization" #Options: lemmatization or stemming
+#================================================================
+#Detail configurations; Changes optional, mostly used for quality checking / error handling
+number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
+multiple_sessions: false # Set to True if multiple sessions per subject
+recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive

pelican_nlp/sample_configuration_files/config_fluency.yml ADDED Viewed

@@ -0,0 +1,108 @@
+# Configuration file for fluency task
+# =======================================
+input_file: "text" #or 'audio'
+fluency_task: &fluency_flag true
+#========================================
+#general configurations; always adapt
+PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
+language: "german"
+multiple_sessions: &session_flag false
+corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
+  - "animals"
+  - "clothes"
+  - "food"
+#Specify linguistic metrics to extract
+metric_to_extract: 'embeddings' #Possible options: 'embeddings', 'logits'
+output_document_information: true
+#====================================================================
+#Optional configurations; Change with preference. However, default settings recommended
+cleaning_options:
+  general_cleaning: true
+  #Options for fluency tasks
+  fluency_task: *fluency_flag
+  word_splitter: ';' #default split with ',' add different word_splitter if necessary
+  remove_hyphens: true
+  remove_duplicates: false
+  lowercase: false
+  #Optional cleaning
+  remove_brackets_and_bracketcontent: false #default 'false'
+  remove_timestamps: false #default 'false'
+  timestamp_pattern_example: null #e.g. "#00:00:23-00#"
+  remove_punctuation: false #Careful!: If set to true word_splitter might be removed
+options_embeddings:
+  tokenization_method: "whitespace" #or "model"
+  model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
+  pytorch_based_model: false
+  method: "model_instance"
+  max_length: null
+  clean_embedding_tokens: true
+  semantic-similarity: true
+  distance-from-randomness: false
+options_dis_from_randomness:
+  window_size: 8
+  min_len: null
+  bootstrap: 10000
+  shuffle_mode: 'include0_includeN'
+  parallel_computing: false #not yet set up
+options_semantic-similarity:
+  window_sizes: #'all' or window size as integer
+    - 2
+    - 8
+#==================================================================
+#Extra configurations;
+task_name: "fluency"
+create_aggregation_of_results: true
+pipeline_options:
+  quality_check: false
+  clean_text: true
+  tokenize_text: false
+  normalize_text: false
+general_cleaning_options:
+  strip_whitespace: true
+  merge_multiple_whitespaces: true
+  remove_whitespace_before_punctuation: true
+  merge_newline_characters: true
+  remove_backslashes: true
+has_multiple_sections: false
+has_section_titles: false
+section_identification: null
+number_of_sections: 1
+number_of_speakers: 1
+discourse: false
+document_information_output:
+  parameters:
+    - subject_ID
+    - fluency_word_count
+    - fluency_duplicate_count
+#================================================================
+#Detail configurations; Changes optional, mostly used for quality checking / error handling
+recompute_everything: true
+number_of_subjects: null
+# Filename components configuration
+filename_components:
+  subject: true    # mandatory
+  session: *session_flag
+  task: true       # mandatory
+  task_addition: false
+  corpus: true    # typically true for fluency tasks (e.g., "animals", "clothes")
+  metric: true
+  additional_tags: []

pelican_nlp/sample_configuration_files/config_general.yml ADDED Viewed

@@ -0,0 +1,131 @@
+# Master Configuration File
+# ========================
+# Basic Settings
+# -------------
+input_file: "text"  # Options: 'text' or 'audio'
+PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
+language: "german"
+recompute_everything: true  # If false, reuses previously computed results
+# Task Configuration
+# -----------------
+task_name: # Name of task used for creation of data
+fluency_task: &fluency_flag false  # Flag for fluency-specific settings
+discourse: &discourse_flag false  # Flag for discourse-specific settings
+corpus_names:  # List of task corpora
+  - "healthy-control"
+# Session and Subject Settings
+# --------------------------
+multiple_sessions: false
+number_of_subjects: null  # If null, auto-detected
+number_of_speakers: 1
+subject_speakertag: null  # Speaker tag for subject (e.g., "B")
+# Document Structure
+# ----------------
+has_multiple_sections: false
+has_section_titles: false
+section_identification: null  # e.g., "Section:"
+number_of_sections: 1  # If null, auto-detected
+# Processing Pipeline
+# -----------------
+pipeline_options:
+  quality_check: false
+  clean_text: true
+  tokenize_text: false
+  normalize_text: false
+# Metric Extraction
+# ---------------
+metric_to_extract: "embeddings"  # Options: 'embeddings', 'logits'
+extract_logits: null
+extract_embeddings: true
+# Cleaning Options
+# --------------
+cleaning_options:
+  general_cleaning: true
+  remove_punctuation: false
+  lowercase: true
+  remove_brackets_and_bracketcontent: false
+  remove_timestamps: false
+  timestamp_pattern_example: null  # e.g., "#00:00:23-00#"
+  # Fluency-specific options
+  fluency_task: *fluency_flag
+  word_splitter: ';'
+  remove_hyphens: true
+  remove_duplicates: true
+general_cleaning_options:
+  strip_whitespace: true
+  merge_multiple_whitespaces: true
+  remove_whitespace_before_punctuation: true
+  merge_newline_characters: true
+  remove_backslashes: true
+# Embedding Options
+# ---------------
+options_embeddings:
+  tokenization_method: "whitespace"  # Options: 'whitespace', 'model'
+  model_name: "fastText"  # Options: 'fastText', 'xlm-roberta-base'
+  pytorch_based_model: false
+  method: "model_instance"
+  max_length: 512
+  clean_embedding_tokens: true
+  remove_punctuation: false
+  lowercase: false
+  keep_speakertags: false
+  semantic-similarity: true
+  window_size: null
+  clean_tokens: true
+  divergence_from_optimality: false
+  output_options:
+    exclude_special_tokens: true
+    remove_'_'_character: true
+    remove_speaker_labels: true
+    remove_punctuation_and_symbols: true
+    remove_brackets_and_content: true
+# Logits Options
+# -------------
+options_logits:
+  chunk_size: 128
+  overlap_size: 64
+  tokenization_method: "model"
+  model_name: "DiscoResearch/Llama3-German-8B-32k"
+  remove_punctuation: true
+  lowercase: true
+  keep_speakertags: true
+# Analysis Options
+# --------------
+options_semantic-similarity:
+  window_sizes:  # 'all' or window size as integer
+    - 2
+    - 8
+options_dis_from_randomness:
+  window_size: 8
+  min_len: null
+  bootstrap: 10000
+  shuffle_mode: 'include0_includeN'
+  parallel_computing: false
+# Normalization Options
+# -------------------
+normalization_options:
+  method: "lemmatization"  # Options: 'lemmatization', 'stemming'
+# Filename Configuration
+# --------------------
+filename_components:
+  subject: true    # mandatory
+  session: false
+  task: true       # mandatory
+  task_addition: false
+  corpus: true     #mandatory
+  metric: true
+  additional_tags: []

pelican_nlp/utils/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Import utility functions for easier access
+from .setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
+from .csv_functions import store_features_to_csv

pelican_nlp/utils/csv_functions.py ADDED Viewed

@@ -0,0 +1,193 @@
+import os
+import csv
+import numpy as np
+def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
+    """Store various types of features to CSV files with consistent formatting.
+    Args:
+        input_data: The data to be stored in CSV format
+        derivatives_dir: Base directory for all derivatives
+        doc_class: Document class containing subject, session (optional), task, and task_addition (optional) info
+        metric: Type of metric being stored
+    """
+    # Get the appropriate metric folder
+    metric_folder = metric
+    # Build base filename parts from doc_class
+    filename_parts = [
+        doc_class.subject_ID,
+        doc_class.task,
+        doc_class.corpus_name
+    ]
+    # Add session to filename if it exists
+    if hasattr(doc_class, 'session') and doc_class.session:
+        filename_parts.insert(1, doc_class.session)
+    # Join the base parts with underscores
+    filename = "_".join(filename_parts)
+    # Add task_addition with underscore if it exists
+    if hasattr(doc_class, 'task_addition') and doc_class.task_addition:
+        filename += f"_{doc_class.task_addition}"
+    # Add the metric with an underscore
+    filename += f"_{metric}.csv"
+    # Build the full path
+    path_components = [
+        derivatives_dir,
+        metric_folder,
+        doc_class.subject_ID,
+    ]
+    # Add session to path if it exists
+    if hasattr(doc_class, 'session') and doc_class.session:
+        path_components.append(doc_class.session)
+    path_components.append(doc_class.task)
+    # Create directory and get final filepath
+    final_results_path = os.path.join(*path_components)
+    os.makedirs(final_results_path, exist_ok=True)
+    output_filepath = os.path.join(final_results_path, filename)
+    file_exists = os.path.exists(output_filepath)
+    # Write data based on metric type
+    with open(output_filepath, mode='a', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        if metric == 'embeddings':
+            if not isinstance(input_data, list) or not input_data:
+                raise ValueError("Input data must be a non-empty list of tuples.")
+            # Get the dimensionality from the first embedding
+            embedding_dim = len(input_data[0][1])
+            header = ['Token'] + [f"Dim_{i}" for i in range(embedding_dim)]
+            _write_csv_header(writer, header, file_exists)
+            for token, embedding in input_data:
+                # Handle both list and tensor/array types
+                if hasattr(embedding, 'tolist'):
+                    embedding_list = embedding.tolist()
+                elif isinstance(embedding, list):
+                    embedding_list = embedding
+                else:
+                    raise ValueError(f"Embedding must be either a list or have tolist() method, got {type(embedding)}")
+                writer.writerow([token] + embedding_list)
+        elif metric == 'cosine-similarity-matrix':
+            _write_csv_header(writer, ['Matrix'], file_exists)
+            for row in input_data:
+                writer.writerow(row)
+        elif metric.startswith('semantic-similarity-window-'):
+            header = ['Metric', 'Similarity_Score']
+            _write_csv_header(writer, header, file_exists)
+            for metric_name, score in input_data.items():
+                writer.writerow([metric_name, score])
+        elif metric == 'distance-from-randomness':
+            header = ['window_index', 'all_pairs_average', 'actual_dist', 'average_dist', 'std_dist']
+            _write_csv_header(writer, header, file_exists)
+            # Input data is a dictionary with 'section' key containing list of window results
+            for window_result in input_data['section']:
+                writer.writerow([
+                    window_result['window_index'],
+                    window_result['all_pairs_average'],
+                    window_result['actual_dist'],
+                    window_result['average_dist'],
+                    window_result['std_dist']
+                ])
+        elif metric == 'logits':
+            if not input_data:
+                return
+            header = list(input_data[0].keys())
+            _write_csv_header(writer, header, file_exists)
+            for entry in input_data:
+                writer.writerow(entry.values())
+        elif metric == 'opensmile-features':
+            if not input_data:
+                return
+            # Get all column names from the first entry
+            csv_columns = list(input_data[0].keys()) if isinstance(input_data, list) else list(input_data.keys())
+            # Only write header if file doesn't exist
+            if not file_exists:
+                writer.writerow(csv_columns)
+            # Handle both list of dictionaries and single dictionary cases
+            if isinstance(input_data, list):
+                for entry in input_data:
+                    # Create a new array for the row data
+                    row_data = []
+                    for column in csv_columns:
+                        # Convert numerical values to float
+                        value = entry[column]
+                        if isinstance(value, (int, float)):
+                            value = float(value)
+                        row_data.append(value)
+                    writer.writerow(row_data)
+            else:
+                # Handle single dictionary case
+                row_data = []
+                for column in csv_columns:
+                    value = input_data[column]
+                    if isinstance(value, (int, float)):
+                        value = float(value)
+                    row_data.append(value)
+                writer.writerow(row_data)
+def _build_filename_parts(path_parts, corpus, metric, config=None):
+    """Helper function to build filename components."""
+    filename_config = config.get('filename_components', {}) if config else {}
+    # Extract mandatory components
+    if len(path_parts) < 3:
+        raise ValueError("Invalid path format. Expected at least 'project/subject/task'.")
+    subject = path_parts[-3]
+    task = path_parts[-1]
+    # Build filename components
+    parts = [subject]
+    # Add optional session
+    if filename_config.get('session', False) and len(path_parts) >= 4:
+        parts.append(path_parts[-3])
+    parts.append(task)
+    # Add optional components
+    if filename_config.get('corpus', True):
+        parts.append(corpus)
+    parts.extend(filename_config.get('additional_tags', []))
+    parts.append(metric)
+    return parts
+def _get_metric_folder(metric):
+    """Determine the appropriate metric folder."""
+    if metric.startswith('semantic-similarity') or metric in ['consecutive-similarities', 'cosine-similarity-matrix']:
+        return 'semantic-similarity'
+    return 'embeddings'
+def _write_csv_header(writer, header, file_exists):
+    """Write CSV header with section separation if file exists."""
+    if not file_exists:
+        writer.writerow(header)
+    else:
+        writer.writerow([])  # Separate sections
+        writer.writerow(['New Section'])
+        writer.writerow(header)

pelican_nlp/utils/sample_usage.py ADDED Viewed

@@ -0,0 +1,17 @@
+import pelican
+file_path = 'your/file/path'
+#return preprocessed transcript
+preprocessed_files = pelican.preprocess(
+    file_path=file_path,
+    task=image_descriptions,
+    general_cleaning=true,
+    lowercase=true
+)
+#return embeddings from transcript
+file_embeddings = pelican.extract_embeddings(
+    file_path=file_path,
+    mode="example_mode"
+)