PyPI - pelican-nlp - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

pelican-nlp 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

pelican_nlp/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.2"
1	+ __version__ = "0.3.4"

pelican_nlp/cli.py CHANGED Viewed

@@ -1,17 +1,35 @@
 import os
+from pathlib import Path
 from pelican_nlp.main import Pelican
+from pelican_nlp.config import RUN_TESTS, run_tests
 def main():
-    config_files = [f for f in os.listdir(".") if f.endswith(".yml")]
+    # Run tests if enabled
+    if RUN_TESTS:
+        print("Running tests...")
+        run_tests()
+        return
+    # Look for configuration files in the current working directory
+    config_dir = Path.cwd()
+    print(f"Looking for configuration files in: {config_dir}")
+    config_files = [f for f in os.listdir(config_dir) if f.endswith((".yml", ".yaml"))]
     if not config_files:
-        print("No .yml configuration file found in the current directory.")
+        print("No .yml or .yaml configuration file found in the current directory.")
+        print("Please ensure you have a configuration file in your current working directory.")
         return
     if len(config_files) > 1:
-        print("More than one configuration file found - remove unneeded files from project directory")
+        print("Warning: Multiple configuration files found in current directory:")
+        for i, file in enumerate(config_files, 1):
+            print(f"  {i}. {file}")
+        print("Please ensure only one configuration file is present in the current directory.")
         return
-    config_file = config_files[0]  # You could also add logic to choose or validate
+    config_file = str(config_dir / config_files[0])
     print(f"Using configuration file: {config_file}")
     pelican = Pelican(config_file)

pelican_nlp/config.py CHANGED Viewed

@@ -6,9 +6,30 @@ For consistency of pipeline, DO NOT CHANGE.
 """
 # Debug flag
-DEBUG_MODE = False
+DEBUG_MODE = True
+# Test flag - set to True to run all example tests
+RUN_TESTS = False
 def debug_print(*args, **kwargs):
     """Print only if debug mode is enabled."""
+    DEBUG_MODE = True
     if DEBUG_MODE:
-        print(*args, **kwargs)
+        print(*args, **kwargs)
+def run_tests():
+    """Run all example tests if RUN_TESTS is enabled."""
+    if RUN_TESTS:
+        import unittest
+        from pathlib import Path
+        # Get the path to the test file
+        test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
+        # Create a test suite and add the test file
+        loader = unittest.TestLoader()
+        suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
+        # Run the tests
+        runner = unittest.TextTestRunner(verbosity=2)
+        runner.run(suite)

pelican_nlp/extraction/extract_embeddings.py CHANGED Viewed

@@ -35,13 +35,60 @@ class EmbeddingsExtractor:
                         outputs = self.model_instance(input_ids=inputs['input_ids'])
                     else:
                         # Handle RoBERTa and other models that accept **inputs
-                        outputs = self.model_instance(**inputs)
+                        if isinstance(inputs, dict):
+                            # Ensure inputs are on the same device as the model
+                            inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
+                            debug_print(f"Model inputs: {inputs}")
+                            outputs = self.model_instance(**inputs, output_hidden_states=True)
+                        else:
+                            debug_print(f"Input type: {type(inputs)}")
+                            debug_print(f"Input content: {inputs}")
+                            # If inputs is a list of strings, convert to token IDs first
+                            if isinstance(inputs, list):
+                                if isinstance(inputs[0], str):
+                                    # Convert tokens to IDs
+                                    token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
+                                    debug_print(f"Token IDs: {token_ids}")
+                                    inputs = torch.tensor([token_ids], device=self.model_instance.device)
+                                else:
+                                    # If it's already a list of numbers, convert directly
+                                    inputs = torch.tensor([inputs], device=self.model_instance.device)
+                            else:
+                                # If it's already a tensor, just move to device
+                                inputs = inputs.to(self.model_instance.device)
+                            debug_print(f"Final tensor shape: {inputs.shape}")
+                            # Ensure proper shape
+                            if len(inputs.shape) == 1:
+                                inputs = inputs.unsqueeze(0)  # Add batch dimension
+                            # Create attention mask
+                            attention_mask = torch.ones_like(inputs)
+                            debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
+                            outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
+                            debug_print(f"Model outputs type: {type(outputs)}")
+                            debug_print(f"Model outputs attributes: {dir(outputs)}")
                 # Get word embeddings (last hidden state)
-                word_embeddings = outputs.last_hidden_state
+                if outputs is None:
+                    raise ValueError("Model returned None output")
+                if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
+                    word_embeddings = outputs.hidden_states[-1]
+                    debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
+                elif hasattr(outputs, 'last_hidden_state'):
+                    word_embeddings = outputs.last_hidden_state
+                    debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
+                else:
+                    raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
                 # Extract input_ids and convert them back to tokens
-                input_ids = inputs['input_ids'][0].tolist()
+                if isinstance(inputs, dict):
+                    input_ids = inputs['input_ids'][0].tolist()
+                else:
+                    input_ids = inputs[0].tolist()
                 tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
                 # Now align the tokens and embeddings

pelican_nlp/extraction/extract_logits.py CHANGED Viewed

@@ -22,7 +22,12 @@ class LogitsExtractor:
         chunk_size = self.options['chunk_size']
         overlap_size = self.options['overlap_size']
-        input_ids = tokens.to(self.device)
+        # Convert list of token IDs to tensor if needed
+        if isinstance(tokens, list):
+            input_ids = torch.tensor([tokens], device=self.device)
+        else:
+            input_ids = tokens.to(self.device)
         chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
         per_token_data = []

pelican_nlp/extraction/language_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import shutil
 from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
 class Model:
     def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
             print(f'FastText model loaded successfully from {model_path}')
         elif self.model_name == 'xlm-roberta-base':
-            from transformers import AutoModel
             self.model_instance = AutoModel.from_pretrained(
                 self.model_name,
                 trust_remote_code=trust_remote_code,

pelican_nlp/main.py CHANGED Viewed

@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
 from pelican_nlp.preprocessing import LPDS
 from pelican_nlp.utils.filename_parser import parse_lpds_filename
-from pelican_nlp.config import debug_print
+from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
 project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
+#project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
 class Pelican:
@@ -40,12 +41,12 @@ class Pelican:
         # If no config path is provided, use the default config from package; used for dev-mode
         if config_path is None:
             package_dir = Path(__file__).parent
-            default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
+            default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
             if default_config.exists():
                 config_path = str(default_config)
                 print(f"Using default configuration file: {config_path}")
             else:
-                sys.exit('Error: Default configuration file not found in package.')
+                sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
         # Verify the provided path is a YAML file
         elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
         """Execute the main processing pipeline."""
         self._clear_gpu_memory()
-        '''
-        #run unittests in dev_mode; not yet implemented
-        if self.dev_mode:
-            self._run_tests()
-        '''
         self._handle_output_directory()
         # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
 if __name__ == '__main__':
-    Pelican(project_path, dev_mode=True).run()
+    if RUN_TESTS:
+        print("Running tests...")
+        run_tests()
+    else:
+        Pelican(project_path, dev_mode=True).run()

pelican_nlp/preprocessing/text_tokenizer.py CHANGED Viewed

@@ -24,7 +24,8 @@ class TextTokenizer:
             # Tokenize using the model's tokenizer
             return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
         elif method == 'model':
-            return self.tokenizer.encode(text, return_tensors='pt')
+            # For model method, return token IDs directly
+            return self.tokenizer.encode(text, add_special_tokens=True)
         else:
             raise ValueError(f"Unsupported tokenization method: {method}")
@@ -34,10 +35,14 @@ class TextTokenizer:
     def get_tokenizer(self):
         if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
             from transformers import AutoTokenizer
+            if not self.model_name:
+                raise ValueError("model_name must be provided for model-based tokenization methods")
             return AutoTokenizer.from_pretrained(
                 self.model_name,
                 trust_remote_code=False,  # Don't execute arbitrary model code
                 use_safetensors=True
             )
-        else:
+        elif self.tokenization_method == 'whitespace':
             return None
+        else:
+            raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")

pelican_nlp/project_graph/graph_visualization.py ADDED Viewed

@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+Graph Visualization for Pelican-nlp Project
+===========================================
+This script creates a visual representation of the Pelican-nlp project structure
+using graphviz.
+"""
+from graphviz import Digraph
+def create_pelican_graph():
+    # Create a new directed graph
+    dot = Digraph(comment='Pelican-nlp Project Structure')
+    dot.attr(rankdir='TB')
+    # Set node styles
+    dot.attr('node', shape='box', style='rounded,filled')
+    # Main Components
+    with dot.subgraph(name='cluster_main') as c:
+        c.attr(label='Main Components')
+        c.attr('node', fillcolor='lightblue')
+        c.node('Pelican', 'Pelican\n(Main Controller)')
+        c.node('LPDS', 'LPDS\n(Data Structure)')
+        c.node('Corpus', 'Corpus\n(Document Collection)')
+        c.node('Subject', 'Subject\n(Grouping Unit)')
+        c.node('Document', 'Document\n(Data Container)')
+        c.node('AudioDocument', 'AudioDocument\n(Audio Data)')
+    # Core Processing
+    with dot.subgraph(name='cluster_core') as c:
+        c.attr(label='Core Processing')
+        c.attr('node', fillcolor='lightgreen')
+        c.node('Config', 'Configuration\n(config.py)')
+        c.node('CLI', 'Command Line Interface\n(cli.py)')
+        c.node('Main', 'Main Entry Point\n(main.py)')
+    # Preprocessing Components
+    with dot.subgraph(name='cluster_preprocessing') as c:
+        c.attr(label='Preprocessing')
+        c.attr('node', fillcolor='lightyellow')
+        c.node('TextTokenizer', 'Text Tokenizer\n(text_tokenizer.py)')
+        c.node('TextNormalizer', 'Text Normalizer\n(text_normalizer.py)')
+        c.node('TextCleaner', 'Text Cleaner\n(text_cleaner.py)')
+        c.node('TextImporter', 'Text Importer\n(text_importer.py)')
+        c.node('SpeakerDiarization', 'Speaker Diarization\n(speaker_diarization.py)')
+        c.node('Pipeline', 'Preprocessing Pipeline\n(pipeline.py)')
+    # Extraction Components
+    with dot.subgraph(name='cluster_extraction') as c:
+        c.attr(label='Feature Extraction')
+        c.attr('node', fillcolor='lightpink')
+        c.node('LogitsExtractor', 'Logits Extractor\n(extract_logits.py)')
+        c.node('EmbeddingsExtractor', 'Embeddings Extractor\n(extract_embeddings.py)')
+        c.node('LanguageModel', 'Language Model\n(language_model.py)')
+        c.node('AcousticFeatures', 'Acoustic Features\n(acoustic_feature_extraction.py)')
+        c.node('SemanticSimilarity', 'Semantic Similarity\n(semantic_similarity.py)')
+        c.node('RandomnessDistance', 'Distance from Randomness\n(distance_from_randomness.py)')
+    # Utility Components
+    with dot.subgraph(name='cluster_utils') as c:
+        c.attr(label='Utilities')
+        c.attr('node', fillcolor='lightgrey')
+        c.node('FilenameParser', 'Filename Parser\n(filename_parser.py)')
+        c.node('CSVFunctions', 'CSV Functions\n(csv_functions.py)')
+        c.node('SetupFunctions', 'Setup Functions\n(setup_functions.py)')
+    # Main Relationships
+    dot.edge('Pelican', 'LPDS', 'manages')
+    dot.edge('Pelican', 'Corpus', 'processes')
+    dot.edge('Pelican', 'Subject', 'instantiates')
+    dot.edge('Corpus', 'Document', 'contains')
+    dot.edge('Subject', 'Document', 'groups')
+    dot.edge('Document', 'AudioDocument', 'extends')
+    # Core Processing Relationships
+    dot.edge('CLI', 'Main', 'calls')
+    dot.edge('Main', 'Pelican', 'instantiates')
+    dot.edge('Pelican', 'Config', 'uses')
+    # Preprocessing Relationships
+    dot.edge('Pipeline', 'TextTokenizer', 'uses')
+    dot.edge('Pipeline', 'TextNormalizer', 'uses')
+    dot.edge('Pipeline', 'TextCleaner', 'uses')
+    dot.edge('Pipeline', 'TextImporter', 'uses')
+    dot.edge('Pipeline', 'SpeakerDiarization', 'uses')
+    dot.edge('Corpus', 'Pipeline', 'executes')
+    # Extraction Relationships
+    dot.edge('Corpus', 'LogitsExtractor', 'uses')
+    dot.edge('Corpus', 'EmbeddingsExtractor', 'uses')
+    dot.edge('LogitsExtractor', 'LanguageModel', 'uses')
+    dot.edge('EmbeddingsExtractor', 'LanguageModel', 'uses')
+    dot.edge('Corpus', 'AcousticFeatures', 'uses')
+    dot.edge('Corpus', 'SemanticSimilarity', 'uses')
+    dot.edge('Corpus', 'RandomnessDistance', 'uses')
+    # Utility Relationships
+    dot.edge('Pelican', 'FilenameParser', 'uses')
+    dot.edge('Corpus', 'CSVFunctions', 'uses')
+    dot.edge('Pelican', 'SetupFunctions', 'uses')
+    # Save the graph
+    dot.render('pelican_structure_detailed', format='png', cleanup=True)
+    print("Detailed graph visualization has been created as 'pelican_structure_detailed.png'")
+if __name__ == '__main__':
+    create_pelican_graph()

pelican_nlp/sample_configuration_files/config_discourse.yml CHANGED Viewed

@@ -7,12 +7,15 @@ discourse: &discourse_flag true
 #general configurations; always adapt
 language: "german" # Possibly add options for German and English
-task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
-corpus_names:
+task_name: "interview"
+#Create analysis corpus, group files based on corpus entity.
+corpus_key: "acq"
+corpus_values: #group names
   - "placebo"
   - "schizophrenia"
-metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
+metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
 number_of_speakers: 3
 subject_speakertag: "B"
@@ -24,7 +27,7 @@ cleaning_options:
   general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
   remove_brackets_and_bracketcontent: true
   remove_timestamps: true
-  timestamp_pattern_example: "#00:00:19-0#"
+  timestamp_pattern_example: "#00:00:19-00#"
   remove_punctuation: false
   lowercase: false
   #Options for fluency tasks
@@ -44,10 +47,10 @@ options_logits:
   keep_speakertags: true
 options_embeddings:
-  tokenization_method: "whitespace" #"model" or "whitespace"
+  tokenization_method: "model" #"model" or "whitespace"
   max_length: 512 #max sequence length
-  model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
-  pytorch_based_model: false
+  model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
+  pytorch_based_model: true
   method: "model_instance"
   remove_punctuation: false
   lowercase: false
@@ -60,6 +63,7 @@ options_embeddings:
     remove_punctuation_and_symbols: true
     remove_brackets_and_content: true
   semantic-similarity: false
+  distance-from-randomness: false
   window_size: null
   clean_tokens: false
   divergence_from_optimality: false
@@ -93,6 +97,9 @@ normalization_options:
   method: "lemmatization" #Options: lemmatization or stemming
 #================================================================
+create_aggregation_of_results: false
+output_document_information: false
 #Detail configurations; Changes optional, mostly used for quality checking / error handling
 number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
 multiple_sessions: false # Set to True if multiple sessions per subject

pelican_nlp/sample_configuration_files/config_fluency.yml CHANGED Viewed

@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
 language: "german"
 multiple_sessions: &session_flag false
-corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
+corpus_key: "acq"
+corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
   - "animals"
   - "clothes"
   - "food"
@@ -103,5 +104,3 @@ filename_components:
   metric: true
   additional_tags: []

pelican_nlp/sample_configuration_files/config_general.yml CHANGED Viewed

@@ -5,33 +5,38 @@
 # -------------
 input_file: "text"  # Options: 'text' or 'audio'
 language: "german"
-recompute_everything: true #If false will give warning if output folder already exists
+recompute_everything: true  # If false will give warning if output folder already exists
 # Task Configuration
 # -----------------
-task_name: # Name of task used for creation of data
+task_name: null  # Name of task used for creation of data
 fluency_task: &fluency_flag false  # Flag for fluency-specific settings
 discourse: &discourse_flag false  # Flag for discourse-specific settings
-corpus_names:  # List of task corpora
+# Corpus Configuration
+# ------------------
+corpus_key: null # Entity key to group files for analysis
+corpus_values:  # Corresponding entity values found in dataset
   - "healthy-control"
+  - "placebo"
 # Session and Subject Settings
 # --------------------------
 multiple_sessions: false
 number_of_subjects: null  # If null, auto-detected
-number_of_speakers: 1
+number_of_speakers: 1 # Specify amount of speakers for discourse files
 subject_speakertag: null  # Speaker tag for subject (e.g., "B"), only for discourse
 # Document Structure
 # ----------------
 has_multiple_sections: false
 has_section_titles: false
-section_identification: null  # e.g., "Section:"
-number_of_sections: 1  # If null, auto-detected
+section_identification: null  # e.g., "Section:", in case of multiple sections
+number_of_sections: null  # If null, auto-detected, specify for multiple sections to check section detection
 # Processing Pipeline
 # -----------------
-pipeline_options:
+pipeline_options: # Just for data preprocessing without metric extraction
   quality_check: false
   clean_text: true
   tokenize_text: false
@@ -40,18 +45,17 @@ pipeline_options:
 # Metric Extraction
 # ---------------
 metric_to_extract: "embeddings"  # Options: 'embeddings', 'logits'
-extract_logits: null
-extract_embeddings: true
+output_document_information: true
 # Cleaning Options
 # --------------
 cleaning_options:
-  general_cleaning: true
+  general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
   remove_punctuation: false
   lowercase: true
   remove_brackets_and_bracketcontent: false
   remove_timestamps: false
-  timestamp_pattern_example: null  # e.g., "#00:00:23-00#"
+  timestamp_pattern_example: null  # e.g., "#00:00:23-00#", only if remove_timestamps = True
   # Fluency-specific options
   fluency_task: *fluency_flag
   word_splitter: ';'
@@ -74,8 +78,8 @@ options_embeddings:
   method: "model_instance"
   max_length: 512
   clean_embedding_tokens: true
-  remove_punctuation: false
-  lowercase: false
+  remove_punctuation_embeddings: false
+  lowercase_embeddings: false
   keep_speakertags: false
   semantic-similarity: true
   window_size: null
@@ -118,6 +122,14 @@ options_dis_from_randomness:
 normalization_options:
   method: "lemmatization"  # Options: 'lemmatization', 'stemming'
+# Document Information Output
+# -------------------------
+document_information_output:
+  parameters:
+    - subject_ID
+    - fluency_word_count
+    - fluency_duplicate_count
 # Filename Configuration
 # --------------------
 filename_components:
@@ -125,6 +137,10 @@ filename_components:
   session: false
   task: true       # mandatory
   task_addition: false
-  corpus: true     #mandatory
+  corpus: true     # mandatory
   metric: true
   additional_tags: []
+# Additional Settings
+# -----------------
+create_aggregation_of_results: true

pelican_nlp/utils/setup_functions.py CHANGED Viewed

@@ -4,7 +4,7 @@ import yaml
 import sys
 from pelican_nlp.core.subject import Subject
 from .filename_parser import parse_lpds_filename
-from ..config import debug_print
+from pelican_nlp.config import debug_print
 def subject_instantiator(config, project_folder):

pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml ADDED Viewed

@@ -0,0 +1,109 @@
+# Configuration file for discourse protocols
+#=======================================
+input_file: "text" #or 'audio'
+discourse: &discourse_flag true
+#=====================================
+#general configurations; always adapt
+language: "german" # Possibly add options for German and English
+task_name: "interview"
+#Create analysis corpus, group files based on corpus entity.
+corpus_key: "acq"
+corpus_values: #group names
+  - "placebo"
+  - "schizophrenia"
+metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
+number_of_speakers: 2
+subject_speakertag: "B"
+#=========================================================
+#Optional configurations; Change with preference. However, default settings recommended
+fluency_task: &fluency_flag false
+cleaning_options:
+  general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
+  remove_brackets_and_bracketcontent: true
+  remove_timestamps: true
+  timestamp_pattern_example: "#00:00:19-00#"
+  remove_punctuation: false
+  lowercase: false
+  #Options for fluency tasks
+  fluency_task: *fluency_flag
+  word_splitter: null
+  remove_hyphens: null
+  remove_duplicates: null
+options_logits:
+  chunk_size: 128
+  overlap_size: 64
+  tokenization_method: "model"
+  #method: "model_instance" # Options: model, regex, nltk, etc.
+  model_name: "xlm-roberta-base" #Replace with your model instance name, e.g. "DiscoResearch/Llama3-German-8B-32k"
+  remove_punctuation: true
+  lowercase: true
+  keep_speakertags: true
+options_embeddings:
+  tokenization_method: "model" #"model" or "whitespace"
+  max_length: 512 #max sequence length
+  model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
+  pytorch_based_model: true
+  method: "model_instance"
+  remove_punctuation: false
+  lowercase: false
+  keep_speakertags: true
+  clean_embedding_tokens: true
+  output_options:
+    exclude_special_tokens: true
+    remove_'_'_character: true
+    remove_speaker_labels: true
+    remove_punctuation_and_symbols: true
+    remove_brackets_and_content: true
+  semantic-similarity: false
+  distance-from-randomness: false
+  window_size: null
+  clean_tokens: false
+  divergence_from_optimality: false
+#================================================================================
+#Extra configurations:
+pipeline_options:
+  quality_check: false
+  clean_text: true
+  tokenize_text: false
+  normalize_text: false
+general_cleaning_options:
+  strip_whitespace: true
+  merge_multiple_whitespaces: true
+  remove_whitespace_before_punctuation: true
+  merge_newline_characters: true
+  remove_backslashes: true
+has_multiple_sections: false #evaluated independently
+has_section_titles: false
+section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
+number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
+# Options for extract_embeddings
+window_sizes: [2]
+metric_function: cosine_similarity
+aggregation_functions: mean_of_means
+normalization_options:
+  method: "lemmatization" #Options: lemmatization or stemming
+#================================================================
+create_aggregation_of_results: false
+output_document_information: false
+#Detail configurations; Changes optional, mostly used for quality checking / error handling
+number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
+multiple_sessions: false # Set to True if multiple sessions per subject
+recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive

pelican-nlp 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

pelican-nlp 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl