pelican-nlp 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pelican_nlp/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.3"
pelican_nlp/cli.py CHANGED
@@ -1,17 +1,33 @@
1
1
  import os
2
+ from pathlib import Path
2
3
  from pelican_nlp.main import Pelican
4
+ from pelican_nlp.config import RUN_TESTS, run_tests
3
5
 
4
6
  def main():
5
- config_files = [f for f in os.listdir(".") if f.endswith(".yml")]
7
+ # Run tests if enabled
8
+ if RUN_TESTS:
9
+ print("Running tests...")
10
+ run_tests()
11
+ return
12
+
13
+ # Get the package directory's sample_configuration_files folder
14
+ package_dir = Path(__file__).parent
15
+ config_dir = package_dir / 'sample_configuration_files'
16
+
17
+ if not config_dir.exists():
18
+ print("sample_configuration_files directory not found in package directory.")
19
+ return
20
+
21
+ config_files = [f for f in os.listdir(config_dir) if f.endswith(".yml")]
6
22
  if not config_files:
7
- print("No .yml configuration file found in the current directory.")
23
+ print("No .yml configuration file found in the sample_configuration_files directory.")
8
24
  return
9
25
 
10
26
  if len(config_files) > 1:
11
- print("More than one configuration file found - remove unneeded files from project directory")
27
+ print("More than one configuration file found in sample_configuration_files directory - please specify which one to use")
12
28
  return
13
29
 
14
- config_file = config_files[0] # You could also add logic to choose or validate
30
+ config_file = str(config_dir / config_files[0])
15
31
  print(f"Using configuration file: {config_file}")
16
32
 
17
33
  pelican = Pelican(config_file)
pelican_nlp/config.py CHANGED
@@ -6,9 +6,30 @@ For consistency of pipeline, DO NOT CHANGE.
6
6
  """
7
7
 
8
8
  # Debug flag
9
- DEBUG_MODE = False
9
+ DEBUG_MODE = True
10
+
11
+ # Test flag - set to True to run all example tests
12
+ RUN_TESTS = False
10
13
 
11
14
  def debug_print(*args, **kwargs):
12
15
  """Print only if debug mode is enabled."""
16
+ DEBUG_MODE = True
13
17
  if DEBUG_MODE:
14
- print(*args, **kwargs)
18
+ print(*args, **kwargs)
19
+
20
+ def run_tests():
21
+ """Run all example tests if RUN_TESTS is enabled."""
22
+ if RUN_TESTS:
23
+ import unittest
24
+ from pathlib import Path
25
+
26
+ # Get the path to the test file
27
+ test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
28
+
29
+ # Create a test suite and add the test file
30
+ loader = unittest.TestLoader()
31
+ suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
32
+
33
+ # Run the tests
34
+ runner = unittest.TextTestRunner(verbosity=2)
35
+ runner.run(suite)
@@ -35,13 +35,60 @@ class EmbeddingsExtractor:
35
35
  outputs = self.model_instance(input_ids=inputs['input_ids'])
36
36
  else:
37
37
  # Handle RoBERTa and other models that accept **inputs
38
- outputs = self.model_instance(**inputs)
38
+ if isinstance(inputs, dict):
39
+ # Ensure inputs are on the same device as the model
40
+ inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
41
+ debug_print(f"Model inputs: {inputs}")
42
+ outputs = self.model_instance(**inputs, output_hidden_states=True)
43
+ else:
44
+ debug_print(f"Input type: {type(inputs)}")
45
+ debug_print(f"Input content: {inputs}")
46
+
47
+ # If inputs is a list of strings, convert to token IDs first
48
+ if isinstance(inputs, list):
49
+ if isinstance(inputs[0], str):
50
+ # Convert tokens to IDs
51
+ token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
52
+ debug_print(f"Token IDs: {token_ids}")
53
+ inputs = torch.tensor([token_ids], device=self.model_instance.device)
54
+ else:
55
+ # If it's already a list of numbers, convert directly
56
+ inputs = torch.tensor([inputs], device=self.model_instance.device)
57
+ else:
58
+ # If it's already a tensor, just move to device
59
+ inputs = inputs.to(self.model_instance.device)
60
+
61
+ debug_print(f"Final tensor shape: {inputs.shape}")
62
+
63
+ # Ensure proper shape
64
+ if len(inputs.shape) == 1:
65
+ inputs = inputs.unsqueeze(0) # Add batch dimension
66
+
67
+ # Create attention mask
68
+ attention_mask = torch.ones_like(inputs)
69
+ debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
70
+ outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
71
+ debug_print(f"Model outputs type: {type(outputs)}")
72
+ debug_print(f"Model outputs attributes: {dir(outputs)}")
39
73
 
40
74
  # Get word embeddings (last hidden state)
41
- word_embeddings = outputs.last_hidden_state
75
+ if outputs is None:
76
+ raise ValueError("Model returned None output")
77
+
78
+ if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
79
+ word_embeddings = outputs.hidden_states[-1]
80
+ debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
81
+ elif hasattr(outputs, 'last_hidden_state'):
82
+ word_embeddings = outputs.last_hidden_state
83
+ debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
84
+ else:
85
+ raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
42
86
 
43
87
  # Extract input_ids and convert them back to tokens
44
- input_ids = inputs['input_ids'][0].tolist()
88
+ if isinstance(inputs, dict):
89
+ input_ids = inputs['input_ids'][0].tolist()
90
+ else:
91
+ input_ids = inputs[0].tolist()
45
92
  tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
46
93
 
47
94
  # Now align the tokens and embeddings
@@ -22,7 +22,12 @@ class LogitsExtractor:
22
22
  chunk_size = self.options['chunk_size']
23
23
  overlap_size = self.options['overlap_size']
24
24
 
25
- input_ids = tokens.to(self.device)
25
+ # Convert list of token IDs to tensor if needed
26
+ if isinstance(tokens, list):
27
+ input_ids = torch.tensor([tokens], device=self.device)
28
+ else:
29
+ input_ids = tokens.to(self.device)
30
+
26
31
  chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
27
32
 
28
33
  per_token_data = []
@@ -4,7 +4,7 @@ import os
4
4
  import shutil
5
5
 
6
6
  from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
7
- from transformers import AutoModelForCausalLM
7
+ from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
8
8
 
9
9
  class Model:
10
10
  def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
75
75
 
76
76
  print(f'FastText model loaded successfully from {model_path}')
77
77
  elif self.model_name == 'xlm-roberta-base':
78
- from transformers import AutoModel
79
78
  self.model_instance = AutoModel.from_pretrained(
80
79
  self.model_name,
81
80
  trust_remote_code=trust_remote_code,
pelican_nlp/main.py CHANGED
@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
25
25
  from pelican_nlp.preprocessing import LPDS
26
26
  from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
27
 
28
- from config import debug_print
28
+ from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
29
29
 
30
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
31
+ #project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
31
32
 
32
33
  class Pelican:
33
34
 
@@ -40,12 +41,12 @@ class Pelican:
40
41
  # If no config path is provided, use the default config from package; used for dev-mode
41
42
  if config_path is None:
42
43
  package_dir = Path(__file__).parent
43
- default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
44
+ default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
44
45
  if default_config.exists():
45
46
  config_path = str(default_config)
46
47
  print(f"Using default configuration file: {config_path}")
47
48
  else:
48
- sys.exit('Error: Default configuration file not found in package.')
49
+ sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
49
50
 
50
51
  # Verify the provided path is a YAML file
51
52
  elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
72
73
  """Execute the main processing pipeline."""
73
74
  self._clear_gpu_memory()
74
75
 
75
- '''
76
- #run unittests in dev_mode; not yet implemented
77
- if self.dev_mode:
78
- self._run_tests()
79
- '''
80
-
81
76
  self._handle_output_directory()
82
77
 
83
78
  # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
229
224
 
230
225
 
231
226
  if __name__ == '__main__':
232
- Pelican(project_path, dev_mode=True).run()
227
+ if RUN_TESTS:
228
+ print("Running tests...")
229
+ run_tests()
230
+ else:
231
+ Pelican(project_path, dev_mode=True).run()
@@ -24,7 +24,8 @@ class TextTokenizer:
24
24
  # Tokenize using the model's tokenizer
25
25
  return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
26
26
  elif method == 'model':
27
- return self.tokenizer.encode(text, return_tensors='pt')
27
+ # For model method, return token IDs directly
28
+ return self.tokenizer.encode(text, add_special_tokens=True)
28
29
  else:
29
30
  raise ValueError(f"Unsupported tokenization method: {method}")
30
31
 
@@ -34,10 +35,14 @@ class TextTokenizer:
34
35
  def get_tokenizer(self):
35
36
  if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
36
37
  from transformers import AutoTokenizer
38
+ if not self.model_name:
39
+ raise ValueError("model_name must be provided for model-based tokenization methods")
37
40
  return AutoTokenizer.from_pretrained(
38
41
  self.model_name,
39
42
  trust_remote_code=False, # Don't execute arbitrary model code
40
43
  use_safetensors=True
41
44
  )
42
- else:
45
+ elif self.tokenization_method == 'whitespace':
43
46
  return None
47
+ else:
48
+ raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Graph Visualization for Pelican-nlp Project
4
+ ===========================================
5
+
6
+ This script creates a visual representation of the Pelican-nlp project structure
7
+ using graphviz.
8
+ """
9
+
10
+ from graphviz import Digraph
11
+
12
+ def create_pelican_graph():
13
+ # Create a new directed graph
14
+ dot = Digraph(comment='Pelican-nlp Project Structure')
15
+ dot.attr(rankdir='TB')
16
+
17
+ # Set node styles
18
+ dot.attr('node', shape='box', style='rounded,filled')
19
+
20
+ # Main Components
21
+ with dot.subgraph(name='cluster_main') as c:
22
+ c.attr(label='Main Components')
23
+ c.attr('node', fillcolor='lightblue')
24
+ c.node('Pelican', 'Pelican\n(Main Controller)')
25
+ c.node('LPDS', 'LPDS\n(Data Structure)')
26
+ c.node('Corpus', 'Corpus\n(Document Collection)')
27
+ c.node('Subject', 'Subject\n(Grouping Unit)')
28
+ c.node('Document', 'Document\n(Data Container)')
29
+ c.node('AudioDocument', 'AudioDocument\n(Audio Data)')
30
+
31
+ # Core Processing
32
+ with dot.subgraph(name='cluster_core') as c:
33
+ c.attr(label='Core Processing')
34
+ c.attr('node', fillcolor='lightgreen')
35
+ c.node('Config', 'Configuration\n(config.py)')
36
+ c.node('CLI', 'Command Line Interface\n(cli.py)')
37
+ c.node('Main', 'Main Entry Point\n(main.py)')
38
+
39
+ # Preprocessing Components
40
+ with dot.subgraph(name='cluster_preprocessing') as c:
41
+ c.attr(label='Preprocessing')
42
+ c.attr('node', fillcolor='lightyellow')
43
+ c.node('TextTokenizer', 'Text Tokenizer\n(text_tokenizer.py)')
44
+ c.node('TextNormalizer', 'Text Normalizer\n(text_normalizer.py)')
45
+ c.node('TextCleaner', 'Text Cleaner\n(text_cleaner.py)')
46
+ c.node('TextImporter', 'Text Importer\n(text_importer.py)')
47
+ c.node('SpeakerDiarization', 'Speaker Diarization\n(speaker_diarization.py)')
48
+ c.node('Pipeline', 'Preprocessing Pipeline\n(pipeline.py)')
49
+
50
+ # Extraction Components
51
+ with dot.subgraph(name='cluster_extraction') as c:
52
+ c.attr(label='Feature Extraction')
53
+ c.attr('node', fillcolor='lightpink')
54
+ c.node('LogitsExtractor', 'Logits Extractor\n(extract_logits.py)')
55
+ c.node('EmbeddingsExtractor', 'Embeddings Extractor\n(extract_embeddings.py)')
56
+ c.node('LanguageModel', 'Language Model\n(language_model.py)')
57
+ c.node('AcousticFeatures', 'Acoustic Features\n(acoustic_feature_extraction.py)')
58
+ c.node('SemanticSimilarity', 'Semantic Similarity\n(semantic_similarity.py)')
59
+ c.node('RandomnessDistance', 'Distance from Randomness\n(distance_from_randomness.py)')
60
+
61
+ # Utility Components
62
+ with dot.subgraph(name='cluster_utils') as c:
63
+ c.attr(label='Utilities')
64
+ c.attr('node', fillcolor='lightgrey')
65
+ c.node('FilenameParser', 'Filename Parser\n(filename_parser.py)')
66
+ c.node('CSVFunctions', 'CSV Functions\n(csv_functions.py)')
67
+ c.node('SetupFunctions', 'Setup Functions\n(setup_functions.py)')
68
+
69
+ # Main Relationships
70
+ dot.edge('Pelican', 'LPDS', 'manages')
71
+ dot.edge('Pelican', 'Corpus', 'processes')
72
+ dot.edge('Pelican', 'Subject', 'instantiates')
73
+ dot.edge('Corpus', 'Document', 'contains')
74
+ dot.edge('Subject', 'Document', 'groups')
75
+ dot.edge('Document', 'AudioDocument', 'extends')
76
+
77
+ # Core Processing Relationships
78
+ dot.edge('CLI', 'Main', 'calls')
79
+ dot.edge('Main', 'Pelican', 'instantiates')
80
+ dot.edge('Pelican', 'Config', 'uses')
81
+
82
+ # Preprocessing Relationships
83
+ dot.edge('Pipeline', 'TextTokenizer', 'uses')
84
+ dot.edge('Pipeline', 'TextNormalizer', 'uses')
85
+ dot.edge('Pipeline', 'TextCleaner', 'uses')
86
+ dot.edge('Pipeline', 'TextImporter', 'uses')
87
+ dot.edge('Pipeline', 'SpeakerDiarization', 'uses')
88
+ dot.edge('Corpus', 'Pipeline', 'executes')
89
+
90
+ # Extraction Relationships
91
+ dot.edge('Corpus', 'LogitsExtractor', 'uses')
92
+ dot.edge('Corpus', 'EmbeddingsExtractor', 'uses')
93
+ dot.edge('LogitsExtractor', 'LanguageModel', 'uses')
94
+ dot.edge('EmbeddingsExtractor', 'LanguageModel', 'uses')
95
+ dot.edge('Corpus', 'AcousticFeatures', 'uses')
96
+ dot.edge('Corpus', 'SemanticSimilarity', 'uses')
97
+ dot.edge('Corpus', 'RandomnessDistance', 'uses')
98
+
99
+ # Utility Relationships
100
+ dot.edge('Pelican', 'FilenameParser', 'uses')
101
+ dot.edge('Corpus', 'CSVFunctions', 'uses')
102
+ dot.edge('Pelican', 'SetupFunctions', 'uses')
103
+
104
+ # Save the graph
105
+ dot.render('pelican_structure_detailed', format='png', cleanup=True)
106
+ print("Detailed graph visualization has been created as 'pelican_structure_detailed.png'")
107
+
108
+ if __name__ == '__main__':
109
+ create_pelican_graph()
@@ -7,12 +7,15 @@ discourse: &discourse_flag true
7
7
  #general configurations; always adapt
8
8
  language: "german" # Possibly add options for German and English
9
9
 
10
- task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
11
- corpus_names:
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
12
15
  - "placebo"
13
16
  - "schizophrenia"
14
17
 
15
- metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
18
+ metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
16
19
 
17
20
  number_of_speakers: 3
18
21
  subject_speakertag: "B"
@@ -24,7 +27,7 @@ cleaning_options:
24
27
  general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
25
28
  remove_brackets_and_bracketcontent: true
26
29
  remove_timestamps: true
27
- timestamp_pattern_example: "#00:00:19-0#"
30
+ timestamp_pattern_example: "#00:00:19-00#"
28
31
  remove_punctuation: false
29
32
  lowercase: false
30
33
  #Options for fluency tasks
@@ -44,10 +47,10 @@ options_logits:
44
47
  keep_speakertags: true
45
48
 
46
49
  options_embeddings:
47
- tokenization_method: "whitespace" #"model" or "whitespace"
50
+ tokenization_method: "model" #"model" or "whitespace"
48
51
  max_length: 512 #max sequence length
49
- model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
50
- pytorch_based_model: false
52
+ model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
53
+ pytorch_based_model: true
51
54
  method: "model_instance"
52
55
  remove_punctuation: false
53
56
  lowercase: false
@@ -60,6 +63,7 @@ options_embeddings:
60
63
  remove_punctuation_and_symbols: true
61
64
  remove_brackets_and_content: true
62
65
  semantic-similarity: false
66
+ distance-from-randomness: false
63
67
  window_size: null
64
68
  clean_tokens: false
65
69
  divergence_from_optimality: false
@@ -93,6 +97,9 @@ normalization_options:
93
97
  method: "lemmatization" #Options: lemmatization or stemming
94
98
  #================================================================
95
99
 
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
96
103
  #Detail configurations; Changes optional, mostly used for quality checking / error handling
97
104
  number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
98
105
  multiple_sessions: false # Set to True if multiple sessions per subject
@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
8
8
  language: "german"
9
9
  multiple_sessions: &session_flag false
10
10
 
11
- corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
11
+ corpus_key: "acq"
12
+ corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
12
13
  - "animals"
13
14
  - "clothes"
14
15
  - "food"
@@ -103,5 +104,3 @@ filename_components:
103
104
  metric: true
104
105
  additional_tags: []
105
106
 
106
-
107
-
@@ -5,33 +5,38 @@
5
5
  # -------------
6
6
  input_file: "text" # Options: 'text' or 'audio'
7
7
  language: "german"
8
- recompute_everything: true #If false will give warning if output folder already exists
8
+ recompute_everything: true # If false will give warning if output folder already exists
9
9
 
10
10
  # Task Configuration
11
11
  # -----------------
12
- task_name: # Name of task used for creation of data
12
+ task_name: null # Name of task used for creation of data
13
13
  fluency_task: &fluency_flag false # Flag for fluency-specific settings
14
14
  discourse: &discourse_flag false # Flag for discourse-specific settings
15
- corpus_names: # List of task corpora
15
+
16
+ # Corpus Configuration
17
+ # ------------------
18
+ corpus_key: null # Entity key to group files for analysis
19
+ corpus_values: # Corresponding entity values found in dataset
16
20
  - "healthy-control"
21
+ - "placebo"
17
22
 
18
23
  # Session and Subject Settings
19
24
  # --------------------------
20
25
  multiple_sessions: false
21
26
  number_of_subjects: null # If null, auto-detected
22
- number_of_speakers: 1
27
+ number_of_speakers: 1 # Specify amount of speakers for discourse files
23
28
  subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
24
29
 
25
30
  # Document Structure
26
31
  # ----------------
27
32
  has_multiple_sections: false
28
33
  has_section_titles: false
29
- section_identification: null # e.g., "Section:"
30
- number_of_sections: 1 # If null, auto-detected
34
+ section_identification: null # e.g., "Section:", in case of multiple sections
35
+ number_of_sections: null # If null, auto-detected, specify for multiple sections to check section detection
31
36
 
32
37
  # Processing Pipeline
33
38
  # -----------------
34
- pipeline_options:
39
+ pipeline_options: # Just for data preprocessing without metric extraction
35
40
  quality_check: false
36
41
  clean_text: true
37
42
  tokenize_text: false
@@ -40,18 +45,17 @@ pipeline_options:
40
45
  # Metric Extraction
41
46
  # ---------------
42
47
  metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
43
- extract_logits: null
44
- extract_embeddings: true
48
+ output_document_information: true
45
49
 
46
50
  # Cleaning Options
47
51
  # --------------
48
52
  cleaning_options:
49
- general_cleaning: true
53
+ general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
50
54
  remove_punctuation: false
51
55
  lowercase: true
52
56
  remove_brackets_and_bracketcontent: false
53
57
  remove_timestamps: false
54
- timestamp_pattern_example: null # e.g., "#00:00:23-00#"
58
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#", only if remove_timestamps = True
55
59
  # Fluency-specific options
56
60
  fluency_task: *fluency_flag
57
61
  word_splitter: ';'
@@ -74,8 +78,8 @@ options_embeddings:
74
78
  method: "model_instance"
75
79
  max_length: 512
76
80
  clean_embedding_tokens: true
77
- remove_punctuation: false
78
- lowercase: false
81
+ remove_punctuation_embeddings: false
82
+ lowercase_embeddings: false
79
83
  keep_speakertags: false
80
84
  semantic-similarity: true
81
85
  window_size: null
@@ -118,6 +122,14 @@ options_dis_from_randomness:
118
122
  normalization_options:
119
123
  method: "lemmatization" # Options: 'lemmatization', 'stemming'
120
124
 
125
+ # Document Information Output
126
+ # -------------------------
127
+ document_information_output:
128
+ parameters:
129
+ - subject_ID
130
+ - fluency_word_count
131
+ - fluency_duplicate_count
132
+
121
133
  # Filename Configuration
122
134
  # --------------------
123
135
  filename_components:
@@ -125,6 +137,10 @@ filename_components:
125
137
  session: false
126
138
  task: true # mandatory
127
139
  task_addition: false
128
- corpus: true #mandatory
140
+ corpus: true # mandatory
129
141
  metric: true
130
142
  additional_tags: []
143
+
144
+ # Additional Settings
145
+ # -----------------
146
+ create_aggregation_of_results: true
@@ -4,7 +4,7 @@ import yaml
4
4
  import sys
5
5
  from pelican_nlp.core.subject import Subject
6
6
  from .filename_parser import parse_lpds_filename
7
- from ..config import debug_print
7
+ from pelican_nlp.config import debug_print
8
8
 
9
9
 
10
10
  def subject_instantiator(config, project_folder):
@@ -0,0 +1,109 @@
1
+ # Configuration file for discourse protocols
2
+ #=======================================
3
+ input_file: "text" #or 'audio'
4
+ discourse: &discourse_flag true
5
+ #=====================================
6
+
7
+ #general configurations; always adapt
8
+ language: "german" # Possibly add options for German and English
9
+
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
15
+ - "placebo"
16
+ - "schizophrenia"
17
+
18
+ metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
19
+
20
+ number_of_speakers: 2
21
+ subject_speakertag: "B"
22
+ #=========================================================
23
+
24
+ #Optional configurations; Change with preference. However, default settings recommended
25
+ fluency_task: &fluency_flag false
26
+ cleaning_options:
27
+ general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
28
+ remove_brackets_and_bracketcontent: true
29
+ remove_timestamps: true
30
+ timestamp_pattern_example: "#00:00:19-00#"
31
+ remove_punctuation: false
32
+ lowercase: false
33
+ #Options for fluency tasks
34
+ fluency_task: *fluency_flag
35
+ word_splitter: null
36
+ remove_hyphens: null
37
+ remove_duplicates: null
38
+
39
+ options_logits:
40
+ chunk_size: 128
41
+ overlap_size: 64
42
+ tokenization_method: "model"
43
+ #method: "model_instance" # Options: model, regex, nltk, etc.
44
+ model_name: "xlm-roberta-base" #Replace with your model instance name, e.g. "DiscoResearch/Llama3-German-8B-32k"
45
+ remove_punctuation: true
46
+ lowercase: true
47
+ keep_speakertags: true
48
+
49
+ options_embeddings:
50
+ tokenization_method: "model" #"model" or "whitespace"
51
+ max_length: 512 #max sequence length
52
+ model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
53
+ pytorch_based_model: true
54
+ method: "model_instance"
55
+ remove_punctuation: false
56
+ lowercase: false
57
+ keep_speakertags: true
58
+ clean_embedding_tokens: true
59
+ output_options:
60
+ exclude_special_tokens: true
61
+ remove_'_'_character: true
62
+ remove_speaker_labels: true
63
+ remove_punctuation_and_symbols: true
64
+ remove_brackets_and_content: true
65
+ semantic-similarity: false
66
+ distance-from-randomness: false
67
+ window_size: null
68
+ clean_tokens: false
69
+ divergence_from_optimality: false
70
+ #================================================================================
71
+
72
+ #Extra configurations:
73
+ pipeline_options:
74
+ quality_check: false
75
+ clean_text: true
76
+ tokenize_text: false
77
+ normalize_text: false
78
+
79
+ general_cleaning_options:
80
+ strip_whitespace: true
81
+ merge_multiple_whitespaces: true
82
+ remove_whitespace_before_punctuation: true
83
+ merge_newline_characters: true
84
+ remove_backslashes: true
85
+
86
+ has_multiple_sections: false #evaluated independently
87
+ has_section_titles: false
88
+ section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
89
+ number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
90
+
91
+ # Options for extract_embeddings
92
+ window_sizes: [2]
93
+ metric_function: cosine_similarity
94
+ aggregation_functions: mean_of_means
95
+
96
+ normalization_options:
97
+ method: "lemmatization" #Options: lemmatization or stemming
98
+ #================================================================
99
+
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
103
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
104
+ number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
105
+ multiple_sessions: false # Set to True if multiple sessions per subject
106
+
107
+ recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
108
+
109
+
@@ -0,0 +1,106 @@
1
+ # Configuration file for fluency task
2
+ # =======================================
3
+ input_file: "text" #or 'audio'
4
+ fluency_task: &fluency_flag true
5
+ #========================================
6
+
7
+ #general configurations; always adapt
8
+ language: "german"
9
+ multiple_sessions: &session_flag false
10
+
11
+ corpus_key: "acq"
12
+ corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
13
+ - "animals"
14
+ - "clothes"
15
+ - "food"
16
+
17
+ #Specify linguistic metrics to extract
18
+ metric_to_extract: 'embeddings' #Possible options: 'embeddings', 'logits'
19
+ output_document_information: true
20
+ #====================================================================
21
+
22
+ #Optional configurations; Change with preference. However, default settings recommended
23
+ cleaning_options:
24
+ general_cleaning: true
25
+ #Options for fluency tasks
26
+ fluency_task: *fluency_flag
27
+ word_splitter: ';' #default split with ',' add different word_splitter if necessary
28
+ remove_hyphens: true
29
+ remove_duplicates: false
30
+ lowercase: false
31
+ #Optional cleaning
32
+ remove_brackets_and_bracketcontent: false #default 'false'
33
+ remove_timestamps: false #default 'false'
34
+ timestamp_pattern_example: null #e.g. "#00:00:23-00#"
35
+ remove_punctuation: false #Careful!: If set to true word_splitter might be removed
36
+
37
+ options_embeddings:
38
+ tokenization_method: "whitespace" #or "model"
39
+ model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
40
+ pytorch_based_model: false
41
+ method: "model_instance"
42
+ max_length: null
43
+ clean_embedding_tokens: true
44
+
45
+ semantic-similarity: true
46
+ distance-from-randomness: false
47
+
48
+ options_dis_from_randomness:
49
+ window_size: 8
50
+ min_len: null
51
+ bootstrap: 10000
52
+ shuffle_mode: 'include0_includeN'
53
+ parallel_computing: false #not yet set up
54
+
55
+ options_semantic-similarity:
56
+ window_sizes: #'all' or window size as integer
57
+ - 2
58
+ - 8
59
+ #==================================================================
60
+
61
+ #Extra configurations;
62
+ task_name: "fluency"
63
+ create_aggregation_of_results: true
64
+
65
+ pipeline_options:
66
+ quality_check: false
67
+ clean_text: true
68
+ tokenize_text: false
69
+ normalize_text: false
70
+
71
+ general_cleaning_options:
72
+ strip_whitespace: true
73
+ merge_multiple_whitespaces: true
74
+ remove_whitespace_before_punctuation: true
75
+ merge_newline_characters: true
76
+ remove_backslashes: true
77
+
78
+ has_multiple_sections: false
79
+ has_section_titles: false
80
+ section_identification: null
81
+ number_of_sections: 1
82
+ number_of_speakers: 1
83
+ discourse: false
84
+
85
+ document_information_output:
86
+ parameters:
87
+ - subject_ID
88
+ - fluency_word_count
89
+ - fluency_duplicate_count
90
+
91
+ #================================================================
92
+
93
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
94
+ recompute_everything: true
95
+ number_of_subjects: null
96
+
97
+ # Filename components configuration
98
+ filename_components:
99
+ subject: true # mandatory
100
+ session: *session_flag
101
+ task: true # mandatory
102
+ task_addition: false
103
+ corpus: true # typically true for fluency tasks (e.g., "animals", "clothes")
104
+ metric: true
105
+ additional_tags: []
106
+
@@ -0,0 +1,135 @@
1
+ # Master Configuration File
2
+ # ========================
3
+
4
+ # Basic Settings
5
+ # -------------
6
+ input_file: "text" # Options: 'text' or 'audio'
7
+ language: "german" # Options: 'german', 'english'
8
+ recompute_everything: true # If false, reuses previously computed results
9
+
10
+ # Task Configuration
11
+ # -----------------
12
+ task_name: "image-description" # Options: 'fluency', 'interview'
13
+ fluency_task: &fluency_flag false # Flag for fluency-specific settings
14
+ discourse: &discourse_flag false # Flag for discourse-specific settings
15
+ corpus_names: # List of task corpora
16
+ - "placebo"
17
+ - "drug"
18
+
19
+ # Session and Subject Settings
20
+ # --------------------------
21
+ multiple_sessions: true
22
+ number_of_subjects: null # If null, auto-detected
23
+ number_of_speakers: 1
24
+ subject_speakertag: null # Speaker tag for subject (e.g., "B")
25
+
26
+ # Document Structure
27
+ # ----------------
28
+ has_multiple_sections: true
29
+ has_section_titles: true
30
+ section_identification: "Bild:" # e.g., "Section:"
31
+ number_of_sections: null # If null, auto-detected
32
+
33
+ # Processing Pipeline
34
+ # -----------------
35
+ pipeline_options:
36
+ quality_check: false
37
+ clean_text: true
38
+ tokenize_text: false
39
+ normalize_text: false
40
+
41
+ # Metric Extraction
42
+ # ---------------
43
+ metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
44
+ extract_logits: null
45
+ extract_embeddings: true
46
+
47
+ # Cleaning Options
48
+ # --------------
49
+ cleaning_options:
50
+ general_cleaning: true
51
+ remove_punctuation: false
52
+ lowercase: false
53
+ remove_brackets_and_bracketcontent: true
54
+ remove_timestamps: false
55
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#"
56
+ # Fluency-specific options
57
+ fluency_task: *fluency_flag
58
+ word_splitter: ';'
59
+ remove_hyphens: false
60
+ remove_duplicates: false
61
+
62
+ general_cleaning_options:
63
+ strip_whitespace: true
64
+ merge_multiple_whitespaces: true
65
+ remove_whitespace_before_punctuation: true
66
+ merge_newline_characters: true
67
+ remove_backslashes: true
68
+
69
+ # Embedding Options
70
+ # ---------------
71
+ options_embeddings:
72
+ tokenization_method: "model_roberta" #or "whitespace", "model"
73
+ max_length: 512 #max sequence length
74
+ model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
75
+ pytorch_based_model: true
76
+ method: "model_instance"
77
+ max_length: 512
78
+ clean_embedding_tokens: true
79
+ remove_punctuation: false
80
+ lowercase: false
81
+ keep_speakertags: false
82
+ semantic-similarity: false
83
+ window_size: null
84
+ clean_tokens: true
85
+ distance-from-randomness: false
86
+ output_options:
87
+ exclude_special_tokens: true
88
+ remove_'_'_character: true
89
+ remove_speaker_labels: true
90
+ remove_punctuation_and_symbols: true
91
+ remove_brackets_and_content: true
92
+
93
+ # Logits Options
94
+ # -------------
95
+ options_logits:
96
+ chunk_size: 128
97
+ overlap_size: 64
98
+ tokenization_method: "model"
99
+ model_name: "DiscoResearch/Llama3-German-8B-32k"
100
+ remove_punctuation: true
101
+ lowercase: true
102
+ keep_speakertags: true
103
+
104
+ # Analysis Options
105
+ # --------------
106
+ options_semantic-similarity:
107
+ window_sizes: # 'all' or window size as integer
108
+ - 2
109
+ - 8
110
+
111
+ options_dis_from_randomness:
112
+ window_size: 8
113
+ min_len: null
114
+ bootstrap: 10000
115
+ shuffle_mode: 'include0_includeN'
116
+ parallel_computing: false
117
+
118
+ # Normalization Options
119
+ # -------------------
120
+ normalization_options:
121
+ method: "lemmatization" # Options: 'lemmatization', 'stemming'
122
+
123
+ # Filename Configuration
124
+ # --------------------
125
+ filename_components:
126
+ subject: true # mandatory
127
+ session: false
128
+ task: true # mandatory
129
+ task_addition: false
130
+ corpus: true
131
+ metric: true
132
+ additional_tags: []
133
+
134
+ create_aggregation_of_results: false
135
+ output_document_information: false
@@ -0,0 +1,211 @@
1
+ import unittest
2
+ import os
3
+ import yaml
4
+ from pathlib import Path
5
+ import shutil
6
+ import tempfile
7
+ import json
8
+ import subprocess
9
+ import sys
10
+ import logging
11
+ import signal
12
+ from contextlib import contextmanager
13
+
14
+ # Add the project root to the Python path
15
+ project_root = Path(__file__).parent.parent.parent
16
+ sys.path.append(str(project_root))
17
+
18
+ from pelican_nlp.config import DEBUG_MODE, debug_print
19
+
20
+ class TimeoutError(Exception):
21
+ pass
22
+
23
+ @contextmanager
24
+ def timeout(seconds):
25
+ def signal_handler(signum, frame):
26
+ raise TimeoutError(f"Timed out after {seconds} seconds")
27
+
28
+ # Register the signal handler
29
+ signal.signal(signal.SIGALRM, signal_handler)
30
+ signal.alarm(seconds)
31
+
32
+ try:
33
+ yield
34
+ finally:
35
+ # Disable the alarm
36
+ signal.alarm(0)
37
+
38
+ class TestExamples(unittest.TestCase):
39
+ @classmethod
40
+ def setUpClass(cls):
41
+ debug_print("Setting up test environment...")
42
+ # Create a temporary directory for test outputs
43
+ cls.test_dir = tempfile.mkdtemp()
44
+ cls.examples_dir = Path(__file__).parent / "examples"
45
+
46
+ # Load all example configurations
47
+ cls.examples = {}
48
+ for example_type in ["fluency", "discourse", "image-descriptions"]:
49
+ example_dir = cls.examples_dir / f"example_{example_type}"
50
+ config_path = example_dir / f"config_{example_type}.yml"
51
+
52
+ debug_print(f"Loading configuration for {example_type}...")
53
+ if not config_path.exists():
54
+ debug_print(f"Warning: Config file not found: {config_path}")
55
+ continue
56
+
57
+ with open(config_path, 'r') as f:
58
+ config = yaml.safe_load(f)
59
+
60
+ cls.examples[example_type] = {
61
+ "config_path": config_path,
62
+ "config": config,
63
+ "example_dir": example_dir
64
+ }
65
+
66
+ @classmethod
67
+ def tearDownClass(cls):
68
+ debug_print("Cleaning up test environment...")
69
+ # Clean up temporary directory
70
+ shutil.rmtree(cls.test_dir)
71
+
72
+ def setUp(self):
73
+ # Create a fresh output directory for each test
74
+ self.output_dir = Path(self.test_dir) / "test_output"
75
+ self.output_dir.mkdir(exist_ok=True)
76
+
77
+ def run_pelican_pipeline(self, example_dir, config_path, output_dir):
78
+ """Run the pelican pipeline with the given configuration file"""
79
+ debug_print(f"Running pipeline with config: {config_path}")
80
+ try:
81
+ # Change to the example directory before running the command
82
+ original_dir = os.getcwd()
83
+ os.chdir(example_dir)
84
+
85
+ # Print current directory and files
86
+ debug_print(f"Current directory: {os.getcwd()}")
87
+ debug_print("Files in current directory:")
88
+ for f in os.listdir('.'):
89
+ debug_print(f" - {f}")
90
+
91
+ # Run pelican-run with the configuration file and timeout
92
+ with timeout(300): # 5 minute timeout
93
+ # Use run with real-time output
94
+ process = subprocess.run(
95
+ ["pelican-run", "--config", str(config_path), "--output", str(output_dir)],
96
+ stdout=subprocess.PIPE,
97
+ stderr=subprocess.PIPE,
98
+ text=True,
99
+ check=True
100
+ )
101
+
102
+ # Print output after completion
103
+ if process.stdout:
104
+ print("Pipeline output:")
105
+ print(process.stdout)
106
+ if process.stderr:
107
+ print("Pipeline errors:")
108
+ print(process.stderr)
109
+
110
+ # Change back to original directory
111
+ os.chdir(original_dir)
112
+
113
+ debug_print("Pipeline completed successfully")
114
+ return True, "Pipeline completed successfully"
115
+ except TimeoutError as e:
116
+ os.chdir(original_dir)
117
+ debug_print(f"Pipeline timed out: {str(e)}")
118
+ return False, f"Error: Pipeline timed out after 5 minutes"
119
+ except subprocess.CalledProcessError as e:
120
+ # Change back to original directory even if there's an error
121
+ os.chdir(original_dir)
122
+ debug_print(f"Pipeline failed with exit code {e.returncode}")
123
+ if e.stdout:
124
+ print("Pipeline output:")
125
+ print(e.stdout)
126
+ if e.stderr:
127
+ print("Pipeline errors:")
128
+ print(e.stderr)
129
+ return False, f"Error: Pipeline failed with exit code {e.returncode}"
130
+ except Exception as e:
131
+ os.chdir(original_dir)
132
+ debug_print(f"Unexpected error: {str(e)}")
133
+ return False, f"Error: {str(e)}"
134
+
135
+ def test_discourse_example(self):
136
+ """Test running the discourse example through the pipeline"""
137
+ debug_print("Testing discourse example...")
138
+ if "discourse" not in self.examples:
139
+ self.skipTest("Discourse example configuration not found")
140
+
141
+ example = self.examples["discourse"]
142
+ output_dir = self.output_dir / "discourse"
143
+ output_dir.mkdir(exist_ok=True)
144
+
145
+ success, output = self.run_pelican_pipeline(
146
+ example["example_dir"],
147
+ example["config_path"],
148
+ output_dir
149
+ )
150
+ self.assertTrue(success, f"Pipeline failed: {output}")
151
+
152
+ # Verify output files were created
153
+ self.assertTrue(output_dir.exists())
154
+ self.assertTrue(len(list(output_dir.glob("*"))) > 0)
155
+ debug_print("Discourse example test completed")
156
+
157
+ def test_fluency_example(self):
158
+ """Test running the fluency example through the pipeline"""
159
+ debug_print("Testing fluency example...")
160
+ if "fluency" not in self.examples:
161
+ self.skipTest("Fluency example configuration not found")
162
+
163
+ example = self.examples["fluency"]
164
+ output_dir = self.output_dir / "fluency"
165
+ output_dir.mkdir(exist_ok=True)
166
+
167
+ success, output = self.run_pelican_pipeline(
168
+ example["example_dir"],
169
+ example["config_path"],
170
+ output_dir
171
+ )
172
+ self.assertTrue(success, f"Pipeline failed: {output}")
173
+
174
+ # Verify output files were created
175
+ self.assertTrue(output_dir.exists())
176
+ self.assertTrue(len(list(output_dir.glob("*"))) > 0)
177
+ debug_print("Fluency example test completed")
178
+
179
+ def test_image_descriptions_example(self):
180
+ """Test running the image descriptions example through the pipeline"""
181
+ debug_print("Testing image descriptions example...")
182
+ if "image-descriptions" not in self.examples:
183
+ self.skipTest("Image descriptions example configuration not found")
184
+
185
+ example = self.examples["image-descriptions"]
186
+ output_dir = self.output_dir / "image-descriptions"
187
+ output_dir.mkdir(exist_ok=True)
188
+
189
+ success, output = self.run_pelican_pipeline(
190
+ example["example_dir"],
191
+ example["config_path"],
192
+ output_dir
193
+ )
194
+ self.assertTrue(success, f"Pipeline failed: {output}")
195
+
196
+ # Verify output files were created
197
+ self.assertTrue(output_dir.exists())
198
+ self.assertTrue(len(list(output_dir.glob("*"))) > 0)
199
+ debug_print("Image descriptions example test completed")
200
+
201
+ def suite():
202
+ """Create a test suite with all test cases"""
203
+ suite = unittest.TestSuite()
204
+ suite.addTest(TestExamples('test_discourse_example'))
205
+ suite.addTest(TestExamples('test_fluency_example'))
206
+ suite.addTest(TestExamples('test_image_descriptions_example'))
207
+ return suite
208
+
209
+ if __name__ == '__main__':
210
+ runner = unittest.TextTestRunner(verbosity=2)
211
+ runner.run(suite())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pelican_nlp
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
5
  Author-email: Yves Pauli <yves.pauli@gmail.com>
6
6
  License-Expression: CC-BY-NC-4.0
@@ -48,8 +48,8 @@ Dynamic: license-file
48
48
  pelican_nlp
49
49
  ====================================
50
50
 
51
- .. |logo| image:: docs/images/pelican_logo.png
52
- :alt: PELICAN_nlp Logo
51
+ .. |logo| image:: https://raw.githubusercontent.com/ypauli/pelican_nlp/main/docs/images/pelican_logo.png
52
+ :alt: pelican_nlp Logo
53
53
  :width: 200px
54
54
 
55
55
  +------------+-------------------------------------------------------------------+
@@ -73,6 +73,10 @@ pelican_nlp
73
73
  :target: https://pypi.org/project/pelican_nlp/
74
74
  :alt: Supported Python Versions
75
75
 
76
+ .. image:: https://img.shields.io/badge/Contributions-Welcome-brightgreen.svg
77
+ :target: https://github.com/ypauli/pelican_nlp/blob/main/CONTRIBUTING.md
78
+ :alt: Contributions Welcome
79
+
76
80
  Installation
77
81
  ============
78
82
 
@@ -1,8 +1,8 @@
1
1
  pelican_nlp/__init__.py,sha256=TD5xjKeXXAH6nUWG-6igbClgovi5r8RIEqI_ix1QeYo,204
2
- pelican_nlp/_version.py,sha256=v-ExhFzOD_GemLcOptv2ZODgnklv9iqEEospk_bU1_w,21
3
- pelican_nlp/cli.py,sha256=mPz-ASIMUme69G6YGVpTnHr5VfM3XA4h29WFd7DXpa4,588
4
- pelican_nlp/config.py,sha256=cqUYLeqQB_Y-drR4dpxz8l-aLKl7TcfiB8SeN_rNq4I,352
5
- pelican_nlp/main.py,sha256=43jz94Zit931nZXs1hSAAPimRbX8Vmj-bEx7rDoYtZ4,8674
2
+ pelican_nlp/_version.py,sha256=G3p10uWMvNQiA3ZdxMz0QlmyKECmnauS0Ym9wMP2tEI,21
3
+ pelican_nlp/cli.py,sha256=Z11qwivHvuly07FAcEG8-Fl6_MqGauriZ8U5iZWf4lc,1116
4
+ pelican_nlp/config.py,sha256=LuZnuaq0Z49FgRgKJ7F6mwl1yr60QQDfMtD29ocbKfw,1000
5
+ pelican_nlp/main.py,sha256=CAYVrOHOG1gIJ_WkjlYeXUQPNvsNbAGDd0we92Z0sGI,8784
6
6
  pelican_nlp/Nils_backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  pelican_nlp/Nils_backup/extract_acoustic_features.py,sha256=eSP8lXxbZ15YE1HqxGtma9uWOcSN-fI-ig-NwQ9eOA8,10771
8
8
  pelican_nlp/Nils_backup/speaker_diarization_Nils.py,sha256=3RIhjKihu4Z1rruMt9KESFE2lqesfzIpRr7rLummUEo,10219
@@ -45,9 +45,9 @@ pelican_nlp/core/subject.py,sha256=Jx99vPn0K0KT_9BsJOY8XviFU_GuZGuwtb1rbLNkiUI,1
45
45
  pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
46
46
  pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=Ol6fqyy94Iym1Z-eTVoz8EmqfV58boz5WAoamAK7JVE,2323
47
47
  pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
48
- pelican_nlp/extraction/extract_embeddings.py,sha256=6lzKbZpe5kCWHMh_ca0M-Xl_UF64bmGXEeQjFFTnsOA,2507
49
- pelican_nlp/extraction/extract_logits.py,sha256=kvZn9dZWsZiSPcbQ8hKtFcS9XxNlMmL-WGvpToMMo7c,3925
50
- pelican_nlp/extraction/language_model.py,sha256=37vVNFL31DVIBPSuyQK1rkEm8kiCXHTpGYv4Vk8w2bM,5676
48
+ pelican_nlp/extraction/extract_embeddings.py,sha256=LobzYEtjOpI_SvMZUb0u3QiOyZ2gPQD9bjQI9qwaogw,5719
49
+ pelican_nlp/extraction/extract_logits.py,sha256=4r8KbsqejD3UR3gCAAjm_sQhBkz8t_ePkv30bVGZg18,4116
50
+ pelican_nlp/extraction/language_model.py,sha256=EZE5bl-7pzPKoBIM9rnk7MJSFdMO6iQVnWmpQQsP8MU,5662
51
51
  pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
52
52
  pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
53
53
  pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
@@ -62,18 +62,23 @@ pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_
62
62
  pelican_nlp/preprocessing/text_cleaner.py,sha256=QKqxwoRR8dnuBYiY-PXK1kB7744TVUcUMJb7dbKvXGk,7512
63
63
  pelican_nlp/preprocessing/text_importer.py,sha256=FtSyJjFXDxVle7Jpyw6EqCLDbLTCRxqVQi9ymWWtPB4,1356
64
64
  pelican_nlp/preprocessing/text_normalizer.py,sha256=huo5VFqJ0p2jq-ud1047XvMu1qNeaiuG879SF3zkJoM,894
65
- pelican_nlp/preprocessing/text_tokenizer.py,sha256=h875bXr0YuMrLh4HtQUvpHmASScddtkQXGaF9mm7uwU,1642
66
- pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=OaTCoMwhDjrOIBpw1nKXWIoSWRUUFNjGQdgQZHVrJn0,3570
67
- pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=JYpq90K4AF5TslzESJK6Nidw6-D1IiqD_6cdmlCd5-w,2990
68
- pelican_nlp/sample_configuration_files/config_general.yml,sha256=-GAVATlqXuQq4ANSW0JauwIGhr7ET_oZiBiM7I40AkA,3424
65
+ pelican_nlp/preprocessing/text_tokenizer.py,sha256=vUYayGLtMHTtJunTaEgiqjxJt658puEsFi3wTFfW6qw,1989
66
+ pelican_nlp/project_graph/graph_visualization.py,sha256=M99hGFKAun4_U2VQk9VQBMCF-imNAhQBHMhOvArPvMk,4648
67
+ pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=l1FN8NcgEbb4s8PqPFErnspDqjSD-SEiIWYcDfSS0Xw,3681
68
+ pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=BESisuMG9JPEBpbRPzEDmYVMIEHDY5Pf6HKqTWTa624,3007
69
+ pelican_nlp/sample_configuration_files/config_general.yml,sha256=FsGfcc8bK-di5dYuD-ri4sJlh2johQVEWUqsH7T6cCA,4172
69
70
  pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
70
71
  pelican_nlp/utils/csv_functions.py,sha256=7X8pGh49TGZGs7h6JrJD846swCqSHL32mmXJ-8qLWPE,7774
71
72
  pelican_nlp/utils/filename_parser.py,sha256=PGSKjiYDe_JVAFGcaYHdIYazB3p4MUiG6n8h_uZl8d8,551
72
73
  pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
73
- pelican_nlp/utils/setup_functions.py,sha256=Ovd3VMCRpVg_BU8gcF6rGc9mp0zsD2iqJRqRB61lxOg,4529
74
- pelican_nlp-0.3.1.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
75
- pelican_nlp-0.3.1.dist-info/METADATA,sha256=sgUAHpBqowrsg_yFXs6-HDSgI77js6uqf8josFxjpcM,6593
76
- pelican_nlp-0.3.1.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
77
- pelican_nlp-0.3.1.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
78
- pelican_nlp-0.3.1.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
79
- pelican_nlp-0.3.1.dist-info/RECORD,,
74
+ pelican_nlp/utils/setup_functions.py,sha256=Xk-9W1-ylex8De5w6jxAqWJUlmbe5z-c2EvwptTZ7RQ,4539
75
+ pelican_nlp/utils/unittests/test_examples.py,sha256=XLc9my0cwpPN9W8gBAPnPTVnBf77kMQeO0Xo38oH4Tg,7849
76
+ pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml,sha256=jBSJ07dPujoZo2bOK15_RW4_dKALOWTzI55KljmWJKg,3709
77
+ pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml,sha256=BESisuMG9JPEBpbRPzEDmYVMIEHDY5Pf6HKqTWTa624,3007
78
+ pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml,sha256=HuPI7Py_57FwyfHEdIPk0LcdsMKze3XjmEuP6kPirP4,3540
79
+ pelican_nlp-0.3.3.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
80
+ pelican_nlp-0.3.3.dist-info/METADATA,sha256=MV71aLEm8KappnKjsVzEHKlQYMWEkBfQw1ZhOgETKyM,6839
81
+ pelican_nlp-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ pelican_nlp-0.3.3.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
83
+ pelican_nlp-0.3.3.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
84
+ pelican_nlp-0.3.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5