pelican-nlp 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. pelican_nlp/_version.py +1 -1
  2. pelican_nlp/cli.py +22 -4
  3. pelican_nlp/config.py +23 -2
  4. pelican_nlp/extraction/extract_embeddings.py +50 -3
  5. pelican_nlp/extraction/extract_logits.py +6 -1
  6. pelican_nlp/extraction/language_model.py +1 -2
  7. pelican_nlp/main.py +9 -10
  8. pelican_nlp/preprocessing/text_tokenizer.py +7 -2
  9. pelican_nlp/project_graph/graph_visualization.py +109 -0
  10. pelican_nlp/sample_configuration_files/config_discourse.yml +14 -7
  11. pelican_nlp/sample_configuration_files/config_fluency.yml +2 -3
  12. pelican_nlp/sample_configuration_files/config_general.yml +30 -14
  13. pelican_nlp/utils/setup_functions.py +1 -1
  14. pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml +109 -0
  15. pelican_nlp/utils/unittests/examples/example_discourse/subjects/sub-01/interview/sub-01_task-interview_acq-schizophrenia_run-01_transcript.rtf +40 -0
  16. pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml +106 -0
  17. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-animals_text.txt +1 -0
  18. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-clothes_text.txt +1 -0
  19. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-food_text.txt +1 -0
  20. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-animals_text.txt +1 -0
  21. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-clothes_text.txt +1 -0
  22. pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-food_text.txt +1 -0
  23. pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml +135 -0
  24. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  25. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  26. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-drug_transcript.docx +0 -0
  27. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-placebo_transcript.docx +0 -0
  28. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  29. pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  30. pelican_nlp/utils/unittests/test_examples.py +211 -0
  31. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/METADATA +1 -1
  32. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/RECORD +36 -18
  33. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/WHEEL +1 -1
  34. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/entry_points.txt +0 -0
  35. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/licenses/LICENSE +0 -0
  36. {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.4.dist-info}/top_level.txt +0 -0
pelican_nlp/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.2"
1
+ __version__ = "0.3.4"
pelican_nlp/cli.py CHANGED
@@ -1,17 +1,35 @@
1
1
  import os
2
+ from pathlib import Path
2
3
  from pelican_nlp.main import Pelican
4
+ from pelican_nlp.config import RUN_TESTS, run_tests
3
5
 
4
6
  def main():
5
- config_files = [f for f in os.listdir(".") if f.endswith(".yml")]
7
+ # Run tests if enabled
8
+ if RUN_TESTS:
9
+ print("Running tests...")
10
+ run_tests()
11
+ return
12
+
13
+ # Look for configuration files in the current working directory
14
+ config_dir = Path.cwd()
15
+
16
+ print(f"Looking for configuration files in: {config_dir}")
17
+
18
+ config_files = [f for f in os.listdir(config_dir) if f.endswith((".yml", ".yaml"))]
19
+
6
20
  if not config_files:
7
- print("No .yml configuration file found in the current directory.")
21
+ print("No .yml or .yaml configuration file found in the current directory.")
22
+ print("Please ensure you have a configuration file in your current working directory.")
8
23
  return
9
24
 
10
25
  if len(config_files) > 1:
11
- print("More than one configuration file found - remove unneeded files from project directory")
26
+ print("Warning: Multiple configuration files found in current directory:")
27
+ for i, file in enumerate(config_files, 1):
28
+ print(f" {i}. {file}")
29
+ print("Please ensure only one configuration file is present in the current directory.")
12
30
  return
13
31
 
14
- config_file = config_files[0] # You could also add logic to choose or validate
32
+ config_file = str(config_dir / config_files[0])
15
33
  print(f"Using configuration file: {config_file}")
16
34
 
17
35
  pelican = Pelican(config_file)
pelican_nlp/config.py CHANGED
@@ -6,9 +6,30 @@ For consistency of pipeline, DO NOT CHANGE.
6
6
  """
7
7
 
8
8
  # Debug flag
9
- DEBUG_MODE = False
9
+ DEBUG_MODE = True
10
+
11
+ # Test flag - set to True to run all example tests
12
+ RUN_TESTS = False
10
13
 
11
14
  def debug_print(*args, **kwargs):
12
15
  """Print only if debug mode is enabled."""
16
+ DEBUG_MODE = True
13
17
  if DEBUG_MODE:
14
- print(*args, **kwargs)
18
+ print(*args, **kwargs)
19
+
20
+ def run_tests():
21
+ """Run all example tests if RUN_TESTS is enabled."""
22
+ if RUN_TESTS:
23
+ import unittest
24
+ from pathlib import Path
25
+
26
+ # Get the path to the test file
27
+ test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
28
+
29
+ # Create a test suite and add the test file
30
+ loader = unittest.TestLoader()
31
+ suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
32
+
33
+ # Run the tests
34
+ runner = unittest.TextTestRunner(verbosity=2)
35
+ runner.run(suite)
@@ -35,13 +35,60 @@ class EmbeddingsExtractor:
35
35
  outputs = self.model_instance(input_ids=inputs['input_ids'])
36
36
  else:
37
37
  # Handle RoBERTa and other models that accept **inputs
38
- outputs = self.model_instance(**inputs)
38
+ if isinstance(inputs, dict):
39
+ # Ensure inputs are on the same device as the model
40
+ inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
41
+ debug_print(f"Model inputs: {inputs}")
42
+ outputs = self.model_instance(**inputs, output_hidden_states=True)
43
+ else:
44
+ debug_print(f"Input type: {type(inputs)}")
45
+ debug_print(f"Input content: {inputs}")
46
+
47
+ # If inputs is a list of strings, convert to token IDs first
48
+ if isinstance(inputs, list):
49
+ if isinstance(inputs[0], str):
50
+ # Convert tokens to IDs
51
+ token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
52
+ debug_print(f"Token IDs: {token_ids}")
53
+ inputs = torch.tensor([token_ids], device=self.model_instance.device)
54
+ else:
55
+ # If it's already a list of numbers, convert directly
56
+ inputs = torch.tensor([inputs], device=self.model_instance.device)
57
+ else:
58
+ # If it's already a tensor, just move to device
59
+ inputs = inputs.to(self.model_instance.device)
60
+
61
+ debug_print(f"Final tensor shape: {inputs.shape}")
62
+
63
+ # Ensure proper shape
64
+ if len(inputs.shape) == 1:
65
+ inputs = inputs.unsqueeze(0) # Add batch dimension
66
+
67
+ # Create attention mask
68
+ attention_mask = torch.ones_like(inputs)
69
+ debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
70
+ outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
71
+ debug_print(f"Model outputs type: {type(outputs)}")
72
+ debug_print(f"Model outputs attributes: {dir(outputs)}")
39
73
 
40
74
  # Get word embeddings (last hidden state)
41
- word_embeddings = outputs.last_hidden_state
75
+ if outputs is None:
76
+ raise ValueError("Model returned None output")
77
+
78
+ if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
79
+ word_embeddings = outputs.hidden_states[-1]
80
+ debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
81
+ elif hasattr(outputs, 'last_hidden_state'):
82
+ word_embeddings = outputs.last_hidden_state
83
+ debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
84
+ else:
85
+ raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
42
86
 
43
87
  # Extract input_ids and convert them back to tokens
44
- input_ids = inputs['input_ids'][0].tolist()
88
+ if isinstance(inputs, dict):
89
+ input_ids = inputs['input_ids'][0].tolist()
90
+ else:
91
+ input_ids = inputs[0].tolist()
45
92
  tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
46
93
 
47
94
  # Now align the tokens and embeddings
@@ -22,7 +22,12 @@ class LogitsExtractor:
22
22
  chunk_size = self.options['chunk_size']
23
23
  overlap_size = self.options['overlap_size']
24
24
 
25
- input_ids = tokens.to(self.device)
25
+ # Convert list of token IDs to tensor if needed
26
+ if isinstance(tokens, list):
27
+ input_ids = torch.tensor([tokens], device=self.device)
28
+ else:
29
+ input_ids = tokens.to(self.device)
30
+
26
31
  chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
27
32
 
28
33
  per_token_data = []
@@ -4,7 +4,7 @@ import os
4
4
  import shutil
5
5
 
6
6
  from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
7
- from transformers import AutoModelForCausalLM
7
+ from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
8
8
 
9
9
  class Model:
10
10
  def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
75
75
 
76
76
  print(f'FastText model loaded successfully from {model_path}')
77
77
  elif self.model_name == 'xlm-roberta-base':
78
- from transformers import AutoModel
79
78
  self.model_instance = AutoModel.from_pretrained(
80
79
  self.model_name,
81
80
  trust_remote_code=trust_remote_code,
pelican_nlp/main.py CHANGED
@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
25
25
  from pelican_nlp.preprocessing import LPDS
26
26
  from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
27
 
28
- from pelican_nlp.config import debug_print
28
+ from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
29
29
 
30
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
31
+ #project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
31
32
 
32
33
  class Pelican:
33
34
 
@@ -40,12 +41,12 @@ class Pelican:
40
41
  # If no config path is provided, use the default config from package; used for dev-mode
41
42
  if config_path is None:
42
43
  package_dir = Path(__file__).parent
43
- default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
44
+ default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
44
45
  if default_config.exists():
45
46
  config_path = str(default_config)
46
47
  print(f"Using default configuration file: {config_path}")
47
48
  else:
48
- sys.exit('Error: Default configuration file not found in package.')
49
+ sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
49
50
 
50
51
  # Verify the provided path is a YAML file
51
52
  elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
72
73
  """Execute the main processing pipeline."""
73
74
  self._clear_gpu_memory()
74
75
 
75
- '''
76
- #run unittests in dev_mode; not yet implemented
77
- if self.dev_mode:
78
- self._run_tests()
79
- '''
80
-
81
76
  self._handle_output_directory()
82
77
 
83
78
  # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
229
224
 
230
225
 
231
226
  if __name__ == '__main__':
232
- Pelican(project_path, dev_mode=True).run()
227
+ if RUN_TESTS:
228
+ print("Running tests...")
229
+ run_tests()
230
+ else:
231
+ Pelican(project_path, dev_mode=True).run()
@@ -24,7 +24,8 @@ class TextTokenizer:
24
24
  # Tokenize using the model's tokenizer
25
25
  return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
26
26
  elif method == 'model':
27
- return self.tokenizer.encode(text, return_tensors='pt')
27
+ # For model method, return token IDs directly
28
+ return self.tokenizer.encode(text, add_special_tokens=True)
28
29
  else:
29
30
  raise ValueError(f"Unsupported tokenization method: {method}")
30
31
 
@@ -34,10 +35,14 @@ class TextTokenizer:
34
35
  def get_tokenizer(self):
35
36
  if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
36
37
  from transformers import AutoTokenizer
38
+ if not self.model_name:
39
+ raise ValueError("model_name must be provided for model-based tokenization methods")
37
40
  return AutoTokenizer.from_pretrained(
38
41
  self.model_name,
39
42
  trust_remote_code=False, # Don't execute arbitrary model code
40
43
  use_safetensors=True
41
44
  )
42
- else:
45
+ elif self.tokenization_method == 'whitespace':
43
46
  return None
47
+ else:
48
+ raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Graph Visualization for Pelican-nlp Project
4
+ ===========================================
5
+
6
+ This script creates a visual representation of the Pelican-nlp project structure
7
+ using graphviz.
8
+ """
9
+
10
+ from graphviz import Digraph
11
+
12
+ def create_pelican_graph():
13
+ # Create a new directed graph
14
+ dot = Digraph(comment='Pelican-nlp Project Structure')
15
+ dot.attr(rankdir='TB')
16
+
17
+ # Set node styles
18
+ dot.attr('node', shape='box', style='rounded,filled')
19
+
20
+ # Main Components
21
+ with dot.subgraph(name='cluster_main') as c:
22
+ c.attr(label='Main Components')
23
+ c.attr('node', fillcolor='lightblue')
24
+ c.node('Pelican', 'Pelican\n(Main Controller)')
25
+ c.node('LPDS', 'LPDS\n(Data Structure)')
26
+ c.node('Corpus', 'Corpus\n(Document Collection)')
27
+ c.node('Subject', 'Subject\n(Grouping Unit)')
28
+ c.node('Document', 'Document\n(Data Container)')
29
+ c.node('AudioDocument', 'AudioDocument\n(Audio Data)')
30
+
31
+ # Core Processing
32
+ with dot.subgraph(name='cluster_core') as c:
33
+ c.attr(label='Core Processing')
34
+ c.attr('node', fillcolor='lightgreen')
35
+ c.node('Config', 'Configuration\n(config.py)')
36
+ c.node('CLI', 'Command Line Interface\n(cli.py)')
37
+ c.node('Main', 'Main Entry Point\n(main.py)')
38
+
39
+ # Preprocessing Components
40
+ with dot.subgraph(name='cluster_preprocessing') as c:
41
+ c.attr(label='Preprocessing')
42
+ c.attr('node', fillcolor='lightyellow')
43
+ c.node('TextTokenizer', 'Text Tokenizer\n(text_tokenizer.py)')
44
+ c.node('TextNormalizer', 'Text Normalizer\n(text_normalizer.py)')
45
+ c.node('TextCleaner', 'Text Cleaner\n(text_cleaner.py)')
46
+ c.node('TextImporter', 'Text Importer\n(text_importer.py)')
47
+ c.node('SpeakerDiarization', 'Speaker Diarization\n(speaker_diarization.py)')
48
+ c.node('Pipeline', 'Preprocessing Pipeline\n(pipeline.py)')
49
+
50
+ # Extraction Components
51
+ with dot.subgraph(name='cluster_extraction') as c:
52
+ c.attr(label='Feature Extraction')
53
+ c.attr('node', fillcolor='lightpink')
54
+ c.node('LogitsExtractor', 'Logits Extractor\n(extract_logits.py)')
55
+ c.node('EmbeddingsExtractor', 'Embeddings Extractor\n(extract_embeddings.py)')
56
+ c.node('LanguageModel', 'Language Model\n(language_model.py)')
57
+ c.node('AcousticFeatures', 'Acoustic Features\n(acoustic_feature_extraction.py)')
58
+ c.node('SemanticSimilarity', 'Semantic Similarity\n(semantic_similarity.py)')
59
+ c.node('RandomnessDistance', 'Distance from Randomness\n(distance_from_randomness.py)')
60
+
61
+ # Utility Components
62
+ with dot.subgraph(name='cluster_utils') as c:
63
+ c.attr(label='Utilities')
64
+ c.attr('node', fillcolor='lightgrey')
65
+ c.node('FilenameParser', 'Filename Parser\n(filename_parser.py)')
66
+ c.node('CSVFunctions', 'CSV Functions\n(csv_functions.py)')
67
+ c.node('SetupFunctions', 'Setup Functions\n(setup_functions.py)')
68
+
69
+ # Main Relationships
70
+ dot.edge('Pelican', 'LPDS', 'manages')
71
+ dot.edge('Pelican', 'Corpus', 'processes')
72
+ dot.edge('Pelican', 'Subject', 'instantiates')
73
+ dot.edge('Corpus', 'Document', 'contains')
74
+ dot.edge('Subject', 'Document', 'groups')
75
+ dot.edge('Document', 'AudioDocument', 'extends')
76
+
77
+ # Core Processing Relationships
78
+ dot.edge('CLI', 'Main', 'calls')
79
+ dot.edge('Main', 'Pelican', 'instantiates')
80
+ dot.edge('Pelican', 'Config', 'uses')
81
+
82
+ # Preprocessing Relationships
83
+ dot.edge('Pipeline', 'TextTokenizer', 'uses')
84
+ dot.edge('Pipeline', 'TextNormalizer', 'uses')
85
+ dot.edge('Pipeline', 'TextCleaner', 'uses')
86
+ dot.edge('Pipeline', 'TextImporter', 'uses')
87
+ dot.edge('Pipeline', 'SpeakerDiarization', 'uses')
88
+ dot.edge('Corpus', 'Pipeline', 'executes')
89
+
90
+ # Extraction Relationships
91
+ dot.edge('Corpus', 'LogitsExtractor', 'uses')
92
+ dot.edge('Corpus', 'EmbeddingsExtractor', 'uses')
93
+ dot.edge('LogitsExtractor', 'LanguageModel', 'uses')
94
+ dot.edge('EmbeddingsExtractor', 'LanguageModel', 'uses')
95
+ dot.edge('Corpus', 'AcousticFeatures', 'uses')
96
+ dot.edge('Corpus', 'SemanticSimilarity', 'uses')
97
+ dot.edge('Corpus', 'RandomnessDistance', 'uses')
98
+
99
+ # Utility Relationships
100
+ dot.edge('Pelican', 'FilenameParser', 'uses')
101
+ dot.edge('Corpus', 'CSVFunctions', 'uses')
102
+ dot.edge('Pelican', 'SetupFunctions', 'uses')
103
+
104
+ # Save the graph
105
+ dot.render('pelican_structure_detailed', format='png', cleanup=True)
106
+ print("Detailed graph visualization has been created as 'pelican_structure_detailed.png'")
107
+
108
+ if __name__ == '__main__':
109
+ create_pelican_graph()
@@ -7,12 +7,15 @@ discourse: &discourse_flag true
7
7
  #general configurations; always adapt
8
8
  language: "german" # Possibly add options for German and English
9
9
 
10
- task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
11
- corpus_names:
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
12
15
  - "placebo"
13
16
  - "schizophrenia"
14
17
 
15
- metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
18
+ metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
16
19
 
17
20
  number_of_speakers: 3
18
21
  subject_speakertag: "B"
@@ -24,7 +27,7 @@ cleaning_options:
24
27
  general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
25
28
  remove_brackets_and_bracketcontent: true
26
29
  remove_timestamps: true
27
- timestamp_pattern_example: "#00:00:19-0#"
30
+ timestamp_pattern_example: "#00:00:19-00#"
28
31
  remove_punctuation: false
29
32
  lowercase: false
30
33
  #Options for fluency tasks
@@ -44,10 +47,10 @@ options_logits:
44
47
  keep_speakertags: true
45
48
 
46
49
  options_embeddings:
47
- tokenization_method: "whitespace" #"model" or "whitespace"
50
+ tokenization_method: "model" #"model" or "whitespace"
48
51
  max_length: 512 #max sequence length
49
- model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
50
- pytorch_based_model: false
52
+ model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
53
+ pytorch_based_model: true
51
54
  method: "model_instance"
52
55
  remove_punctuation: false
53
56
  lowercase: false
@@ -60,6 +63,7 @@ options_embeddings:
60
63
  remove_punctuation_and_symbols: true
61
64
  remove_brackets_and_content: true
62
65
  semantic-similarity: false
66
+ distance-from-randomness: false
63
67
  window_size: null
64
68
  clean_tokens: false
65
69
  divergence_from_optimality: false
@@ -93,6 +97,9 @@ normalization_options:
93
97
  method: "lemmatization" #Options: lemmatization or stemming
94
98
  #================================================================
95
99
 
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
96
103
  #Detail configurations; Changes optional, mostly used for quality checking / error handling
97
104
  number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
98
105
  multiple_sessions: false # Set to True if multiple sessions per subject
@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
8
8
  language: "german"
9
9
  multiple_sessions: &session_flag false
10
10
 
11
- corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
11
+ corpus_key: "acq"
12
+ corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
12
13
  - "animals"
13
14
  - "clothes"
14
15
  - "food"
@@ -103,5 +104,3 @@ filename_components:
103
104
  metric: true
104
105
  additional_tags: []
105
106
 
106
-
107
-
@@ -5,33 +5,38 @@
5
5
  # -------------
6
6
  input_file: "text" # Options: 'text' or 'audio'
7
7
  language: "german"
8
- recompute_everything: true #If false will give warning if output folder already exists
8
+ recompute_everything: true # If false will give warning if output folder already exists
9
9
 
10
10
  # Task Configuration
11
11
  # -----------------
12
- task_name: # Name of task used for creation of data
12
+ task_name: null # Name of task used for creation of data
13
13
  fluency_task: &fluency_flag false # Flag for fluency-specific settings
14
14
  discourse: &discourse_flag false # Flag for discourse-specific settings
15
- corpus_names: # List of task corpora
15
+
16
+ # Corpus Configuration
17
+ # ------------------
18
+ corpus_key: null # Entity key to group files for analysis
19
+ corpus_values: # Corresponding entity values found in dataset
16
20
  - "healthy-control"
21
+ - "placebo"
17
22
 
18
23
  # Session and Subject Settings
19
24
  # --------------------------
20
25
  multiple_sessions: false
21
26
  number_of_subjects: null # If null, auto-detected
22
- number_of_speakers: 1
27
+ number_of_speakers: 1 # Specify amount of speakers for discourse files
23
28
  subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
24
29
 
25
30
  # Document Structure
26
31
  # ----------------
27
32
  has_multiple_sections: false
28
33
  has_section_titles: false
29
- section_identification: null # e.g., "Section:"
30
- number_of_sections: 1 # If null, auto-detected
34
+ section_identification: null # e.g., "Section:", in case of multiple sections
35
+ number_of_sections: null # If null, auto-detected, specify for multiple sections to check section detection
31
36
 
32
37
  # Processing Pipeline
33
38
  # -----------------
34
- pipeline_options:
39
+ pipeline_options: # Just for data preprocessing without metric extraction
35
40
  quality_check: false
36
41
  clean_text: true
37
42
  tokenize_text: false
@@ -40,18 +45,17 @@ pipeline_options:
40
45
  # Metric Extraction
41
46
  # ---------------
42
47
  metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
43
- extract_logits: null
44
- extract_embeddings: true
48
+ output_document_information: true
45
49
 
46
50
  # Cleaning Options
47
51
  # --------------
48
52
  cleaning_options:
49
- general_cleaning: true
53
+ general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
50
54
  remove_punctuation: false
51
55
  lowercase: true
52
56
  remove_brackets_and_bracketcontent: false
53
57
  remove_timestamps: false
54
- timestamp_pattern_example: null # e.g., "#00:00:23-00#"
58
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#", only if remove_timestamps = True
55
59
  # Fluency-specific options
56
60
  fluency_task: *fluency_flag
57
61
  word_splitter: ';'
@@ -74,8 +78,8 @@ options_embeddings:
74
78
  method: "model_instance"
75
79
  max_length: 512
76
80
  clean_embedding_tokens: true
77
- remove_punctuation: false
78
- lowercase: false
81
+ remove_punctuation_embeddings: false
82
+ lowercase_embeddings: false
79
83
  keep_speakertags: false
80
84
  semantic-similarity: true
81
85
  window_size: null
@@ -118,6 +122,14 @@ options_dis_from_randomness:
118
122
  normalization_options:
119
123
  method: "lemmatization" # Options: 'lemmatization', 'stemming'
120
124
 
125
+ # Document Information Output
126
+ # -------------------------
127
+ document_information_output:
128
+ parameters:
129
+ - subject_ID
130
+ - fluency_word_count
131
+ - fluency_duplicate_count
132
+
121
133
  # Filename Configuration
122
134
  # --------------------
123
135
  filename_components:
@@ -125,6 +137,10 @@ filename_components:
125
137
  session: false
126
138
  task: true # mandatory
127
139
  task_addition: false
128
- corpus: true #mandatory
140
+ corpus: true # mandatory
129
141
  metric: true
130
142
  additional_tags: []
143
+
144
+ # Additional Settings
145
+ # -----------------
146
+ create_aggregation_of_results: true
@@ -4,7 +4,7 @@ import yaml
4
4
  import sys
5
5
  from pelican_nlp.core.subject import Subject
6
6
  from .filename_parser import parse_lpds_filename
7
- from ..config import debug_print
7
+ from pelican_nlp.config import debug_print
8
8
 
9
9
 
10
10
  def subject_instantiator(config, project_folder):
@@ -0,0 +1,109 @@
1
+ # Configuration file for discourse protocols
2
+ #=======================================
3
+ input_file: "text" #or 'audio'
4
+ discourse: &discourse_flag true
5
+ #=====================================
6
+
7
+ #general configurations; always adapt
8
+ language: "german" # Possibly add options for German and English
9
+
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
15
+ - "placebo"
16
+ - "schizophrenia"
17
+
18
+ metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
19
+
20
+ number_of_speakers: 2
21
+ subject_speakertag: "B"
22
+ #=========================================================
23
+
24
+ #Optional configurations; Change with preference. However, default settings recommended
25
+ fluency_task: &fluency_flag false
26
+ cleaning_options:
27
+ general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
28
+ remove_brackets_and_bracketcontent: true
29
+ remove_timestamps: true
30
+ timestamp_pattern_example: "#00:00:19-00#"
31
+ remove_punctuation: false
32
+ lowercase: false
33
+ #Options for fluency tasks
34
+ fluency_task: *fluency_flag
35
+ word_splitter: null
36
+ remove_hyphens: null
37
+ remove_duplicates: null
38
+
39
+ options_logits:
40
+ chunk_size: 128
41
+ overlap_size: 64
42
+ tokenization_method: "model"
43
+ #method: "model_instance" # Options: model, regex, nltk, etc.
44
+ model_name: "xlm-roberta-base" #Replace with your model instance name, e.g. "DiscoResearch/Llama3-German-8B-32k"
45
+ remove_punctuation: true
46
+ lowercase: true
47
+ keep_speakertags: true
48
+
49
+ options_embeddings:
50
+ tokenization_method: "model" #"model" or "whitespace"
51
+ max_length: 512 #max sequence length
52
+ model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
53
+ pytorch_based_model: true
54
+ method: "model_instance"
55
+ remove_punctuation: false
56
+ lowercase: false
57
+ keep_speakertags: true
58
+ clean_embedding_tokens: true
59
+ output_options:
60
+ exclude_special_tokens: true
61
+ remove_'_'_character: true
62
+ remove_speaker_labels: true
63
+ remove_punctuation_and_symbols: true
64
+ remove_brackets_and_content: true
65
+ semantic-similarity: false
66
+ distance-from-randomness: false
67
+ window_size: null
68
+ clean_tokens: false
69
+ divergence_from_optimality: false
70
+ #================================================================================
71
+
72
+ #Extra configurations:
73
+ pipeline_options:
74
+ quality_check: false
75
+ clean_text: true
76
+ tokenize_text: false
77
+ normalize_text: false
78
+
79
+ general_cleaning_options:
80
+ strip_whitespace: true
81
+ merge_multiple_whitespaces: true
82
+ remove_whitespace_before_punctuation: true
83
+ merge_newline_characters: true
84
+ remove_backslashes: true
85
+
86
+ has_multiple_sections: false #evaluated independently
87
+ has_section_titles: false
88
+ section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
89
+ number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
90
+
91
+ # Options for extract_embeddings
92
+ window_sizes: [2]
93
+ metric_function: cosine_similarity
94
+ aggregation_functions: mean_of_means
95
+
96
+ normalization_options:
97
+ method: "lemmatization" #Options: lemmatization or stemming
98
+ #================================================================
99
+
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
103
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
104
+ number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
105
+ multiple_sessions: false # Set to True if multiple sessions per subject
106
+
107
+ recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
108
+
109
+