pelican-nlp 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/_version.py +1 -1
- pelican_nlp/cli.py +20 -4
- pelican_nlp/config.py +23 -2
- pelican_nlp/extraction/extract_embeddings.py +50 -3
- pelican_nlp/extraction/extract_logits.py +6 -1
- pelican_nlp/extraction/language_model.py +1 -2
- pelican_nlp/main.py +9 -10
- pelican_nlp/preprocessing/text_tokenizer.py +7 -2
- pelican_nlp/project_graph/graph_visualization.py +109 -0
- pelican_nlp/sample_configuration_files/config_discourse.yml +14 -7
- pelican_nlp/sample_configuration_files/config_fluency.yml +2 -3
- pelican_nlp/sample_configuration_files/config_general.yml +30 -14
- pelican_nlp/utils/setup_functions.py +1 -1
- pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml +109 -0
- pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml +106 -0
- pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml +135 -0
- pelican_nlp/utils/unittests/test_examples.py +211 -0
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/METADATA +1 -1
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/RECORD +23 -18
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/WHEEL +1 -1
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/entry_points.txt +0 -0
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.3.2.dist-info → pelican_nlp-0.3.3.dist-info}/top_level.txt +0 -0
pelican_nlp/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.3"
|
pelican_nlp/cli.py
CHANGED
@@ -1,17 +1,33 @@
|
|
1
1
|
import os
|
2
|
+
from pathlib import Path
|
2
3
|
from pelican_nlp.main import Pelican
|
4
|
+
from pelican_nlp.config import RUN_TESTS, run_tests
|
3
5
|
|
4
6
|
def main():
|
5
|
-
|
7
|
+
# Run tests if enabled
|
8
|
+
if RUN_TESTS:
|
9
|
+
print("Running tests...")
|
10
|
+
run_tests()
|
11
|
+
return
|
12
|
+
|
13
|
+
# Get the package directory's sample_configuration_files folder
|
14
|
+
package_dir = Path(__file__).parent
|
15
|
+
config_dir = package_dir / 'sample_configuration_files'
|
16
|
+
|
17
|
+
if not config_dir.exists():
|
18
|
+
print("sample_configuration_files directory not found in package directory.")
|
19
|
+
return
|
20
|
+
|
21
|
+
config_files = [f for f in os.listdir(config_dir) if f.endswith(".yml")]
|
6
22
|
if not config_files:
|
7
|
-
print("No .yml configuration file found in the
|
23
|
+
print("No .yml configuration file found in the sample_configuration_files directory.")
|
8
24
|
return
|
9
25
|
|
10
26
|
if len(config_files) > 1:
|
11
|
-
print("More than one configuration file found -
|
27
|
+
print("More than one configuration file found in sample_configuration_files directory - please specify which one to use")
|
12
28
|
return
|
13
29
|
|
14
|
-
config_file = config_files[0]
|
30
|
+
config_file = str(config_dir / config_files[0])
|
15
31
|
print(f"Using configuration file: {config_file}")
|
16
32
|
|
17
33
|
pelican = Pelican(config_file)
|
pelican_nlp/config.py
CHANGED
@@ -6,9 +6,30 @@ For consistency of pipeline, DO NOT CHANGE.
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Debug flag
|
9
|
-
DEBUG_MODE =
|
9
|
+
DEBUG_MODE = True
|
10
|
+
|
11
|
+
# Test flag - set to True to run all example tests
|
12
|
+
RUN_TESTS = False
|
10
13
|
|
11
14
|
def debug_print(*args, **kwargs):
|
12
15
|
"""Print only if debug mode is enabled."""
|
16
|
+
DEBUG_MODE = True
|
13
17
|
if DEBUG_MODE:
|
14
|
-
print(*args, **kwargs)
|
18
|
+
print(*args, **kwargs)
|
19
|
+
|
20
|
+
def run_tests():
|
21
|
+
"""Run all example tests if RUN_TESTS is enabled."""
|
22
|
+
if RUN_TESTS:
|
23
|
+
import unittest
|
24
|
+
from pathlib import Path
|
25
|
+
|
26
|
+
# Get the path to the test file
|
27
|
+
test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
|
28
|
+
|
29
|
+
# Create a test suite and add the test file
|
30
|
+
loader = unittest.TestLoader()
|
31
|
+
suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
|
32
|
+
|
33
|
+
# Run the tests
|
34
|
+
runner = unittest.TextTestRunner(verbosity=2)
|
35
|
+
runner.run(suite)
|
@@ -35,13 +35,60 @@ class EmbeddingsExtractor:
|
|
35
35
|
outputs = self.model_instance(input_ids=inputs['input_ids'])
|
36
36
|
else:
|
37
37
|
# Handle RoBERTa and other models that accept **inputs
|
38
|
-
|
38
|
+
if isinstance(inputs, dict):
|
39
|
+
# Ensure inputs are on the same device as the model
|
40
|
+
inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
|
41
|
+
debug_print(f"Model inputs: {inputs}")
|
42
|
+
outputs = self.model_instance(**inputs, output_hidden_states=True)
|
43
|
+
else:
|
44
|
+
debug_print(f"Input type: {type(inputs)}")
|
45
|
+
debug_print(f"Input content: {inputs}")
|
46
|
+
|
47
|
+
# If inputs is a list of strings, convert to token IDs first
|
48
|
+
if isinstance(inputs, list):
|
49
|
+
if isinstance(inputs[0], str):
|
50
|
+
# Convert tokens to IDs
|
51
|
+
token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
|
52
|
+
debug_print(f"Token IDs: {token_ids}")
|
53
|
+
inputs = torch.tensor([token_ids], device=self.model_instance.device)
|
54
|
+
else:
|
55
|
+
# If it's already a list of numbers, convert directly
|
56
|
+
inputs = torch.tensor([inputs], device=self.model_instance.device)
|
57
|
+
else:
|
58
|
+
# If it's already a tensor, just move to device
|
59
|
+
inputs = inputs.to(self.model_instance.device)
|
60
|
+
|
61
|
+
debug_print(f"Final tensor shape: {inputs.shape}")
|
62
|
+
|
63
|
+
# Ensure proper shape
|
64
|
+
if len(inputs.shape) == 1:
|
65
|
+
inputs = inputs.unsqueeze(0) # Add batch dimension
|
66
|
+
|
67
|
+
# Create attention mask
|
68
|
+
attention_mask = torch.ones_like(inputs)
|
69
|
+
debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
|
70
|
+
outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
|
71
|
+
debug_print(f"Model outputs type: {type(outputs)}")
|
72
|
+
debug_print(f"Model outputs attributes: {dir(outputs)}")
|
39
73
|
|
40
74
|
# Get word embeddings (last hidden state)
|
41
|
-
|
75
|
+
if outputs is None:
|
76
|
+
raise ValueError("Model returned None output")
|
77
|
+
|
78
|
+
if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
|
79
|
+
word_embeddings = outputs.hidden_states[-1]
|
80
|
+
debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
|
81
|
+
elif hasattr(outputs, 'last_hidden_state'):
|
82
|
+
word_embeddings = outputs.last_hidden_state
|
83
|
+
debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
|
84
|
+
else:
|
85
|
+
raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
|
42
86
|
|
43
87
|
# Extract input_ids and convert them back to tokens
|
44
|
-
|
88
|
+
if isinstance(inputs, dict):
|
89
|
+
input_ids = inputs['input_ids'][0].tolist()
|
90
|
+
else:
|
91
|
+
input_ids = inputs[0].tolist()
|
45
92
|
tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
|
46
93
|
|
47
94
|
# Now align the tokens and embeddings
|
@@ -22,7 +22,12 @@ class LogitsExtractor:
|
|
22
22
|
chunk_size = self.options['chunk_size']
|
23
23
|
overlap_size = self.options['overlap_size']
|
24
24
|
|
25
|
-
|
25
|
+
# Convert list of token IDs to tensor if needed
|
26
|
+
if isinstance(tokens, list):
|
27
|
+
input_ids = torch.tensor([tokens], device=self.device)
|
28
|
+
else:
|
29
|
+
input_ids = tokens.to(self.device)
|
30
|
+
|
26
31
|
chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
|
27
32
|
|
28
33
|
per_token_data = []
|
@@ -4,7 +4,7 @@ import os
|
|
4
4
|
import shutil
|
5
5
|
|
6
6
|
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
|
7
|
-
from transformers import AutoModelForCausalLM
|
7
|
+
from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
|
8
8
|
|
9
9
|
class Model:
|
10
10
|
def __init__(self, model_name, project_path):
|
@@ -75,7 +75,6 @@ class Model:
|
|
75
75
|
|
76
76
|
print(f'FastText model loaded successfully from {model_path}')
|
77
77
|
elif self.model_name == 'xlm-roberta-base':
|
78
|
-
from transformers import AutoModel
|
79
78
|
self.model_instance = AutoModel.from_pretrained(
|
80
79
|
self.model_name,
|
81
80
|
trust_remote_code=trust_remote_code,
|
pelican_nlp/main.py
CHANGED
@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
|
|
25
25
|
from pelican_nlp.preprocessing import LPDS
|
26
26
|
from pelican_nlp.utils.filename_parser import parse_lpds_filename
|
27
27
|
|
28
|
-
from pelican_nlp.config import debug_print
|
28
|
+
from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
|
29
29
|
|
30
30
|
project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
|
31
|
+
#project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
|
31
32
|
|
32
33
|
class Pelican:
|
33
34
|
|
@@ -40,12 +41,12 @@ class Pelican:
|
|
40
41
|
# If no config path is provided, use the default config from package; used for dev-mode
|
41
42
|
if config_path is None:
|
42
43
|
package_dir = Path(__file__).parent
|
43
|
-
default_config = package_dir / '
|
44
|
+
default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
|
44
45
|
if default_config.exists():
|
45
46
|
config_path = str(default_config)
|
46
47
|
print(f"Using default configuration file: {config_path}")
|
47
48
|
else:
|
48
|
-
sys.exit('Error: Default configuration file not found in
|
49
|
+
sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
|
49
50
|
|
50
51
|
# Verify the provided path is a YAML file
|
51
52
|
elif not config_path.endswith(('.yml', '.yaml')):
|
@@ -72,12 +73,6 @@ class Pelican:
|
|
72
73
|
"""Execute the main processing pipeline."""
|
73
74
|
self._clear_gpu_memory()
|
74
75
|
|
75
|
-
'''
|
76
|
-
#run unittests in dev_mode; not yet implemented
|
77
|
-
if self.dev_mode:
|
78
|
-
self._run_tests()
|
79
|
-
'''
|
80
|
-
|
81
76
|
self._handle_output_directory()
|
82
77
|
|
83
78
|
# Check/Create LPDS
|
@@ -229,4 +224,8 @@ class Pelican:
|
|
229
224
|
|
230
225
|
|
231
226
|
if __name__ == '__main__':
|
232
|
-
|
227
|
+
if RUN_TESTS:
|
228
|
+
print("Running tests...")
|
229
|
+
run_tests()
|
230
|
+
else:
|
231
|
+
Pelican(project_path, dev_mode=True).run()
|
@@ -24,7 +24,8 @@ class TextTokenizer:
|
|
24
24
|
# Tokenize using the model's tokenizer
|
25
25
|
return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
|
26
26
|
elif method == 'model':
|
27
|
-
|
27
|
+
# For model method, return token IDs directly
|
28
|
+
return self.tokenizer.encode(text, add_special_tokens=True)
|
28
29
|
else:
|
29
30
|
raise ValueError(f"Unsupported tokenization method: {method}")
|
30
31
|
|
@@ -34,10 +35,14 @@ class TextTokenizer:
|
|
34
35
|
def get_tokenizer(self):
|
35
36
|
if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
|
36
37
|
from transformers import AutoTokenizer
|
38
|
+
if not self.model_name:
|
39
|
+
raise ValueError("model_name must be provided for model-based tokenization methods")
|
37
40
|
return AutoTokenizer.from_pretrained(
|
38
41
|
self.model_name,
|
39
42
|
trust_remote_code=False, # Don't execute arbitrary model code
|
40
43
|
use_safetensors=True
|
41
44
|
)
|
42
|
-
|
45
|
+
elif self.tokenization_method == 'whitespace':
|
43
46
|
return None
|
47
|
+
else:
|
48
|
+
raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Graph Visualization for Pelican-nlp Project
|
4
|
+
===========================================
|
5
|
+
|
6
|
+
This script creates a visual representation of the Pelican-nlp project structure
|
7
|
+
using graphviz.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from graphviz import Digraph
|
11
|
+
|
12
|
+
def create_pelican_graph():
|
13
|
+
# Create a new directed graph
|
14
|
+
dot = Digraph(comment='Pelican-nlp Project Structure')
|
15
|
+
dot.attr(rankdir='TB')
|
16
|
+
|
17
|
+
# Set node styles
|
18
|
+
dot.attr('node', shape='box', style='rounded,filled')
|
19
|
+
|
20
|
+
# Main Components
|
21
|
+
with dot.subgraph(name='cluster_main') as c:
|
22
|
+
c.attr(label='Main Components')
|
23
|
+
c.attr('node', fillcolor='lightblue')
|
24
|
+
c.node('Pelican', 'Pelican\n(Main Controller)')
|
25
|
+
c.node('LPDS', 'LPDS\n(Data Structure)')
|
26
|
+
c.node('Corpus', 'Corpus\n(Document Collection)')
|
27
|
+
c.node('Subject', 'Subject\n(Grouping Unit)')
|
28
|
+
c.node('Document', 'Document\n(Data Container)')
|
29
|
+
c.node('AudioDocument', 'AudioDocument\n(Audio Data)')
|
30
|
+
|
31
|
+
# Core Processing
|
32
|
+
with dot.subgraph(name='cluster_core') as c:
|
33
|
+
c.attr(label='Core Processing')
|
34
|
+
c.attr('node', fillcolor='lightgreen')
|
35
|
+
c.node('Config', 'Configuration\n(config.py)')
|
36
|
+
c.node('CLI', 'Command Line Interface\n(cli.py)')
|
37
|
+
c.node('Main', 'Main Entry Point\n(main.py)')
|
38
|
+
|
39
|
+
# Preprocessing Components
|
40
|
+
with dot.subgraph(name='cluster_preprocessing') as c:
|
41
|
+
c.attr(label='Preprocessing')
|
42
|
+
c.attr('node', fillcolor='lightyellow')
|
43
|
+
c.node('TextTokenizer', 'Text Tokenizer\n(text_tokenizer.py)')
|
44
|
+
c.node('TextNormalizer', 'Text Normalizer\n(text_normalizer.py)')
|
45
|
+
c.node('TextCleaner', 'Text Cleaner\n(text_cleaner.py)')
|
46
|
+
c.node('TextImporter', 'Text Importer\n(text_importer.py)')
|
47
|
+
c.node('SpeakerDiarization', 'Speaker Diarization\n(speaker_diarization.py)')
|
48
|
+
c.node('Pipeline', 'Preprocessing Pipeline\n(pipeline.py)')
|
49
|
+
|
50
|
+
# Extraction Components
|
51
|
+
with dot.subgraph(name='cluster_extraction') as c:
|
52
|
+
c.attr(label='Feature Extraction')
|
53
|
+
c.attr('node', fillcolor='lightpink')
|
54
|
+
c.node('LogitsExtractor', 'Logits Extractor\n(extract_logits.py)')
|
55
|
+
c.node('EmbeddingsExtractor', 'Embeddings Extractor\n(extract_embeddings.py)')
|
56
|
+
c.node('LanguageModel', 'Language Model\n(language_model.py)')
|
57
|
+
c.node('AcousticFeatures', 'Acoustic Features\n(acoustic_feature_extraction.py)')
|
58
|
+
c.node('SemanticSimilarity', 'Semantic Similarity\n(semantic_similarity.py)')
|
59
|
+
c.node('RandomnessDistance', 'Distance from Randomness\n(distance_from_randomness.py)')
|
60
|
+
|
61
|
+
# Utility Components
|
62
|
+
with dot.subgraph(name='cluster_utils') as c:
|
63
|
+
c.attr(label='Utilities')
|
64
|
+
c.attr('node', fillcolor='lightgrey')
|
65
|
+
c.node('FilenameParser', 'Filename Parser\n(filename_parser.py)')
|
66
|
+
c.node('CSVFunctions', 'CSV Functions\n(csv_functions.py)')
|
67
|
+
c.node('SetupFunctions', 'Setup Functions\n(setup_functions.py)')
|
68
|
+
|
69
|
+
# Main Relationships
|
70
|
+
dot.edge('Pelican', 'LPDS', 'manages')
|
71
|
+
dot.edge('Pelican', 'Corpus', 'processes')
|
72
|
+
dot.edge('Pelican', 'Subject', 'instantiates')
|
73
|
+
dot.edge('Corpus', 'Document', 'contains')
|
74
|
+
dot.edge('Subject', 'Document', 'groups')
|
75
|
+
dot.edge('Document', 'AudioDocument', 'extends')
|
76
|
+
|
77
|
+
# Core Processing Relationships
|
78
|
+
dot.edge('CLI', 'Main', 'calls')
|
79
|
+
dot.edge('Main', 'Pelican', 'instantiates')
|
80
|
+
dot.edge('Pelican', 'Config', 'uses')
|
81
|
+
|
82
|
+
# Preprocessing Relationships
|
83
|
+
dot.edge('Pipeline', 'TextTokenizer', 'uses')
|
84
|
+
dot.edge('Pipeline', 'TextNormalizer', 'uses')
|
85
|
+
dot.edge('Pipeline', 'TextCleaner', 'uses')
|
86
|
+
dot.edge('Pipeline', 'TextImporter', 'uses')
|
87
|
+
dot.edge('Pipeline', 'SpeakerDiarization', 'uses')
|
88
|
+
dot.edge('Corpus', 'Pipeline', 'executes')
|
89
|
+
|
90
|
+
# Extraction Relationships
|
91
|
+
dot.edge('Corpus', 'LogitsExtractor', 'uses')
|
92
|
+
dot.edge('Corpus', 'EmbeddingsExtractor', 'uses')
|
93
|
+
dot.edge('LogitsExtractor', 'LanguageModel', 'uses')
|
94
|
+
dot.edge('EmbeddingsExtractor', 'LanguageModel', 'uses')
|
95
|
+
dot.edge('Corpus', 'AcousticFeatures', 'uses')
|
96
|
+
dot.edge('Corpus', 'SemanticSimilarity', 'uses')
|
97
|
+
dot.edge('Corpus', 'RandomnessDistance', 'uses')
|
98
|
+
|
99
|
+
# Utility Relationships
|
100
|
+
dot.edge('Pelican', 'FilenameParser', 'uses')
|
101
|
+
dot.edge('Corpus', 'CSVFunctions', 'uses')
|
102
|
+
dot.edge('Pelican', 'SetupFunctions', 'uses')
|
103
|
+
|
104
|
+
# Save the graph
|
105
|
+
dot.render('pelican_structure_detailed', format='png', cleanup=True)
|
106
|
+
print("Detailed graph visualization has been created as 'pelican_structure_detailed.png'")
|
107
|
+
|
108
|
+
if __name__ == '__main__':
|
109
|
+
create_pelican_graph()
|
@@ -7,12 +7,15 @@ discourse: &discourse_flag true
|
|
7
7
|
#general configurations; always adapt
|
8
8
|
language: "german" # Possibly add options for German and English
|
9
9
|
|
10
|
-
task_name: "interview"
|
11
|
-
|
10
|
+
task_name: "interview"
|
11
|
+
|
12
|
+
#Create analysis corpus, group files based on corpus entity.
|
13
|
+
corpus_key: "acq"
|
14
|
+
corpus_values: #group names
|
12
15
|
- "placebo"
|
13
16
|
- "schizophrenia"
|
14
17
|
|
15
|
-
metric_to_extract: "
|
18
|
+
metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
|
16
19
|
|
17
20
|
number_of_speakers: 3
|
18
21
|
subject_speakertag: "B"
|
@@ -24,7 +27,7 @@ cleaning_options:
|
|
24
27
|
general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
|
25
28
|
remove_brackets_and_bracketcontent: true
|
26
29
|
remove_timestamps: true
|
27
|
-
timestamp_pattern_example: "#00:00:19-
|
30
|
+
timestamp_pattern_example: "#00:00:19-00#"
|
28
31
|
remove_punctuation: false
|
29
32
|
lowercase: false
|
30
33
|
#Options for fluency tasks
|
@@ -44,10 +47,10 @@ options_logits:
|
|
44
47
|
keep_speakertags: true
|
45
48
|
|
46
49
|
options_embeddings:
|
47
|
-
tokenization_method: "
|
50
|
+
tokenization_method: "model" #"model" or "whitespace"
|
48
51
|
max_length: 512 #max sequence length
|
49
|
-
model_name: "
|
50
|
-
pytorch_based_model:
|
52
|
+
model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
|
53
|
+
pytorch_based_model: true
|
51
54
|
method: "model_instance"
|
52
55
|
remove_punctuation: false
|
53
56
|
lowercase: false
|
@@ -60,6 +63,7 @@ options_embeddings:
|
|
60
63
|
remove_punctuation_and_symbols: true
|
61
64
|
remove_brackets_and_content: true
|
62
65
|
semantic-similarity: false
|
66
|
+
distance-from-randomness: false
|
63
67
|
window_size: null
|
64
68
|
clean_tokens: false
|
65
69
|
divergence_from_optimality: false
|
@@ -93,6 +97,9 @@ normalization_options:
|
|
93
97
|
method: "lemmatization" #Options: lemmatization or stemming
|
94
98
|
#================================================================
|
95
99
|
|
100
|
+
create_aggregation_of_results: false
|
101
|
+
output_document_information: false
|
102
|
+
|
96
103
|
#Detail configurations; Changes optional, mostly used for quality checking / error handling
|
97
104
|
number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
|
98
105
|
multiple_sessions: false # Set to True if multiple sessions per subject
|
@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
|
|
8
8
|
language: "german"
|
9
9
|
multiple_sessions: &session_flag false
|
10
10
|
|
11
|
-
|
11
|
+
corpus_key: "acq"
|
12
|
+
corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
|
12
13
|
- "animals"
|
13
14
|
- "clothes"
|
14
15
|
- "food"
|
@@ -103,5 +104,3 @@ filename_components:
|
|
103
104
|
metric: true
|
104
105
|
additional_tags: []
|
105
106
|
|
106
|
-
|
107
|
-
|
@@ -5,33 +5,38 @@
|
|
5
5
|
# -------------
|
6
6
|
input_file: "text" # Options: 'text' or 'audio'
|
7
7
|
language: "german"
|
8
|
-
recompute_everything: true
|
8
|
+
recompute_everything: true # If false will give warning if output folder already exists
|
9
9
|
|
10
10
|
# Task Configuration
|
11
11
|
# -----------------
|
12
|
-
task_name: # Name of task used for creation of data
|
12
|
+
task_name: null # Name of task used for creation of data
|
13
13
|
fluency_task: &fluency_flag false # Flag for fluency-specific settings
|
14
14
|
discourse: &discourse_flag false # Flag for discourse-specific settings
|
15
|
-
|
15
|
+
|
16
|
+
# Corpus Configuration
|
17
|
+
# ------------------
|
18
|
+
corpus_key: null # Entity key to group files for analysis
|
19
|
+
corpus_values: # Corresponding entity values found in dataset
|
16
20
|
- "healthy-control"
|
21
|
+
- "placebo"
|
17
22
|
|
18
23
|
# Session and Subject Settings
|
19
24
|
# --------------------------
|
20
25
|
multiple_sessions: false
|
21
26
|
number_of_subjects: null # If null, auto-detected
|
22
|
-
number_of_speakers: 1
|
27
|
+
number_of_speakers: 1 # Specify amount of speakers for discourse files
|
23
28
|
subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
|
24
29
|
|
25
30
|
# Document Structure
|
26
31
|
# ----------------
|
27
32
|
has_multiple_sections: false
|
28
33
|
has_section_titles: false
|
29
|
-
section_identification: null # e.g., "Section:"
|
30
|
-
number_of_sections:
|
34
|
+
section_identification: null # e.g., "Section:", in case of multiple sections
|
35
|
+
number_of_sections: null # If null, auto-detected, specify for multiple sections to check section detection
|
31
36
|
|
32
37
|
# Processing Pipeline
|
33
38
|
# -----------------
|
34
|
-
pipeline_options:
|
39
|
+
pipeline_options: # Just for data preprocessing without metric extraction
|
35
40
|
quality_check: false
|
36
41
|
clean_text: true
|
37
42
|
tokenize_text: false
|
@@ -40,18 +45,17 @@ pipeline_options:
|
|
40
45
|
# Metric Extraction
|
41
46
|
# ---------------
|
42
47
|
metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
|
43
|
-
|
44
|
-
extract_embeddings: true
|
48
|
+
output_document_information: true
|
45
49
|
|
46
50
|
# Cleaning Options
|
47
51
|
# --------------
|
48
52
|
cleaning_options:
|
49
|
-
general_cleaning: true
|
53
|
+
general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
|
50
54
|
remove_punctuation: false
|
51
55
|
lowercase: true
|
52
56
|
remove_brackets_and_bracketcontent: false
|
53
57
|
remove_timestamps: false
|
54
|
-
timestamp_pattern_example: null # e.g., "#00:00:23-00#"
|
58
|
+
timestamp_pattern_example: null # e.g., "#00:00:23-00#", only if remove_timestamps = True
|
55
59
|
# Fluency-specific options
|
56
60
|
fluency_task: *fluency_flag
|
57
61
|
word_splitter: ';'
|
@@ -74,8 +78,8 @@ options_embeddings:
|
|
74
78
|
method: "model_instance"
|
75
79
|
max_length: 512
|
76
80
|
clean_embedding_tokens: true
|
77
|
-
|
78
|
-
|
81
|
+
remove_punctuation_embeddings: false
|
82
|
+
lowercase_embeddings: false
|
79
83
|
keep_speakertags: false
|
80
84
|
semantic-similarity: true
|
81
85
|
window_size: null
|
@@ -118,6 +122,14 @@ options_dis_from_randomness:
|
|
118
122
|
normalization_options:
|
119
123
|
method: "lemmatization" # Options: 'lemmatization', 'stemming'
|
120
124
|
|
125
|
+
# Document Information Output
|
126
|
+
# -------------------------
|
127
|
+
document_information_output:
|
128
|
+
parameters:
|
129
|
+
- subject_ID
|
130
|
+
- fluency_word_count
|
131
|
+
- fluency_duplicate_count
|
132
|
+
|
121
133
|
# Filename Configuration
|
122
134
|
# --------------------
|
123
135
|
filename_components:
|
@@ -125,6 +137,10 @@ filename_components:
|
|
125
137
|
session: false
|
126
138
|
task: true # mandatory
|
127
139
|
task_addition: false
|
128
|
-
corpus: true #mandatory
|
140
|
+
corpus: true # mandatory
|
129
141
|
metric: true
|
130
142
|
additional_tags: []
|
143
|
+
|
144
|
+
# Additional Settings
|
145
|
+
# -----------------
|
146
|
+
create_aggregation_of_results: true
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# Configuration file for discourse protocols
|
2
|
+
#=======================================
|
3
|
+
input_file: "text" #or 'audio'
|
4
|
+
discourse: &discourse_flag true
|
5
|
+
#=====================================
|
6
|
+
|
7
|
+
#general configurations; always adapt
|
8
|
+
language: "german" # Possibly add options for German and English
|
9
|
+
|
10
|
+
task_name: "interview"
|
11
|
+
|
12
|
+
#Create analysis corpus, group files based on corpus entity.
|
13
|
+
corpus_key: "acq"
|
14
|
+
corpus_values: #group names
|
15
|
+
- "placebo"
|
16
|
+
- "schizophrenia"
|
17
|
+
|
18
|
+
metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
|
19
|
+
|
20
|
+
number_of_speakers: 2
|
21
|
+
subject_speakertag: "B"
|
22
|
+
#=========================================================
|
23
|
+
|
24
|
+
#Optional configurations; Change with preference. However, default settings recommended
|
25
|
+
fluency_task: &fluency_flag false
|
26
|
+
cleaning_options:
|
27
|
+
general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
|
28
|
+
remove_brackets_and_bracketcontent: true
|
29
|
+
remove_timestamps: true
|
30
|
+
timestamp_pattern_example: "#00:00:19-00#"
|
31
|
+
remove_punctuation: false
|
32
|
+
lowercase: false
|
33
|
+
#Options for fluency tasks
|
34
|
+
fluency_task: *fluency_flag
|
35
|
+
word_splitter: null
|
36
|
+
remove_hyphens: null
|
37
|
+
remove_duplicates: null
|
38
|
+
|
39
|
+
options_logits:
|
40
|
+
chunk_size: 128
|
41
|
+
overlap_size: 64
|
42
|
+
tokenization_method: "model"
|
43
|
+
#method: "model_instance" # Options: model, regex, nltk, etc.
|
44
|
+
model_name: "xlm-roberta-base" #Replace with your model instance name, e.g. "DiscoResearch/Llama3-German-8B-32k"
|
45
|
+
remove_punctuation: true
|
46
|
+
lowercase: true
|
47
|
+
keep_speakertags: true
|
48
|
+
|
49
|
+
options_embeddings:
|
50
|
+
tokenization_method: "model" #"model" or "whitespace"
|
51
|
+
max_length: 512 #max sequence length
|
52
|
+
model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
|
53
|
+
pytorch_based_model: true
|
54
|
+
method: "model_instance"
|
55
|
+
remove_punctuation: false
|
56
|
+
lowercase: false
|
57
|
+
keep_speakertags: true
|
58
|
+
clean_embedding_tokens: true
|
59
|
+
output_options:
|
60
|
+
exclude_special_tokens: true
|
61
|
+
remove_'_'_character: true
|
62
|
+
remove_speaker_labels: true
|
63
|
+
remove_punctuation_and_symbols: true
|
64
|
+
remove_brackets_and_content: true
|
65
|
+
semantic-similarity: false
|
66
|
+
distance-from-randomness: false
|
67
|
+
window_size: null
|
68
|
+
clean_tokens: false
|
69
|
+
divergence_from_optimality: false
|
70
|
+
#================================================================================
|
71
|
+
|
72
|
+
#Extra configurations:
|
73
|
+
pipeline_options:
|
74
|
+
quality_check: false
|
75
|
+
clean_text: true
|
76
|
+
tokenize_text: false
|
77
|
+
normalize_text: false
|
78
|
+
|
79
|
+
general_cleaning_options:
|
80
|
+
strip_whitespace: true
|
81
|
+
merge_multiple_whitespaces: true
|
82
|
+
remove_whitespace_before_punctuation: true
|
83
|
+
merge_newline_characters: true
|
84
|
+
remove_backslashes: true
|
85
|
+
|
86
|
+
has_multiple_sections: false #evaluated independently
|
87
|
+
has_section_titles: false
|
88
|
+
section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
|
89
|
+
number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
|
90
|
+
|
91
|
+
# Options for extract_embeddings
|
92
|
+
window_sizes: [2]
|
93
|
+
metric_function: cosine_similarity
|
94
|
+
aggregation_functions: mean_of_means
|
95
|
+
|
96
|
+
normalization_options:
|
97
|
+
method: "lemmatization" #Options: lemmatization or stemming
|
98
|
+
#================================================================
|
99
|
+
|
100
|
+
create_aggregation_of_results: false
|
101
|
+
output_document_information: false
|
102
|
+
|
103
|
+
#Detail configurations; Changes optional, mostly used for quality checking / error handling
|
104
|
+
number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
|
105
|
+
multiple_sessions: false # Set to True if multiple sessions per subject
|
106
|
+
|
107
|
+
recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
|
108
|
+
|
109
|
+
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# Configuration file for fluency task
|
2
|
+
# =======================================
|
3
|
+
input_file: "text" #or 'audio'
|
4
|
+
fluency_task: &fluency_flag true
|
5
|
+
#========================================
|
6
|
+
|
7
|
+
#general configurations; always adapt
|
8
|
+
language: "german"
|
9
|
+
multiple_sessions: &session_flag false
|
10
|
+
|
11
|
+
corpus_key: "acq"
|
12
|
+
corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
|
13
|
+
- "animals"
|
14
|
+
- "clothes"
|
15
|
+
- "food"
|
16
|
+
|
17
|
+
#Specify linguistic metrics to extract
|
18
|
+
metric_to_extract: 'embeddings' #Possible options: 'embeddings', 'logits'
|
19
|
+
output_document_information: true
|
20
|
+
#====================================================================
|
21
|
+
|
22
|
+
#Optional configurations; Change with preference. However, default settings recommended
|
23
|
+
cleaning_options:
|
24
|
+
general_cleaning: true
|
25
|
+
#Options for fluency tasks
|
26
|
+
fluency_task: *fluency_flag
|
27
|
+
word_splitter: ';' #default split with ',' add different word_splitter if necessary
|
28
|
+
remove_hyphens: true
|
29
|
+
remove_duplicates: false
|
30
|
+
lowercase: false
|
31
|
+
#Optional cleaning
|
32
|
+
remove_brackets_and_bracketcontent: false #default 'false'
|
33
|
+
remove_timestamps: false #default 'false'
|
34
|
+
timestamp_pattern_example: null #e.g. "#00:00:23-00#"
|
35
|
+
remove_punctuation: false #Careful!: If set to true word_splitter might be removed
|
36
|
+
|
37
|
+
options_embeddings:
|
38
|
+
tokenization_method: "whitespace" #or "model"
|
39
|
+
model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
|
40
|
+
pytorch_based_model: false
|
41
|
+
method: "model_instance"
|
42
|
+
max_length: null
|
43
|
+
clean_embedding_tokens: true
|
44
|
+
|
45
|
+
semantic-similarity: true
|
46
|
+
distance-from-randomness: false
|
47
|
+
|
48
|
+
options_dis_from_randomness:
|
49
|
+
window_size: 8
|
50
|
+
min_len: null
|
51
|
+
bootstrap: 10000
|
52
|
+
shuffle_mode: 'include0_includeN'
|
53
|
+
parallel_computing: false #not yet set up
|
54
|
+
|
55
|
+
options_semantic-similarity:
|
56
|
+
window_sizes: #'all' or window size as integer
|
57
|
+
- 2
|
58
|
+
- 8
|
59
|
+
#==================================================================
|
60
|
+
|
61
|
+
#Extra configurations;
|
62
|
+
task_name: "fluency"
|
63
|
+
create_aggregation_of_results: true
|
64
|
+
|
65
|
+
pipeline_options:
|
66
|
+
quality_check: false
|
67
|
+
clean_text: true
|
68
|
+
tokenize_text: false
|
69
|
+
normalize_text: false
|
70
|
+
|
71
|
+
general_cleaning_options:
|
72
|
+
strip_whitespace: true
|
73
|
+
merge_multiple_whitespaces: true
|
74
|
+
remove_whitespace_before_punctuation: true
|
75
|
+
merge_newline_characters: true
|
76
|
+
remove_backslashes: true
|
77
|
+
|
78
|
+
has_multiple_sections: false
|
79
|
+
has_section_titles: false
|
80
|
+
section_identification: null
|
81
|
+
number_of_sections: 1
|
82
|
+
number_of_speakers: 1
|
83
|
+
discourse: false
|
84
|
+
|
85
|
+
document_information_output:
|
86
|
+
parameters:
|
87
|
+
- subject_ID
|
88
|
+
- fluency_word_count
|
89
|
+
- fluency_duplicate_count
|
90
|
+
|
91
|
+
#================================================================
|
92
|
+
|
93
|
+
#Detail configurations; Changes optional, mostly used for quality checking / error handling
|
94
|
+
recompute_everything: true
|
95
|
+
number_of_subjects: null
|
96
|
+
|
97
|
+
# Filename components configuration
|
98
|
+
filename_components:
|
99
|
+
subject: true # mandatory
|
100
|
+
session: *session_flag
|
101
|
+
task: true # mandatory
|
102
|
+
task_addition: false
|
103
|
+
corpus: true # typically true for fluency tasks (e.g., "animals", "clothes")
|
104
|
+
metric: true
|
105
|
+
additional_tags: []
|
106
|
+
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Master Configuration File
|
2
|
+
# ========================
|
3
|
+
|
4
|
+
# Basic Settings
|
5
|
+
# -------------
|
6
|
+
input_file: "text" # Options: 'text' or 'audio'
|
7
|
+
language: "german" # Options: 'german', 'english'
|
8
|
+
recompute_everything: true # If false, reuses previously computed results
|
9
|
+
|
10
|
+
# Task Configuration
|
11
|
+
# -----------------
|
12
|
+
task_name: "image-description" # Options: 'fluency', 'interview'
|
13
|
+
fluency_task: &fluency_flag false # Flag for fluency-specific settings
|
14
|
+
discourse: &discourse_flag false # Flag for discourse-specific settings
|
15
|
+
corpus_names: # List of task corpora
|
16
|
+
- "placebo"
|
17
|
+
- "drug"
|
18
|
+
|
19
|
+
# Session and Subject Settings
|
20
|
+
# --------------------------
|
21
|
+
multiple_sessions: true
|
22
|
+
number_of_subjects: null # If null, auto-detected
|
23
|
+
number_of_speakers: 1
|
24
|
+
subject_speakertag: null # Speaker tag for subject (e.g., "B")
|
25
|
+
|
26
|
+
# Document Structure
|
27
|
+
# ----------------
|
28
|
+
has_multiple_sections: true
|
29
|
+
has_section_titles: true
|
30
|
+
section_identification: "Bild:" # e.g., "Section:"
|
31
|
+
number_of_sections: null # If null, auto-detected
|
32
|
+
|
33
|
+
# Processing Pipeline
|
34
|
+
# -----------------
|
35
|
+
pipeline_options:
|
36
|
+
quality_check: false
|
37
|
+
clean_text: true
|
38
|
+
tokenize_text: false
|
39
|
+
normalize_text: false
|
40
|
+
|
41
|
+
# Metric Extraction
|
42
|
+
# ---------------
|
43
|
+
metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
|
44
|
+
extract_logits: null
|
45
|
+
extract_embeddings: true
|
46
|
+
|
47
|
+
# Cleaning Options
|
48
|
+
# --------------
|
49
|
+
cleaning_options:
|
50
|
+
general_cleaning: true
|
51
|
+
remove_punctuation: false
|
52
|
+
lowercase: false
|
53
|
+
remove_brackets_and_bracketcontent: true
|
54
|
+
remove_timestamps: false
|
55
|
+
timestamp_pattern_example: null # e.g., "#00:00:23-00#"
|
56
|
+
# Fluency-specific options
|
57
|
+
fluency_task: *fluency_flag
|
58
|
+
word_splitter: ';'
|
59
|
+
remove_hyphens: false
|
60
|
+
remove_duplicates: false
|
61
|
+
|
62
|
+
general_cleaning_options:
|
63
|
+
strip_whitespace: true
|
64
|
+
merge_multiple_whitespaces: true
|
65
|
+
remove_whitespace_before_punctuation: true
|
66
|
+
merge_newline_characters: true
|
67
|
+
remove_backslashes: true
|
68
|
+
|
69
|
+
# Embedding Options
|
70
|
+
# ---------------
|
71
|
+
options_embeddings:
|
72
|
+
tokenization_method: "model_roberta" #or "whitespace", "model"
|
73
|
+
max_length: 512 #max sequence length
|
74
|
+
model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
|
75
|
+
pytorch_based_model: true
|
76
|
+
method: "model_instance"
|
77
|
+
max_length: 512
|
78
|
+
clean_embedding_tokens: true
|
79
|
+
remove_punctuation: false
|
80
|
+
lowercase: false
|
81
|
+
keep_speakertags: false
|
82
|
+
semantic-similarity: false
|
83
|
+
window_size: null
|
84
|
+
clean_tokens: true
|
85
|
+
distance-from-randomness: false
|
86
|
+
output_options:
|
87
|
+
exclude_special_tokens: true
|
88
|
+
remove_'_'_character: true
|
89
|
+
remove_speaker_labels: true
|
90
|
+
remove_punctuation_and_symbols: true
|
91
|
+
remove_brackets_and_content: true
|
92
|
+
|
93
|
+
# Logits Options
|
94
|
+
# -------------
|
95
|
+
options_logits:
|
96
|
+
chunk_size: 128
|
97
|
+
overlap_size: 64
|
98
|
+
tokenization_method: "model"
|
99
|
+
model_name: "DiscoResearch/Llama3-German-8B-32k"
|
100
|
+
remove_punctuation: true
|
101
|
+
lowercase: true
|
102
|
+
keep_speakertags: true
|
103
|
+
|
104
|
+
# Analysis Options
|
105
|
+
# --------------
|
106
|
+
options_semantic-similarity:
|
107
|
+
window_sizes: # 'all' or window size as integer
|
108
|
+
- 2
|
109
|
+
- 8
|
110
|
+
|
111
|
+
options_dis_from_randomness:
|
112
|
+
window_size: 8
|
113
|
+
min_len: null
|
114
|
+
bootstrap: 10000
|
115
|
+
shuffle_mode: 'include0_includeN'
|
116
|
+
parallel_computing: false
|
117
|
+
|
118
|
+
# Normalization Options
|
119
|
+
# -------------------
|
120
|
+
normalization_options:
|
121
|
+
method: "lemmatization" # Options: 'lemmatization', 'stemming'
|
122
|
+
|
123
|
+
# Filename Configuration
|
124
|
+
# --------------------
|
125
|
+
filename_components:
|
126
|
+
subject: true # mandatory
|
127
|
+
session: false
|
128
|
+
task: true # mandatory
|
129
|
+
task_addition: false
|
130
|
+
corpus: true
|
131
|
+
metric: true
|
132
|
+
additional_tags: []
|
133
|
+
|
134
|
+
create_aggregation_of_results: false
|
135
|
+
output_document_information: false
|
@@ -0,0 +1,211 @@
|
|
1
|
+
import unittest
|
2
|
+
import os
|
3
|
+
import yaml
|
4
|
+
from pathlib import Path
|
5
|
+
import shutil
|
6
|
+
import tempfile
|
7
|
+
import json
|
8
|
+
import subprocess
|
9
|
+
import sys
|
10
|
+
import logging
|
11
|
+
import signal
|
12
|
+
from contextlib import contextmanager
|
13
|
+
|
14
|
+
# Add the project root to the Python path
|
15
|
+
project_root = Path(__file__).parent.parent.parent
|
16
|
+
sys.path.append(str(project_root))
|
17
|
+
|
18
|
+
from pelican_nlp.config import DEBUG_MODE, debug_print
|
19
|
+
|
20
|
+
class TimeoutError(Exception):
|
21
|
+
pass
|
22
|
+
|
23
|
+
@contextmanager
|
24
|
+
def timeout(seconds):
|
25
|
+
def signal_handler(signum, frame):
|
26
|
+
raise TimeoutError(f"Timed out after {seconds} seconds")
|
27
|
+
|
28
|
+
# Register the signal handler
|
29
|
+
signal.signal(signal.SIGALRM, signal_handler)
|
30
|
+
signal.alarm(seconds)
|
31
|
+
|
32
|
+
try:
|
33
|
+
yield
|
34
|
+
finally:
|
35
|
+
# Disable the alarm
|
36
|
+
signal.alarm(0)
|
37
|
+
|
38
|
+
class TestExamples(unittest.TestCase):
|
39
|
+
@classmethod
|
40
|
+
def setUpClass(cls):
|
41
|
+
debug_print("Setting up test environment...")
|
42
|
+
# Create a temporary directory for test outputs
|
43
|
+
cls.test_dir = tempfile.mkdtemp()
|
44
|
+
cls.examples_dir = Path(__file__).parent / "examples"
|
45
|
+
|
46
|
+
# Load all example configurations
|
47
|
+
cls.examples = {}
|
48
|
+
for example_type in ["fluency", "discourse", "image-descriptions"]:
|
49
|
+
example_dir = cls.examples_dir / f"example_{example_type}"
|
50
|
+
config_path = example_dir / f"config_{example_type}.yml"
|
51
|
+
|
52
|
+
debug_print(f"Loading configuration for {example_type}...")
|
53
|
+
if not config_path.exists():
|
54
|
+
debug_print(f"Warning: Config file not found: {config_path}")
|
55
|
+
continue
|
56
|
+
|
57
|
+
with open(config_path, 'r') as f:
|
58
|
+
config = yaml.safe_load(f)
|
59
|
+
|
60
|
+
cls.examples[example_type] = {
|
61
|
+
"config_path": config_path,
|
62
|
+
"config": config,
|
63
|
+
"example_dir": example_dir
|
64
|
+
}
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def tearDownClass(cls):
|
68
|
+
debug_print("Cleaning up test environment...")
|
69
|
+
# Clean up temporary directory
|
70
|
+
shutil.rmtree(cls.test_dir)
|
71
|
+
|
72
|
+
def setUp(self):
|
73
|
+
# Create a fresh output directory for each test
|
74
|
+
self.output_dir = Path(self.test_dir) / "test_output"
|
75
|
+
self.output_dir.mkdir(exist_ok=True)
|
76
|
+
|
77
|
+
def run_pelican_pipeline(self, example_dir, config_path, output_dir):
|
78
|
+
"""Run the pelican pipeline with the given configuration file"""
|
79
|
+
debug_print(f"Running pipeline with config: {config_path}")
|
80
|
+
try:
|
81
|
+
# Change to the example directory before running the command
|
82
|
+
original_dir = os.getcwd()
|
83
|
+
os.chdir(example_dir)
|
84
|
+
|
85
|
+
# Print current directory and files
|
86
|
+
debug_print(f"Current directory: {os.getcwd()}")
|
87
|
+
debug_print("Files in current directory:")
|
88
|
+
for f in os.listdir('.'):
|
89
|
+
debug_print(f" - {f}")
|
90
|
+
|
91
|
+
# Run pelican-run with the configuration file and timeout
|
92
|
+
with timeout(300): # 5 minute timeout
|
93
|
+
# Use run with real-time output
|
94
|
+
process = subprocess.run(
|
95
|
+
["pelican-run", "--config", str(config_path), "--output", str(output_dir)],
|
96
|
+
stdout=subprocess.PIPE,
|
97
|
+
stderr=subprocess.PIPE,
|
98
|
+
text=True,
|
99
|
+
check=True
|
100
|
+
)
|
101
|
+
|
102
|
+
# Print output after completion
|
103
|
+
if process.stdout:
|
104
|
+
print("Pipeline output:")
|
105
|
+
print(process.stdout)
|
106
|
+
if process.stderr:
|
107
|
+
print("Pipeline errors:")
|
108
|
+
print(process.stderr)
|
109
|
+
|
110
|
+
# Change back to original directory
|
111
|
+
os.chdir(original_dir)
|
112
|
+
|
113
|
+
debug_print("Pipeline completed successfully")
|
114
|
+
return True, "Pipeline completed successfully"
|
115
|
+
except TimeoutError as e:
|
116
|
+
os.chdir(original_dir)
|
117
|
+
debug_print(f"Pipeline timed out: {str(e)}")
|
118
|
+
return False, f"Error: Pipeline timed out after 5 minutes"
|
119
|
+
except subprocess.CalledProcessError as e:
|
120
|
+
# Change back to original directory even if there's an error
|
121
|
+
os.chdir(original_dir)
|
122
|
+
debug_print(f"Pipeline failed with exit code {e.returncode}")
|
123
|
+
if e.stdout:
|
124
|
+
print("Pipeline output:")
|
125
|
+
print(e.stdout)
|
126
|
+
if e.stderr:
|
127
|
+
print("Pipeline errors:")
|
128
|
+
print(e.stderr)
|
129
|
+
return False, f"Error: Pipeline failed with exit code {e.returncode}"
|
130
|
+
except Exception as e:
|
131
|
+
os.chdir(original_dir)
|
132
|
+
debug_print(f"Unexpected error: {str(e)}")
|
133
|
+
return False, f"Error: {str(e)}"
|
134
|
+
|
135
|
+
def test_discourse_example(self):
|
136
|
+
"""Test running the discourse example through the pipeline"""
|
137
|
+
debug_print("Testing discourse example...")
|
138
|
+
if "discourse" not in self.examples:
|
139
|
+
self.skipTest("Discourse example configuration not found")
|
140
|
+
|
141
|
+
example = self.examples["discourse"]
|
142
|
+
output_dir = self.output_dir / "discourse"
|
143
|
+
output_dir.mkdir(exist_ok=True)
|
144
|
+
|
145
|
+
success, output = self.run_pelican_pipeline(
|
146
|
+
example["example_dir"],
|
147
|
+
example["config_path"],
|
148
|
+
output_dir
|
149
|
+
)
|
150
|
+
self.assertTrue(success, f"Pipeline failed: {output}")
|
151
|
+
|
152
|
+
# Verify output files were created
|
153
|
+
self.assertTrue(output_dir.exists())
|
154
|
+
self.assertTrue(len(list(output_dir.glob("*"))) > 0)
|
155
|
+
debug_print("Discourse example test completed")
|
156
|
+
|
157
|
+
def test_fluency_example(self):
|
158
|
+
"""Test running the fluency example through the pipeline"""
|
159
|
+
debug_print("Testing fluency example...")
|
160
|
+
if "fluency" not in self.examples:
|
161
|
+
self.skipTest("Fluency example configuration not found")
|
162
|
+
|
163
|
+
example = self.examples["fluency"]
|
164
|
+
output_dir = self.output_dir / "fluency"
|
165
|
+
output_dir.mkdir(exist_ok=True)
|
166
|
+
|
167
|
+
success, output = self.run_pelican_pipeline(
|
168
|
+
example["example_dir"],
|
169
|
+
example["config_path"],
|
170
|
+
output_dir
|
171
|
+
)
|
172
|
+
self.assertTrue(success, f"Pipeline failed: {output}")
|
173
|
+
|
174
|
+
# Verify output files were created
|
175
|
+
self.assertTrue(output_dir.exists())
|
176
|
+
self.assertTrue(len(list(output_dir.glob("*"))) > 0)
|
177
|
+
debug_print("Fluency example test completed")
|
178
|
+
|
179
|
+
def test_image_descriptions_example(self):
|
180
|
+
"""Test running the image descriptions example through the pipeline"""
|
181
|
+
debug_print("Testing image descriptions example...")
|
182
|
+
if "image-descriptions" not in self.examples:
|
183
|
+
self.skipTest("Image descriptions example configuration not found")
|
184
|
+
|
185
|
+
example = self.examples["image-descriptions"]
|
186
|
+
output_dir = self.output_dir / "image-descriptions"
|
187
|
+
output_dir.mkdir(exist_ok=True)
|
188
|
+
|
189
|
+
success, output = self.run_pelican_pipeline(
|
190
|
+
example["example_dir"],
|
191
|
+
example["config_path"],
|
192
|
+
output_dir
|
193
|
+
)
|
194
|
+
self.assertTrue(success, f"Pipeline failed: {output}")
|
195
|
+
|
196
|
+
# Verify output files were created
|
197
|
+
self.assertTrue(output_dir.exists())
|
198
|
+
self.assertTrue(len(list(output_dir.glob("*"))) > 0)
|
199
|
+
debug_print("Image descriptions example test completed")
|
200
|
+
|
201
|
+
def suite():
|
202
|
+
"""Create a test suite with all test cases"""
|
203
|
+
suite = unittest.TestSuite()
|
204
|
+
suite.addTest(TestExamples('test_discourse_example'))
|
205
|
+
suite.addTest(TestExamples('test_fluency_example'))
|
206
|
+
suite.addTest(TestExamples('test_image_descriptions_example'))
|
207
|
+
return suite
|
208
|
+
|
209
|
+
if __name__ == '__main__':
|
210
|
+
runner = unittest.TextTestRunner(verbosity=2)
|
211
|
+
runner.run(suite())
|
@@ -1,8 +1,8 @@
|
|
1
1
|
pelican_nlp/__init__.py,sha256=TD5xjKeXXAH6nUWG-6igbClgovi5r8RIEqI_ix1QeYo,204
|
2
|
-
pelican_nlp/_version.py,sha256=
|
3
|
-
pelican_nlp/cli.py,sha256=
|
4
|
-
pelican_nlp/config.py,sha256=
|
5
|
-
pelican_nlp/main.py,sha256=
|
2
|
+
pelican_nlp/_version.py,sha256=G3p10uWMvNQiA3ZdxMz0QlmyKECmnauS0Ym9wMP2tEI,21
|
3
|
+
pelican_nlp/cli.py,sha256=Z11qwivHvuly07FAcEG8-Fl6_MqGauriZ8U5iZWf4lc,1116
|
4
|
+
pelican_nlp/config.py,sha256=LuZnuaq0Z49FgRgKJ7F6mwl1yr60QQDfMtD29ocbKfw,1000
|
5
|
+
pelican_nlp/main.py,sha256=CAYVrOHOG1gIJ_WkjlYeXUQPNvsNbAGDd0we92Z0sGI,8784
|
6
6
|
pelican_nlp/Nils_backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
pelican_nlp/Nils_backup/extract_acoustic_features.py,sha256=eSP8lXxbZ15YE1HqxGtma9uWOcSN-fI-ig-NwQ9eOA8,10771
|
8
8
|
pelican_nlp/Nils_backup/speaker_diarization_Nils.py,sha256=3RIhjKihu4Z1rruMt9KESFE2lqesfzIpRr7rLummUEo,10219
|
@@ -45,9 +45,9 @@ pelican_nlp/core/subject.py,sha256=Jx99vPn0K0KT_9BsJOY8XviFU_GuZGuwtb1rbLNkiUI,1
|
|
45
45
|
pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
|
46
46
|
pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=Ol6fqyy94Iym1Z-eTVoz8EmqfV58boz5WAoamAK7JVE,2323
|
47
47
|
pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
|
48
|
-
pelican_nlp/extraction/extract_embeddings.py,sha256=
|
49
|
-
pelican_nlp/extraction/extract_logits.py,sha256=
|
50
|
-
pelican_nlp/extraction/language_model.py,sha256=
|
48
|
+
pelican_nlp/extraction/extract_embeddings.py,sha256=LobzYEtjOpI_SvMZUb0u3QiOyZ2gPQD9bjQI9qwaogw,5719
|
49
|
+
pelican_nlp/extraction/extract_logits.py,sha256=4r8KbsqejD3UR3gCAAjm_sQhBkz8t_ePkv30bVGZg18,4116
|
50
|
+
pelican_nlp/extraction/language_model.py,sha256=EZE5bl-7pzPKoBIM9rnk7MJSFdMO6iQVnWmpQQsP8MU,5662
|
51
51
|
pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
|
52
52
|
pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
|
53
53
|
pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
|
@@ -62,18 +62,23 @@ pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_
|
|
62
62
|
pelican_nlp/preprocessing/text_cleaner.py,sha256=QKqxwoRR8dnuBYiY-PXK1kB7744TVUcUMJb7dbKvXGk,7512
|
63
63
|
pelican_nlp/preprocessing/text_importer.py,sha256=FtSyJjFXDxVle7Jpyw6EqCLDbLTCRxqVQi9ymWWtPB4,1356
|
64
64
|
pelican_nlp/preprocessing/text_normalizer.py,sha256=huo5VFqJ0p2jq-ud1047XvMu1qNeaiuG879SF3zkJoM,894
|
65
|
-
pelican_nlp/preprocessing/text_tokenizer.py,sha256=
|
66
|
-
pelican_nlp/
|
67
|
-
pelican_nlp/sample_configuration_files/
|
68
|
-
pelican_nlp/sample_configuration_files/
|
65
|
+
pelican_nlp/preprocessing/text_tokenizer.py,sha256=vUYayGLtMHTtJunTaEgiqjxJt658puEsFi3wTFfW6qw,1989
|
66
|
+
pelican_nlp/project_graph/graph_visualization.py,sha256=M99hGFKAun4_U2VQk9VQBMCF-imNAhQBHMhOvArPvMk,4648
|
67
|
+
pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=l1FN8NcgEbb4s8PqPFErnspDqjSD-SEiIWYcDfSS0Xw,3681
|
68
|
+
pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=BESisuMG9JPEBpbRPzEDmYVMIEHDY5Pf6HKqTWTa624,3007
|
69
|
+
pelican_nlp/sample_configuration_files/config_general.yml,sha256=FsGfcc8bK-di5dYuD-ri4sJlh2johQVEWUqsH7T6cCA,4172
|
69
70
|
pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
|
70
71
|
pelican_nlp/utils/csv_functions.py,sha256=7X8pGh49TGZGs7h6JrJD846swCqSHL32mmXJ-8qLWPE,7774
|
71
72
|
pelican_nlp/utils/filename_parser.py,sha256=PGSKjiYDe_JVAFGcaYHdIYazB3p4MUiG6n8h_uZl8d8,551
|
72
73
|
pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
|
73
|
-
pelican_nlp/utils/setup_functions.py,sha256=
|
74
|
-
pelican_nlp
|
75
|
-
pelican_nlp
|
76
|
-
pelican_nlp
|
77
|
-
pelican_nlp-
|
78
|
-
pelican_nlp-0.3.
|
79
|
-
pelican_nlp-0.3.
|
74
|
+
pelican_nlp/utils/setup_functions.py,sha256=Xk-9W1-ylex8De5w6jxAqWJUlmbe5z-c2EvwptTZ7RQ,4539
|
75
|
+
pelican_nlp/utils/unittests/test_examples.py,sha256=XLc9my0cwpPN9W8gBAPnPTVnBf77kMQeO0Xo38oH4Tg,7849
|
76
|
+
pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml,sha256=jBSJ07dPujoZo2bOK15_RW4_dKALOWTzI55KljmWJKg,3709
|
77
|
+
pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml,sha256=BESisuMG9JPEBpbRPzEDmYVMIEHDY5Pf6HKqTWTa624,3007
|
78
|
+
pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml,sha256=HuPI7Py_57FwyfHEdIPk0LcdsMKze3XjmEuP6kPirP4,3540
|
79
|
+
pelican_nlp-0.3.3.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
|
80
|
+
pelican_nlp-0.3.3.dist-info/METADATA,sha256=MV71aLEm8KappnKjsVzEHKlQYMWEkBfQw1ZhOgETKyM,6839
|
81
|
+
pelican_nlp-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
82
|
+
pelican_nlp-0.3.3.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
|
83
|
+
pelican_nlp-0.3.3.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
|
84
|
+
pelican_nlp-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|