pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
  39. pelican_nlp-0.1.2.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,231 @@
1
+ """
2
+ Configuration for VELAS fluency analysis pipeline.
3
+
4
+ This file contains all configuration settings for the analysis pipeline, organized by component.
5
+ Each section corresponds to a specific analysis step and contains paths, parameters, and settings
6
+ used by that component.
7
+
8
+ Manual Mounting Instructions:
9
+ ----------------------------
10
+ Windows:
11
+ 1. Open File Explorer
12
+ 2. Right-click on 'This PC'
13
+ 3. Select 'Map Network Drive'
14
+ 4. Enter: \\nas01.bli.uzh.ch\studies\homan.puk.uzh\VELAS\VELAS_Master_Folder
15
+
16
+ macOS:
17
+ 1. Finder > Go > Connect to Server
18
+ 2. Enter: smb://nas01.bli.uzh.ch/studies/homan.puk.uzh/VELAS/VELAS_Master_Folder
19
+
20
+ Linux:
21
+ 1. Create mount point: mkdir -p ~/VELAS_mount
22
+ 2. Mount: sudo mount -t cifs //nas01.bli.uzh.ch/studies/homan.puk.uzh/VELAS/VELAS_Master_Folder ~/VELAS_mount
23
+ """
24
+ from pathlib import Path
25
+ import os
26
+ import sys
27
+ import platform
28
+
29
+
30
+ def get_default_mount_point():
31
+ """Get the default mount point based on the operating system."""
32
+ system = platform.system().lower()
33
+ if system == "windows":
34
+ # Use the UNC path directly for Windows
35
+ return Path(r"\\nas01.bli.uzh.ch\Studies\homan.puk.uzh\VELAS\VELAS_Master_Folder")
36
+ elif system == "darwin": # macOS
37
+ return Path("/Volumes/VELAS_Master_Folder")
38
+ else: # Linux and others
39
+ return Path.home() / "VELAS_mount"
40
+
41
+ def check_mount_point(mount_point):
42
+ """Check if the mount point exists and is accessible."""
43
+ if not mount_point.exists():
44
+ print(f"ERROR: Mount point {mount_point} does not exist!")
45
+ print("\nPlease mount the VELAS network share first.")
46
+ print("See the instructions in the config.py file header.")
47
+ sys.exit(1)
48
+ return mount_point
49
+
50
+ # Server configuration
51
+ SERVER_CONFIG = {
52
+ "server": "nas01.bli.uzh.ch", # NAS server address
53
+ "share": "studies", # Share name
54
+ "project_path": "homan.puk.uzh/VELAS/VELAS_Master_Folder", # Project directory on server
55
+ "mount_point": os.environ.get("VELAS_MOUNT", str(get_default_mount_point())) # Allow override via env var
56
+ }
57
+
58
+ # Base paths for the project
59
+ BASE_DIR = check_mount_point(Path(SERVER_CONFIG["mount_point"])) # Root directory with mount check
60
+ DATA_DIR = BASE_DIR / "Master_Files" # Directory containing master data files
61
+
62
+ MODELS_DIR = BASE_DIR / "Sub_Projects" / "VELAS_Fluency" / "00_Nils" / "fluency-main" / "code" # Directory containing trained models
63
+ RESULTS_DIR = BASE_DIR / "Sub_Projects" / "VELAS_Fluency" / "Results" # Local directory for all output files
64
+
65
+ # Shared configuration settings
66
+ SHARED_CONFIG = {
67
+ "preprocessing": {
68
+ "lower": True, # Whether to convert text to lowercase
69
+ "free_text": False, # Whether input is free text
70
+ },
71
+ "parallelization": {
72
+ "parallelize": True,
73
+ "max_workers": 16
74
+ },
75
+ "model": {
76
+ "fasttext_path": str(MODELS_DIR / "cc.de.300.bin"),
77
+ "language_code": "deu-Latn"
78
+ }
79
+ }
80
+
81
+ # Configuration for questionnaire data processing
82
+ QUESTIONNAIRES_CONFIG = {
83
+ "paths": {
84
+ "input": str(DATA_DIR / "Online_Questionnaire_Data/VELAS_Questionnaire_Master.csv"), # Control group responses
85
+ "output": str(RESULTS_DIR / "aggregates/questionnaires.csv") # Processed questionnaire results
86
+ },
87
+ "columns_to_save": ["study_id", "mss_total", "mss_pos_sum", "mss_neg_sum", "mss_dis_sum"] # Columns to retain in output
88
+ }
89
+
90
+ # Configuration for behavioral data processing
91
+ BEHAVIORAL_CONFIG = {
92
+ "paths": {
93
+ "input": str(DATA_DIR / "Behavioral_Data/VELAS_Behav_Master.csv"), # Raw behavioral data
94
+ "output": str(RESULTS_DIR / "aggregates/behav_agg.csv") # Processed behavioral metrics
95
+ },
96
+ "columns_to_save": [ # Columns to retain in output
97
+ "study_id",
98
+ "panss_pos_sum", "panss_neg_sum", "panss_gen_sum", "panss_total", # PANSS scores
99
+ "working_memory", "stroop_psychomotor", "stroop_attention", "stroop_inhibition" # Cognitive measures
100
+ ],
101
+ "cognitive_variable_mapping": { # Mapping of raw variable names to standardized names
102
+ "stroop_time_1": "stroop_psychomotor",
103
+ "stroop_time_2": "stroop_attention",
104
+ "stroop_time_3": "stroop_inhibition",
105
+ "ds_bw_total": "working_memory"
106
+ }
107
+ }
108
+
109
+ # Configuration for duplicate checking in transcripts
110
+ DUPLICATES_CONFIG = {
111
+ "paths": {
112
+ "input": str(DATA_DIR / "Language_Data/NLP_Data/fluency_transcripts"), # Raw transcript files
113
+ "output": str(RESULTS_DIR / "fluency_transcripts_cleaned") # Cleaned transcript files
114
+ },
115
+ "file_filter": "sem_flu" # Only process files containing this string in filename
116
+ }
117
+
118
+ # Configuration for coherence analysis
119
+ COHERENCE_CONFIG = {
120
+ "modes": ["semantic"], # Types of coherence to analyze
121
+ "windows": [0, 2, 8], # Window sizes (0=whole text, 2/8=sliding windows)
122
+ **SHARED_CONFIG["parallelization"], # Include shared parallelization settings
123
+ "error_messages": True, # Whether to print error messages
124
+ "model": SHARED_CONFIG["model"], # Use shared model settings
125
+ "paths": {
126
+ "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"),
127
+ "results_dir": str(RESULTS_DIR / "coherence")
128
+ },
129
+ "preprocessing": SHARED_CONFIG["preprocessing"] # Use shared preprocessing settings
130
+ }
131
+
132
+ # Configuration for optimality analysis
133
+ OPTIMALITY_CONFIG = {
134
+ "modes": [ "semantic"],
135
+ "window_sizes": [8], # Specific window size for optimality
136
+ **SHARED_CONFIG["parallelization"], # Include shared parallelization settings
137
+ "bootstrap": 10000,
138
+ "shuffle_modes": ["include0_includeN", "exclude0_excludeN"], #whether to include or exclude the first and last word of the window
139
+ "model": SHARED_CONFIG["model"], # Use shared model settings
140
+ "paths": {
141
+ "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"),
142
+ "results_dir": str(RESULTS_DIR / "optimality")
143
+ },
144
+ "preprocessing": SHARED_CONFIG["preprocessing"] # Use shared preprocessing settings
145
+ }
146
+
147
+ # Configuration for statistical analysis
148
+ STATS_CONFIG = {
149
+ "paths": {
150
+ "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"), # Raw transcript data
151
+ "results_dir": str(RESULTS_DIR / "stats"), # Statistical analysis results
152
+ "figures_dir": str(RESULTS_DIR / "figures") # Generated figures
153
+ },
154
+ "demographics": ["age", "gender", "education", "first_language"], # Demographic variables
155
+ "outcomes": {
156
+ "clinical": ["panss_pos_sum", "panss_neg_sum", "panss_gen_sum", "panss_total"], # Clinical outcome measures
157
+ "cognitive": ["working_memory", "stroop_psychomotor", "stroop_attention", "stroop_inhibition"] # Cognitive outcome measures
158
+ },
159
+ "groups": { # Mapping of group codes to labels
160
+ "none": "0",
161
+ "schizophrenia": "1",
162
+ "delusional": "2",
163
+ "brief_psychotic": "3",
164
+ "schizoaffective": "4",
165
+ "other_psychotic": "5",
166
+ "manic_psychotic": "6",
167
+ "mdd_psychotic": "7",
168
+ "other": "8"
169
+ },
170
+ "min_tokens": 8, # Minimum number of tokens required for analysis
171
+ "task_type": "semantic", # Type of task to analyze
172
+ "metrics": [ # Main metrics for analysis
173
+ "semantic_coherence_2_mean_of_window_means",
174
+ "semantic_coherence_8_mean_of_window_means",
175
+ "z_Real_semantic_include0_includeN_8",
176
+ "number_tokens"
177
+ ],
178
+ "new_metrics": [ # Additional metrics for analysis
179
+ "semantic_coherence_2_mean_of_window_means",
180
+ "semantic_coherence_8_mean_of_window_means",
181
+ "z_Real_semantic_include0_includeN_8"
182
+ ],
183
+ "exclusions_bev": [ # Behavioral data columns to exclude
184
+ "panss_pos_sum",
185
+ "panss_neg_sum",
186
+ "panss_gen_sum",
187
+ "panss_total",
188
+ "mss_total",
189
+ "mss_pos_sum",
190
+ "mss_neg_sum",
191
+ "mss_dis_sum",
192
+ "working_memory",
193
+ "stroop_psychomotor",
194
+ "stroop_attention",
195
+ "stroop_inhibition"
196
+ ],
197
+ "alpha": 0.05, # Significance level
198
+ "num_tests": 4 # Number of tests for multiple comparison correction
199
+ }
200
+
201
+ # Configuration for aggregation
202
+ AGGREGATION_CONFIG = {
203
+ "paths": {
204
+ "behav_agg": str(RESULTS_DIR / "aggregates/behav_agg.csv"),
205
+ "questionnaires": str(RESULTS_DIR / "aggregates/questionnaires.csv"),
206
+
207
+ "demo_clinical": str(DATA_DIR / "Demographic_Clinical_Data/VELAS_Demo_Clin_Master.csv"),
208
+ "output": str(RESULTS_DIR / f"master_fluency{'_lower' if SHARED_CONFIG['preprocessing']['lower'] else '_upper'}.csv")
209
+ },
210
+ "demo_columns": [
211
+ 'study_id', 'group', 'age', 'gender', 'first_language', 'education',
212
+ 'diagnosis', 'duration_untreated', 'age_onset', 'antipsy_duration',
213
+ ]
214
+ }
215
+
216
+ # Combine all configs into a single dictionary for easy access
217
+ CONFIG = {
218
+ "questionnaires": QUESTIONNAIRES_CONFIG,
219
+ "behavioral": BEHAVIORAL_CONFIG,
220
+ "duplicates": DUPLICATES_CONFIG,
221
+ "coherence": COHERENCE_CONFIG,
222
+ "optimality": OPTIMALITY_CONFIG,
223
+ "stats": STATS_CONFIG,
224
+ "aggregation": AGGREGATION_CONFIG,
225
+ "shared": SHARED_CONFIG,
226
+ "min_tokens": STATS_CONFIG["min_tokens"],
227
+ "task_type": STATS_CONFIG["task_type"],
228
+ "metrics": STATS_CONFIG["metrics"],
229
+ "new_metrics": STATS_CONFIG["new_metrics"],
230
+ "exclusions_bev": STATS_CONFIG["exclusions_bev"]
231
+ }
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Main script for VELAS fluency analysis pipeline.
5
+
6
+ This script orchestrates the entire analysis pipeline:
7
+ 1. Validates input data
8
+ 2. Processes behavioral data
9
+ 3. Computes NLP metrics
10
+ 4. Performs statistical analysis
11
+ 5. Generates visualizations
12
+ """
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+ import logging
17
+ from typing import Dict, List, Any
18
+ from config import CONFIG, RESULTS_DIR
19
+ from utils import ensure_output_dir # Add import
20
+
21
+ # Debug print CONFIG structure
22
+ print("Initial CONFIG structure:")
23
+ print("CONFIG type:", type(CONFIG))
24
+ print("CONFIG keys:", list(CONFIG.keys()))
25
+ if "questionnaires" in CONFIG:
26
+ print("questionnaires keys:", list(CONFIG["questionnaires"].keys()))
27
+
28
+ # Get absolute path and ensure results directory exists
29
+ results_path = Path(os.getcwd()) / 'results'
30
+ results_path.mkdir(parents=True, exist_ok=True)
31
+
32
+ # Set up logging
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
36
+ handlers=[
37
+ logging.StreamHandler(sys.stdout),
38
+ logging.FileHandler(str(results_path / 'pipeline.log'))
39
+ ]
40
+ )
41
+ logger = logging.getLogger(__name__)
42
+
43
+ def validate_paths(config: Dict[str, Any], required_paths: List[str]) -> bool:
44
+ """Validate that all required paths exist."""
45
+ logger.info(f"CONFIG keys: {list(config.keys())}") # Debug print
46
+ logger.info(f"Required paths: {required_paths}") # Debug print
47
+
48
+ for path_key in required_paths:
49
+ # Split into section and path
50
+ section, *path_parts = path_key.split('.')
51
+ if section not in config:
52
+ logger.error(f"Missing config section: {section}")
53
+ return False
54
+
55
+ current = config[section]
56
+ try:
57
+ for key in path_parts:
58
+ current = current[key]
59
+
60
+ # Handle paths with format strings
61
+ if isinstance(current, str) and '{' in current:
62
+ # If path contains format string, check parent directory
63
+ path = Path(current.format(case='lower')).parent
64
+ else:
65
+ path = Path(current)
66
+
67
+ # For input paths, check existence
68
+ if 'input' in path_parts:
69
+ if not path.exists():
70
+ logger.error(f"Input path does not exist: {path}")
71
+ return False
72
+ # For output paths, create if doesn't exist
73
+ else:
74
+ ensure_output_dir(str(path))
75
+ logger.info(f"Created output directory: {path}")
76
+
77
+ except KeyError:
78
+ logger.error(f"Missing required path key: {path_key}")
79
+ return False
80
+ except Exception as e:
81
+ logger.error(f"Error validating path {path_key}: {str(e)}")
82
+ return False
83
+ return True
84
+
85
+ def log_config_section(section_name: str, config: Dict[str, Any]):
86
+ """Log the configuration section being used."""
87
+ logger.info(f"\nConfiguration for {section_name}:")
88
+ for key, value in config.items():
89
+ if isinstance(value, dict):
90
+ logger.info(f"{key}:")
91
+ for subkey, subvalue in value.items():
92
+ logger.info(f" {subkey}: {subvalue}")
93
+ else:
94
+ logger.info(f"{key}: {value}")
95
+
96
+ def run_questionnaires():
97
+ """Process questionnaire data."""
98
+ logger.info("\nProcessing questionnaire data...")
99
+ import questionnaires_data
100
+ questionnaires_data.main()
101
+ return True
102
+
103
+ def run_behavioral_data():
104
+ """Run behavioral data processing."""
105
+ logger.info("\nRunning behavioral data processing...")
106
+ import behavioral_data
107
+ behavioral_data.main()
108
+ return True
109
+
110
+ def run_check_duplicates():
111
+ """Check for duplicates in processed data."""
112
+ logger.info("\nChecking for duplicates...")
113
+ import check_duplicates
114
+ check_duplicates.main()
115
+ return True
116
+
117
+ def run_coherence():
118
+ """Run coherence analysis."""
119
+ logger.info("\nRunning coherence analysis...")
120
+ import coherence
121
+ coherence.main()
122
+ return True
123
+
124
+ def run_optimality():
125
+ """Run optimality analysis."""
126
+ logger.info("\nRunning optimality analysis...")
127
+ import optimality_without_tsa
128
+ optimality_without_tsa.main()
129
+ return True
130
+
131
+ def run_aggregate_results():
132
+ """Aggregate fluency results."""
133
+ logger.info("\nAggregating results...")
134
+ import aggregate_fluency_results
135
+ aggregate_fluency_results.main()
136
+ return True
137
+
138
+ def run_stats():
139
+ """Run statistical analysis."""
140
+ logger.info("\nRunning statistical analysis...")
141
+ import stats_fluency
142
+ stats_fluency.main()
143
+ return True
144
+
145
+ def main():
146
+ """Main execution pipeline."""
147
+ logger.info("Starting VELAS fluency analysis pipeline...")
148
+
149
+ # Create necessary directories
150
+ RESULTS_DIR.mkdir(parents=True, exist_ok=True)
151
+
152
+ # Pipeline execution order
153
+ pipeline_steps = [
154
+ ("Questionnaire Data Processing", run_questionnaires),
155
+ ("Behavioral Data Processing", run_behavioral_data),
156
+ ("Duplicate Check", run_check_duplicates),
157
+ ("Coherence Analysis", run_coherence),
158
+ ("Optimality Analysis", run_optimality),
159
+ ("Result Aggregation", run_aggregate_results),
160
+ ("Statistical Analysis", run_stats)
161
+ ]
162
+
163
+ # Execute pipeline
164
+ for step_name, step_func in pipeline_steps:
165
+ logger.info(f"\n{'='*50}")
166
+ logger.info(f"Starting {step_name}")
167
+ logger.info(f"{'='*50}")
168
+
169
+ try:
170
+ success = step_func()
171
+ if not success:
172
+ logger.error(f"{step_name} failed. Stopping pipeline.")
173
+ return
174
+ logger.info(f"{step_name} completed successfully.")
175
+ except Exception as e:
176
+ logger.exception(f"Error in {step_name}: {str(e)}")
177
+ return
178
+
179
+ logger.info("\nPipeline completed successfully!")
180
+
181
+ if __name__ == "__main__":
182
+ main()