PyPI - pelican-nlp - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

pelican-nlp 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

pelican_nlp/Nils_backup/__init__.py +0 -0
pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
pelican_nlp/Nils_backup/fluency/config.py +231 -0
pelican_nlp/Nils_backup/fluency/main.py +182 -0
pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
pelican_nlp/Nils_backup/fluency/utils.py +41 -0
pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
pelican_nlp/Nils_backup/transcription/test.json +1 -0
pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
pelican_nlp/__init__.py +1 -1
pelican_nlp/_version.py +1 -0
pelican_nlp/configuration_files/config_audio.yml +150 -0
pelican_nlp/configuration_files/config_discourse.yml +104 -0
pelican_nlp/configuration_files/config_fluency.yml +108 -0
pelican_nlp/configuration_files/config_general.yml +131 -0
pelican_nlp/configuration_files/config_morteza.yml +103 -0
pelican_nlp/praat/__init__.py +29 -0
{pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +4 -3
pelican_nlp-0.1.2.dist-info/RECORD +75 -0
pelican_nlp-0.1.1.dist-info/RECORD +0 -39
{pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
{pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
{pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0

pelican_nlp/Nils_backup/fluency/config.py ADDED Viewed

@@ -0,0 +1,231 @@
+"""
+Configuration for VELAS fluency analysis pipeline.
+This file contains all configuration settings for the analysis pipeline, organized by component.
+Each section corresponds to a specific analysis step and contains paths, parameters, and settings
+used by that component.
+Manual Mounting Instructions:
+----------------------------
+Windows:
+1. Open File Explorer
+2. Right-click on 'This PC'
+3. Select 'Map Network Drive'
+4. Enter: \\nas01.bli.uzh.ch\studies\homan.puk.uzh\VELAS\VELAS_Master_Folder
+macOS:
+1. Finder > Go > Connect to Server
+2. Enter: smb://nas01.bli.uzh.ch/studies/homan.puk.uzh/VELAS/VELAS_Master_Folder
+Linux:
+1. Create mount point: mkdir -p ~/VELAS_mount
+2. Mount: sudo mount -t cifs //nas01.bli.uzh.ch/studies/homan.puk.uzh/VELAS/VELAS_Master_Folder ~/VELAS_mount
+"""
+from pathlib import Path
+import os
+import sys
+import platform
+def get_default_mount_point():
+    """Get the default mount point based on the operating system."""
+    system = platform.system().lower()
+    if system == "windows":
+        # Use the UNC path directly for Windows
+        return Path(r"\\nas01.bli.uzh.ch\Studies\homan.puk.uzh\VELAS\VELAS_Master_Folder")
+    elif system == "darwin":  # macOS
+        return Path("/Volumes/VELAS_Master_Folder")
+    else:  # Linux and others
+        return Path.home() / "VELAS_mount"
+def check_mount_point(mount_point):
+    """Check if the mount point exists and is accessible."""
+    if not mount_point.exists():
+        print(f"ERROR: Mount point {mount_point} does not exist!")
+        print("\nPlease mount the VELAS network share first.")
+        print("See the instructions in the config.py file header.")
+        sys.exit(1)
+    return mount_point
+# Server configuration
+SERVER_CONFIG = {
+    "server": "nas01.bli.uzh.ch",  # NAS server address
+    "share": "studies",  # Share name
+    "project_path": "homan.puk.uzh/VELAS/VELAS_Master_Folder",  # Project directory on server
+    "mount_point": os.environ.get("VELAS_MOUNT", str(get_default_mount_point()))  # Allow override via env var
+}
+# Base paths for the project
+BASE_DIR = check_mount_point(Path(SERVER_CONFIG["mount_point"]))  # Root directory with mount check
+DATA_DIR = BASE_DIR / "Master_Files"  # Directory containing master data files
+MODELS_DIR = BASE_DIR / "Sub_Projects" / "VELAS_Fluency" / "00_Nils" / "fluency-main" / "code"  # Directory containing trained models
+RESULTS_DIR = BASE_DIR / "Sub_Projects" / "VELAS_Fluency" / "Results"  # Local directory for all output files
+# Shared configuration settings
+SHARED_CONFIG = {
+    "preprocessing": {
+        "lower": True,  # Whether to convert text to lowercase
+        "free_text": False,  # Whether input is free text
+    },
+    "parallelization": {
+        "parallelize": True,
+        "max_workers": 16
+    },
+    "model": {
+        "fasttext_path": str(MODELS_DIR / "cc.de.300.bin"),
+        "language_code": "deu-Latn"
+    }
+}
+# Configuration for questionnaire data processing
+QUESTIONNAIRES_CONFIG = {
+    "paths": {
+        "input": str(DATA_DIR / "Online_Questionnaire_Data/VELAS_Questionnaire_Master.csv"),  # Control group responses
+        "output": str(RESULTS_DIR / "aggregates/questionnaires.csv")  # Processed questionnaire results
+    },
+    "columns_to_save": ["study_id", "mss_total", "mss_pos_sum", "mss_neg_sum", "mss_dis_sum"]  # Columns to retain in output
+}
+# Configuration for behavioral data processing
+BEHAVIORAL_CONFIG = {
+    "paths": {
+        "input": str(DATA_DIR / "Behavioral_Data/VELAS_Behav_Master.csv"),  # Raw behavioral data
+        "output": str(RESULTS_DIR / "aggregates/behav_agg.csv")  # Processed behavioral metrics
+    },
+    "columns_to_save": [  # Columns to retain in output
+        "study_id",
+        "panss_pos_sum", "panss_neg_sum", "panss_gen_sum", "panss_total",  # PANSS scores
+        "working_memory", "stroop_psychomotor", "stroop_attention", "stroop_inhibition"  # Cognitive measures
+    ],
+    "cognitive_variable_mapping": {  # Mapping of raw variable names to standardized names
+        "stroop_time_1": "stroop_psychomotor",
+        "stroop_time_2": "stroop_attention",
+        "stroop_time_3": "stroop_inhibition",
+        "ds_bw_total": "working_memory"
+    }
+}
+# Configuration for duplicate checking in transcripts
+DUPLICATES_CONFIG = {
+    "paths": {
+        "input": str(DATA_DIR / "Language_Data/NLP_Data/fluency_transcripts"),  # Raw transcript files
+        "output": str(RESULTS_DIR / "fluency_transcripts_cleaned")  # Cleaned transcript files
+    },
+    "file_filter": "sem_flu"  # Only process files containing this string in filename
+}
+# Configuration for coherence analysis
+COHERENCE_CONFIG = {
+    "modes": ["semantic"],  # Types of coherence to analyze
+    "windows": [0, 2, 8],  # Window sizes (0=whole text, 2/8=sliding windows)
+    **SHARED_CONFIG["parallelization"],  # Include shared parallelization settings
+    "error_messages": True,  # Whether to print error messages
+    "model": SHARED_CONFIG["model"],  # Use shared model settings
+    "paths": {
+        "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"),
+        "results_dir": str(RESULTS_DIR / "coherence")
+    },
+    "preprocessing": SHARED_CONFIG["preprocessing"]  # Use shared preprocessing settings
+}
+# Configuration for optimality analysis
+OPTIMALITY_CONFIG = {
+    "modes": [ "semantic"],
+    "window_sizes": [8],  # Specific window size for optimality
+    **SHARED_CONFIG["parallelization"],  # Include shared parallelization settings
+    "bootstrap": 10000,
+    "shuffle_modes": ["include0_includeN", "exclude0_excludeN"], #whether to include or exclude the first and last word of the window
+    "model": SHARED_CONFIG["model"],  # Use shared model settings
+    "paths": {
+        "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"),
+        "results_dir": str(RESULTS_DIR / "optimality")
+    },
+    "preprocessing": SHARED_CONFIG["preprocessing"]  # Use shared preprocessing settings
+}
+# Configuration for statistical analysis
+STATS_CONFIG = {
+    "paths": {
+        "data_dir": str(RESULTS_DIR / "fluency_transcripts_cleaned"),  # Raw transcript data
+        "results_dir": str(RESULTS_DIR / "stats"),  # Statistical analysis results
+        "figures_dir": str(RESULTS_DIR / "figures")  # Generated figures
+    },
+    "demographics": ["age", "gender", "education", "first_language"],  # Demographic variables
+    "outcomes": {
+        "clinical": ["panss_pos_sum", "panss_neg_sum", "panss_gen_sum", "panss_total"],  # Clinical outcome measures
+        "cognitive": ["working_memory", "stroop_psychomotor", "stroop_attention", "stroop_inhibition"]  # Cognitive outcome measures
+    },
+    "groups": {  # Mapping of group codes to labels
+        "none": "0",
+        "schizophrenia": "1",
+        "delusional": "2",
+        "brief_psychotic": "3",
+        "schizoaffective": "4",
+        "other_psychotic": "5",
+        "manic_psychotic": "6",
+        "mdd_psychotic": "7",
+        "other": "8"
+    },
+    "min_tokens": 8,  # Minimum number of tokens required for analysis
+    "task_type": "semantic",  # Type of task to analyze
+    "metrics": [  # Main metrics for analysis
+        "semantic_coherence_2_mean_of_window_means",
+        "semantic_coherence_8_mean_of_window_means",
+        "z_Real_semantic_include0_includeN_8",
+        "number_tokens"
+    ],
+    "new_metrics": [  # Additional metrics for analysis
+        "semantic_coherence_2_mean_of_window_means",
+        "semantic_coherence_8_mean_of_window_means",
+        "z_Real_semantic_include0_includeN_8"
+    ],
+    "exclusions_bev": [  # Behavioral data columns to exclude
+        "panss_pos_sum",
+        "panss_neg_sum",
+        "panss_gen_sum",
+        "panss_total",
+        "mss_total",
+        "mss_pos_sum",
+        "mss_neg_sum",
+        "mss_dis_sum",
+        "working_memory",
+        "stroop_psychomotor",
+        "stroop_attention",
+        "stroop_inhibition"
+    ],
+    "alpha": 0.05,  # Significance level
+    "num_tests": 4  # Number of tests for multiple comparison correction
+}
+# Configuration for aggregation
+AGGREGATION_CONFIG = {
+    "paths": {
+        "behav_agg": str(RESULTS_DIR / "aggregates/behav_agg.csv"),
+        "questionnaires": str(RESULTS_DIR / "aggregates/questionnaires.csv"),
+        "demo_clinical": str(DATA_DIR / "Demographic_Clinical_Data/VELAS_Demo_Clin_Master.csv"),
+        "output": str(RESULTS_DIR / f"master_fluency{'_lower' if SHARED_CONFIG['preprocessing']['lower'] else '_upper'}.csv")
+    },
+    "demo_columns": [
+        'study_id', 'group', 'age', 'gender', 'first_language', 'education',
+        'diagnosis', 'duration_untreated', 'age_onset', 'antipsy_duration',
+    ]
+}
+# Combine all configs into a single dictionary for easy access
+CONFIG = {
+    "questionnaires": QUESTIONNAIRES_CONFIG,
+    "behavioral": BEHAVIORAL_CONFIG,
+    "duplicates": DUPLICATES_CONFIG,
+    "coherence": COHERENCE_CONFIG,
+    "optimality": OPTIMALITY_CONFIG,
+    "stats": STATS_CONFIG,
+    "aggregation": AGGREGATION_CONFIG,
+    "shared": SHARED_CONFIG,
+    "min_tokens": STATS_CONFIG["min_tokens"],
+    "task_type": STATS_CONFIG["task_type"],
+    "metrics": STATS_CONFIG["metrics"],
+    "new_metrics": STATS_CONFIG["new_metrics"],
+    "exclusions_bev": STATS_CONFIG["exclusions_bev"]
+}

pelican_nlp/Nils_backup/fluency/main.py ADDED Viewed

@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Main script for VELAS fluency analysis pipeline.
+This script orchestrates the entire analysis pipeline:
+1. Validates input data
+2. Processes behavioral data
+3. Computes NLP metrics
+4. Performs statistical analysis
+5. Generates visualizations
+"""
+import os
+import sys
+from pathlib import Path
+import logging
+from typing import Dict, List, Any
+from config import CONFIG, RESULTS_DIR
+from utils import ensure_output_dir  # Add import
+# Debug print CONFIG structure
+print("Initial CONFIG structure:")
+print("CONFIG type:", type(CONFIG))
+print("CONFIG keys:", list(CONFIG.keys()))
+if "questionnaires" in CONFIG:
+    print("questionnaires keys:", list(CONFIG["questionnaires"].keys()))
+# Get absolute path and ensure results directory exists
+results_path = Path(os.getcwd()) / 'results'
+results_path.mkdir(parents=True, exist_ok=True)
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(str(results_path / 'pipeline.log'))
+    ]
+)
+logger = logging.getLogger(__name__)
+def validate_paths(config: Dict[str, Any], required_paths: List[str]) -> bool:
+    """Validate that all required paths exist."""
+    logger.info(f"CONFIG keys: {list(config.keys())}")  # Debug print
+    logger.info(f"Required paths: {required_paths}")  # Debug print
+    for path_key in required_paths:
+        # Split into section and path
+        section, *path_parts = path_key.split('.')
+        if section not in config:
+            logger.error(f"Missing config section: {section}")
+            return False
+        current = config[section]
+        try:
+            for key in path_parts:
+                current = current[key]
+            # Handle paths with format strings
+            if isinstance(current, str) and '{' in current:
+                # If path contains format string, check parent directory
+                path = Path(current.format(case='lower')).parent
+            else:
+                path = Path(current)
+            # For input paths, check existence
+            if 'input' in path_parts:
+                if not path.exists():
+                    logger.error(f"Input path does not exist: {path}")
+                    return False
+            # For output paths, create if doesn't exist
+            else:
+                ensure_output_dir(str(path))
+                logger.info(f"Created output directory: {path}")
+        except KeyError:
+            logger.error(f"Missing required path key: {path_key}")
+            return False
+        except Exception as e:
+            logger.error(f"Error validating path {path_key}: {str(e)}")
+            return False
+    return True
+def log_config_section(section_name: str, config: Dict[str, Any]):
+    """Log the configuration section being used."""
+    logger.info(f"\nConfiguration for {section_name}:")
+    for key, value in config.items():
+        if isinstance(value, dict):
+            logger.info(f"{key}:")
+            for subkey, subvalue in value.items():
+                logger.info(f"  {subkey}: {subvalue}")
+        else:
+            logger.info(f"{key}: {value}")
+def run_questionnaires():
+    """Process questionnaire data."""
+    logger.info("\nProcessing questionnaire data...")
+    import questionnaires_data
+    questionnaires_data.main()
+    return True
+def run_behavioral_data():
+    """Run behavioral data processing."""
+    logger.info("\nRunning behavioral data processing...")
+    import behavioral_data
+    behavioral_data.main()
+    return True
+def run_check_duplicates():
+    """Check for duplicates in processed data."""
+    logger.info("\nChecking for duplicates...")
+    import check_duplicates
+    check_duplicates.main()
+    return True
+def run_coherence():
+    """Run coherence analysis."""
+    logger.info("\nRunning coherence analysis...")
+    import coherence
+    coherence.main()
+    return True
+def run_optimality():
+    """Run optimality analysis."""
+    logger.info("\nRunning optimality analysis...")
+    import optimality_without_tsa
+    optimality_without_tsa.main()
+    return True
+def run_aggregate_results():
+    """Aggregate fluency results."""
+    logger.info("\nAggregating results...")
+    import aggregate_fluency_results
+    aggregate_fluency_results.main()
+    return True
+def run_stats():
+    """Run statistical analysis."""
+    logger.info("\nRunning statistical analysis...")
+    import stats_fluency
+    stats_fluency.main()
+    return True
+def main():
+    """Main execution pipeline."""
+    logger.info("Starting VELAS fluency analysis pipeline...")
+    # Create necessary directories
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    # Pipeline execution order
+    pipeline_steps = [
+        ("Questionnaire Data Processing", run_questionnaires),
+        ("Behavioral Data Processing", run_behavioral_data),
+        ("Duplicate Check", run_check_duplicates),
+        ("Coherence Analysis", run_coherence),
+        ("Optimality Analysis", run_optimality),
+        ("Result Aggregation", run_aggregate_results),
+        ("Statistical Analysis", run_stats)
+    ]
+    # Execute pipeline
+    for step_name, step_func in pipeline_steps:
+        logger.info(f"\n{'='*50}")
+        logger.info(f"Starting {step_name}")
+        logger.info(f"{'='*50}")
+        try:
+            success = step_func()
+            if not success:
+                logger.error(f"{step_name} failed. Stopping pipeline.")
+                return
+            logger.info(f"{step_name} completed successfully.")
+        except Exception as e:
+            logger.exception(f"Error in {step_name}: {str(e)}")
+            return
+    logger.info("\nPipeline completed successfully!")
+if __name__ == "__main__":
+    main()

pelican-nlp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

pelican-nlp 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl