PyPI - pelican-nlp - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

pelican-nlp 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

pelican_nlp/Nils_backup/__init__.py +0 -0
pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
pelican_nlp/Nils_backup/fluency/config.py +231 -0
pelican_nlp/Nils_backup/fluency/main.py +182 -0
pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
pelican_nlp/Nils_backup/fluency/utils.py +41 -0
pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
pelican_nlp/Nils_backup/transcription/test.json +1 -0
pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
pelican_nlp/__init__.py +1 -1
pelican_nlp/_version.py +1 -0
pelican_nlp/configuration_files/config_audio.yml +150 -0
pelican_nlp/configuration_files/config_discourse.yml +104 -0
pelican_nlp/configuration_files/config_fluency.yml +108 -0
pelican_nlp/configuration_files/config_general.yml +131 -0
pelican_nlp/configuration_files/config_morteza.yml +103 -0
pelican_nlp/praat/__init__.py +29 -0
{pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/METADATA +14 -21
pelican_nlp-0.1.2.dist-info/RECORD +75 -0
pelican_nlp-0.1.0.dist-info/RECORD +0 -39
{pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/WHEEL +0 -0
{pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/licenses/LICENSE +0 -0
{pelican_nlp-0.1.0.dist-info → pelican_nlp-0.1.2.dist-info}/top_level.txt +0 -0

pelican_nlp/Nils_backup/__init__.py ADDED Viewed

File without changes

pelican_nlp/Nils_backup/extract_acoustic_features.py ADDED Viewed

@@ -0,0 +1,274 @@
+#import pandas as pd
+#import numpy as np
+#from pydub import AudioSegment
+#from pyannote.audio import Model, Inference
+class AudioFeatureExtractor:
+    def __init__(self, model_name, token, device="cpu"):
+        """
+        Initializes the AudioFeatureExtractor class.
+        Parameters:
+        - model_name: str, name of the pretrained model_instance from pyannote
+        - token: str, the Hugging Face authentication token for downloading the model_instance
+        - device: str, device to run the model_instance on (default is "cpu")
+        """
+        self.model = Model.from_pretrained(model_name, use_auth_token=token).to(device)
+    def extract_audio_window(self, audio, start_time=0, duration=60000):
+        """
+        Extract a segment from the audio starting at `start_time` with a specified duration.
+        Parameters:
+        - audio: AudioSegment object, the input audio
+        - start_time: int, starting point of the window in milliseconds (default is 0)
+        - duration: int, duration of the window to extract in milliseconds (default is 60000)
+        Returns:
+        - AudioSegment object of the extracted window
+        """
+        end_time = start_time + duration
+        return audio[start_time:end_time]
+    def extract_embeddings(self, inference, file_path):
+        """
+        Extract embeddings from an audio file using the inference model_instance.
+        Parameters:
+        - inference: Inference object from pyannote
+        - file_path: str, path to the audio file
+        Returns:
+        - numpy array of embeddings
+        """
+        embeddings = inference(file_path)
+        return np.asarray(embeddings)
+    def process_audio(self, file_path, mode="whole", start_time=0, duration=60000, window_step=None):
+        """
+        Process an audio file, extracting either whole or windowed embeddings based on mode.
+        Parameters:
+        - file_path: str, path to the audio file
+        - mode: str, "whole" for whole file extraction or "window" for windowed extraction (default is "whole")
+        - start_time: int, start time for the audio segment in milliseconds (only for window mode, default is 0)
+        - duration: int, duration for the audio segment in milliseconds (default is 60000)
+        - window_step: int, step size for window extraction in milliseconds (only for "window" mode)
+        Returns:
+        - numpy array of embeddings
+        """
+        audio = AudioSegment.from_file(file_path)
+        if mode == "whole":
+            inference = Inference(self.model, window="whole")
+            embeddings = self.extract_embeddings(inference, file_path)
+        elif mode == "window":
+            # If window mode is specified, we extract in a sliding window fashion
+            embeddings = []
+            inference = Inference(self.model, window="sliding", duration=duration, step=window_step)
+            # Split audio into windows and extract embeddings for each window
+            for i in range(0, len(audio), window_step):
+                window_audio = self.extract_audio_window(audio, start_time=i, duration=duration)
+                temp_path = f"temp_window_{i}.wav"
+                window_audio.export(temp_path, format="wav")
+                window_embeddings = self.extract_embeddings(inference, temp_path)
+                embeddings.append(window_embeddings)
+                os.remove(temp_path)
+            embeddings = np.vstack(embeddings)  # Stack all window embeddings
+        else:
+            raise ValueError("Invalid mode. Use 'whole' or 'window'.")
+        return embeddings
+    def save_embeddings(self, embeddings, output_path):
+        """
+        Save the embeddings to a CSV file.
+        Parameters:
+        - embeddings: numpy array of embeddings
+        - output_path: str, path to save the CSV file
+        """
+        df = pd.DataFrame(embeddings)
+        df.to_csv(output_path, index=False)
+# Example usage:
+if __name__ == "__main__":
+    # Initialize the extractor
+    extractor = AudioFeatureExtractor(
+        model_name="pyannote/embedding",
+        token="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",
+        device="mps"
+    )
+    # Process a whole file
+    whole_embeddings = extractor.process_audio(
+        file_path="path/to/audio_file.wav",
+        mode="whole"
+    )
+    # Process a file using sliding window extraction
+    window_embeddings = extractor.process_audio(
+        file_path="path/to/audio_file.wav",
+        mode="window",
+        start_time=0,
+        duration=10000,  # e.g., 10 seconds window
+        window_step=5000  # e.g., 5 seconds step
+    )
+    # Save the embeddings
+    extractor.save_embeddings(whole_embeddings, "path/to/output_whole.csv")
+    extractor.save_embeddings(window_embeddings, "path/to/output_window.csv")
+'''import os
+import numpy as np
+from pydub import AudioSegment
+from pyannote.audio import Model, Inference'''
+class AudioFeatureExtractor:
+    def __init__(self, model_name_or_instance, device="cpu", use_auth_token=None):
+        """
+        Initializes the AudioFeatureExtractor class.
+        Parameters:
+        - model_name_or_instance: str or Model, the name of the pretrained model_instance from pyannote or an instance of Model
+        - device: str, device to run the model_instance on (default is "cpu")
+        - use_auth_token: str, Hugging Face authentication token if required
+        """
+        if isinstance(model_name_or_instance, str):
+            self.model = Model.from_pretrained(
+                model_name_or_instance, use_auth_token=use_auth_token
+            ).to(device)
+        else:
+            self.model = model_name_or_instance.to(device)
+        self.device = device
+    def extract_audio_window(self, audio, start_time=0, duration=None):
+        """
+        Extract a segment from the audio starting at 'start_time' with a specified 'duration'.
+        Parameters:
+        - audio: AudioSegment object, the input audio
+        - start_time: int, starting point of the window in milliseconds (default is 0)
+        - duration: int, duration of the window to extract in milliseconds (default is None, till the end)
+        Returns:
+        - AudioSegment object of the extracted window
+        """
+        if duration is None:
+            duration = len(audio) - start_time
+        end_time = start_time + duration
+        return audio[start_time:end_time]
+    def extract_embeddings(self, inference, file_path):
+        """
+        Extract embeddings from the audio file using the specified inference model_instance.
+        Parameters:
+        - inference: Inference object from pyannote
+        - file_path: str, path to the audio file
+        Returns:
+        - numpy array of embeddings
+        """
+        embeddings = inference(file_path)
+        return np.asarray(embeddings)
+    def process_audio(self, file_path, mode="whole", window_duration=None, window_step=None, start_time=0, end_time=None):
+        """
+        Process an audio file, extracting embeddings based on the specified mode.
+        Parameters:
+        - file_path: str, path to the audio file
+        - mode: str, "whole" for whole file extraction or "windowed" for windowed extraction (default is "whole")
+        - window_duration: int, duration of the window in milliseconds (required for "windowed" mode)
+        - window_step: int, step size in milliseconds between windows (required for "windowed" mode)
+        - start_time: int, start time in milliseconds for processing (default is 0)
+        - end_time: int, end time in milliseconds for processing (default is None, till the end)
+        Returns:
+        - numpy array of embeddings
+        """
+        # Load and optionally trim the audio file
+        audio = AudioSegment.from_file(file_path)
+        if end_time is None or end_time > len(audio):
+            end_time = len(audio)
+        audio = audio[start_time:end_time]
+        # Export the (possibly trimmed) audio to a temporary file
+        temp_dir = "temp_audio"
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_path = os.path.join(temp_dir, "temp_audio.wav")
+        audio.export(temp_path, format="wav")
+        if mode == "whole":
+            inference = Inference(self.model, window="whole")
+            embeddings = self.extract_embeddings(inference, temp_path)
+        elif mode == "windowed":
+            if window_duration is None or window_step is None:
+                raise ValueError("window_duration and window_step must be specified for 'windowed' mode.")
+            # Convert milliseconds to seconds for pyannote
+            window_duration_sec = window_duration / 1000.0
+            window_step_sec = window_step / 1000.0
+            inference = Inference(
+                self.model,
+                window="sliding",
+                duration=window_duration_sec,
+                step=window_step_sec
+            )
+            embeddings = self.extract_embeddings(inference, temp_path)
+        else:
+            os.remove(temp_path)
+            raise ValueError("Invalid mode. Use 'whole' or 'windowed'.")
+        # Clean up temporary file
+        os.remove(temp_path)
+        return embeddings
+    def save_embeddings(self, embeddings, output_path):
+        """
+        Save the embeddings to a file.
+        Parameters:
+        - embeddings: numpy array, the embeddings to save
+        - output_path: str, the path where embeddings will be saved
+        """
+        np.save(output_path, embeddings)
+        # Alternatively, to save as CSV:
+        # np.savetxt(output_path, embeddings, delimiter=",")
+# Example usage:
+if __name__ == "__main__":
+    # Initialize the extractor with a model_instance name and token if required
+    extractor = AudioFeatureExtractor(
+        model_name_or_instance="pyannote/embedding",
+        device="cpu",
+        use_auth_token="YOUR_HUGGING_FACE_TOKEN"  # Replace with your token if necessary
+    )
+    # Path to your audio file
+    audio_file_path = "path/to/your/audio_file.wav"
+    # Extract embeddings from the whole audio file
+    whole_embeddings = extractor.process_audio(
+        file_path=audio_file_path,
+        mode="whole"
+    )
+    # Save the embeddings
+    extractor.save_embeddings(whole_embeddings, "whole_embeddings.npy")
+    # Extract embeddings using sliding windows
+    windowed_embeddings = extractor.process_audio(
+        file_path=audio_file_path,
+        mode="windowed",
+        window_duration=5000,  # Window duration in milliseconds (e.g., 5000 ms = 5 seconds)
+        window_step=1000       # Window step in milliseconds (e.g., 1000 ms = 1 second)
+    )
+    # Save the windowed embeddings
+    extractor.save_embeddings(windowed_embeddings, "windowed_embeddings.npy")

pelican_nlp/Nils_backup/fluency/__init__.py ADDED Viewed

File without changes

pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py ADDED Viewed

@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 18 12:28:18 2024
+@author: nilsl
+"""
+import os
+import pandas as pd
+import numpy as np
+from config import CONFIG
+def add_prefix_if_short(text, length, prefix):
+    if len(text) < length:
+        return prefix + text
+    return text
+def parse_array_from_string(s):
+    """Parses a numpy array from a string representation."""
+    cleaned = s.strip("[]")
+    return np.array([float(x) for x in cleaned.split()])
+def load_csvs_to_dataframe(directory, match_str, indeces):
+    df_list = []
+    # Loop through all files in the specified directory
+    for filename in os.listdir(directory):
+        # Check if the file is a CSV and contains the match_str in the filename
+        if filename.endswith(".csv") and match_str in filename:
+            # Construct the full path to the file
+            file_path = os.path.join(directory, filename)
+            # Read the CSV file into a DataFrame
+            df = pd.read_csv(file_path, index_col=0, dtype=str)
+            # Remove the file extension and use the filename as the key in the dictionary
+            for i, index in enumerate(indeces):
+                df[f"attr_{i}"] = filename.split(".")[0].split("_")[index]
+            df_list.append(df)
+    return pd.concat(df_list, axis=0)
+def pivot_df(df, index_cols, pivot_cols):
+    # Perform the pivot operation
+    df_pivot = df.pivot_table(
+        index=index_cols, columns=pivot_cols, aggfunc="first"
+    )
+    # Flatten the MultiIndex columns by joining level names with corresponding values
+    df_pivot.columns = [
+        "{}_{}".format("_".join(map(str, col[:-1])), col[-1])
+        for col in df_pivot.columns
+    ]
+    # Reset index to make index columns regular columns again
+    df_pivot.reset_index(inplace=True)
+    # Handling the columns after resetting index
+    all_cols = df_pivot.columns.tolist()
+    non_index_cols = [col for col in all_cols if col not in index_cols]
+    # Sorting non-index columns by base name and context number, modified to handle multiple pivots
+    sorted_cols = sorted(
+        non_index_cols, key=lambda x: (x.split("_")[0], int(x.split("_")[-1]))
+    )
+    # Reordering DataFrame columns including index and sorted other columns
+    df_pivot = df_pivot[index_cols + sorted_cols]
+    return df_pivot
+def main():
+    """Main execution function."""
+    lower = CONFIG["shared"]["preprocessing"]["lower"]
+    behav = pd.read_csv(CONFIG["aggregation"]["paths"]["behav_agg"], dtype=str)
+    behav["study_id"] = (
+        behav["study_id"]
+        .apply(add_prefix_if_short, length=4, prefix="0")
+        .apply(add_prefix_if_short, length=4, prefix="0")
+    )
+    behav_scores = [col for col in behav.columns if not col in ["study_id", "group", "gender", "first_language", "diagnosis"]]
+    behav[behav_scores] = behav[behav_scores].astype(float)
+    demo = pd.read_csv(
+        CONFIG["aggregation"]["paths"]["demo_clinical"],
+        dtype=str
+    )[CONFIG["aggregation"]["demo_columns"]]
+    demo_scores = [col for col in demo.columns if not col in ['study_id', 'group', 'gender', 'first_language', 'diagnosis']]
+    demo[demo_scores] = demo[demo_scores].astype(float)
+    behav_mss = pd.read_csv(CONFIG["aggregation"]["paths"]["questionnaires"], dtype=str)
+    behav_mss[behav_mss.columns.drop("study_id")] = behav_mss[behav_mss.columns.drop("study_id")].astype(float)
+    behav_mss["study_id"] = (
+        behav_mss["study_id"]
+        .apply(add_prefix_if_short, length=4, prefix="0")
+        .apply(add_prefix_if_short, length=4, prefix="0")
+    )
+    fluency_optimality = (
+        load_csvs_to_dataframe(
+            CONFIG["optimality"]["paths"]["results_dir"],
+            "lower" if lower else "upper",
+            [3, 4, 5],
+        )
+        .rename(
+            columns={
+                "attr_0": "min_length",
+                "attr_1": "index_0_shuffle",
+                "attr_2": "index_-1_shuffle",
+            }
+        )
+        .drop("task", axis=1)
+    )
+    fluency_optimality["z_Real"] = (
+        fluency_optimality["actual_dist"].astype(float)
+        - fluency_optimality["average_dist"].astype(float)
+    ) / fluency_optimality["std_dist"].astype(float)
+    fluency_optimality["z_Real"] = fluency_optimality["z_Real"].astype(float)
+    fluency_optimality["all_pairs_average"] = fluency_optimality[
+        "all_pairs_average"
+    ].astype(float)
+    fluency_optimality["actual_dist"] = fluency_optimality["actual_dist"].astype(
+        float
+    )
+    fluency_optimality["min_length"] = fluency_optimality["min_length"].astype(
+        int
+    )
+    fluency_optimality = (
+        fluency_optimality.drop("window_index", axis = 1).groupby(
+            [
+                "min_length",
+                "index_0_shuffle",
+                "index_-1_shuffle",
+                "analysis_mode",
+                "study_id",
+                "sub_task",
+            ]
+        )["z_Real"]
+        .mean()
+        .reset_index()
+    )
+    fluency_optimality_pivot = pivot_df(
+        fluency_optimality,
+        ["study_id", "sub_task"],
+        ["analysis_mode", "index_0_shuffle", "index_-1_shuffle", "min_length"],
+    )
+    fluency_coherence = pd.read_csv(
+        os.path.join(CONFIG["coherence"]["paths"]["results_dir"],
+                    f"coherence_results{'_lower' if lower else '_upper'}.csv"),
+        dtype=str
+    )
+    print(fluency_coherence.columns)
+    print(fluency_coherence.head())
+    fluency_coherence[fluency_coherence.columns.drop(["study_id", "sub_task"])] = fluency_coherence[fluency_coherence.columns.drop(["study_id", "sub_task"])].astype(float)
+    index = demo.merge(behav).merge(behav_mss)
+    metrics = fluency_coherence.merge(fluency_optimality_pivot, how = "outer")
+    paper_df = index.merge(metrics)
+    paper_df["task"] = ""
+    paper_df.loc[paper_df["sub_task"] == "b", "task"] = "phonetic"
+    paper_df.loc[paper_df["sub_task"] == "k", "task"] = "phonetic"
+    paper_df.loc[paper_df["sub_task"] == "m", "task"] = "phonetic"
+    paper_df.loc[paper_df["sub_task"] == "animals", "task"] = "semantic"
+    paper_df.loc[paper_df["sub_task"] == "clothes", "task"] = "semantic"
+    paper_df.loc[paper_df["sub_task"] == "food", "task"] = "semantic"
+    paper_df.to_csv(CONFIG["aggregation"]["paths"]["output"])
+    return True
+if __name__ == "__main__":
+    main()

pelican_nlp/Nils_backup/fluency/behavioral_data.py ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Process VELAS behavioral data.
+This script:
+1. Loads the VELAS behavioral master data
+2. Renames cognitive test variables for clarity
+3. Selects relevant columns
+4. Outputs cleaned CSV
+"""
+import pandas as pd
+from utils import ensure_output_dir
+from config import BEHAVIORAL_CONFIG
+def load_behavioral_data(filepath):
+    """Load behavioral data from CSV."""
+    return pd.read_csv(filepath)
+def rename_cognitive_variables(df):
+    """Rename cognitive test variables for clarity."""
+    return df.rename(columns=BEHAVIORAL_CONFIG["cognitive_variable_mapping"])
+def save_aggregated_data(df, output_path):
+    """Save relevant columns to CSV."""
+    ensure_output_dir(output_path)
+    df[BEHAVIORAL_CONFIG["columns_to_save"]].to_csv(output_path, index=False)
+def main():
+    # Get paths from config
+    paths = BEHAVIORAL_CONFIG["paths"]
+    # Process data
+    df = load_behavioral_data(paths["input"])
+    print(df.columns)
+    df = rename_cognitive_variables(df)
+    save_aggregated_data(df, paths["output"])
+    print(f"Processed behavioral data saved to: {paths['output']}")
+if __name__ == "__main__":
+    main()

pelican_nlp/Nils_backup/fluency/check_duplicates.py ADDED Viewed

@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Process VELAS fluency transcripts by cleaning duplicates and hyphenated words.
+This script:
+1. Analyzes text files for duplicates and hyphenated words
+2. Cleans transcripts by removing duplicates and hyphens
+3. Saves processed transcripts to output directory
+"""
+import os
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Tuple
+from utils import ensure_output_dir, validate_input_data
+from config import DUPLICATES_CONFIG
+#implemented in fluency_cleaner==================================
+def analyze_transcript(content: str) -> Tuple[int, int]:
+    """
+    Count duplicates and hyphenated words in a transcript.
+    Args:
+        content: Semicolon-separated transcript content
+    Returns:
+        Tuple of (duplicate_count, hyphenated_word_count)
+    """
+    words = content.split(';')
+    word_counter = Counter(words)
+    duplicates = sum(count - 1 for count in word_counter.values() if count > 1)
+    hyphenated = sum(1 for word in words if '-' in word)
+    return duplicates, hyphenated
+#=============================================================================
+def analyze_directory(directory: str) -> Dict[str, int]:
+    """
+    Analyze all transcripts in directory for duplicates and hyphenated words.
+    Args:
+        directory: Path to transcript directory
+    Returns:
+        Dictionary with total counts of duplicates and hyphenated words
+    """
+    total_duplicates = 0
+    total_hyphenated = 0
+    for filename in os.listdir(directory):
+        if filename.endswith('.txt') and DUPLICATES_CONFIG["file_filter"] in filename:
+            filepath = os.path.join(directory, filename)
+            with open(filepath, 'r') as file:
+                content = file.read()
+                duplicates, hyphenated = analyze_transcript(content)
+                total_duplicates += duplicates
+                total_hyphenated += hyphenated
+    return {
+        'duplicates': total_duplicates,
+        'hyphenated': total_hyphenated
+    }
+#=======================================================
+#implemented in fluency_cleaner
+def clean_transcript(content: str) -> str:
+    # Remove whitespace and decorators
+    content = re.sub(r'\s+', '', content).strip()
+    # Split and clean words
+    words = [word for word in content.split(';') if word]
+    words = [word.replace('-', '') for word in words]
+    # Remove duplicate words while preserving order
+    word_counter = Counter(words)
+    seen = set()
+    cleaned_words = []
+    for word in words:
+        if word in seen and word_counter[word] > 1:
+            word_counter[word] -= 1
+        else:
+            cleaned_words.append(word)
+            seen.add(word)
+    return ';'.join(cleaned_words)
+#===========================================================
+def process_directory(input_dir: str, output_dir: str) -> None:
+    """
+    Process all transcripts in directory, cleaning and saving to output directory.
+    Args:
+        input_dir: Directory containing raw transcripts
+        output_dir: Directory for cleaned transcripts
+    """
+    # Create the output directory and any necessary parent directories
+    print(f"Creating output directory: {output_dir}")
+    print(f"Output directory exists before ensure_output_dir? {os.path.exists(output_dir)}")
+    ensure_output_dir(output_dir)
+    print(f"Output directory exists after ensure_output_dir? {os.path.exists(output_dir)}")
+    for filename in os.listdir(input_dir):
+        if filename.endswith('.txt') and DUPLICATES_CONFIG["file_filter"] in filename:
+            input_path = os.path.join(input_dir, filename)
+            output_path = os.path.join(output_dir, filename)
+            print(f"\nProcessing file: {filename}")
+            print(f"Input path: {input_path}")
+            print(f"Output path: {output_path}")
+            print(f"Output dir exists? {os.path.exists(os.path.dirname(output_path))}")
+            # Ensure the directory for this specific file exists
+            ensure_output_dir(os.path.dirname(output_path))
+            print(f"Output dir exists after ensure? {os.path.exists(os.path.dirname(output_path))}")
+            with open(input_path, 'r') as infile:
+                content = infile.read()
+                cleaned_content = clean_transcript(content)
+            with open(output_path, 'w') as outfile:
+                outfile.write(cleaned_content)
+def print_analysis_results(results: Dict[str, int], stage: str) -> None:
+    """Print analysis results in a formatted way."""
+    print(f"\nAnalysis results ({stage}):")
+    print(f"- Total duplicates: {results['duplicates']}")
+    print(f"- Total hyphenated words: {results['hyphenated']}")
+def main():
+    # Get paths from config
+    paths = DUPLICATES_CONFIG["paths"]
+    # Validate input paths and create output directories
+    input_errors = validate_input_data({"transcripts": paths["input"]})
+    if input_errors:
+        for desc, error in input_errors.items():
+            print(f"Error with {desc}: {error}")
+        return
+    print(f"\nInput directory: {paths['input']}")
+    print(f"Output directory: {paths['output']}")
+    print(f"Input directory exists? {os.path.exists(paths['input'])}")
+    ensure_output_dir(paths["output"])
+    print(f"Output directory exists after ensure? {os.path.exists(paths['output'])}")
+    # Analyze original transcripts
+    print("\nAnalyzing original transcripts...")
+    original_results = analyze_directory(paths["input"])
+    print_analysis_results(original_results, "before cleaning")
+    # Process transcripts
+    print("\nCleaning transcripts...")
+    process_directory(paths["input"], paths["output"])
+    # Analyze cleaned transcripts
+    print("\nAnalyzing cleaned transcripts...")
+    cleaned_results = analyze_directory(paths["output"])
+    print_analysis_results(cleaned_results, "after cleaning")
+    print(f"\nCleaned transcripts saved to: {paths['output']}")
+if __name__ == "__main__":
+    main()

pelican-nlp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

pelican-nlp 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl