PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/ohe.py ADDED Viewed

@@ -0,0 +1,160 @@
+import numpy as np
+import anndata as ad
+import os
+import concurrent.futures
+def one_hot_encode(sequence, device='auto'):
+    """
+    One-hot encodes a DNA sequence.
+    Parameters:
+        sequence (str or list): DNA sequence (e.g., "ACGTN" or ['A', 'C', 'G', 'T', 'N']).
+    Returns:
+        ndarray: Flattened one-hot encoded representation of the input sequence.
+    """
+    mapping = np.array(['A', 'C', 'G', 'T', 'N'])
+    # Ensure input is a list of characters
+    if not isinstance(sequence, list):
+        sequence = list(sequence)  # Convert string to list of characters
+    # Handle empty sequences
+    if len(sequence) == 0:
+        print("Warning: Empty sequence encountered in one_hot_encode()")
+        return np.zeros(len(mapping))  # Return empty encoding instead of failing
+    # Convert sequence to NumPy array
+    seq_array = np.array(sequence, dtype='<U1')
+    # Replace invalid bases with 'N'
+    seq_array = np.where(np.isin(seq_array, mapping), seq_array, 'N')
+    # Create one-hot encoding matrix
+    one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
+    # Flatten and return
+    return one_hot_matrix.flatten()
+def one_hot_decode(ohe_array):
+    """
+    Takes a flattened one hot encoded array and returns the sequence string from that array.
+    Parameters:
+        ohe_array (np.array): A one hot encoded array
+    Returns:
+        sequence (str): Sequence string of the one hot encoded array
+    """
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    # Reshape the flattened array into a 2D matrix with 5 columns (one for each base)
+    one_hot_matrix = ohe_array.reshape(-1, 5)
+    # Get the index of the maximum value (which will be 1) in each row
+    decoded_indices = np.argmax(one_hot_matrix, axis=1)
+    # Map the indices back to the corresponding bases
+    sequence_list = [mapping[i] for i in decoded_indices]
+    sequence = ''.join(sequence_list)
+    return sequence
+def ohe_layers_decode(adata, obs_names):
+    """
+    Takes an anndata object and a list of observation names. Returns a list of sequence strings for the reads of interest.
+    Parameters:
+        adata (AnnData): An anndata object.
+        obs_names (list): A list of observation name strings to retrieve sequences for.
+    Returns:
+        sequences (list of str): List of strings of the one hot encoded array
+    """
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    ohe_layers = [f"{base}_binary_encoding" for base in mapping]
+    sequences = []
+    for obs_name in obs_names:
+        obs_subset = adata[obs_name]
+        ohe_list = []
+        for layer in ohe_layers:
+            ohe_list += list(obs_subset.layers[layer])
+        ohe_array = np.array(ohe_list)
+        sequence = one_hot_decode(ohe_array)
+        sequences.append(sequence)
+    return sequences
+def _encode_sequence(args):
+    """Parallel helper function for one-hot encoding."""
+    read_name, seq, device = args
+    try:
+        one_hot_matrix = one_hot_encode(seq, device)
+        return read_name, one_hot_matrix
+    except Exception:
+        return None  # Skip invalid sequences
+def _encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number):
+    """Encodes a batch and writes to disk immediately."""
+    batch = {read_name: matrix for read_name, matrix in batch_data if matrix is not None}
+    if batch:
+        save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad')
+        tmp_ad = ad.AnnData(X=np.zeros((1, 1)), uns=batch)  # Placeholder X
+        tmp_ad.write_h5ad(save_name)
+        return save_name
+    return None
+def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000, progress_bar=None, device='auto', threads=None):
+    """
+    Efficient version of ohe_batching: one-hot encodes sequences in parallel and writes batches immediately.
+    Parameters:
+        base_identities (dict): Dictionary mapping read names to sequences.
+        tmp_dir (str): Directory for storing temporary files.
+        record (str): Record name.
+        prefix (str): Prefix for file naming.
+        batch_size (int): Number of reads per batch.
+        progress_bar (tqdm instance, optional): Shared progress bar.
+        device (str): Device for encoding.
+        threads (int, optional): Number of parallel workers.
+    Returns:
+        list: List of valid H5AD file paths.
+    """
+    threads = threads or os.cpu_count()  # Default to max available CPU cores
+    batch_data = []
+    batch_number = 0
+    file_names = []
+    # Step 1: Prepare Data for Parallel Encoding
+    encoding_args = [(read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None]
+    # Step 2: Parallel One-Hot Encoding using threads (to avoid nested processes)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+        for result in executor.map(_encode_sequence, encoding_args):
+            if result:
+                batch_data.append(result)
+                if len(batch_data) >= batch_size:
+                    # Step 3: Process and Write Batch Immediately
+                    file_name = _encode_and_save_batch(batch_data.copy(), tmp_dir, prefix, record, batch_number)
+                    if file_name:
+                        file_names.append(file_name)
+                    batch_data.clear()
+                    batch_number += 1
+                if progress_bar:
+                    progress_bar.update(1)
+    # Step 4: Process Remaining Batch
+    if batch_data:
+        file_name = _encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number)
+        if file_name:
+            file_names.append(file_name)
+    return file_names

smftools/informatics/pod5_functions.py ADDED Viewed

@@ -0,0 +1,224 @@
+from ..config import LoadExperimentConfig
+from ..readwrite import make_dirs
+import os
+import subprocess
+from pathlib import Path
+import pod5 as p5
+from typing import Union, List
+def basecall_pod5s(config_path):
+    """
+    Basecall from pod5s given a config file.
+    Parameters:
+        config_path (str): File path to the basecall configuration file
+    Returns:
+        None
+    """
+    # Default params
+    bam_suffix = '.bam' # If different, change from here.
+    # Load experiment config parameters into global variables
+    experiment_config = LoadExperimentConfig(config_path)
+    var_dict = experiment_config.var_dict
+    # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
+    default_value = None
+    # General config variable init
+    input_data_path = Path(var_dict.get('input_data_path', default_value)) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
+    output_directory = Path(var_dict.get('output_directory', default_value)) # Path to the output directory to make for the analysis. Necessary.
+    model = var_dict.get('model', default_value) # needed for dorado basecaller
+    model_dir = Path(var_dict.get('model_dir', default_value)) # model directory
+    barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
+    barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
+    trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
+    device = var_dict.get('device', 'auto')
+    # Modified basecalling specific variable init
+    filter_threshold = var_dict.get('filter_threshold', default_value)
+    m6A_threshold = var_dict.get('m6A_threshold', default_value)
+    m5C_threshold = var_dict.get('m5C_threshold', default_value)
+    hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
+    thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
+    mod_list = var_dict.get('mod_list', default_value)
+    # Make initial output directory
+    make_dirs([output_directory])
+    # Get the input filetype
+    if input_data_path.is_file():
+        input_data_filetype = input_data_path.suffixes[0]
+        input_is_pod5 = input_data_filetype in ['.pod5','.p5']
+        input_is_fast5 = input_data_filetype in ['.fast5','.f5']
+    elif input_data_path.is_dir():
+        # Get the file names in the input data dir
+        input_files = input_data_path.iterdir()
+        input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
+        input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
+    # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
+    if input_is_fast5 and not input_is_pod5:
+        # take the input directory of fast5 files and write out a single pod5 file into the output directory.
+        output_pod5 = output_directory / 'FAST5s_to_POD5.pod5'
+        print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
+        fast5_to_pod5(input_data_path, output_pod5)
+        # Reassign the pod5_dir variable to point to the new pod5 file.
+        input_data_path = output_pod5
+    model_basename = model.name
+    model_basename = model_basename.replace('.', '_')
+    if mod_list:
+        mod_string = "_".join(mod_list)
+        bam = output_directory / f"{model_basename}_{mod_string}_calls"
+        modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
+    else:
+        bam = output_directory / f"{model_basename}_canonical_basecalls"
+        canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
+def fast5_to_pod5(
+    fast5_dir: Union[str, Path, List[Union[str, Path]]],
+    output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
+) -> None:
+    """
+    Convert Nanopore FAST5 files (single file, list of files, or directory)
+    into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
+    """
+    output_pod5 = str(output_pod5)  # ensure string
+    # 1) If user gives a list of FAST5 files
+    if isinstance(fast5_dir, (list, tuple)):
+        fast5_paths = [str(Path(f)) for f in fast5_dir]
+        cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    # Ensure Path object
+    p = Path(fast5_dir)
+    # 2) If user gives a single file
+    if p.is_file():
+        cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    # 3) If user gives a directory → collect FAST5s
+    if p.is_dir():
+        fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
+        if not fast5_paths:
+            raise FileNotFoundError(f"No FAST5 files found in {p}")
+        cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
+def subsample_pod5(pod5_path, read_name_path, output_directory):
+    """
+    Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
+    This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
+    Parameters:
+        pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
+        read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
+        output_directory (str): A file path to the directory to output the file.
+    Returns:
+        None
+    """
+    if os.path.isdir(pod5_path):
+        pod5_path_is_dir = True
+        input_pod5_base = 'input_pod5s.pod5'
+        files = os.listdir(pod5_path)
+        pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
+        pod5_files.sort()
+        print(f'Found input pod5s: {pod5_files}')
+    elif os.path.exists(pod5_path):
+        pod5_path_is_dir = False
+        input_pod5_base = os.path.basename(pod5_path)
+    else:
+        print('Error: pod5_path passed does not exist')
+        return None
+    if type(read_name_path) == str:
+        input_read_name_base = os.path.basename(read_name_path)
+        output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
+        # extract read names into a list of strings
+        with open(read_name_path, 'r') as file:
+            read_names = [line.strip() for line in file]
+        print(f'Looking for read_ids: {read_names}')
+        read_records = []
+        if pod5_path_is_dir:
+            for input_pod5 in pod5_files:
+                with p5.Reader(input_pod5) as reader:
+                    try:
+                        for read_record in reader.reads(selection=read_names, missing_ok=True):
+                            read_records.append(read_record.to_read())
+                            print(f'Found read in {input_pod5}: {read_record.read_id}')
+                    except:
+                        print('Skipping pod5, could not find reads')
+        else:
+            with p5.Reader(pod5_path) as reader:
+                try:
+                    for read_record in reader.reads(selection=read_names):
+                        read_records.append(read_record.to_read())
+                        print(f'Found read in {input_pod5}: {read_record}')
+                except:
+                    print('Could not find reads')
+    elif type(read_name_path) == int:
+        import random
+        output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
+        all_read_records = []
+        if pod5_path_is_dir:
+            # Shuffle the list of input pod5 paths
+            random.shuffle(pod5_files)
+            for input_pod5 in pod5_files:
+                # iterate over the input pod5s
+                print(f'Opening pod5 file {input_pod5}')
+                with p5.Reader(pod5_path) as reader:
+                    for read_record in reader.reads():
+                        all_read_records.append(read_record.to_read())
+                # When enough reads are in all_read_records, stop accumulating reads.
+                if len(all_read_records) >= read_name_path:
+                    break
+            if read_name_path <= len(all_read_records):
+                read_records = random.sample(all_read_records, read_name_path)
+            else:
+                print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
+                read_records = all_read_records
+        else:
+            with p5.Reader(pod5_path) as reader:
+                for read_record in reader.reads():
+                    # get all read records from the input pod5
+                    all_read_records.append(read_record.to_read())
+            if read_name_path <= len(all_read_records):
+                # if the subsampling amount is less than the record amount in the file, randomly subsample the reads
+                read_records = random.sample(all_read_records, read_name_path)
+            else:
+                print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
+                read_records = all_read_records
+    output_pod5 = os.path.join(output_directory, output_base)
+    # Write the subsampled POD5
+    with p5.Writer(output_pod5) as writer:
+        writer.add_reads(read_records)

smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} RENAMED Viewed

@@ -9,10 +9,13 @@ def run_multiqc(input_dir, output_dir):
     Returns:
     - None: The function executes MultiQC and prints the status.
     """
-    import os
+    from ..readwrite import make_dirs
     import subprocess
     # Ensure the output directory exists
-    os.makedirs(output_dir, exist_ok=True)
+    make_dirs(output_dir)
+    input_dir = str(input_dir)
+    output_dir = str(output_dir)
     # Construct MultiQC command
     command = ["multiqc", input_dir, "-o", output_dir]

smftools/machine_learning/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from . import models
+from . import data
+from . import utils
+from . import evaluation
+from . import inference
+from . import training
+__all__ = [
+    "calculate_relative_risk_on_activity",
+    "evaluate_models_by_subgroup",
+    "prepare_melted_model_data",
+]

smftools/machine_learning/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .anndata_data_module import AnnDataModule, build_anndata_loader
2	+ from .preprocessing import random_fill_nans

smftools/machine_learning/data/anndata_data_module.py ADDED Viewed

@@ -0,0 +1,234 @@
+import torch
+from torch.utils.data import DataLoader, TensorDataset, random_split, Dataset, Subset
+import pytorch_lightning as pl
+import numpy as np
+import pandas as pd
+from .preprocessing import random_fill_nans
+from sklearn.utils.class_weight import compute_class_weight
+class AnnDataDataset(Dataset):
+    """
+    Generic PyTorch Dataset from AnnData.
+    """
+    def __init__(self, adata, tensor_source="X", tensor_key=None, label_col=None, window_start=None, window_size=None):
+        self.adata = adata
+        self.tensor_source = tensor_source
+        self.tensor_key = tensor_key
+        self.label_col = label_col
+        self.window_start = window_start
+        self.window_size = window_size
+        if tensor_source == "X":
+            X = adata.X
+        elif tensor_source == "layers":
+            assert tensor_key in adata.layers
+            X = adata.layers[tensor_key]
+        elif tensor_source == "obsm":
+            assert tensor_key in adata.obsm
+            X = adata.obsm[tensor_key]
+        else:
+            raise ValueError(f"Invalid tensor_source: {tensor_source}")
+        if self.window_start is not None and self.window_size is not None:
+            X = X[:, self.window_start : self.window_start + self.window_size]
+        X = random_fill_nans(X)
+        self.X_tensor = torch.tensor(X, dtype=torch.float32)
+        if label_col is not None:
+            y = adata.obs[label_col]
+            if y.dtype.name == 'category':
+                y = y.cat.codes
+            self.y_tensor = torch.tensor(y.values, dtype=torch.long)
+        else:
+            self.y_tensor = None
+    def numpy(self, indices):
+        return self.X_tensor[indices].numpy(), self.y_tensor[indices].numpy()
+    def __len__(self):
+        return len(self.X_tensor)
+    def __getitem__(self, idx):
+        x = self.X_tensor[idx]
+        if self.y_tensor is not None:
+            y = self.y_tensor[idx]
+            return x, y
+        else:
+            return (x,)
+def split_dataset(adata, dataset, train_frac=0.6, val_frac=0.1, test_frac=0.3,
+                                 random_seed=42, split_col="train_val_test_split",
+                                 load_existing_split=False, split_save_path=None):
+    """
+    Perform split and record assignment into adata.obs[split_col].
+    """
+    total_len = len(dataset)
+    if load_existing_split:
+        if split_col in adata.obs:
+            pass  # use existing
+        elif split_save_path:
+            split_df = pd.read_csv(split_save_path, index_col=0)
+            adata.obs[split_col] = split_df.loc[adata.obs_names][split_col].values
+        else:
+            raise ValueError("No existing split column found and no file provided.")
+    else:
+        indices = np.arange(total_len)
+        np.random.seed(random_seed)
+        np.random.shuffle(indices)
+        n_train = int(train_frac * total_len)
+        n_val = int(val_frac * total_len)
+        n_test = total_len - n_train - n_val
+        split_array = np.full(total_len, "test", dtype=object)
+        split_array[indices[:n_train]] = "train"
+        split_array[indices[n_train:n_train + n_val]] = "val"
+        adata.obs[split_col] = split_array
+        if split_save_path:
+            adata.obs[[split_col]].to_csv(split_save_path)
+    split_labels = adata.obs[split_col].values
+    train_indices = np.where(split_labels == "train")[0]
+    val_indices = np.where(split_labels == "val")[0]
+    test_indices = np.where(split_labels == "test")[0]
+    train_set = Subset(dataset, train_indices)
+    val_set = Subset(dataset, val_indices)
+    test_set = Subset(dataset, test_indices)
+    return train_set, val_set, test_set
+class AnnDataModule(pl.LightningDataModule):
+    """
+    Unified LightningDataModule version of AnnDataDataset + splitting with adata.obs recording.
+    """
+    def __init__(self, adata, tensor_source="X", tensor_key=None, label_col="labels",
+                 batch_size=64, train_frac=0.6, val_frac=0.1, test_frac=0.3, random_seed=42,
+                 inference_mode=False, split_col="train_val_test_split", split_save_path=None,
+                 load_existing_split=False, window_start=None, window_size=None, num_workers=None, persistent_workers=False):
+        super().__init__()
+        self.adata = adata
+        self.tensor_source = tensor_source
+        self.tensor_key = tensor_key
+        self.label_col = label_col
+        self.batch_size = batch_size
+        self.train_frac = train_frac
+        self.val_frac = val_frac
+        self.test_frac = test_frac
+        self.random_seed = random_seed
+        self.inference_mode = inference_mode
+        self.split_col = split_col
+        self.split_save_path = split_save_path
+        self.load_existing_split = load_existing_split
+        self.var_names = adata.var_names.copy()
+        self.window_start = window_start
+        self.window_size = window_size
+        self.num_workers = num_workers
+        self.persistent_workers = persistent_workers
+    def setup(self, stage=None):
+        dataset = AnnDataDataset(self.adata, self.tensor_source, self.tensor_key,
+                                  None if self.inference_mode else self.label_col,
+                                    window_start=self.window_start, window_size=self.window_size)
+        if self.inference_mode:
+            self.infer_dataset = dataset
+            return
+        self.train_set, self.val_set, self.test_set = split_dataset(
+            self.adata, dataset, train_frac=self.train_frac, val_frac=self.val_frac,
+            test_frac=self.test_frac, random_seed=self.random_seed,
+            split_col=self.split_col, split_save_path=self.split_save_path,
+            load_existing_split=self.load_existing_split
+        )
+    def train_dataloader(self):
+        if self.num_workers:
+            return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, persistent_workers=self.persistent_workers)
+        else:
+            return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True)
+    def val_dataloader(self):
+        if self.num_workers:
+            return DataLoader(self.val_set, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=self.persistent_workers)
+        else:
+            return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=False)
+    def test_dataloader(self):
+        if self.num_workers:
+            return DataLoader(self.test_set, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=self.persistent_workers)
+        else:
+            return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=False)
+    def predict_dataloader(self):
+        if not self.inference_mode:
+            raise RuntimeError("Only valid in inference mode")
+        return DataLoader(self.infer_dataset, batch_size=self.batch_size)
+    def compute_class_weights(self):
+        train_indices = self.train_set.indices # get the indices of the training set
+        y_all = self.train_set.dataset.y_tensor # get labels for the entire dataset (We are pulling from a Subset object, so this syntax can be confusing)
+        y_train = y_all[train_indices].cpu().numpy() # get the labels for the training set and move to a numpy array
+        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
+        return torch.tensor(class_weights, dtype=torch.float32)
+    def inference_numpy(self):
+        """
+        Return inference data as numpy for use in sklearn inference.
+        """
+        if not self.inference_mode:
+            raise RuntimeError("Must be in inference_mode=True to use inference_numpy()")
+        X_np = self.infer_dataset.X_tensor.numpy()
+        return X_np
+    def to_numpy(self):
+        """
+        Move the AnnDataModule tensors into numpy arrays
+        """
+        if not self.inference_mode:
+            train_X, train_y = self.train_set.dataset.numpy(self.train_set.indices)
+            val_X, val_y = self.val_set.dataset.numpy(self.val_set.indices)
+            test_X, test_Y = self.test_set.dataset.numpy(self.test_set.indices)
+            return train_X, train_y, val_X, val_y, test_X, test_Y
+        else:
+            return self.inference_numpy()
+def build_anndata_loader(
+    adata, tensor_source="X", tensor_key=None, label_col=None, train_frac=0.6, val_frac=0.1,
+    test_frac=0.3, random_seed=42, batch_size=64, lightning=True, inference_mode=False,
+    split_col="train_val_test_split", split_save_path=None, load_existing_split=False
+):
+    """
+    Unified pipeline for both Lightning and raw PyTorch.
+    The lightning loader works for both Lightning and the Sklearn wrapper.
+    Set lightning to False if you want to make data loaders for base PyTorch or base sklearn models
+    """
+    if lightning:
+        return AnnDataModule(
+            adata, tensor_source=tensor_source, tensor_key=tensor_key, label_col=label_col,
+            batch_size=batch_size, train_frac=train_frac, val_frac=val_frac, test_frac=test_frac,
+            random_seed=random_seed, inference_mode=inference_mode,
+            split_col=split_col, split_save_path=split_save_path, load_existing_split=load_existing_split
+        )
+    else:
+        var_names = adata.var_names.copy()
+        dataset = AnnDataDataset(adata, tensor_source, tensor_key, None if inference_mode else label_col)
+        if inference_mode:
+            return DataLoader(dataset, batch_size=batch_size)
+        else:
+            train_set, val_set, test_set = split_dataset(
+                adata, dataset, train_frac, val_frac, test_frac, random_seed,
+                split_col, split_save_path, load_existing_split
+            )
+            train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
+            val_loader = DataLoader(val_set, batch_size=batch_size)
+            test_loader = DataLoader(test_set, batch_size=batch_size)
+            return train_loader, val_loader, test_loader

smftools/machine_learning/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .evaluators import ModelEvaluator, PostInferenceModelEvaluator
2	+ from .eval_utils import flatten_sliding_window_results

smftools/machine_learning/evaluation/eval_utils.py ADDED Viewed

@@ -0,0 +1,31 @@
+import pandas as pd
+def flatten_sliding_window_results(results_dict):
+    """
+    Flatten nested sliding window results into pandas DataFrame.
+    Expects structure:
+        results[model_name][window_size][window_center]['metrics'][metric_name]
+    """
+    records = []
+    for model_name, model_results in results_dict.items():
+        for window_size, window_results in model_results.items():
+            for center_var, result in window_results.items():
+                metrics = result['metrics']
+                record = {
+                    'model': model_name,
+                    'window_size': window_size,
+                    'center_var': center_var
+                }
+                # Add all metrics
+                record.update(metrics)
+                records.append(record)
+    df = pd.DataFrame.from_records(records)
+    # Convert center_var to numeric if possible (optional but helpful for plotting)
+    df['center_var'] = pd.to_numeric(df['center_var'], errors='coerce')
+    df = df.sort_values(['model', 'window_size', 'center_var'])
+    return df

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl