PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

smftools/__init__.py +5 -1
smftools/_version.py +1 -1
smftools/informatics/__init__.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +63 -10
smftools/informatics/direct_smf.py +66 -18
smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
smftools/informatics/helpers/__init__.py +16 -2
smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
smftools/informatics/helpers/canoncall.py +12 -3
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +33 -46
smftools/informatics/helpers/extract_mods.py +55 -23
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/find_conversion_sites.py +33 -44
smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
smftools/informatics/helpers/modcall.py +13 -5
smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
smftools/informatics/helpers/ohe_batching.py +65 -41
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +45 -9
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/split_and_index_BAM.py +3 -8
smftools/informatics/load_adata.py +58 -3
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +22 -9
smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
smftools/preprocessing/binarize_on_Youden.py +35 -32
smftools/preprocessing/binary_layers_to_ohe.py +13 -3
smftools/preprocessing/calculate_complexity.py +3 -2
smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
smftools/preprocessing/calculate_coverage.py +26 -25
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_position_Youden.py +18 -7
smftools/preprocessing/calculate_read_length_stats.py +39 -46
smftools/preprocessing/clean_NaN.py +33 -25
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
smftools/preprocessing/filter_reads_on_length.py +14 -4
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +18 -11
smftools/preprocessing/load_sample_sheet.py +30 -16
smftools/preprocessing/recipes.py +22 -20
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +105 -13
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +19 -23
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
smftools-0.1.7.dist-info/RECORD +136 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/read_HMM.py +0 -1
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
/smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
/smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/ohe_batching.py CHANGED Viewed

@@ -1,52 +1,76 @@
-# ohe_batching
+import os
+import anndata as ad
+import numpy as np
+import concurrent.futures
+from .one_hot_encode import one_hot_encode
-def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000):
+def encode_sequence(args):
+    """Parallel helper function for one-hot encoding."""
+    read_name, seq, device = args
+    try:
+        one_hot_matrix = one_hot_encode(seq, device)
+        return read_name, one_hot_matrix
+    except Exception:
+        return None  # Skip invalid sequences
+def encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number):
+    """Encodes a batch and writes to disk immediately."""
+    batch = {read_name: matrix for read_name, matrix in batch_data if matrix is not None}
+    if batch:
+        save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad')
+        tmp_ad = ad.AnnData(X=np.zeros((1, 1)), uns=batch)  # Placeholder X
+        tmp_ad.write_h5ad(save_name)
+        return save_name
+    return None
+def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000, progress_bar=None, device='auto', threads=None):
     """
-    Processes base identities to one-hot encoded matrices and writes to a h5ad file in batches.
+    Efficient version of ohe_batching: one-hot encodes sequences in parallel and writes batches immediately.
     Parameters:
-        base_identities (dict): A dictionary of read names and sequences.
-        tmp_dir (str): Path to directory where the files will be saved.
-        record (str): Name of the record.
-        prefix (str): Prefix to add to the output file name
-        batch_size (int): Number of reads to process in each batch.
+        base_identities (dict): Dictionary mapping read names to sequences.
+        tmp_dir (str): Directory for storing temporary files.
+        record (str): Record name.
+        prefix (str): Prefix for file naming.
+        batch_size (int): Number of reads per batch.
+        progress_bar (tqdm instance, optional): Shared progress bar.
+        device (str): Device for encoding.
+        threads (int, optional): Number of parallel workers.
     Returns:
-        ohe_file (list): list of output file names
+        list: List of valid H5AD file paths.
     """
-    import os
-    import anndata as ad
-    import numpy as np
-    from tqdm import tqdm
-    from .one_hot_encode import one_hot_encode
-    batch = {}
-    count = 0
+    threads = threads or os.cpu_count()  # Default to max available CPU cores
+    batch_data = []
     batch_number = 0
-    total_reads = len(base_identities)
     file_names = []
-    for read_name, seq in tqdm(base_identities.items(), desc="Encoding and writing one hot encoded reads", total=total_reads):
-        one_hot_matrix = one_hot_encode(seq)
-        batch[read_name] = one_hot_matrix
-        count += 1
-        # If the batch size is reached, write out the batch and reset
-        if count >= batch_size:
-            save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
-            X = np.random.rand(1, 1)
-            tmp_ad = ad.AnnData(X=X, uns=batch)
-            tmp_ad.write_h5ad(save_name, compression='gzip')
-            file_names.append(save_name)
-            batch.clear()
-            count = 0
-            batch_number += 1
-    # Write out any remaining reads in the final batch
-    if batch:
-        save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
-        X = np.random.rand(1, 1)
-        tmp_ad = ad.AnnData(X=X, uns=batch)
-        tmp_ad.write_h5ad(save_name, compression='gzip')
-        file_names.append(save_name)
+    # Step 1: Prepare Data for Parallel Encoding
+    encoding_args = [(read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None]
+    # Step 2: Parallel One-Hot Encoding using threads (to avoid nested processes)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+        for result in executor.map(encode_sequence, encoding_args):
+            if result:
+                batch_data.append(result)
+                if len(batch_data) >= batch_size:
+                    # Step 3: Process and Write Batch Immediately
+                    file_name = encode_and_save_batch(batch_data.copy(), tmp_dir, prefix, record, batch_number)
+                    if file_name:
+                        file_names.append(file_name)
+                    batch_data.clear()
+                    batch_number += 1
+                if progress_bar:
+                    progress_bar.update(1)
+    # Step 4: Process Remaining Batch
+    if batch_data:
+        file_name = encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number)
+        if file_name:
+            file_names.append(file_name)
     return file_names

smftools/informatics/helpers/ohe_layers_decode.py ADDED Viewed

@@ -0,0 +1,32 @@
+# ohe_layers_decode
+def ohe_layers_decode(adata, obs_names):
+    """
+    Takes an anndata object and a list of observation names. Returns a list of sequence strings for the reads of interest.
+    Parameters:
+        adata (AnnData): An anndata object.
+        obs_names (list): A list of observation name strings to retrieve sequences for.
+    Returns:
+        sequences (list of str): List of strings of the one hot encoded array
+    """
+    import anndata as ad
+    import numpy as np
+    from .ohe_decode import ohe_decode
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    ohe_layers = [f"{base}_binary_encoding" for base in mapping]
+    sequences = []
+    for obs_name in obs_names:
+        obs_subset = adata[obs_name]
+        ohe_list = []
+        for layer in ohe_layers:
+            ohe_list += list(obs_subset.layers[layer])
+        ohe_array = np.array(ohe_list)
+        sequence = ohe_decode(ohe_array)
+        sequences.append(sequence)
+    return sequences

smftools/informatics/helpers/one_hot_decode.py ADDED Viewed

@@ -0,0 +1,27 @@
+# one_hot_decode
+# String encodings
+def one_hot_decode(ohe_array):
+    """
+    Takes a flattened one hot encoded array and returns the sequence string from that array.
+    Parameters:
+        ohe_array (np.array): A one hot encoded array
+    Returns:
+        sequence (str): Sequence string of the one hot encoded array
+    """
+    import numpy as np
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    # Reshape the flattened array into a 2D matrix with 5 columns (one for each base)
+    one_hot_matrix = ohe_array.reshape(-1, 5)
+    # Get the index of the maximum value (which will be 1) in each row
+    decoded_indices = np.argmax(one_hot_matrix, axis=1)
+    # Map the indices back to the corresponding bases
+    sequence_list = [mapping[i] for i in decoded_indices]
+    sequence = ''.join(sequence_list)
+    return sequence

smftools/informatics/helpers/one_hot_encode.py CHANGED Viewed

@@ -1,21 +1,57 @@
 # one_hot_encode
-# String encodings
-def one_hot_encode(sequence):
+def one_hot_encode(sequence, device='auto'):
     """
-    One hot encodes a sequence list.
+    One-hot encodes a DNA sequence.
     Parameters:
-        sequence (list): A list of DNA base sequences.
+        sequence (str or list): DNA sequence (e.g., "ACGTN" or ['A', 'C', 'G', 'T', 'N']).
     Returns:
-        flattened (ndarray): A numpy ndarray holding a flattened one hot encoding of the input sequence string.
+        ndarray: Flattened one-hot encoded representation of the input sequence.
     """
     import numpy as np
-    seq_array = np.array(sequence, dtype='<U1')  # String dtype
     mapping = np.array(['A', 'C', 'G', 'T', 'N'])
-    seq_array[~np.isin(seq_array, mapping)] = 'N'
+    # Ensure input is a list of characters
+    if not isinstance(sequence, list):
+        sequence = list(sequence)  # Convert string to list of characters
+    # Handle empty sequences
+    if len(sequence) == 0:
+        print("Warning: Empty sequence encountered in one_hot_encode()")
+        return np.zeros(len(mapping))  # Return empty encoding instead of failing
+    # Convert sequence to NumPy array
+    seq_array = np.array(sequence, dtype='<U1')
+    # Replace invalid bases with 'N'
+    seq_array = np.where(np.isin(seq_array, mapping), seq_array, 'N')
+    # Create one-hot encoding matrix
     one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
-    flattened = one_hot_matrix.flatten()
-    return flattened
+    # Flatten and return
+    return one_hot_matrix.flatten()
+    # import torch
+    # bases = torch.tensor([ord('A'), ord('C'), ord('G'), ord('T'), ord('N')], dtype=torch.int8, device=device)
+    # # Convert input to tensor of character ASCII codes
+    # seq_tensor = torch.tensor([ord(c) for c in sequence], dtype=torch.int8, device=device)
+    # # Handle empty sequence
+    # if seq_tensor.numel() == 0:
+    #     print("Warning: Empty sequence encountered in one_hot_encode_torch()")
+    #     return torch.zeros(len(bases), device=device)
+    # # Replace invalid bases with 'N'
+    # is_valid = (seq_tensor[:, None] == bases)  # Compare each base with mapping
+    # seq_tensor = torch.where(is_valid.any(dim=1), seq_tensor, ord('N'))
+    # # Create one-hot encoding matrix
+    # one_hot_matrix = (seq_tensor[:, None] == bases).int()
+    # # Flatten and return
+    # return one_hot_matrix.flatten()

smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py CHANGED Viewed

@@ -18,6 +18,7 @@ def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
     bed_basename = os.path.basename(bed_file).split('.bed')[0]
     # Load the BED file into a DataFrame
+    print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
     df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
     # Group by chromosome

smftools/informatics/helpers/run_multiqc.py ADDED Viewed

@@ -0,0 +1,28 @@
+def run_multiqc(input_dir, output_dir):
+    """
+    Runs MultiQC on a given directory and saves the report to the specified output directory.
+    Parameters:
+    - input_dir (str): Path to the directory containing QC reports (e.g., FastQC, Samtools, bcftools outputs).
+    - output_dir (str): Path to the directory where MultiQC reports should be saved.
+    Returns:
+    - None: The function executes MultiQC and prints the status.
+    """
+    import os
+    import subprocess
+    # Ensure the output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Construct MultiQC command
+    command = ["multiqc", input_dir, "-o", output_dir]
+    print(f"Running MultiQC on '{input_dir}' and saving results to '{output_dir}'...")
+    # Run MultiQC
+    try:
+        subprocess.run(command, check=True)
+        print(f"MultiQC report generated successfully in: {output_dir}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running MultiQC: {e}")

smftools/informatics/helpers/split_and_index_BAM.py CHANGED Viewed

@@ -1,6 +1,6 @@
 ## split_and_index_BAM
-def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, fasta):
+def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory):
     """
     A wrapper function for splitting BAMS and indexing them.
     Parameters:
@@ -8,7 +8,6 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_direct
         split_dir (str): A string representing the file path to the directory to split the BAMs into.
         bam_suffix (str): A suffix to add to the bam file.
         output_directory (str): A file path to the directory to output all the analyses.
-        fasta (str): File path to the reference genome to align to.
     Returns:
         None
@@ -19,8 +18,6 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_direct
     import subprocess
     import glob
     from .separate_bam_by_bc import separate_bam_by_bc
-    from .aligned_BAM_to_bed import aligned_BAM_to_bed
-    from .extract_readnames_from_BAM import extract_readnames_from_BAM
     from .make_dirs import make_dirs
     plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
@@ -35,7 +32,5 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_direct
     bam_files = [bam for bam in bam_files if '.bai' not in bam]
     for input_file in bam_files:
         subprocess.run(["samtools", "index", input_file])
-        # Make a bed file of coordinates for the BAM
-        aligned_BAM_to_bed(input_file, plotting_dir, bed_dir, fasta)
-        # Make a text file of reads for the BAM
-        extract_readnames_from_BAM(input_file)
+    return bam_files

smftools/informatics/load_adata.py CHANGED Viewed

@@ -14,11 +14,12 @@ def load_adata(config_path):
         None
     """
     # Lazy importing of packages
-    from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam
+    from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam, extract_read_features_from_bam
     from .fast5_to_pod5 import fast5_to_pod5
     from .subsample_fasta_from_bed import subsample_fasta_from_bed
     import os
     import numpy as np
+    import anndata as ad
     from pathlib import Path
     # Default params
@@ -42,8 +43,13 @@ def load_adata(config_path):
     fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
     mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
     experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
+    model_dir = var_dict.get('model_dir', default_value) # needed for dorado basecaller
     model = var_dict.get('model', default_value) # needed for dorado basecaller
     barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
+    barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
+    trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
+    input_already_demuxed = var_dict.get('input_already_demuxed', default_value) # If the input files are already demultiplexed.
+    threads = var_dict.get('threads', default_value) # number of cpu threads available for multiprocessing
     # Conversion specific variable init
     conversion_types = var_dict.get('conversion_types', default_value)
     # Direct methylation specific variable init
@@ -54,6 +60,10 @@ def load_adata(config_path):
     thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
     mod_list = var_dict.get('mod_list', default_value)
     batch_size = var_dict.get('batch_size', default_value)
+    device = var_dict.get('device', 'auto')
+    make_bigwigs = var_dict.get('make_bigwigs', default_value)
+    skip_unclassified = var_dict.get('skip_unclassified', True)
+    delete_batch_hdfs = var_dict.get('delete_batch_hdfs', True)
     # Make initial output directory
     make_dirs([output_directory])
@@ -119,9 +129,54 @@ def load_adata(config_path):
     if smf_modality == 'conversion':
         from .conversion_smf import conversion_smf
-        conversion_smf(fasta, output_directory, conversions, strands, model, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall)
+        final_adata, final_adata_path, sorted_output, bam_files = conversion_smf(fasta, output_directory, conversions, strands, model_dir, model, input_data_path, split_path
+                                                         , barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed)
     elif smf_modality == 'direct':
         from .direct_smf import direct_smf
-        direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall)
+        # need to add input_already_demuxed workflow here.
+        final_adata, final_adata_path, sorted_output, bam_files = direct_smf(fasta, output_directory, mod_list,model_dir, model, thresholds, input_data_path, split_path
+                                                     , barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads)
     else:
             print("Error")
+    # Read in the final adata object and append final metadata
+    #print(f'Reading in adata from {final_adata_path} to add final metadata')
+    # final_adata = ad.read_h5ad(final_adata_path)
+    # Adding read query length metadata to adata object.
+    read_metrics = {}
+    for bam_file in bam_files:
+        bam_read_metrics = extract_read_features_from_bam(bam_file)
+        read_metrics.update(bam_read_metrics)
+    #read_metrics = extract_read_features_from_bam(sorted_output)
+    query_read_length_values = []
+    query_read_quality_values = []
+    reference_lengths = []
+    # Iterate over each row of the AnnData object
+    for obs_name in final_adata.obs_names:
+        # Fetch the value from the dictionary using the obs_name as the key
+        value = read_metrics.get(obs_name, np.nan)  # Use np.nan if the key is not found
+        if type(value) is list:
+            query_read_length_values.append(value[0])
+            query_read_quality_values.append(value[1])
+            reference_lengths.append(value[2])
+        else:
+            query_read_length_values.append(value)
+            query_read_quality_values.append(value)
+            reference_lengths.append(value)
+    # Add the new column to adata.obs
+    final_adata.obs['query_read_length'] = query_read_length_values
+    final_adata.obs['query_read_quality'] = query_read_quality_values
+    final_adata.obs['query_length_to_reference_length_ratio'] = np.array(query_read_length_values) / np.array(reference_lengths)
+    final_adata.obs['Raw_methylation_signal'] = np.nansum(final_adata.X, axis=1)
+    final_adata.obs['Raw_per_base_methylation_average'] = final_adata.obs['Raw_methylation_signal'] / final_adata.obs['query_read_length']
+    print('Saving final adata')
+    if ".gz" in final_adata_path:
+        final_adata.write_h5ad(f"{final_adata_path}", compression='gzip')
+    else:
+        final_adata.write_h5ad(f"{final_adata_path}.gz", compression='gzip')
+    print('Final adata saved')

smftools/plotting/__init__.py CHANGED Viewed

@@ -0,0 +1,15 @@
+from .position_stats import plot_bar_relative_risk, plot_volcano_relative_risk, plot_positionwise_matrix, plot_positionwise_matrix_grid
+from .general_plotting import combined_hmm_raw_clustermap
+from .classifiers import plot_model_performance, plot_feature_importances_or_saliency, plot_model_curves_from_adata, plot_model_curves_from_adata_with_frequency_grid
+__all__ = [
+    "combined_hmm_raw_clustermap",
+    "plot_bar_relative_risk",
+    "plot_positionwise_matrix",
+    "plot_positionwise_matrix_grid",
+    "plot_volcano_relative_risk",
+    "plot_feature_importances_or_saliency",
+    "plot_model_performance",
+    "plot_model_curves_from_adata",
+    "plot_model_curves_from_adata_with_frequency_grid"
+]

smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl