PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

smftools/__init__.py +5 -1
smftools/_version.py +1 -1
smftools/informatics/__init__.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +63 -10
smftools/informatics/direct_smf.py +66 -18
smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
smftools/informatics/helpers/__init__.py +16 -2
smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
smftools/informatics/helpers/canoncall.py +12 -3
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +33 -46
smftools/informatics/helpers/extract_mods.py +55 -23
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/find_conversion_sites.py +33 -44
smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
smftools/informatics/helpers/modcall.py +13 -5
smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
smftools/informatics/helpers/ohe_batching.py +65 -41
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +45 -9
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/split_and_index_BAM.py +3 -8
smftools/informatics/load_adata.py +58 -3
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +22 -9
smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
smftools/preprocessing/binarize_on_Youden.py +35 -32
smftools/preprocessing/binary_layers_to_ohe.py +13 -3
smftools/preprocessing/calculate_complexity.py +3 -2
smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
smftools/preprocessing/calculate_coverage.py +26 -25
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_position_Youden.py +18 -7
smftools/preprocessing/calculate_read_length_stats.py +39 -46
smftools/preprocessing/clean_NaN.py +33 -25
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
smftools/preprocessing/filter_reads_on_length.py +14 -4
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +18 -11
smftools/preprocessing/load_sample_sheet.py +30 -16
smftools/preprocessing/recipes.py +22 -20
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +105 -13
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +19 -23
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
smftools-0.1.7.dist-info/RECORD +136 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/read_HMM.py +0 -1
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
/smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
/smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 ## mark_duplicates
-def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', hamming_distance_thresholds={}):
+def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
     """
     Marks duplicates in the adata object.
@@ -8,8 +8,9 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
         adata (AnnData): An adata object.
         layers (list): A list of strings representing the layers to use.
         obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
-        sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
-        hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
+        sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
+        method (str): method to use for calculating the distance metric
+        distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
     Returns:
         None
@@ -21,7 +22,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
     from scipy.signal import find_peaks
     import networkx as nx
     from .binary_layers_to_ohe import binary_layers_to_ohe
-    from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
+    from .calculate_pairwise_differences import calculate_pairwise_differences
     from .min_non_diagonal import min_non_diagonal
     categories = adata.obs[obs_column].cat.categories
@@ -29,49 +30,59 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
     # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
     adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
+    cat_sample_dict = {}
     for cat in categories:
         cat_subset = adata[adata.obs[obs_column] == cat].copy()
         for sample in sample_names:
             sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
+            sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
             # Encode sequencing reads as a one-hot-encodings
-            adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
+            print(f'One-hot encoding reads from {sample} on {cat}')
+            cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
             # Unpack the read names and one hot encodings into lists
             read_names = []
             ohe_list = []
-            for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
+            for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
                 read_names.append(read_name)
                 ohe_list.append(ohe)
             # Calculate the pairwise hamming distances
-            print(f'Calculating hamming distances for {sample} on {cat} allele')
-            distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
+            if method == 'N_masked_distances':
+                print(f'Calculating N_masked_distances for {sample} on {cat} allele')
+                distance_matrix = calculate_pairwise_differences(ohe_list)
+            else:
+                print(f'{method} for calculating differences is not available')
             n_reads = distance_matrix.shape[0]
             # Load the hamming matrix into a dataframe with index and column names as the read_names
             distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
-            # Save the distance dataframe into an unstructured component of the adata object
-            adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
+            cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
             if n_reads > 1:
                 # Calculate the minimum non-self distance for every read in the reference and sample
                 min_distance_values = min_non_diagonal(distance_matrix)
                 min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
                 adata.obs.update(min_distance_df)
-                # Generate a histogram of minimum non-self distances for each read
-                if n_reads > 3:
-                    n_bins = n_reads // 4
-                else:
-                    n_bins = 1
-                min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
-                if cat in hamming_distance_thresholds:
-                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
+                if cat in distance_thresholds:
+                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
                 else: # eventually this should be written to use known PCR duplicate controls for thresholding.
+                    # Generate a histogram of minimum non-self distances for each read
+                    if n_reads > 3:
+                        n_bins = n_reads // 4
+                    else:
+                        n_bins = 1
+                    min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
                     # Normalize the max value in any histogram bin to 1
                     normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
                     # Extract the bin index of peak centers in the histogram
-                    peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
-                    first_peak_index = peak_centers[0]
-                    offset_index = first_peak_index-1
-                    # Use the distance corresponding to the first peak as the threshold distance in graph construction
-                    first_peak_distance = min_distance_bins[1][first_peak_index]
-                    offset_distance = min_distance_bins[1][offset_index]
+                    try:
+                        peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
+                        first_peak_index = peak_centers[0]
+                        offset_index = first_peak_index-1
+                        # Use the distance corresponding to the first peak as the threshold distance in graph construction
+                        first_peak_distance = min_distance_bins[1][first_peak_index]
+                        offset_distance = min_distance_bins[1][offset_index]
+                    except:
+                        offset_distance = normalized_min_distance_counts[0]
                     adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
             else:
                 adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
@@ -83,7 +94,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
     for cat in categories:
         for sample in sample_names:
-            distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
+            distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
             read_names = distance_df.index
             distance_matrix = distance_df.values
             n_reads = distance_matrix.shape[0]
@@ -106,7 +117,8 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
                 fraction_unique = cluster_count / n_reads
             else:
                 fraction_unique = 0
-            adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, fraction_unique, clusters]
+            adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
+            adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
             # Update the adata object
             read_cluster_map = {}
             read_duplicate_map = {}

smftools/preprocessing/binarize_on_Youden.py CHANGED Viewed

@@ -1,42 +1,45 @@
-## binarize_on_Youden
 def binarize_on_Youden(adata, obs_column='Reference'):
     """
-    Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
+    Binarize SMF values based on position thresholds determined by calculate_position_Youden.
     Parameters:
-        adata (AnnData): The anndata object to binarize. pp.calculate_position_Youden function has to be run first.
-        obs_column (str): The obs_column to stratify on. Needs to be the same as passed in pp.calculate_position_Youden.
-    Input: adata object that has had calculate_position_Youden called on it.
-    Output:
+        adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
+        obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
+    Modifies:
+        Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
     """
     import numpy as np
-    import anndata as ad
-    temp_adata = None
-    categories = adata.obs[obs_column].cat.categories
+    import anndata as ad
+    # Initialize an empty matrix to store the binarized methylation values
+    binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
+    # Get unique categories
+    categories = adata.obs[obs_column].cat.categories
     for cat in categories:
-        # Get the category subset
-        cat_subset = adata[adata.obs[obs_column] == cat].copy()
-        # extract the probability matrix for the category subset
-        original_matrix = cat_subset.X
-        # extract the learned methylation call thresholds for each position in the category.
-        thresholds = [cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'][i][0] for i in range(cat_subset.shape[1])]
-        # In the original matrix, get all positions that are nan values
+        # Select subset for this category
+        cat_mask = adata.obs[obs_column] == cat
+        cat_subset = adata[cat_mask]
+        # Extract the probability matrix
+        original_matrix = cat_subset.X.copy()
+        # Extract the thresholds for each position efficiently
+        thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
+        # Identify NaN values
         nan_mask = np.isnan(original_matrix)
-        # Binarize the matrix on the new thresholds
+        # Binarize based on threshold
         binarized_matrix = (original_matrix > thresholds).astype(float)
-        # At the original positions that had nan values, replace the values with nans again
+        # Restore NaN values
         binarized_matrix[nan_mask] = np.nan
-        # Make a new layer for the reference that contains the binarized methylation calls
-        cat_subset.layers['binarized_methylation'] = binarized_matrix
-        if temp_adata:
-            # If temp_data already exists, concatenate
-            temp_adata = ad.concat([temp_adata, cat_subset], join='outer', index_unique=None).copy()
-        else:
-            # If temp_adata is still None, initialize temp_adata with reference_subset
-            temp_adata = cat_subset.copy()
-    # Sort the temp adata on the index names of the primary adata
-    temp_adata = temp_adata[adata.obs_names].copy()
-    # Pull back the new binarized layers into the original adata object
-    adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
+        # Assign the binarized values back into the preallocated storage
+        binarized_methylation[cat_mask, :] = binarized_matrix
+    # Store the binarized matrix in a new layer
+    adata.layers['binarized_methylation'] = binarized_methylation

smftools/preprocessing/binary_layers_to_ohe.py CHANGED Viewed

@@ -1,11 +1,11 @@
 ## binary_layers_to_ohe
 ## Conversion SMF Specific
-def binary_layers_to_ohe(adata, layers, stack='hstack'):
+def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
     """
     Parameters:
         adata (AnnData): Anndata object.
-        layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix
+        binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
         stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
     Returns:
@@ -14,8 +14,18 @@ def binary_layers_to_ohe(adata, layers, stack='hstack'):
     """
     import numpy as np
     import anndata as ad
+    # Ensure that the N layer is last!
+    # Grab all binary layers that are not encoding N
+    ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
+    # If there is a binary layer encoding N, hold it in N_binary_layer
+    N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
+    # Add the N_binary_encoding layer to the end of the list of binary layers
+    all_binary_layers = ACGT_binary_layers + N_binary_layer
+    print(f'Found {all_binary_layers} layers in adata')
     # Extract the layers
-    layers = [adata.layers[layer_name] for layer_name in layers]
+    layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
     n_reads = layers[0].shape[0]
     ohe_dict = {}
     for i in range(n_reads):

smftools/preprocessing/calculate_complexity.py CHANGED Viewed

@@ -32,7 +32,8 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
     for cat in categories:
         for sample in sample_names:
-            unique_reads, total_reads = adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'][0:2]
+            unique_reads = adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}']
+            total_reads = adata.uns[f'total_reads_within_{cat}_{sample}']
             reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
             # Subsampling depths
             subsampling_depths = [total_reads // (i+1) for i in range(10)]
@@ -49,7 +50,7 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
             # Generate data for the complexity curve
             x_data = np.linspace(0, 5000, 100)
             y_data = lander_waterman(x_data, *popt)
-            adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
+            adata.uns[f'Library_complexity_of_{sample}_on_{cat}'] = popt[0]
             if plot:
                 import matplotlib.pyplot as plt
                 # Plot the complexity curve

smftools/preprocessing/calculate_converted_read_methylation_stats.py CHANGED Viewed

@@ -3,7 +3,7 @@
 ## Conversion SMF Specific
 # Read methylation QC
-def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False):
+def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col):
     """
     Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
@@ -11,9 +11,6 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
         adata (AnnData): An adata object
         reference_column (str): String representing the name of the Reference column to use
         sample_names_col (str): String representing the name of the sample name column to use
-        output_directory (str): String representing the output directory to make and write out the histograms.
-        show_methylation_histogram (bool): Whether to display the histograms.
-        save_methylation_histogram (bool): Whether to save the histograms.
     Returns:
         None
@@ -21,8 +18,8 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
     import numpy as np
     import anndata as ad
     import pandas as pd
-    import matplotlib.pyplot as plt
-    from .. import readwrite
+    print('Calculating read level methylation statistics')
     references = set(adata.obs[reference_column])
     sample_names = set(adata.obs[sample_names_col])
@@ -53,44 +50,45 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
     pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
     adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
-    adata.uns['methylation_dict'] = {}
-    n_bins = 50
-    site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
+# Below should be a plotting function
+    # adata.uns['methylation_dict'] = {}
+    # n_bins = 50
+    # site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
-    for reference in references:
-        reference_adata = adata[adata.obs[reference_column] == reference].copy()
-        split_reference = reference.split('_')[0][1:]
-        for sample in sample_names:
-            sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
-            for site_type in site_types_to_analyze:
-                methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
-                max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
-                if not np.isnan(max_meth):
-                    n_bins = int(max_meth // 2)
-                else:
-                    n_bins = 1
-                mean = np.mean(methylation_data)
-                median = np.median(methylation_data)
-                stdev = np.std(methylation_data)
-                adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
-                if show_methylation_histogram or save_methylation_histogram:
-                    fig, ax = plt.subplots(figsize=(6, 4))
-                    count, bins, patches =  plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
-                    plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
-                    plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
-                    plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
-                    plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
-                    plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
-                    plt.xlabel('Fraction methylated')
-                    plt.ylabel('Proportion')
-                    title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
-                    plt.title(title, pad=20)
-                    plt.xlim(-0.05, 1.05)  # Set x-axis range from 0 to 1
-                    ax.spines['right'].set_visible(False)
-                    ax.spines['top'].set_visible(False)
-                    save_name = output_directory + f'/{readwrite.date_string()} {title}'
-                    if save_methylation_histogram:
-                        plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
-                        plt.close()
-                    else:
-                        plt.show()
+    # for reference in references:
+    #     reference_adata = adata[adata.obs[reference_column] == reference].copy()
+    #     split_reference = reference.split('_')[0][1:]
+    #     for sample in sample_names:
+    #         sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
+    #         for site_type in site_types_to_analyze:
+    #             methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
+    #             max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
+    #             if not np.isnan(max_meth):
+    #                 n_bins = int(max_meth // 2)
+    #             else:
+    #                 n_bins = 1
+    #             mean = np.mean(methylation_data)
+    #             median = np.median(methylation_data)
+    #             stdev = np.std(methylation_data)
+    #             adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
+    #             if show_methylation_histogram or save_methylation_histogram:
+    #                 fig, ax = plt.subplots(figsize=(6, 4))
+    #                 count, bins, patches =  plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
+    #                 plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
+    #                 plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
+    #                 plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
+    #                 plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
+    #                 plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
+    #                 plt.xlabel('Fraction methylated')
+    #                 plt.ylabel('Proportion')
+    #                 title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
+    #                 plt.title(title, pad=20)
+    #                 plt.xlim(-0.05, 1.05)  # Set x-axis range from 0 to 1
+    #                 ax.spines['right'].set_visible(False)
+    #                 ax.spines['top'].set_visible(False)
+    #                 save_name = output_directory + f'/{readwrite.date_string()} {title}'
+    #                 if save_methylation_histogram:
+    #                     plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
+    #                     plt.close()
+    #                 else:
+    #                     plt.show()

smftools/preprocessing/calculate_coverage.py CHANGED Viewed

@@ -1,41 +1,42 @@
-## calculate_coverage
-def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
+def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.05):
     """
-    Append position level metadata regarding whether the position is informative within the given observation category.
+    Append position-level metadata regarding whether the position is informative within the given observation category.
     Parameters:
         adata (AnnData): An AnnData object
         obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
         position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
-    Returns:
-        None
+    Modifies:
+        - Adds new columns to `adata.var` containing coverage statistics.
     """
     import numpy as np
-    import anndata as ad
     import pandas as pd
+    import anndata as ad
     categories = adata.obs[obs_column].cat.categories
     n_categories_with_position = np.zeros(adata.shape[1])
     # Loop over categories
     for cat in categories:
-        # Look at positional information for each reference
-        temp_cat_adata = adata[adata.obs[obs_column] == cat].copy()
-        # Look at read coverage on the given category strand
+        print(f'Assessing positional coverage across samples for {cat} reference')
+        # Subset to current category
+        cat_mask = adata.obs[obs_column] == cat
+        temp_cat_adata = adata[cat_mask]
+        # Compute fraction of valid coverage
         cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
-        cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
-        cat_valid_fraction = cat_valid_coverage / (cat_valid_coverage + cat_invalid_coverage)
-        # Append metadata for category to the anndata object
+        cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0]  # Avoid extra computation
+        # Store coverage stats
         adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
-        # Characterize if the position is in the given category or not
-        conditions = [
-            (adata.var[f'{cat}_valid_fraction'] >= position_nan_threshold),
-            (adata.var[f'{cat}_valid_fraction'] < position_nan_threshold)
-        ]
-        choices = [True, False]
-        adata.var[f'position_in_{cat}'] = np.select(conditions, choices, default=False)
-        n_categories_with_position += np.array(adata.var[f'position_in_{cat}'])
-    # Final array with the sum at each position of the number of categories covering that position
-    adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
+        # Assign whether the position is covered based on threshold
+        adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
+        # Sum the number of categories covering each position
+        n_categories_with_position += adata.var[f'position_in_{cat}'].values
+    # Store final category count
+    adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)

smftools/preprocessing/calculate_pairwise_differences.py ADDED Viewed

@@ -0,0 +1,49 @@
+# calculate_pairwise_differences
+def calculate_pairwise_differences(arrays):
+    """
+    Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
+    Parameters:
+        arrays (str): A list of ndarrays.
+    Returns:
+        distance_matrix (ndarray): a 2D array containing the pairwise differences between all arrays.
+    """
+    import numpy as np
+    from tqdm import tqdm
+    num_arrays = len(arrays)
+    n_rows = 5
+    reshaped_arrays = [array.reshape(n_rows, -1) for array in arrays]
+    N_masks = [array[-1].astype(bool) for array in reshaped_arrays]
+    reshaped_arrays_minus_N = [array[:-1].flatten() for array in reshaped_arrays]
+    # Precompute the repeated N masks to avoid repeated computations
+    repeated_N_masks = [np.tile(N_mask, (n_rows - 1)) for N_mask in N_masks]
+    # Initialize the distance matrix
+    distance_matrix = np.zeros((num_arrays, num_arrays), dtype=np.float32)
+    # Calculate pairwise distances with progress bar
+    for i in tqdm(range(num_arrays), desc="Calculating Pairwise Differences"):
+        array_i = reshaped_arrays_minus_N[i]
+        N_mask_i = repeated_N_masks[i]
+        for j in range(i + 1, num_arrays):
+            array_j = reshaped_arrays_minus_N[j]
+            N_mask_j = repeated_N_masks[j]
+            # Combined mask to ignore N positions
+            combined_mask = N_mask_i | N_mask_j
+            # Calculate the hamming distance directly with boolean operations
+            differences = (array_i != array_j) & ~combined_mask
+            distance = np.sum(differences) / np.sum(~combined_mask)
+            # Store the symmetric distances
+            distance_matrix[i, j] = distance
+            distance_matrix[j, i] = distance
+    return distance_matrix

smftools/preprocessing/calculate_position_Youden.py CHANGED Viewed

@@ -1,7 +1,7 @@
 ## calculate_position_Youden
 ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
-def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
+def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
     """
     Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
@@ -11,6 +11,8 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
         negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
         J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
         obs_column (str): The category to iterate over.
+        infer_on_perdentile (bool | int): If False, use defined postive and negative control samples. If an int (0 < int < 100) is passed, this uses the top and bottom int percentile of methylated reads based on metric in inference_variable column.
+        inference_variable (str): If infer_on_percentile has an integer value passed, use the AnnData observation column name passed by this string as the metric.
         save (bool): Whether to save the ROC plots.
         output_directory (str): String representing the path to the output directory to output the ROC curves.
@@ -27,15 +29,25 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
     categories = adata.obs[obs_column].cat.categories
     # Iterate over each category in the specified obs_column
     for cat in categories:
+        print(f"Calculating position Youden statistics for {cat}")
         # Subset to keep only reads associated with the category
-        cat_subset = adata[adata.obs[obs_column] == cat].copy()
+        cat_subset = adata[adata.obs[obs_column] == cat]
         # Iterate over positive and negative control samples
         for control in control_samples:
             # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
             adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
-            # get the current control subset on the given category
-            filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
-            control_subset = cat_subset[filtered_obs.index].copy()
+            if infer_on_percentile:
+                sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
+                if control == "positive":
+                    threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
+                    control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
+                else:
+                    threshold = np.percentile(sorted_column, infer_on_percentile)
+                    control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
+            else:
+                # get the current control subset on the given category
+                filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
+                control_subset = cat_subset[filtered_obs.index]
             # Iterate through every position in the control subset
             for position in range(control_subset.shape[1]):
                 # Get the coordinate name associated with that position
@@ -91,8 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
                     probability_thresholding_list[position] = (0.8, np.nan)
         title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
         plt.title(title)
-        date_string = date_string()
-        save_name = output_directory + f'/{date_string} {title}'
+        save_name = output_directory + f'/{title}'
         if save:
             plt.savefig(save_name)
             plt.close()

smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl