PyPI - smftools - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +29 -0
smftools/_settings.py +20 -0
smftools/_version.py +1 -0
smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
smftools/datasets/F1_sample_sheet.csv +5 -0
smftools/datasets/__init__.py +9 -0
smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
smftools/datasets/datasets.py +28 -0
smftools/informatics/__init__.py +16 -0
smftools/informatics/archived/bam_conversion.py +59 -0
smftools/informatics/archived/bam_direct.py +63 -0
smftools/informatics/archived/basecalls_to_adata.py +71 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +132 -0
smftools/informatics/direct_smf.py +137 -0
smftools/informatics/fast5_to_pod5.py +21 -0
smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
smftools/informatics/helpers/__init__.py +74 -0
smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
smftools/informatics/helpers/archived/informatics.py +260 -0
smftools/informatics/helpers/archived/load_adata.py +516 -0
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/bed_to_bigwig.py +39 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
smftools/informatics/helpers/canoncall.py +34 -0
smftools/informatics/helpers/complement_base_list.py +21 -0
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/count_aligned_reads.py +43 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +44 -0
smftools/informatics/helpers/extract_mods.py +83 -0
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
smftools/informatics/helpers/find_conversion_sites.py +50 -0
smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
smftools/informatics/helpers/get_native_references.py +28 -0
smftools/informatics/helpers/index_fasta.py +12 -0
smftools/informatics/helpers/make_dirs.py +21 -0
smftools/informatics/helpers/make_modbed.py +27 -0
smftools/informatics/helpers/modQC.py +27 -0
smftools/informatics/helpers/modcall.py +36 -0
smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
smftools/informatics/helpers/ohe_batching.py +76 -0
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +57 -0
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
smftools/informatics/helpers/split_and_index_BAM.py +36 -0
smftools/informatics/load_adata.py +182 -0
smftools/informatics/readwrite.py +106 -0
smftools/informatics/subsample_fasta_from_bed.py +47 -0
smftools/informatics/subsample_pod5.py +104 -0
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +33 -0
smftools/preprocessing/append_C_context.py +82 -0
smftools/preprocessing/archives/mark_duplicates.py +146 -0
smftools/preprocessing/archives/preprocessing.py +614 -0
smftools/preprocessing/archives/remove_duplicates.py +21 -0
smftools/preprocessing/binarize_on_Youden.py +45 -0
smftools/preprocessing/binary_layers_to_ohe.py +40 -0
smftools/preprocessing/calculate_complexity.py +72 -0
smftools/preprocessing/calculate_consensus.py +47 -0
smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
smftools/preprocessing/calculate_coverage.py +42 -0
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
smftools/preprocessing/calculate_position_Youden.py +115 -0
smftools/preprocessing/calculate_read_length_stats.py +79 -0
smftools/preprocessing/clean_NaN.py +46 -0
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
smftools/preprocessing/filter_reads_on_length.py +51 -0
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +30 -0
smftools/preprocessing/load_sample_sheet.py +38 -0
smftools/preprocessing/make_dirs.py +21 -0
smftools/preprocessing/min_non_diagonal.py +25 -0
smftools/preprocessing/recipes.py +127 -0
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +198 -0
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/evaluation/__init__.py +0 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +28 -0
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
smftools-0.1.7.dist-info/RECORD +136 -0
smftools-0.1.6.dist-info/RECORD +0 -4
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+from .append_C_context import append_C_context
+from .binarize_on_Youden import binarize_on_Youden
+from .calculate_complexity import calculate_complexity
+from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
+from .calculate_coverage import calculate_coverage
+from .calculate_position_Youden import calculate_position_Youden
+from .calculate_read_length_stats import calculate_read_length_stats
+from .clean_NaN import clean_NaN
+from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
+from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
+from .filter_reads_on_length import filter_reads_on_length
+from .invert_adata import invert_adata
+from .load_sample_sheet import load_sample_sheet
+from .flag_duplicate_reads import flag_duplicate_reads
+from .subsample_adata import subsample_adata
+__all__ = [
+    "append_C_context",
+    "binarize_on_Youden",
+    "calculate_complexity",
+    "calculate_converted_read_methylation_stats",
+    "calculate_coverage",
+    "calculate_position_Youden",
+    "calculate_read_length_stats",
+    "clean_NaN",
+    "filter_adata_by_nan_proportion",
+    "filter_converted_reads_on_methylation",
+    "filter_reads_on_length",
+    "invert_adata",
+    "load_sample_sheet",
+    "flag_duplicate_reads",
+    "subsample_adata"
+]

smftools/preprocessing/append_C_context.py ADDED Viewed

@@ -0,0 +1,82 @@
+## append_C_context
+## Conversion SMF Specific
+# Read methylation QC
+def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
+    """
+    Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
+    Parameters:
+        adata (AnnData): The input adata object.
+        obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
+        use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
+        native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
+    Returns:
+        None
+    """
+    import numpy as np
+    import anndata as ad
+    print('Adding Cytosine context based on reference FASTA sequence for sample')
+    site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
+    categories = adata.obs[obs_column].cat.categories
+    for cat in categories:
+        # Assess if the strand is the top or bottom strand converted
+        if 'top' in cat:
+            strand = 'top'
+        elif 'bottom' in cat:
+            strand = 'bottom'
+        if native:
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
+        else:
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
+        # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
+        boolean_dict = {}
+        for site_type in site_types:
+            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
+        if strand == 'top':
+            # Iterate through the sequence and apply the criteria
+            for i in range(1, len(sequence) - 1):
+                if sequence[i] == 'C':
+                    boolean_dict[f'{cat}_any_C_site'][i] = True
+                    if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
+                        boolean_dict[f'{cat}_GpC_site'][i] = True
+                    elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
+                        boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                    elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
+                        boolean_dict[f'{cat}_CpG_site'][i] = True
+                    elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
+                        boolean_dict[f'{cat}_other_C'][i] = True
+        elif strand == 'bottom':
+            # Iterate through the sequence and apply the criteria
+            for i in range(1, len(sequence) - 1):
+                if sequence[i] == 'G':
+                    boolean_dict[f'{cat}_any_C_site'][i] = True
+                    if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
+                        boolean_dict[f'{cat}_GpC_site'][i] = True
+                    elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
+                        boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                    elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
+                        boolean_dict[f'{cat}_CpG_site'][i] = True
+                    elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
+                        boolean_dict[f'{cat}_other_C'][i] = True
+        else:
+            print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
+        for site_type in site_types:
+            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
+            adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X

smftools/preprocessing/archives/mark_duplicates.py ADDED Viewed

@@ -0,0 +1,146 @@
+## mark_duplicates
+def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
+    """
+    Marks duplicates in the adata object.
+    Parameters:
+        adata (AnnData): An adata object.
+        layers (list): A list of strings representing the layers to use.
+        obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
+        sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
+        method (str): method to use for calculating the distance metric
+        distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
+    Returns:
+        None
+    """
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from scipy.signal import find_peaks
+    import networkx as nx
+    from .binary_layers_to_ohe import binary_layers_to_ohe
+    from .calculate_pairwise_differences import calculate_pairwise_differences
+    from .min_non_diagonal import min_non_diagonal
+    categories = adata.obs[obs_column].cat.categories
+    sample_names = adata.obs[sample_col].cat.categories
+    # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
+    adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
+    cat_sample_dict = {}
+    for cat in categories:
+        cat_subset = adata[adata.obs[obs_column] == cat].copy()
+        for sample in sample_names:
+            sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
+            sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
+            # Encode sequencing reads as a one-hot-encodings
+            print(f'One-hot encoding reads from {sample} on {cat}')
+            cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
+            # Unpack the read names and one hot encodings into lists
+            read_names = []
+            ohe_list = []
+            for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
+                read_names.append(read_name)
+                ohe_list.append(ohe)
+            # Calculate the pairwise hamming distances
+            if method == 'N_masked_distances':
+                print(f'Calculating N_masked_distances for {sample} on {cat} allele')
+                distance_matrix = calculate_pairwise_differences(ohe_list)
+            else:
+                print(f'{method} for calculating differences is not available')
+            n_reads = distance_matrix.shape[0]
+            # Load the hamming matrix into a dataframe with index and column names as the read_names
+            distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
+            cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
+            if n_reads > 1:
+                # Calculate the minimum non-self distance for every read in the reference and sample
+                min_distance_values = min_non_diagonal(distance_matrix)
+                min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
+                adata.obs.update(min_distance_df)
+                if cat in distance_thresholds:
+                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
+                else: # eventually this should be written to use known PCR duplicate controls for thresholding.
+                    # Generate a histogram of minimum non-self distances for each read
+                    if n_reads > 3:
+                        n_bins = n_reads // 4
+                    else:
+                        n_bins = 1
+                    min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
+                    # Normalize the max value in any histogram bin to 1
+                    normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
+                    # Extract the bin index of peak centers in the histogram
+                    try:
+                        peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
+                        first_peak_index = peak_centers[0]
+                        offset_index = first_peak_index-1
+                        # Use the distance corresponding to the first peak as the threshold distance in graph construction
+                        first_peak_distance = min_distance_bins[1][first_peak_index]
+                        offset_distance = min_distance_bins[1][offset_index]
+                    except:
+                        offset_distance = normalized_min_distance_counts[0]
+                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
+            else:
+                adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
+    ## Detect likely duplicate reads and mark them in the adata object.
+    adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
+    adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
+    adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
+    for cat in categories:
+        for sample in sample_names:
+            distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
+            read_names = distance_df.index
+            distance_matrix = distance_df.values
+            n_reads = distance_matrix.shape[0]
+            distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
+            # Initialize the read distance graph
+            G = nx.Graph()
+            # Add each read as a node to the graph
+            G.add_nodes_from(range(n_reads))
+            # Add edges based on the threshold
+            for i in range(n_reads):
+                for j in range(i + 1, n_reads):
+                    if distance_matrix[i, j] <= distance_threshold:
+                        G.add_edge(i, j)
+            # Determine distinct clusters using connected components
+            clusters = list(nx.connected_components(G))
+            clusters = [list(cluster) for cluster in clusters]
+            # Get the number of clusters
+            cluster_count = len(clusters)
+            if n_reads > 0:
+                fraction_unique = cluster_count / n_reads
+            else:
+                fraction_unique = 0
+            adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
+            adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
+            # Update the adata object
+            read_cluster_map = {}
+            read_duplicate_map = {}
+            read_keep_map = {}
+            for i, cluster in enumerate(clusters):
+                for j, read_index in enumerate(cluster):
+                    read_name = read_names[read_index]
+                    read_cluster_map[read_name] = i
+                    if len(cluster) > 1:
+                        read_duplicate_map[read_name] = True
+                        if j == 0:
+                            read_keep_map[read_name] = True
+                        else:
+                            read_keep_map[read_name] = False
+                    elif len(cluster) == 1:
+                        read_duplicate_map[read_name] = False
+                        read_keep_map[read_name] = True
+            cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
+            duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
+            keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
+            df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
+            adata.obs.update(df_combined)
+            adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
+            adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
+            print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')

smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl