PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/METADATA +44 -11
smftools-0.1.6.dist-info/RECORD +4 -0
smftools/__init__.py +0 -25
smftools/_settings.py +0 -20
smftools/_version.py +0 -1
smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
smftools/datasets/F1_sample_sheet.csv +0 -5
smftools/datasets/__init__.py +0 -9
smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
smftools/datasets/datasets.py +0 -28
smftools/informatics/__init__.py +0 -14
smftools/informatics/archived/bam_conversion.py +0 -59
smftools/informatics/archived/bam_direct.py +0 -63
smftools/informatics/archived/basecalls_to_adata.py +0 -71
smftools/informatics/conversion_smf.py +0 -79
smftools/informatics/direct_smf.py +0 -89
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
smftools/informatics/helpers/__init__.py +0 -60
smftools/informatics/helpers/align_and_sort_BAM.py +0 -48
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -73
smftools/informatics/helpers/archived/informatics.py +0 -260
smftools/informatics/helpers/archived/load_adata.py +0 -516
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
smftools/informatics/helpers/canoncall.py +0 -25
smftools/informatics/helpers/complement_base_list.py +0 -21
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -54
smftools/informatics/helpers/converted_BAM_to_adata.py +0 -233
smftools/informatics/helpers/count_aligned_reads.py +0 -43
smftools/informatics/helpers/extract_base_identities.py +0 -57
smftools/informatics/helpers/extract_mods.py +0 -51
smftools/informatics/helpers/extract_readnames_from_BAM.py +0 -22
smftools/informatics/helpers/find_conversion_sites.py +0 -61
smftools/informatics/helpers/generate_converted_FASTA.py +0 -98
smftools/informatics/helpers/get_chromosome_lengths.py +0 -32
smftools/informatics/helpers/get_native_references.py +0 -28
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/make_modbed.py +0 -27
smftools/informatics/helpers/modQC.py +0 -27
smftools/informatics/helpers/modcall.py +0 -28
smftools/informatics/helpers/modkit_extract_to_adata.py +0 -518
smftools/informatics/helpers/ohe_batching.py +0 -52
smftools/informatics/helpers/one_hot_encode.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -52
smftools/informatics/helpers/separate_bam_by_bc.py +0 -43
smftools/informatics/helpers/split_and_index_BAM.py +0 -41
smftools/informatics/load_adata.py +0 -127
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/informatics/subsample_pod5.py +0 -104
smftools/plotting/__init__.py +0 -0
smftools/preprocessing/__init__.py +0 -34
smftools/preprocessing/append_C_context.py +0 -69
smftools/preprocessing/archives/preprocessing.py +0 -614
smftools/preprocessing/binarize_on_Youden.py +0 -42
smftools/preprocessing/binary_layers_to_ohe.py +0 -30
smftools/preprocessing/calculate_complexity.py +0 -71
smftools/preprocessing/calculate_consensus.py +0 -47
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -96
smftools/preprocessing/calculate_coverage.py +0 -41
smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
smftools/preprocessing/calculate_position_Youden.py +0 -104
smftools/preprocessing/calculate_read_length_stats.py +0 -86
smftools/preprocessing/clean_NaN.py +0 -38
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -29
smftools/preprocessing/filter_reads_on_length.py +0 -41
smftools/preprocessing/invert_adata.py +0 -23
smftools/preprocessing/load_sample_sheet.py +0 -24
smftools/preprocessing/make_dirs.py +0 -21
smftools/preprocessing/mark_duplicates.py +0 -134
smftools/preprocessing/min_non_diagonal.py +0 -25
smftools/preprocessing/recipes.py +0 -125
smftools/preprocessing/remove_duplicates.py +0 -21
smftools/readwrite.py +0 -106
smftools/tools/__init__.py +0 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/cluster.py +0 -0
smftools/tools/read_HMM.py +0 -1
smftools/tools/subset_adata.py +0 -32
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
{smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/load_sample_sheet.py DELETED Viewed

@@ -1,24 +0,0 @@
-# load_sample_sheet
-def load_sample_sheet(adata, sample_sheet_path, mapping_key_column):
-    """
-    Loads a sample sheet csv and uses one of the columns to map sample information into the AnnData object.
-    Parameters:
-        adata (AnnData): The Anndata object to append sample information to.
-        sample_sheet_path (str):
-        mapping_key_column (str):
-    Returns:
-        None
-    """
-    import pandas as pd
-    import anndata as ad
-    df = pd.read_csv(sample_sheet_path)
-    key_column = mapping_key_column
-    df[key_column] = df[key_column].astype(str)
-    value_columns = [column for column in df.columns if column != key_column]
-    mapping_dict = df.set_index(key_column)[value_columns].to_dict(orient='index')
-    for column in value_columns:
-        column_map = {key: value[column] for key, value in mapping_dict.items()}
-        adata.obs[column] = adata.obs[key_column].map(column_map)

smftools/preprocessing/make_dirs.py DELETED Viewed

@@ -1,21 +0,0 @@
-## make_dirs
-# General
-def make_dirs(directories):
-    """
-    Takes a list of file paths and makes new directories if the directory does not already exist.
-    Parameters:
-        directories (list): A list of directories to make
-    Returns:
-        None
-    """
-    import os
-    for directory in directories:
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
-            print(f"Directory '{directory}' created successfully.")
-        else:
-            print(f"Directory '{directory}' already exists.")

smftools/preprocessing/mark_duplicates.py DELETED Viewed

@@ -1,134 +0,0 @@
-## mark_duplicates
-def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', hamming_distance_thresholds={}):
-    """
-    Marks duplicates in the adata object.
-    Parameters:
-        adata (AnnData): An adata object.
-        layers (list): A list of strings representing the layers to use.
-        obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
-        sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
-        hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
-    Returns:
-        None
-    """
-    import numpy as np
-    import pandas as pd
-    import matplotlib.pyplot as plt
-    from scipy.signal import find_peaks
-    import networkx as nx
-    from .binary_layers_to_ohe import binary_layers_to_ohe
-    from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
-    from .min_non_diagonal import min_non_diagonal
-    categories = adata.obs[obs_column].cat.categories
-    sample_names = adata.obs[sample_col].cat.categories
-    # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
-    adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
-    for cat in categories:
-        cat_subset = adata[adata.obs[obs_column] == cat].copy()
-        for sample in sample_names:
-            sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
-            # Encode sequencing reads as a one-hot-encodings
-            adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
-            # Unpack the read names and one hot encodings into lists
-            read_names = []
-            ohe_list = []
-            for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
-                read_names.append(read_name)
-                ohe_list.append(ohe)
-            # Calculate the pairwise hamming distances
-            print(f'Calculating hamming distances for {sample} on {cat} allele')
-            distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
-            n_reads = distance_matrix.shape[0]
-            # Load the hamming matrix into a dataframe with index and column names as the read_names
-            distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
-            # Save the distance dataframe into an unstructured component of the adata object
-            adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
-            if n_reads > 1:
-                # Calculate the minimum non-self distance for every read in the reference and sample
-                min_distance_values = min_non_diagonal(distance_matrix)
-                min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
-                adata.obs.update(min_distance_df)
-                # Generate a histogram of minimum non-self distances for each read
-                if n_reads > 3:
-                    n_bins = n_reads // 4
-                else:
-                    n_bins = 1
-                min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
-                if cat in hamming_distance_thresholds:
-                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
-                else: # eventually this should be written to use known PCR duplicate controls for thresholding.
-                    # Normalize the max value in any histogram bin to 1
-                    normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
-                    # Extract the bin index of peak centers in the histogram
-                    peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
-                    first_peak_index = peak_centers[0]
-                    offset_index = first_peak_index-1
-                    # Use the distance corresponding to the first peak as the threshold distance in graph construction
-                    first_peak_distance = min_distance_bins[1][first_peak_index]
-                    offset_distance = min_distance_bins[1][offset_index]
-                    adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
-            else:
-                adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
-    ## Detect likely duplicate reads and mark them in the adata object.
-    adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
-    adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
-    adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
-    for cat in categories:
-        for sample in sample_names:
-            distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
-            read_names = distance_df.index
-            distance_matrix = distance_df.values
-            n_reads = distance_matrix.shape[0]
-            distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
-            # Initialize the read distance graph
-            G = nx.Graph()
-            # Add each read as a node to the graph
-            G.add_nodes_from(range(n_reads))
-            # Add edges based on the threshold
-            for i in range(n_reads):
-                for j in range(i + 1, n_reads):
-                    if distance_matrix[i, j] <= distance_threshold:
-                        G.add_edge(i, j)
-            # Determine distinct clusters using connected components
-            clusters = list(nx.connected_components(G))
-            clusters = [list(cluster) for cluster in clusters]
-            # Get the number of clusters
-            cluster_count = len(clusters)
-            if n_reads > 0:
-                fraction_unique = cluster_count / n_reads
-            else:
-                fraction_unique = 0
-            adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, fraction_unique, clusters]
-            # Update the adata object
-            read_cluster_map = {}
-            read_duplicate_map = {}
-            read_keep_map = {}
-            for i, cluster in enumerate(clusters):
-                for j, read_index in enumerate(cluster):
-                    read_name = read_names[read_index]
-                    read_cluster_map[read_name] = i
-                    if len(cluster) > 1:
-                        read_duplicate_map[read_name] = True
-                        if j == 0:
-                            read_keep_map[read_name] = True
-                        else:
-                            read_keep_map[read_name] = False
-                    elif len(cluster) == 1:
-                        read_duplicate_map[read_name] = False
-                        read_keep_map[read_name] = True
-            cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
-            duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
-            keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
-            df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
-            adata.obs.update(df_combined)
-            adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
-            adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
-            print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')

smftools/preprocessing/min_non_diagonal.py DELETED Viewed

@@ -1,25 +0,0 @@
-## min_non_diagonal
-def min_non_diagonal(matrix):
-    """
-    Takes a matrix and returns the smallest value from each row with the diagonal masked.
-    Parameters:
-        matrix (ndarray): A 2D ndarray.
-    Returns:
-        min_values (list): A list of minimum values from each row of the matrix
-    """
-    import numpy as np
-    n = matrix.shape[0]
-    min_values = []
-    for i in range(n):
-        # Mask to exclude the diagonal element
-        row_mask = np.ones(n, dtype=bool)
-        row_mask[i] = False
-        # Extract the row excluding the diagonal element
-        row = matrix[i, row_mask]
-        # Find the minimum value in the row
-        min_values.append(np.min(row))
-    return min_values

smftools/preprocessing/recipes.py DELETED Viewed

@@ -1,125 +0,0 @@
-# recipes
-def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory, mapping_key_column='Sample', reference_column = 'Reference', sample_names_col='Sample_names', invert=False):
-    """
-    The first part of the preprocessing workflow applied to the smf.inform.pod_to_adata() output derived from Kissiov_and_McKenna_2025.
-    Performs the following tasks:
-    1) Loads a sample CSV to append metadata mappings to the adata object.
-    2) Appends a boolean indicating whether each position in var_names is within a given reference.
-    3) Appends the cytosine context to each position from each reference.
-    4) Calculate read level methylation statistics.
-    5) Optionally inverts the adata to flip the position coordinate orientation.
-    6) Calculates read length statistics (start position, end position, read length)
-    7) Returns a dictionary to pass the variable namespace to the parent scope.
-    Parameters:
-        adata (AnnData): The AnnData object to use as input.
-        sample_sheet_path (str): String representing the path to the sample sheet csv containing the sample metadata.
-        output_directory (str): String representing the path to the output directory for plots.
-        mapping_key_column (str): The column name to use as the mapping keys for applying the sample sheet metadata.
-        reference_column (str): The name of the reference column to use.
-        sample_names_col (str): The name of the sample name column to use.
-        invert (bool): Whether to invert the positional coordinates of the adata object.
-    Returns:
-        variables (dict): A dictionary of variables to append to the parent scope.
-    """
-    import anndata as ad
-    import pandas as pd
-    import numpy as np
-    from .load_sample_sheet import load_sample_sheet
-    from .calculate_coverage import calculate_coverage
-    from .append_C_context import append_C_context
-    from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
-    from .invert_adata import invert_adata
-    from .calculate_read_length_stats import calculate_read_length_stats
-    # Clean up some of the Reference metadata and save variable names that point to sets of values in the column.
-    adata.obs[reference_column] = adata.obs[reference_column].astype('category')
-    references = adata.obs[reference_column].cat.categories
-    split_references = [(reference, reference.split('_')[0][1:]) for reference in references]
-    reference_mapping = {k: v for k, v in split_references}
-    adata.obs[f'{reference_column}_short'] = adata.obs[reference_column].map(reference_mapping)
-    short_references = set(adata.obs[f'{reference_column}_short'])
-    binary_layers = adata.layers.keys()
-    # load sample sheet metadata
-    load_sample_sheet(adata, sample_sheet_path, mapping_key_column)
-    # hold sample names set
-    adata.obs[sample_names_col] = adata.obs[sample_names_col].astype('category')
-    sample_names = adata.obs[sample_names_col].cat.categories
-    # Add position level metadata
-    calculate_coverage(adata, obs_column=reference_column)
-    adata.var['SNP_position'] = (adata.var[f'N_{reference_column}_with_position'] > 0) & (adata.var[f'N_{reference_column}_with_position'] < len(references)).astype(bool)
-    # Append cytosine context to the reference positions based on the conversion strand.
-    append_C_context(adata, obs_column=reference_column, use_consensus=False)
-    # Calculate read level methylation statistics. Assess if GpC methylation level is above other_C methylation level as a QC.
-    calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False)
-    # Invert the adata object (ie flip the strand orientation for visualization)
-    if invert:
-        invert_adata(adata)
-    else:
-        pass
-    # Calculate read length statistics, with options to display or save the read length histograms
-    upper_bound, lower_bound = calculate_read_length_stats(adata, reference_column, sample_names_col, output_directory, show_read_length_histogram=False, save_read_length_histogram=False)
-    variables = {
-        "short_references": short_references,
-        "binary_layers": binary_layers,
-        "sample_names": sample_names,
-        "upper_bound": upper_bound,
-        "lower_bound": lower_bound,
-        "references": references
-    }
-    return variables
-def recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers, hamming_distance_thresholds={}, reference_column = 'Reference', sample_names_col='Sample_names'):
-    """
-    The second part of the preprocessing workflow applied to the adata that has already been preprocessed by recipe_1_Kissiov_and_McKenna_2025.
-    Performs the following tasks:
-    1) Adds new layers containing NaN replaced variants of adata.X (fill_closest, nan0_0minus1, nan1_12).
-    2) Marks putative PCR duplicates using pairwise hamming distance metrics.
-    3) Performs a complexity analysis of the library based on the PCR duplicate detection rate.
-    4) Removes PCR duplicates from the adata.
-    5) Returns two adata object: one for the filtered adata and one for the duplicate adata.
-    Parameters:
-        adata (AnnData): The AnnData object to use as input.
-        output_directory (str): String representing the path to the output directory for plots.
-        binary_layers (list): A list of layers to used for the binary encoding of read sequences. Used for duplicate detection.
-        hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
-        reference_column (str): The name of the reference column to use.
-        sample_names_col (str): The name of the sample name column to use.
-    Returns:
-        filtered_adata (AnnData): An AnnData object containing the filtered reads
-        duplicates (AnnData): An AnnData object containing the duplicate reads
-    """
-    import anndata as ad
-    import pandas as pd
-    import numpy as np
-    from .clean_NaN import clean_NaN
-    from .mark_duplicates import mark_duplicates
-    from .calculate_complexity import calculate_complexity
-    from .remove_duplicates import remove_duplicates
-    # NaN replacement strategies stored in additional layers. Having layer=None uses adata.X
-    clean_NaN(adata, layer=None)
-    # Duplicate detection using pairwise hamming distance across reads
-    mark_duplicates(adata, binary_layers, obs_column=reference_column, sample_col=sample_names_col, hamming_distance_thresholds=hamming_distance_thresholds)
-    # Complexity analysis using the marked duplicates and the lander-watermann algorithm
-    calculate_complexity(adata, output_directory, obs_column=reference_column, sample_col=sample_names_col, plot=True, save_plot=False)
-    # Remove duplicate reads and store the duplicate reads in a new AnnData object named duplicates.
-    filtered_adata, duplicates = remove_duplicates(adata)
-    return filtered_adata, duplicates

smftools/preprocessing/remove_duplicates.py DELETED Viewed

@@ -1,21 +0,0 @@
-# remove_duplicates
-def remove_duplicates(adata):
-    """
-    Remove duplicates from the adata object
-    Parameters:
-        adata (Anndata): An adata object.
-    Returns:
-        filtered_adata (AnnData): An AnnData object of the filtered reads
-        duplicates (AnnData): An AnnData object of the duplicate reads
-    """
-    import anndata as ad
-    initial_size = adata.shape[0]
-    filtered_adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
-    final_size = filtered_adata.shape[0]
-    print(f'Removed {initial_size-final_size} reads from the dataset')
-    duplicates = adata[adata.obs['Unique_in_final_read_set'] == False].copy()
-    return filtered_adata, duplicates

smftools/readwrite.py DELETED Viewed

@@ -1,106 +0,0 @@
-## readwrite ##
-######################################################################################################
-## Datetime functionality
-def date_string():
-    """
-    Each time this is called, it returns the current date string
-    """
-    from datetime import datetime
-    current_date = datetime.now()
-    date_string = current_date.strftime("%Y%m%d")
-    date_string = date_string[2:]
-    return date_string
-def time_string():
-    """
-    Each time this is called, it returns the current time string
-    """
-    from datetime import datetime
-    current_time = datetime.now()
-    return current_time.strftime("%H:%M:%S")
-######################################################################################################
-######################################################################################################
-## Numpy, Pandas, Anndata functionality
-def adata_to_df(adata, layer=None):
-    """
-    Input: An adata object with a specified layer.
-    Output: A dataframe for the specific layer.
-    """
-    import pandas as pd
-    import anndata as ad
-    # Extract the data matrix from the given layer
-    if layer:
-        data_matrix = adata.layers[layer]
-    else:
-        data_matrix = adata.X
-    # Extract observation (read) annotations
-    obs_df = adata.obs
-    # Extract variable (position) annotations
-    var_df = adata.var
-    # Convert data matrix and annotations to pandas DataFrames
-    df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
-    return df
-def save_matrix(matrix, save_name):
-    """
-    Input: A numpy matrix and a save_name
-    Output: A txt file representation of the data matrix
-    """
-    import numpy as np
-    np.savetxt(f'{save_name}.txt', matrix)
-def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
-    """
-    Concatenate all h5ad files in a directory and delete them after the final adata is written out.
-    Input: an output file path relative to the directory in which the function is called
-    """
-    import os
-    import anndata as ad
-    # Runtime warnings
-    import warnings
-    warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
-    warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
-    # List all files in the directory
-    files = os.listdir(os.getcwd())
-    # get current working directory
-    cwd = os.getcwd()
-    suffix = file_suffix
-    # Filter file names that contain the search string in their filename and keep them in a list
-    hdfs = [hdf for hdf in files if suffix in hdf]
-    # Sort file list by names and print the list of file names
-    hdfs.sort()
-    print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
-    # Iterate over all of the hdf5 files and concatenate them.
-    final_adata = None
-    for hdf in hdfs:
-        print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
-        temp_adata = ad.read_h5ad(hdf)
-        if final_adata:
-            print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
-            final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
-        else:
-            print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
-            final_adata = temp_adata
-    print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
-    final_adata.write_h5ad(output_file, compression='gzip')
-    # Delete the individual h5ad files and only keep the final concatenated file
-    if delete_inputs:
-        files = os.listdir(os.getcwd())
-        hdfs = [hdf for hdf in files if suffix in hdf]
-        if output_file in hdfs:
-            hdfs.remove(output_file)
-            # Iterate over the files and delete them
-            for hdf in hdfs:
-                try:
-                    os.remove(hdf)
-                    print(f"Deleted file: {hdf}")
-                except OSError as e:
-                    print(f"Error deleting file {hdf}: {e}")
-    else:
-        print('Keeping input files')
-######################################################################################################

smftools/tools/__init__.py DELETED Viewed

File without changes

smftools/tools/apply_HMM.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # apply_HMM

smftools/tools/cluster.py DELETED Viewed

File without changes

smftools/tools/read_HMM.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # read_HMM

smftools/tools/subset_adata.py DELETED Viewed

@@ -1,32 +0,0 @@
-# subset_adata
-def subset_adata(adata, obs_columns):
-    """
-    Subsets an AnnData object based on categorical values in specified `.obs` columns.
-    Parameters:
-        adata (AnnData): The AnnData object to subset.
-        obs_columns (list of str): List of `.obs` column names to subset by. The order matters.
-    Returns:
-        dict: A dictionary where keys are tuples of category values and values are corresponding AnnData subsets.
-    """
-    def subset_recursive(adata_subset, columns):
-        if not columns:
-            return {(): adata_subset}
-        current_column = columns[0]
-        categories = adata_subset.obs[current_column].cat.categories
-        subsets = {}
-        for cat in categories:
-            subset = adata_subset[adata_subset.obs[current_column] == cat]
-            subsets.update(subset_recursive(subset, columns[1:]))
-        return subsets
-    # Start the recursive subset process
-    subsets_dict = subset_recursive(adata, obs_columns)
-    return subsets_dict

smftools/tools/train_HMM.py DELETED Viewed

@@ -1,43 +0,0 @@
-# train_HMM
-def train_HMM(adata, model_name='trained_HMM', save_hmm=False):
-    """
-    Parameters:
-        adata (AnnData): Input AnnData object
-        model_name (str): Name of the model
-        save_hmm (bool): Whether to save the model
-    """
-    import numpy as np
-    import anndata as ad
-    from pomegranate.distributions import Categorical
-    from pomegranate.hmm import DenseHMM
-    bound = Categorical([[0.95, 0.05]])
-    unbound = Categorical([[0.05, 0.95]])
-    edges = [[0.9, 0.1], [0.1, 0.9]]
-    starts = [0.5, 0.5]
-    ends = [0.5, 0.5]
-    model = DenseHMM([bound, unbound], edges=edges, starts=starts, ends=ends, max_iter=5, verbose=True)
-    # define training sets and labels
-    # Determine the number of reads to sample
-    n_sample = round(0.7 * adata.X.shape[0])
-    # Generate random indices
-    np.random.seed(0)
-    random_indices = np.random.choice(adata.shape[0], size=n_sample, replace=False)
-    # Subset the AnnData object using the random indices
-    training_adata_subsampled = adata[random_indices, :]
-    training_sequences = training_adata_subsampled.X
-    # Train the HMM without labeled data
-    model.fit(training_sequences, algorithm='baum-welch')
-    if save_hmm:
-        # Save the model to a file
-        model_json = model.to_json()
-        with open(f'{model_name}.json', 'w') as f:
-                f.write(model_json)

smftools-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,84 +0,0 @@
-smftools/__init__.py,sha256=zy4ckT7hKrLrlm6NiZQoupvc6oSN7wJsyOBCYdzukcQ,401
-smftools/_settings.py,sha256=Ed8lzKUA5ncq5ZRfSp0t6_rphEEjMxts6guttwTZP5Y,409
-smftools/_version.py,sha256=R5TtpJu7Qu6sOarfDpp-5Oyy8Pi2Ir3VewCvsCQiAgo,21
-smftools/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
-smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
-smftools/datasets/F1_sample_sheet.csv,sha256=9PodIIOXK2eamYPbC6DGnXdzgi9bRDovf296j1aM0ak,259
-smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
-smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
-smftools/datasets/datasets.py,sha256=0y597Ntp707bOgDwN6O-JEt9yxgplj66p0aj6Zs_IB4,779
-smftools/informatics/__init__.py,sha256=WQiMBr1yjDrlmHg8UNgW2MJsq4fPrVfh-UBr5tYI9x4,326
-smftools/informatics/conversion_smf.py,sha256=PS-TjgMttr3VRrT0zg5L_L01xMOewB_OXSsQyoM7DWI,4333
-smftools/informatics/direct_smf.py,sha256=ue7p7deuRwaZtEh9EFV1YTE8HKRAmOsx9oaRJdjCrbY,4697
-smftools/informatics/fast5_to_pod5.py,sha256=xfdZU3QluaAcR-q2uBRz8hcBwYt73nCnrFeahvi0OKQ,704
-smftools/informatics/load_adata.py,sha256=i-2YCSaeLzbPfNtKPrLwfkv-9u_TrTAZrbtNAj3FRWY,7271
-smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
-smftools/informatics/subsample_fasta_from_bed.py,sha256=YqYV09rvEQdeiS5hTTrKa8xYmJfeM3Vk-UUqwpw0qBk,1983
-smftools/informatics/subsample_pod5.py,sha256=zDw9tRcrFRmPI62xkcy9dh8IfsJcuYm7R-FVeBC_g3s,4701
-smftools/informatics/archived/bam_conversion.py,sha256=I8EzXjQixMmqx2oWnoNSH5NURBhfT-krbWHkoi_M964,3330
-smftools/informatics/archived/bam_direct.py,sha256=jbEFtUIiUR8Wlp3po_sWkr19AUNS9WZjglojb9j28vo,3606
-smftools/informatics/archived/basecalls_to_adata.py,sha256=-Nag6lr_NAtU4t8jo0GSMdgIAIfmDge-5VEUPQbEatE,3692
-smftools/informatics/helpers/LoadExperimentConfig.py,sha256=gsWGoa9cydwY4Kd-hTXF2gtmxc8glRRD2V1JB88e9js,2822
-smftools/informatics/helpers/__init__.py,sha256=KrfyM08_RgDf3Ajvb4KNTvcOqZiWYSIVhEznCr01Gcc,2255
-smftools/informatics/helpers/align_and_sort_BAM.py,sha256=DouG6nGWXtz2ulZD5p0sEShE-4dbPudHaWcHFm4-oJA,2184
-smftools/informatics/helpers/aligned_BAM_to_bed.py,sha256=eYkGQFSM2gPEauASkY_-9Yvy6727vP8Q4wx_st85Dpc,2638
-smftools/informatics/helpers/bed_to_bigwig.py,sha256=AazYEZzKgKgukSFwCpeiApzxh1kbt11X4RFqRIiBIaY,1466
-smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=iJlDah-YJ0zx0UrlHdtgvrALVNSA0TTTdDoKmNCVg0Q,1846
-smftools/informatics/helpers/canoncall.py,sha256=M7HEqhYsWMUB0tLP3hzMM0L7PhcOTXgetl5lV3GgIaw,1062
-smftools/informatics/helpers/complement_base_list.py,sha256=k6EkLtxFoajaIufxw1p0pShJ2nPHyGLTbzZmIFFjB4o,532
-smftools/informatics/helpers/concatenate_fastqs_to_bam.py,sha256=RXPn7e6Dcwol9tnUsfXJu3EuZcMSOJJo5LNWouovvZs,2715
-smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=Rsnydzpf9lMS3TQjXpbXJSSfCzhVTPn3rBDLiK-8utA,13991
-smftools/informatics/helpers/count_aligned_reads.py,sha256=uYyUYglF1asiaoxr-LKxPMUEbfyD7FS-dumTg2hJHzQ,2170
-smftools/informatics/helpers/extract_base_identities.py,sha256=E-_m9W82N52NjX5kz9Af5YH0S2k58hnq9KTrm4S5vgM,4370
-smftools/informatics/helpers/extract_mods.py,sha256=UBFjXDKz_A6ivjcocYT1_pKjvygY2Fdg0RjQmMS8UuA,2269
-smftools/informatics/helpers/extract_readnames_from_BAM.py,sha256=3FxSNqbZ1VsOK2RfHrvevQTzhWATf5E8bZ5yVOqayvk,759
-smftools/informatics/helpers/find_conversion_sites.py,sha256=5AghDQzEoSvE2Og98VsKoeWUFSLnIGY1LnRu1BtQavM,3700
-smftools/informatics/helpers/generate_converted_FASTA.py,sha256=ueaAsFnBuc7zKwkBivBR3DJg4DtkxkHHIQcVVSWzv-w,5161
-smftools/informatics/helpers/get_chromosome_lengths.py,sha256=sLumLrGsU_Xg_oJcdOpQyjUGpJoT2HbcmxWwbwzXUlE,1036
-smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
-smftools/informatics/helpers/index_fasta.py,sha256=N3IErfSiavYldeaat8xcQgA1MpykoQHcE0gHUeWuClE,267
-smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
-smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
-smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
-smftools/informatics/helpers/modcall.py,sha256=9PH7Peq4y-VBqQcMkbv0TwgePBlD5aM4_FmI7H4hbQQ,1142
-smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=duPlRAIz4VWM-jm9iaLY7N6JHQcun_L0nhr2VyUjNTI,38184
-smftools/informatics/helpers/ohe_batching.py,sha256=_Mz2p1We5PVIb8S6Hbq_hREKJ9mGQiADwfFK_NgMGhA,1909
-smftools/informatics/helpers/one_hot_encode.py,sha256=hpZAuwa9ndkhyCm9sO65KVHE0lbFDKqRylfliEKyD4o,632
-smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py,sha256=tAnXFleGzXJNjHRAgZ0NUJuZ0P3aKmUYIrK-V9VoJKY,1860
-smftools/informatics/helpers/separate_bam_by_bc.py,sha256=Fsi8OEmv5Ny13cWoHVV9JmEjVFEXT_ZxbBOlRdmyPbE,1742
-smftools/informatics/helpers/split_and_index_BAM.py,sha256=_TFJ8fcLbIf37JG83hSc1zgs1yxX70-NhA8y-PbhTpo,1966
-smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
-smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
-smftools/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-smftools/preprocessing/__init__.py,sha256=5FQNrj51KmaDLeAGGBA8iWMkYiSOe7O91ES8mT4aVtE,1399
-smftools/preprocessing/append_C_context.py,sha256=pP5u9o5U4JmHras0PK6yas65u4-U5KlX3sKLb-duo80,3728
-smftools/preprocessing/binarize_on_Youden.py,sha256=slkkt56DZ1FZWy8Un5mNJEZ49JlPnPKow2zU4GoHEr8,2303
-smftools/preprocessing/binary_layers_to_ohe.py,sha256=931eHuVda6pMZTvC7jVTKkY2a_KQWpSfgi-nkA5NmaI,1238
-smftools/preprocessing/calculate_complexity.py,sha256=ut60et8bmIswtiLhctJWHNseIV4ZRQultYdtJPHcRPs,3224
-smftools/preprocessing/calculate_consensus.py,sha256=6zRpRmb2xdfDu5hctZrReALRb7Pjn8sy8xJZTm3o0nU,2442
-smftools/preprocessing/calculate_converted_read_methylation_stats.py,sha256=Si0DcES0lLMvg3XgdKpedxfPnXQ14tEFKrOAFRn3fHs,6059
-smftools/preprocessing/calculate_coverage.py,sha256=ZgRxQGpydxQg1exkvSiy8nHmzDIPGGqL5vL9XQ2PZQ4,2068
-smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=e5Mzyex7pT29H2PY014uU4Fi_eewbut1JkzC1ffBbCg,961
-smftools/preprocessing/calculate_position_Youden.py,sha256=mfQ6nFfUaEaKg_icyHA1zZlhh0wHjpLE56BZDXOdP_4,6364
-smftools/preprocessing/calculate_read_length_stats.py,sha256=6m362JaCKlD0QoBUMnM2qsB6Jo_4shl7xFzqU1uZccU,4945
-smftools/preprocessing/clean_NaN.py,sha256=1vieT026p0gDJCbqB_CiLvAGGxlc-5xufoKJgZuBFFk,1150
-smftools/preprocessing/filter_converted_reads_on_methylation.py,sha256=SN5q0rqYtYW9j3i0sVSyTv9EmR_uLKI7GkjmJixeOU0,1307
-smftools/preprocessing/filter_reads_on_length.py,sha256=sAT66bjuI8ZtXyQc9SuPzq1dPIB1CNVx6VfWqVng4Dg,2191
-smftools/preprocessing/invert_adata.py,sha256=u6Y70EH0B5mXb9-HuukIlzpMgZ6rhzcJuy3YZZTx3SA,684
-smftools/preprocessing/load_sample_sheet.py,sha256=uGjzG9x-1t_1lCooH85P8Tfg80GdvVx8Jv1LPl9XNFM,915
-smftools/preprocessing/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
-smftools/preprocessing/mark_duplicates.py,sha256=sQuPcTw8JsQoONOk-kMlAF965sIk2Pu-M7rIyfbyGGs,8145
-smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
-smftools/preprocessing/recipes.py,sha256=KzSw5JW0WJGzSis5Fm7moQY5PxOYl6-uYYf1NDj6nOE,7117
-smftools/preprocessing/remove_duplicates.py,sha256=Erooi5_1VOUNfWpzddzmMNYMCl1U1jJryt7ZtMhabAs,699
-smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
-smftools/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-smftools/tools/apply_HMM.py,sha256=AuVtOki69-Xs4mhjhTXJzd49KCVXwixFyWSUgDjtR6s,11
-smftools/tools/cluster.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-smftools/tools/read_HMM.py,sha256=N0MGG494VjlxYJcCVz1jN4OasGtRITZS98SJ2xB_j8k,10
-smftools/tools/subset_adata.py,sha256=qyU9iCal03edb5aUS3AZ2U4TlL3uQ42jGI9hX3QF7Fc,1047
-smftools/tools/train_HMM.py,sha256=x5ZcXj-heWQqDOX86nuuDoj1tPkYKl04fYA1fCKNQ0c,1380
-smftools-0.1.3.dist-info/METADATA,sha256=u26Og8tpAF2TgXZztotk3Q4EuP7Fvf73s1tlIjBDD-A,6410
-smftools-0.1.3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-smftools-0.1.3.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
-smftools-0.1.3.dist-info/RECORD,,

{smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

smftools 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.6py3-none-any.whl