PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/append_C_context.py DELETED Viewed

@@ -1,82 +0,0 @@
-## append_C_context
-## Conversion SMF Specific
-# Read methylation QC
-def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
-    """
-    Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
-    Parameters:
-        adata (AnnData): The input adata object.
-        obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
-        use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
-        native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
-    Returns:
-        None
-    """
-    import numpy as np
-    import anndata as ad
-    print('Adding Cytosine context based on reference FASTA sequence for sample')
-    site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
-    categories = adata.obs[obs_column].cat.categories
-    for cat in categories:
-        # Assess if the strand is the top or bottom strand converted
-        if 'top' in cat:
-            strand = 'top'
-        elif 'bottom' in cat:
-            strand = 'bottom'
-        if native:
-            basename = cat.split(f"_{strand}")[0]
-            if use_consensus:
-                sequence = adata.uns[f'{basename}_consensus_sequence']
-            else:
-                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
-                sequence = adata.uns[f'{basename}_FASTA_sequence']
-        else:
-            basename = cat.split(f"_{strand}")[0]
-            if use_consensus:
-                sequence = adata.uns[f'{basename}_consensus_sequence']
-            else:
-                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
-                sequence = adata.uns[f'{basename}_FASTA_sequence']
-        # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
-        boolean_dict = {}
-        for site_type in site_types:
-            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
-        if strand == 'top':
-            # Iterate through the sequence and apply the criteria
-            for i in range(1, len(sequence) - 1):
-                if sequence[i] == 'C':
-                    boolean_dict[f'{cat}_any_C_site'][i] = True
-                    if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
-                        boolean_dict[f'{cat}_GpC_site'][i] = True
-                    elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
-                        boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
-                    elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
-                        boolean_dict[f'{cat}_CpG_site'][i] = True
-                    elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
-                        boolean_dict[f'{cat}_other_C'][i] = True
-        elif strand == 'bottom':
-            # Iterate through the sequence and apply the criteria
-            for i in range(1, len(sequence) - 1):
-                if sequence[i] == 'G':
-                    boolean_dict[f'{cat}_any_C_site'][i] = True
-                    if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
-                        boolean_dict[f'{cat}_GpC_site'][i] = True
-                    elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
-                        boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
-                    elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
-                        boolean_dict[f'{cat}_CpG_site'][i] = True
-                    elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
-                        boolean_dict[f'{cat}_other_C'][i] = True
-        else:
-            print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
-        for site_type in site_types:
-            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
-            adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X

smftools/preprocessing/calculate_converted_read_methylation_stats.py DELETED Viewed

@@ -1,94 +0,0 @@
-## calculate_converted_read_methylation_stats
-## Conversion SMF Specific
-# Read methylation QC
-def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col):
-    """
-    Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
-    Parameters:
-        adata (AnnData): An adata object
-        reference_column (str): String representing the name of the Reference column to use
-        sample_names_col (str): String representing the name of the sample name column to use
-    Returns:
-        None
-    """
-    import numpy as np
-    import anndata as ad
-    import pandas as pd
-    print('Calculating read level methylation statistics')
-    references = set(adata.obs[reference_column])
-    sample_names = set(adata.obs[sample_names_col])
-    site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
-    for site_type in site_types:
-        adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
-        adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
-        adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
-        adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
-    for cat in references:
-        cat_subset = adata[adata.obs[reference_column] == cat].copy()
-        for site_type in site_types:
-            print(f'Iterating over {cat}_{site_type}')
-            observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
-            number_valid_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
-            row_methylation_sums = np.nansum(observation_matrix, axis=1)
-            number_valid_positions_in_read[number_valid_positions_in_read == 0] = 1
-            fraction_valid_positions_in_range = number_valid_positions_in_read / np.max(number_valid_positions_in_read)
-            row_methylation_means = np.divide(row_methylation_sums, number_valid_positions_in_read)
-            temp_obs_data = pd.DataFrame({f'number_valid_{site_type}_in_read': number_valid_positions_in_read,
-                                        f'fraction_valid_{site_type}_in_range': fraction_valid_positions_in_range,
-                                        f'{site_type}_row_methylation_sums': row_methylation_sums,
-                                        f'{site_type}_row_methylation_means': row_methylation_means}, index=cat_subset.obs.index)
-            adata.obs.update(temp_obs_data)
-    # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
-    pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
-    adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
-# Below should be a plotting function
-    # adata.uns['methylation_dict'] = {}
-    # n_bins = 50
-    # site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
-    # for reference in references:
-    #     reference_adata = adata[adata.obs[reference_column] == reference].copy()
-    #     split_reference = reference.split('_')[0][1:]
-    #     for sample in sample_names:
-    #         sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
-    #         for site_type in site_types_to_analyze:
-    #             methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
-    #             max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
-    #             if not np.isnan(max_meth):
-    #                 n_bins = int(max_meth // 2)
-    #             else:
-    #                 n_bins = 1
-    #             mean = np.mean(methylation_data)
-    #             median = np.median(methylation_data)
-    #             stdev = np.std(methylation_data)
-    #             adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
-    #             if show_methylation_histogram or save_methylation_histogram:
-    #                 fig, ax = plt.subplots(figsize=(6, 4))
-    #                 count, bins, patches =  plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
-    #                 plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
-    #                 plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
-    #                 plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
-    #                 plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
-    #                 plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
-    #                 plt.xlabel('Fraction methylated')
-    #                 plt.ylabel('Proportion')
-    #                 title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
-    #                 plt.title(title, pad=20)
-    #                 plt.xlim(-0.05, 1.05)  # Set x-axis range from 0 to 1
-    #                 ax.spines['right'].set_visible(False)
-    #                 ax.spines['top'].set_visible(False)
-    #                 save_name = output_directory + f'/{readwrite.date_string()} {title}'
-    #                 if save_methylation_histogram:
-    #                     plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
-    #                     plt.close()
-    #                 else:
-    #                     plt.show()

smftools/preprocessing/filter_converted_reads_on_methylation.py DELETED Viewed

@@ -1,44 +0,0 @@
-## filter_converted_reads_on_methylation
-## Conversion SMF Specific
-def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025, max_SMF_threshold=0.975):
-    """
-    Filter adata object using minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read.
-    Parameters:
-        adata (AnnData): An adata object.
-        valid_SMF_site_threshold (float): A minimum proportion of valid SMF sites that must be present in the read. Default is 0.8
-        min_SMF_threshold (float): A minimum read methylation level. Default is 0.025
-    Returns:
-        Anndata
-    """
-    import numpy as np
-    import anndata as ad
-    import pandas as pd
-    if valid_SMF_site_threshold:
-        # Keep reads that have over a given valid GpC site content
-        adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
-    if min_SMF_threshold:
-        # Keep reads with SMF methylation over background methylation.
-        below_background = (~adata.obs['GpC_above_other_C']).sum()
-        print(f'Removing {below_background} reads that have GpC conversion below background conversion rate')
-        adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
-        # Keep reads over a defined methylation threshold
-        s0 = adata.shape[0]
-        adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
-        s1 = adata.shape[0]
-        below_threshold = s0 - s1
-        print(f'Removing {below_threshold} reads that have GpC conversion below a minimum threshold conversion rate')
-    if max_SMF_threshold:
-        # Keep reads below a defined methylation threshold
-        s0 = adata.shape[0]
-        adata = adata[adata.obs['GpC_site_row_methylation_means'] < max_SMF_threshold].copy()
-        s1 = adata.shape[0]
-        above_threshold = s0 - s1
-        print(f'Removing {above_threshold} reads that have GpC conversion above a maximum threshold conversion rate')
-    return adata

smftools/preprocessing/filter_reads_on_length.py DELETED Viewed

@@ -1,51 +0,0 @@
-## filter_reads_on_length
-def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2700, max_read_length=3200):
-    """
-    Filters the adata object to keep a defined coordinate window, as well as reads that are over a minimum threshold in length.
-    Parameters:
-        adata (AnnData): An adata object.
-        filter_on_coordinates (bool | list): If False, skips filtering. Otherwise, provide a list containing integers representing the lower and upper bound coordinates to filter on. Default is False.
-        min_read_length (int): The minimum read length to keep in the filtered dataset. Default is 2700.
-        max_read_length (int): The maximum query read length to keep in the filtered dataset. Default is 3200.
-    Returns:
-        adata
-    """
-    import numpy as np
-    import anndata as ad
-    import pandas as pd
-    if filter_on_coordinates:
-        lower_bound, upper_bound = filter_on_coordinates
-        # Extract the position information from the adata object as an np array
-        var_names_arr = adata.var_names.astype(int).to_numpy()
-        # Find the upper bound coordinate that is closest to the specified value
-        closest_end_index = np.argmin(np.abs(var_names_arr - upper_bound))
-        upper_bound = int(adata.var_names[closest_end_index])
-        # Find the lower bound coordinate that is closest to the specified value
-        closest_start_index = np.argmin(np.abs(var_names_arr - lower_bound))
-        lower_bound = int(adata.var_names[closest_start_index])
-        # Get a list of positional indexes that encompass the lower and upper bounds of the dataset
-        position_list = list(range(lower_bound, upper_bound + 1))
-        position_list = [str(pos) for pos in position_list]
-        position_set = set(position_list)
-        print(f'Subsetting adata to keep data between coordinates {lower_bound} and {upper_bound}')
-        adata = adata[:, adata.var_names.isin(position_set)].copy()
-    if min_read_length:
-        print(f'Subsetting adata to keep reads longer than {min_read_length}')
-        s0 = adata.shape[0]
-        adata = adata[adata.obs['read_length'] > min_read_length].copy()
-        s1 = adata.shape[0]
-        print(f'Removed {s0-s1} reads less than {min_read_length} basepairs in length')
-    if max_read_length:
-        print(f'Subsetting adata to keep reads shorter than {max_read_length}')
-        s0 = adata.shape[0]
-        adata = adata[adata.obs['read_length'] < max_read_length].copy()
-        s1 = adata.shape[0]
-        print(f'Removed {s0-s1} reads greater than {max_read_length} basepairs in length')
-    return adata

smftools/tools/call_hmm_peaks.py DELETED Viewed

@@ -1,105 +0,0 @@
-def call_hmm_peaks(adata, feature_configs, obs_column='Reference_strand', site_types=['GpC_site', 'CpG_site'], save_plot=False, output_dir=None, date_tag=None):
-    """
-    Calls peaks from HMM feature layers and annotates them into the AnnData object.
-    Parameters:
-        adata : AnnData object with HMM layers (from apply_hmm)
-        feature_configs : dict
-        min_distance : minimum distance between peaks
-        peak_width : window size around peak centers
-        peak_prominence : required peak prominence
-        peak_threshold : threshold for labeling a read as "present" at a peak
-        site_types : list of var site types to aggregate
-        save_plot : whether to save the plot
-        output_dir : path to save the figure if save_plot=True
-        date_tag : optional tag for filename
-    """
-    import matplotlib.pyplot as plt
-    from scipy.signal import find_peaks
-    import os
-    import numpy as np
-    peak_columns = []
-    for feature_layer, config in feature_configs.items():
-        min_distance = config.get('min_distance', 200)
-        peak_width = config.get('peak_width', 200)
-        peak_prominence = config.get('peak_prominence', 0.2)
-        peak_threshold = config.get('peak_threshold', 0.8)
-        # 1️⃣ Calculate mean intensity profile
-        matrix = adata.layers[feature_layer]
-        means = np.mean(matrix, axis=0)
-        feature_peak_columns = []
-        # 2️⃣ Peak calling
-        peak_centers, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
-        adata.uns[f'{feature_layer} peak_centers'] = peak_centers
-        # 3️⃣ Plot
-        plt.figure(figsize=(6, 3))
-        plt.plot(range(len(means)), means)
-        plt.title(f"{feature_layer} density with peak calls")
-        plt.xlabel("Genomic position")
-        plt.ylabel("Mean feature density")
-        y = max(means) / 2
-        for i, center in enumerate(peak_centers):
-            plus_minus_width = peak_width // 2
-            start = center - plus_minus_width
-            end = center + plus_minus_width
-            plt.axvspan(start, end, color='purple', alpha=0.2)
-            plt.axvline(center, color='red', linestyle='--')
-            if i%2:
-                aligned = [end, 'left']
-            else:
-                aligned = [start, 'right']
-            plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
-        if save_plot and output_dir:
-            filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
-            plt.savefig(filename, bbox_inches='tight')
-            print(f"Saved plot to {filename}")
-        else:
-            plt.show()
-        # 4️⃣ Annotate peaks back into adata.obs
-        for center in peak_centers:
-            half_width = peak_width // 2
-            start, end = center - half_width, center + half_width
-            colname = f'{feature_layer}_peak_{center}'
-            peak_columns.append(colname)
-            feature_peak_columns.append(colname)
-            adata.var[colname] = (
-                (adata.var_names.astype(int) >= start) &
-                (adata.var_names.astype(int) <= end)
-            )
-            # Feature layer intensity around peak
-            mean_values = np.mean(matrix[:, start:end+1], axis=1)
-            sum_values = np.sum(matrix[:, start:end+1], axis=1)
-            adata.obs[f'mean_{feature_layer}_around_{center}'] = mean_values
-            adata.obs[f'sum_{feature_layer}_around_{center}'] = sum_values
-            adata.obs[f'{feature_layer}_present_at_{center}'] = mean_values > peak_threshold
-            # Site-type based aggregation
-            for site_type in site_types:
-                adata.obs[f'{site_type}_sum_around_{center}'] = 0
-                adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
-            references = adata.obs[obs_column].cat.categories
-            for ref in adata.obs[obs_column].cat.categories:
-                subset = adata[adata.obs[obs_column] == ref]
-                for site_type in site_types:
-                    mask = subset.var.get(f'{ref}_{site_type}', None)
-                    if mask is not None:
-                        region_mask = (subset.var_names[mask].astype(int) >= start) & (subset.var_names[mask].astype(int) <= end)
-                        region = subset[:, mask].X[:, region_mask]
-                        adata.obs.loc[subset.obs.index, f'{site_type}_sum_around_{center}'] = np.nansum(region, axis=1)
-                        adata.obs.loc[subset.obs.index, f'{site_type}_mean_around_{center}'] = np.nanmean(region, axis=1)
-        adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
-        print(f"✅ Peak annotation completed for {feature_layer} with {len(peak_centers)} peaks.")
-    # Combine all peaks into a single "is_in_any_peak" column
-    adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)

smftools/tools/data/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .anndata_data_module import AnnDataModule
2	- from .preprocessing import random_fill_nans

smftools/tools/data/anndata_data_module.py DELETED Viewed

@@ -1,90 +0,0 @@
-import torch
-from torch.utils.data import DataLoader, TensorDataset, random_split
-import pytorch_lightning as pl
-import numpy as np
-import pandas as pd
-class AnnDataModule(pl.LightningDataModule):
-    def __init__(self, adata, tensor_source="X", tensor_key=None, label_col="labels",
-                 batch_size=64, train_frac=0.7, random_seed=42, split_col='train_val_split', split_save_path=None, load_existing_split=False,
-                 inference_mode=False):
-        super().__init__()
-        self.adata = adata # The adata object
-        self.tensor_source = tensor_source # X, layers, obsm
-        self.tensor_key = tensor_key  # name of the layer or obsm key
-        self.label_col = label_col # name of the label column in obs
-        self.batch_size = batch_size
-        self.train_frac = train_frac
-        self.random_seed = random_seed
-        self.split_col = split_col  # Name of obs column to store "train"/"val"
-        self.split_save_path = split_save_path # Where to save the obs_names and train/test split logging
-        self.load_existing_split = load_existing_split # Whether to load from an existing split
-        self.inference_mode = inference_mode # Whether to load the AnnDataModule in inference mode.
-    def setup(self, stage=None):
-        # Load feature matrix
-        if self.tensor_source == "X":
-            X = self.adata.X
-        elif self.tensor_source == "layers":
-            assert self.tensor_key in self.adata.layers, f"Layer '{self.tensor_key}' not found."
-            X = self.adata.layers[self.tensor_key]
-        elif self.tensor_source == "obsm":
-            assert self.tensor_key in self.adata.obsm, f"obsm key '{self.tensor_key}' not found."
-            X = self.adata.obsm[self.tensor_key]
-        else:
-            raise ValueError(f"Invalid tensor_source: {self.tensor_source}")
-        # Convert to tensor
-        X_tensor = torch.tensor(X, dtype=torch.float32)
-        if self.inference_mode:
-            self.infer_dataset = TensorDataset(X_tensor)
-        else:
-            # Load and encode labels
-            y = self.adata.obs[self.label_col]
-            if y.dtype.name == 'category':
-                y = y.cat.codes
-            y_tensor = torch.tensor(y.values, dtype=torch.long)
-            # Use existing split
-            if self.load_existing_split:
-                split_df = pd.read_csv(self.split_save_path, index_col=0)
-                assert self.split_col in split_df.columns, f"'{self.split_col}' column missing in split file."
-                self.adata.obs[self.split_col] = split_df.loc[self.adata.obs_names][self.split_col].values
-            # If no split exists, create one
-            if self.split_col not in self.adata.obs:
-                full_dataset = TensorDataset(X_tensor, y_tensor)
-                n_train = int(self.train_frac * len(full_dataset))
-                n_val = len(full_dataset) - n_train
-                self.train_set, self.val_set = random_split(
-                    full_dataset, [n_train, n_val],
-                    generator=torch.Generator().manual_seed(self.random_seed)
-                )
-                # Assign split labels
-                split_array = np.full(len(self.adata), "val", dtype=object)
-                train_idx = self.train_set.indices if hasattr(self.train_set, "indices") else self.train_set._indices
-                split_array[train_idx] = "train"
-                self.adata.obs[self.split_col] = split_array
-                # Save to disk
-                if self.split_save_path:
-                    self.adata.obs[[self.split_col]].to_csv(self.split_save_path)
-            else:
-                split_labels = self.adata.obs[self.split_col].values
-                train_mask = split_labels == "train"
-                val_mask = split_labels == "val"
-                self.train_set = TensorDataset(X_tensor[train_mask], y_tensor[train_mask])
-                self.val_set = TensorDataset(X_tensor[val_mask], y_tensor[val_mask])
-    def train_dataloader(self):
-        return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True)
-    def val_dataloader(self):
-        return DataLoader(self.val_set, batch_size=self.batch_size)
-    def predict_dataloader(self):
-        if not self.inference_mode:
-            raise RuntimeError("predict_dataloader only available in inference mode.")
-        return DataLoader(self.infer_dataset, batch_size=self.batch_size)

smftools/tools/inference/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .lightning_inference import run_lightning_inference

smftools/tools/inference/lightning_inference.py DELETED Viewed

@@ -1,41 +0,0 @@
-import torch
-import pandas as pd
-import numpy as np
-from pytorch_lightning import Trainer
-def run_lightning_inference(
-    adata,
-    model,
-    datamodule,
-    label_col="labels",
-    prefix="model"
-):
-    # Get class labels
-    if label_col in adata.obs and pd.api.types.is_categorical_dtype(adata.obs[label_col]):
-        class_labels = adata.obs[label_col].cat.categories.tolist()
-    else:
-        raise ValueError("label_col must be a categorical column in adata.obs")
-    # Run predictions
-    trainer = Trainer(accelerator="auto", devices=1, logger=False, enable_checkpointing=False)
-    preds = trainer.predict(model, datamodule=datamodule)
-    probs = torch.cat(preds, dim=0).cpu().numpy()  # (N, C)
-    pred_class_idx = probs.argmax(axis=1)
-    pred_class_labels = [class_labels[i] for i in pred_class_idx]
-    pred_class_probs = probs[np.arange(len(probs)), pred_class_idx]
-    # Construct full prefix with label_col
-    full_prefix = f"{prefix}_{label_col}"
-    # Store predictions in obs
-    adata.obs[f"{full_prefix}_pred"] = pred_class_idx
-    adata.obs[f"{full_prefix}_pred_label"] = pd.Categorical(pred_class_labels, categories=class_labels)
-    adata.obs[f"{full_prefix}_pred_prob"] = pred_class_probs
-    # Per-class probabilities
-    for i, class_name in enumerate(class_labels):
-        adata.obs[f"{full_prefix}_prob_{class_name}"] = probs[:, i]
-    # Full probability matrix in obsm
-    adata.obsm[f"{full_prefix}_pred_prob_all"] = probs

smftools/tools/models/base.py DELETED Viewed

@@ -1,14 +0,0 @@
-import torch.nn as nn
-from ..utils.device import detect_device
-class BaseTorchModel(nn.Module):
-    """
-    Minimal base class for torch models that:
-    - Stores device
-    - Moves model to detected device on init
-    """
-    def __init__(self, dropout_rate=0.2):
-        super().__init__()
-        self.device = detect_device() # detects available devices
-        self.dropout_rate = dropout_rate # default dropout rate to be used in regularization.
-        self.to(self.device)  # move model to device

smftools/tools/models/cnn.py DELETED Viewed

@@ -1,34 +0,0 @@
-import torch
-import torch.nn as nn
-from .base import BaseTorchModel
-class CNNClassifier(BaseTorchModel):
-    def __init__(self, input_size, num_classes, **kwargs):
-        super().__init__(**kwargs)
-        # Define convolutional layers
-        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
-        # Define activation function
-        self.relu = nn.ReLU()
-        # Determine the flattened size dynamically
-        dummy_input = torch.zeros(1, 1, input_size).to(self.device)
-        with torch.no_grad():
-            dummy_output = self._forward_conv(dummy_input)
-        flattened_size = dummy_output.view(1, -1).shape[1]
-        # Define fully connected layers
-        self.fc1 = nn.Linear(flattened_size, 64)
-        self.fc2 = nn.Linear(64, num_classes)
-    def _forward_conv(self, x):
-        x = self.relu(self.conv1(x))
-        x = self.relu(self.conv2(x))
-        return x
-    def forward(self, x):
-        x = x.unsqueeze(1)  # [B, 1, L]
-        x = self._forward_conv(x)
-        x = x.view(x.size(0), -1)  # flatten
-        x = self.relu(self.fc1(x))
-        return self.fc2(x)

smftools/tools/models/lightning_base.py DELETED Viewed

@@ -1,41 +0,0 @@
-import torch
-import pytorch_lightning as pl
-class TorchClassifierWrapper(pl.LightningModule):
-    def __init__(
-        self,
-        model: torch.nn.Module,
-        optimizer_cls=torch.optim.AdamW,
-        optimizer_kwargs=None,
-        criterion_cls=torch.nn.CrossEntropyLoss,
-        criterion_kwargs=None,
-        lr: float = 1e-3,
-    ):
-        super().__init__()
-        self.model = model
-        self.save_hyperparameters(ignore=['model'])  # logs all except actual model instance
-        self.optimizer_cls = optimizer_cls
-        self.optimizer_kwargs = optimizer_kwargs or {}
-        self.criterion = criterion_cls(**(criterion_kwargs or {}))
-        self.lr = lr
-    def forward(self, x):
-        return self.model(x)
-    def training_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = self.criterion(logits, y)
-        self.log("train_loss", loss, prog_bar=True)
-        return loss
-    def validation_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = self.criterion(logits, y)
-        acc = (logits.argmax(dim=1) == y).float().mean()
-        self.log_dict({"val_loss": loss, "val_acc": acc}, prog_bar=True)
-        return loss
-    def configure_optimizers(self):
-        return self.optimizer_cls(self.parameters(), lr=self.lr, **self.optimizer_kwargs)

smftools/tools/models/mlp.py DELETED Viewed

@@ -1,17 +0,0 @@
-import torch
-import torch.nn as nn
-from .base import BaseTorchModel
-class MLPClassifier(BaseTorchModel):
-    def __init__(self, input_dim, num_classes, hidden_sizes=(128, 64), **kwargs):
-        super().__init__(**kwargs)
-        layers = []
-        prev = input_dim
-        for h in hidden_sizes:
-            layers.extend([nn.Linear(prev, h), nn.ReLU(), nn.Dropout(self.dropout_rate)])
-            prev = h
-        layers.append(nn.Linear(prev, num_classes))
-        self.model = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.model(x)

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl