PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} RENAMED Viewed

@@ -11,29 +11,48 @@ import traceback
 import gzip
 import torch
-from .. import readwrite
+import shutil
+from pathlib import Path
+from typing import Union, Iterable, Optional
+from ..readwrite import make_dirs, safe_write_h5ad
 from .binarize_converted_base_identities import binarize_converted_base_identities
-from .find_conversion_sites import find_conversion_sites
-from .count_aligned_reads import count_aligned_reads
-from .extract_base_identities import extract_base_identities
-from .make_dirs import make_dirs
-from .ohe_batching import ohe_batching
+from .fasta_functions import find_conversion_sites
+from .bam_functions import count_aligned_reads, extract_base_identities
+from .ohe import ohe_batching
 if __name__ == "__main__":
     multiprocessing.set_start_method("forkserver", force=True)
-def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device='cpu', num_threads=8):
+def converted_BAM_to_adata(converted_FASTA,
+                              split_dir,
+                              output_dir,
+                              input_already_demuxed,
+                              mapping_threshold,
+                              experiment_name,
+                              conversions,
+                              bam_suffix,
+                              device='cpu',
+                              num_threads=8,
+                              deaminase_footprinting=False,
+                              delete_intermediates=True,
+                              double_barcoded_path = None,
+):
     """
     Converts BAM files into an AnnData object by binarizing modified base identities.
     Parameters:
-        converted_FASTA (str): Path to the converted FASTA reference.
-        split_dir (str): Directory containing converted BAM files.
+        converted_FASTA (Path): Path to the converted FASTA reference.
+        split_dir (Path): Directory containing converted BAM files.
+        output_dir (Path): Directory of the output dir
+        input_already_demuxed (bool): Whether input reads were originally demuxed
         mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
         experiment_name (str): Name for the output AnnData object.
-        conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
         bam_suffix (str): File suffix for BAM files.
         num_threads (int): Number of parallel processing threads.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
+        double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
     Returns:
         str: Path to the final AnnData object.
@@ -48,50 +67,73 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
     print(f"Using device: {device}")
     ## Set Up Directories and File Paths
-    parent_dir = os.path.dirname(split_dir)
-    h5_dir = os.path.join(parent_dir, 'h5ads')
-    tmp_dir = os.path.join(parent_dir, 'tmp')
-    final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{os.path.basename(split_dir)}.h5ad.gz')
+    h5_dir = output_dir / 'h5ads'
+    tmp_dir = output_dir / 'tmp'
+    final_adata = None
+    final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
-    if os.path.exists(final_adata_path):
+    if final_adata_path.exists():
         print(f"{final_adata_path} already exists. Using existing AnnData object.")
-        return final_adata_path
+        return final_adata, final_adata_path
     make_dirs([h5_dir, tmp_dir])
-    ## Get BAM Files ##
-    bam_files = [f for f in os.listdir(split_dir) if f.endswith(bam_suffix) and not f.endswith('.bai') and 'unclassified' not in f]
-    bam_files.sort()
-    bam_path_list = [os.path.join(split_dir, f) for f in bam_files]
+    bam_files = sorted(
+        p for p in split_dir.iterdir()
+        if p.is_file()
+        and p.suffix == ".bam"
+        and "unclassified" not in p.name
+    )
+    bam_path_list = [split_dir / f for f in bam_files]
     print(f"Found {len(bam_files)} BAM files: {bam_files}")
     ## Process Conversion Sites
-    max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversion_types)
+    max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversions, deaminase_footprinting)
     ## Filter BAM Files by Mapping Threshold
     records_to_analyze = filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold)
     ## Process BAMs in Parallel
-    final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device)
+    final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
+    final_adata.uns['References'] = {}
     for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
         final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
         final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
         final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
+        final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
+    final_adata.obs_names_make_unique()
+    cols = final_adata.obs.columns
-    ## Save Final AnnData
-    # print(f"Saving AnnData to {final_adata_path}")
-    # final_adata.write_h5ad(final_adata_path, compression='gzip')
+    # Make obs cols categorical
+    for col in cols:
+        final_adata.obs[col] = final_adata.obs[col].astype('category')
+    if input_already_demuxed:
+        final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
+        final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
+    else:
+        from .h5ad_functions import add_demux_type_annotation
+        double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
+        add_demux_type_annotation(final_adata, double_barcoded_reads)
+    ## Delete intermediate h5ad files and temp directories
+    if delete_intermediates:
+        delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
     return final_adata, final_adata_path
-def process_conversion_sites(converted_FASTA, conversion_types):
+def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC'], deaminase_footprinting=False):
     """
     Extracts conversion sites and determines the max reference length.
     Parameters:
         converted_FASTA (str): Path to the converted reference FASTA.
-        conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
     Returns:
         max_reference_length (int): The length of the longest sequence.
@@ -101,11 +143,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
     record_FASTA_dict = {}
     chromosome_FASTA_dict = {}
     max_reference_length = 0
-    unconverted = conversion_types[0]
-    conversions = conversion_types[1:]
+    unconverted = conversions[0]
+    conversion_types = conversions[1:]
     # Process the unconverted sequence once
-    modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversion_types)
+    modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversions, deaminase_footprinting)
     # Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
     # Get **max sequence length** from unconverted records
@@ -114,7 +156,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
     # Add **unconverted records** to `record_FASTA_dict`
     for record, values in modification_dict[unconverted].items():
         sequence_length, top_coords, bottom_coords, sequence, complement = values
-        chromosome = record.replace(f"_{unconverted}_top", "")
+        if not deaminase_footprinting:
+            chromosome = record.replace(f"_{unconverted}_top", "")
+        else:
+            chromosome = record
         # Store **original sequence**
         record_FASTA_dict[record] = [
@@ -127,13 +173,17 @@ def process_conversion_sites(converted_FASTA, conversion_types):
             chromosome_FASTA_dict[chromosome] = [sequence + "N" * (max_reference_length - sequence_length), complement + "N" * (max_reference_length - sequence_length)]
     # Process converted records
-    for conversion in conversions:
-        modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversion_types)
+    for conversion in conversion_types:
+        modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversions, deaminase_footprinting)
         # Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
         for record, values in modification_dict[conversion].items():
             sequence_length, top_coords, bottom_coords, sequence, complement = values
-            chromosome = record.split(f"_{unconverted}_")[0]  # Extract chromosome name
+            if not deaminase_footprinting:
+                chromosome = record.split(f"_{unconverted}_")[0]  # Extract chromosome name
+            else:
+                chromosome = record
             # Add **both strands** for converted records
             for strand in ["top", "bottom"]:
@@ -168,18 +218,20 @@ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold
     return records_to_analyze
-def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tmp_dir, max_reference_length, device):
+def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting):
     """Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
     adata_list = []
     for record in records_to_analyze:
-        sample = os.path.basename(bam).split(sep=".bam")[0]
+        sample = bam.stem
         chromosome = record_FASTA_dict[record][2]
         current_length = record_FASTA_dict[record][4]
         mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
+        sequence = chromosome_FASTA_dict[chromosome][0]
         # Extract Base Identities
-        fwd_bases, rev_bases = extract_base_identities(bam, record, range(current_length), max_reference_length)
+        fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, range(current_length), max_reference_length, sequence)
+        mismatch_trend_series = pd.Series(mismatch_trend_per_read)
         # Skip processing if both forward and reverse base identities are empty
         if not fwd_bases and not rev_bases:
@@ -190,11 +242,11 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
         # Binarize the Base Identities if they exist
         if fwd_bases:
-            fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device)
+            fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device,deaminase_footprinting, mismatch_trend_per_read)
             merged_bin.update(fwd_bin)
         if rev_bases:
-            rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device)
+            rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device, deaminase_footprinting, mismatch_trend_per_read)
             merged_bin.update(rev_bin)
         # Skip if merged_bin is empty (no valid binarized data)
@@ -257,11 +309,18 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
         adata.obs_names = bin_df.index.astype(str)
         adata.var_names = bin_df.columns.astype(str)
         adata.obs["Sample"] = [sample] * len(adata)
+        try:
+            barcode = sample.split('barcode')[1]
+        except:
+            barcode = np.nan
+        adata.obs["Barcode"] = [int(barcode)] * len(adata)
+        adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
         adata.obs["Reference"] = [chromosome] * len(adata)
         adata.obs["Strand"] = [strand] * len(adata)
         adata.obs["Dataset"] = [mod_type] * len(adata)
         adata.obs["Reference_dataset_strand"] = [f"{chromosome}_{mod_type}_{strand}"] * len(adata)
         adata.obs["Reference_strand"] = [f"{chromosome}_{strand}"] * len(adata)
+        adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
         # Attach One-Hot Encodings to Layers
         adata.layers["A_binary_encoding"] = df_A
@@ -279,16 +338,16 @@ def timestamp():
     return time.strftime("[%Y-%m-%d %H:%M:%S]")
-def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue):
+def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
     """Worker function that processes a single BAM and writes the output to an H5AD file."""
     worker_id = current_process().pid  # Get worker process ID
-    sample = os.path.basename(bam).split(sep=".bam")[0]
+    sample = bam.stem
     try:
         print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
-        h5ad_path = os.path.join(h5_dir, f"{sample}.h5ad")
-        if os.path.exists(h5ad_path):
+        h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
+        if h5ad_path.exists():
             print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
             progress_queue.put(sample)
             return
@@ -302,10 +361,10 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
             return
         # Process BAM
-        adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, tmp_dir, max_reference_length, device)
+        adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
         if adata is not None:
-            adata.write_h5ad(h5ad_path)
+            adata.write_h5ad(str(h5ad_path))
             print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
             # Free memory
@@ -318,9 +377,9 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
         print(f"{timestamp()} [Worker {worker_id}] ERROR while processing {sample}:\n{traceback.format_exc()}")
         progress_queue.put(sample)  # Still signal completion to prevent deadlock
-def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device):
+def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
     """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
-    os.makedirs(h5_dir, exist_ok=True)  # Ensure h5_dir exists
+    make_dirs(h5_dir)  # Ensure h5_dir exists
     print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
@@ -337,7 +396,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
         with Pool(processes=num_threads) as pool:
             results = [
-                pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue))
+                pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue))
                 for i, bam in enumerate(bam_path_list)
             ]
@@ -356,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
             pool.join()  # Ensure all workers finish
     # Final Concatenation Step
-    h5ad_files = [os.path.join(h5_dir, f) for f in os.listdir(h5_dir) if f.endswith(".h5ad")]
+    h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
     if not h5ad_files:
         print(f"{timestamp()} No valid H5AD files generated. Exiting.")
@@ -366,4 +425,93 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
     final_adata = ad.concat([ad.read_h5ad(f) for f in h5ad_files], join="outer")
     print(f"{timestamp()} Successfully generated final AnnData object.")
-    return final_adata
+    return final_adata
+def delete_intermediate_h5ads_and_tmpdir(
+    h5_dir: Union[str, Path, Iterable[str], None],
+    tmp_dir: Optional[Union[str, Path]] = None,
+    *,
+    dry_run: bool = False,
+    verbose: bool = True,
+):
+    """
+    Delete intermediate .h5ad files and a temporary directory.
+    Parameters
+    ----------
+    h5_dir : str | Path | iterable[str] | None
+        If a directory path is given, all files directly inside it will be considered.
+        If an iterable of file paths is given, those files will be considered.
+        Only files ending with '.h5ad' (and not ending with '.gz') are removed.
+    tmp_dir : str | Path | None
+        Path to a directory to remove recursively (e.g. a temp dir created earlier).
+    dry_run : bool
+        If True, print what *would* be removed but do not actually delete.
+    verbose : bool
+        Print progress / warnings.
+    """
+    # Helper: remove a single file path (Path-like or string)
+    def _maybe_unlink(p: Path):
+        if not p.exists():
+            if verbose:
+                print(f"[skip] not found: {p}")
+            return
+        if not p.is_file():
+            if verbose:
+                print(f"[skip] not a file: {p}")
+            return
+        if dry_run:
+            print(f"[dry-run] would remove file: {p}")
+            return
+        try:
+            p.unlink()
+            if verbose:
+                print(f"Removed file: {p}")
+        except Exception as e:
+            print(f"[error] failed to remove file {p}: {e}")
+    # Handle h5_dir input (directory OR iterable of file paths)
+    if h5_dir is not None:
+        # If it's a path to a directory, iterate its children
+        if isinstance(h5_dir, (str, Path)) and Path(h5_dir).is_dir():
+            dpath = Path(h5_dir)
+            for p in dpath.iterdir():
+                # only target top-level files (not recursing); require '.h5ad' suffix and exclude gz
+                name = p.name.lower()
+                if name.endswith(".h5ad") and not name.endswith(".gz"):
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        # optional: comment this out if too noisy
+                        print(f"[skip] not matching pattern: {p.name}")
+        else:
+            # treat as iterable of file paths
+            for f in h5_dir:
+                p = Path(f)
+                name = p.name.lower()
+                if name.endswith(".h5ad") and not name.endswith(".gz"):
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        print(f"[skip] not matching pattern or not a file: {p}")
+    # Remove tmp_dir recursively (if provided)
+    if tmp_dir is not None:
+        td = Path(tmp_dir)
+        if not td.exists():
+            if verbose:
+                print(f"[skip] tmp_dir not found: {td}")
+        else:
+            if not td.is_dir():
+                if verbose:
+                    print(f"[skip] tmp_dir is not a directory: {td}")
+            else:
+                if dry_run:
+                    print(f"[dry-run] would remove directory tree: {td}")
+                else:
+                    try:
+                        shutil.rmtree(td)
+                        if verbose:
+                            print(f"Removed directory tree: {td}")
+                    except Exception as e:
+                        print(f"[error] failed to remove tmp dir {td}: {e}")

smftools/informatics/fasta_functions.py ADDED Viewed

@@ -0,0 +1,255 @@
+from ..readwrite import make_dirs, time_string
+import os
+import subprocess
+from pathlib import Path
+from typing import Union, List, Dict, Tuple
+import numpy as np
+import gzip
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from pyfaidx import Fasta
+import pysam
+from concurrent.futures import ProcessPoolExecutor
+from itertools import chain
+def _convert_FASTA_record(record, modification_type, strand, unconverted):
+    """ Converts a FASTA record based on modification type and strand. """
+    conversion_maps = {
+        ('5mC', 'top'): ('C', 'T'),
+        ('5mC', 'bottom'): ('G', 'A'),
+        ('6mA', 'top'): ('A', 'G'),
+        ('6mA', 'bottom'): ('T', 'C')
+    }
+    sequence = str(record.seq).upper()
+    if modification_type == unconverted:
+        return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
+    if (modification_type, strand) not in conversion_maps:
+        raise ValueError(f"Invalid combination: {modification_type}, {strand}")
+    original_base, converted_base = conversion_maps[(modification_type, strand)]
+    new_seq = sequence.replace(original_base, converted_base)
+    return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
+def _process_fasta_record(args):
+    """
+    Processes a single FASTA record for parallel execution.
+    Args:
+        args (tuple): (record, modification_types, strands, unconverted)
+    Returns:
+        list of modified SeqRecord objects.
+    """
+    record, modification_types, strands, unconverted = args
+    modified_records = []
+    for modification_type in modification_types:
+        for i, strand in enumerate(strands):
+            if i > 0 and modification_type == unconverted:
+                continue  # Ensure unconverted is added only once
+            modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
+    return modified_records
+def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
+    """
+    Converts an input FASTA file and writes a new converted FASTA file efficiently.
+    Parameters:
+        input_fasta (str): Path to the unconverted FASTA file.
+        modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
+        strands (list): List of strands ('top', 'bottom').
+        output_fasta (str): Path to the converted FASTA output file.
+        num_threads (int): Number of parallel threads to use.
+        chunk_size (int): Number of records to process per write batch.
+    Returns:
+        None (Writes the converted FASTA file).
+    """
+    unconverted = modification_types[0]
+    input_fasta = str(input_fasta)
+    output_fasta = str(output_fasta)
+    # Detect if input is gzipped
+    open_func = gzip.open if input_fasta.endswith('.gz') else open
+    file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
+    def _fasta_record_generator():
+        """ Lazily yields FASTA records from file. """
+        with open_func(input_fasta, file_mode) as handle:
+            for record in SeqIO.parse(handle, 'fasta'):
+                yield record
+    with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
+        # Process records in parallel using a named function (avoiding lambda)
+        results = executor.map(
+            _process_fasta_record,
+            ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
+        )
+        buffer = []
+        for modified_records in results:
+            buffer.extend(modified_records)
+            # Write out in chunks to save memory
+            if len(buffer) >= chunk_size:
+                SeqIO.write(buffer, output_handle, 'fasta')
+                buffer.clear()
+        # Write any remaining records
+        if buffer:
+            SeqIO.write(buffer, output_handle, 'fasta')
+def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    fasta = Path(fasta)
+    pysam.faidx(str(fasta))  # creates <fasta>.fai
+    fai = fasta.with_suffix(fasta.suffix + ".fai")
+    if write_chrom_sizes:
+        chrom_sizes = fasta.with_suffix(".chrom.sizes")
+        with fai.open() as f_in, chrom_sizes.open("w") as out:
+            for line in f_in:
+                chrom, size = line.split()[:2]
+                out.write(f"{chrom}\t{size}\n")
+        return chrom_sizes
+    return fai
+def get_chromosome_lengths(fasta: str | Path) -> Path:
+    """
+    Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
+    """
+    fasta = Path(fasta)
+    fai = fasta.with_suffix(fasta.suffix + ".fai")
+    if not fai.exists():
+        index_fasta(fasta, write_chrom_sizes=True)  # will also create .chrom.sizes
+    chrom_sizes = fasta.with_suffix(".chrom.sizes")
+    if chrom_sizes.exists():
+        print(f"Using existing chrom length file: {chrom_sizes}")
+        return chrom_sizes
+    # Build chrom.sizes from .fai
+    with fai.open() as f_in, chrom_sizes.open("w") as out:
+        for line in f_in:
+            chrom, size = line.split()[:2]
+            out.write(f"{chrom}\t{size}\n")
+    return chrom_sizes
+def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
+    """
+    Return {record_id: (length, sequence)} from a FASTA.
+    Direct methylation specific
+    """
+    fasta_file = Path(fasta_file)
+    print(f"{time_string()}: Opening FASTA file {fasta_file}")
+    record_dict: Dict[str, Tuple[int, str]] = {}
+    with fasta_file.open("r") as f:
+        for rec in SeqIO.parse(f, "fasta"):
+            seq = str(rec.seq).upper()
+            record_dict[rec.id] = (len(seq), seq)
+    return record_dict
+def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
+    """
+    Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
+    Parameters:
+        fasta_file (str): Path to the converted reference FASTA.
+        modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
+        conversions (list): List of conversion types. The first element is the unconverted record type.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
+    Returns:
+        dict: Dictionary where keys are **both unconverted & converted record names**.
+              Values contain:
+              [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
+    """
+    unconverted = conversions[0]
+    record_dict = {}
+    # Define base mapping based on modification type
+    base_mappings = {
+        '5mC': ('C', 'G'),  # Cytosine and Guanine
+        '6mA': ('A', 'T')   # Adenine and Thymine
+    }
+    # Read FASTA file and process records
+    with open(fasta_file, "r") as f:
+        for record in SeqIO.parse(f, "fasta"):
+            if unconverted in record.id or deaminase_footprinting:
+                sequence = str(record.seq).upper()
+                complement = str(record.seq.complement()).upper()
+                sequence_length = len(sequence)
+                # Unconverted case: store the full sequence without coordinate filtering
+                if modification_type == unconverted:
+                    record_dict[record.id] = [sequence_length, [], [], sequence, complement]
+                # Process converted records: extract modified base positions
+                elif modification_type in base_mappings:
+                    top_base, bottom_base = base_mappings[modification_type]
+                    seq_array = np.array(list(sequence))
+                    top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
+                    bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
+                    record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
+                else:
+                    raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
+    return record_dict
+def subsample_fasta_from_bed(
+    input_FASTA: str | Path,
+    input_bed: str | Path,
+    output_directory: str | Path,
+    output_FASTA: str | Path
+) -> None:
+    """
+    Take a genome-wide FASTA file and a BED file containing
+    coordinate windows of interest. Outputs a subsampled FASTA.
+    """
+    # Normalize everything to Path
+    input_FASTA = Path(input_FASTA)
+    input_bed = Path(input_bed)
+    output_directory = Path(output_directory)
+    output_FASTA = Path(output_FASTA)
+    # Ensure output directory exists
+    output_directory.mkdir(parents=True, exist_ok=True)
+    output_FASTA_path = output_directory / output_FASTA
+    # Load the FASTA file using pyfaidx
+    fasta = Fasta(str(input_FASTA))   # pyfaidx requires string paths
+    # Open BED + output FASTA
+    with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
+        for line in bed:
+            fields = line.strip().split()
+            chrom = fields[0]
+            start = int(fields[1]) # BED is 0-based
+            end   = int(fields[2]) # BED is 0-based and end is exclusive
+            desc  = " ".join(fields[3:]) if len(fields) > 3 else ""
+            if chrom not in fasta:
+                print(f"Warning: {chrom} not found in FASTA")
+                continue
+            # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
+            sequence = fasta[chrom][start:end].seq
+            header = f">{chrom}:{start}-{end}"
+            if desc:
+                header += f"    {desc}"
+            out_fasta.write(f"{header}\n{sequence}\n")

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl