PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

smftools/__init__.py +9 -4
smftools/_version.py +1 -1
smftools/cli.py +184 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +33 -0
smftools/config/deaminase.yaml +56 -0
smftools/config/default.yaml +253 -0
smftools/config/direct.yaml +17 -0
smftools/config/experiment_config.py +1191 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +0 -2
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/fast5_to_pod5.py +4 -1
smftools/informatics/helpers/__init__.py +3 -4
smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
smftools/informatics/helpers/discover_input_files.py +100 -0
smftools/informatics/helpers/extract_base_identities.py +29 -3
smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
smftools/informatics/helpers/find_conversion_sites.py +5 -4
smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
smftools/informatics/helpers/plot_bed_histograms.py +269 -0
smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
smftools/informatics/helpers/split_and_index_BAM.py +1 -5
smftools/load_adata.py +1346 -0
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +611 -0
smftools/plotting/general_plotting.py +566 -89
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +13 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +849 -43
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
smftools-0.2.1.dist-info/RECORD +161 -0
smftools-0.2.1.dist-info/entry_points.txt +2 -0
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/evaluation/__init__.py +0 -0
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0

smftools/hmm/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from .apply_hmm_batched import apply_hmm_batched
+from .calculate_distances import calculate_distances
+from .call_hmm_peaks import call_hmm_peaks
+from .display_hmm import display_hmm
+from .hmm_readwrite import load_hmm, save_hmm
+from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
+from .train_hmm import train_hmm
+__all__ = [
+    "apply_hmm_batched",
+    "calculate_distances",
+    "call_hmm_peaks",
+    "display_hmm",
+    "load_hmm",
+    "refine_nucleosome_calls",
+    "infer_nucleosomes_in_large_bound",
+    "save_hmm",
+    "train_hmm"
+]

smftools/{tools → hmm}/apply_hmm_batched.py RENAMED Viewed

@@ -3,14 +3,11 @@ import pandas as pd
 import torch
 from tqdm import tqdm
-def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
+def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A", "C"], device="cpu", threshold=0.7, deaminase_footprinting=False):
     """
     Applies an HMM model to an AnnData object using tensor-based sequence inputs.
     If multiple methbases are passed, generates a combined feature set.
     """
-    import numpy as np
-    import torch
-    from tqdm import tqdm
     model.to(device)
@@ -74,6 +71,7 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
         for methbase in methbases:
             mask = {
                 "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
+                "c": ref_subset.var[f"{ref}_any_C_site"] == True,
                 "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
                 "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
             }[methbase.lower()]
@@ -150,6 +148,8 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
                     adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
     # --- Binarization + Distance ---
+    coordinates = adata.var_names.astype(int).values
     for feature in tqdm(all_features, desc="Finalizing Layers"):
         bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
         counts = np.zeros(adata.shape[0], dtype=int)
@@ -158,9 +158,11 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
                 intervals = []
             for start, length, prob in intervals:
                 if prob > threshold:
-                    bin_matrix[row_idx, start:start+length] = 1
+                    start_idx = np.searchsorted(coordinates, start, side="left")
+                    end_idx = np.searchsorted(coordinates, start + length - 1, side="right")
+                    bin_matrix[row_idx, start_idx:end_idx] = 1
                     counts[row_idx] += 1
-        adata.layers[f"{feature}"] = bin_matrix
+        adata.layers[feature] = bin_matrix
         adata.obs[f"n_{feature}"] = counts
         adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
@@ -202,7 +204,6 @@ def classify_batch(predicted_states_batch, probabilities_batch, coordinates, cla
     Returns:
         List of classifications for each sequence.
     """
-    import numpy as np
     state_labels = ["Non-Methylated", "Methylated"]
     target_idx = state_labels.index(target_state)

smftools/hmm/call_hmm_peaks.py ADDED Viewed

@@ -0,0 +1,106 @@
+def call_hmm_peaks(
+    adata,
+    feature_configs,
+    obs_column='Reference_strand',
+    site_types=['GpC_site', 'CpG_site'],
+    save_plot=False,
+    output_dir=None,
+    date_tag=None,
+    inplace=False
+):
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from scipy.signal import find_peaks
+    if not inplace:
+        adata = adata.copy()
+    # Ensure obs_column is categorical
+    if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
+        adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
+    coordinates = adata.var_names.astype(int).values
+    peak_columns = []
+    obs_updates = {}
+    for feature_layer, config in feature_configs.items():
+        min_distance = config.get('min_distance', 200)
+        peak_width = config.get('peak_width', 200)
+        peak_prominence = config.get('peak_prominence', 0.2)
+        peak_threshold = config.get('peak_threshold', 0.8)
+        matrix = adata.layers[feature_layer]
+        means = np.mean(matrix, axis=0)
+        peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
+        peak_centers = coordinates[peak_indices]
+        adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
+        # Plot
+        plt.figure(figsize=(6, 3))
+        plt.plot(coordinates, means)
+        plt.title(f"{feature_layer} with peak calls")
+        plt.xlabel("Genomic position")
+        plt.ylabel("Mean intensity")
+        for i, center in enumerate(peak_centers):
+            start, end = center - peak_width // 2, center + peak_width // 2
+            plt.axvspan(start, end, color='purple', alpha=0.2)
+            plt.axvline(center, color='red', linestyle='--')
+            aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
+            plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
+        if save_plot and output_dir:
+            filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
+            plt.savefig(filename, bbox_inches='tight')
+            print(f"Saved plot to {filename}")
+        else:
+            plt.show()
+        feature_peak_columns = []
+        for center in peak_centers:
+            start, end = center - peak_width // 2, center + peak_width // 2
+            colname = f'{feature_layer}_peak_{center}'
+            peak_columns.append(colname)
+            feature_peak_columns.append(colname)
+            peak_mask = (coordinates >= start) & (coordinates <= end)
+            adata.var[colname] = peak_mask
+            region = matrix[:, peak_mask]
+            obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
+            obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
+            obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
+            for site_type in site_types:
+                adata.obs[f'{site_type}_sum_around_{center}'] = 0
+                adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
+            for ref in adata.obs[obs_column].cat.categories:
+                ref_idx = adata.obs[obs_column] == ref
+                mask_key = f"{ref}_{site_type}"
+                for site_type in site_types:
+                    if mask_key not in adata.var:
+                        continue
+                    site_mask = adata.var[mask_key].values
+                    site_coords = coordinates[site_mask]
+                    region_mask = (site_coords >= start) & (site_coords <= end)
+                    if not region_mask.any():
+                        continue
+                    full_mask = site_mask.copy()
+                    full_mask[site_mask] = region_mask
+                    site_region = adata[ref_idx, full_mask].X
+                    if hasattr(site_region, "A"):
+                        site_region = site_region.A
+                    if site_region.shape[1] > 0:
+                        adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
+                        adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
+                    else:
+                        pass
+        adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
+        print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
+    adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
+    adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
+    return adata if not inplace else None

smftools/{tools → hmm}/display_hmm.py RENAMED Viewed

@@ -1,16 +1,16 @@
 def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
     import torch
-    print("\n🔹 **HMM Model Overview**")
+    print("\n**HMM Model Overview**")
     print(hmm)
-    print("\n🔹 **Transition Matrix**")
+    print("\n**Transition Matrix**")
     transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
     for i, row in enumerate(transition_matrix):
         label = state_labels[i] if state_labels else f"State {i}"
         formatted_row = ", ".join(f"{p:.6f}" for p in row)
         print(f"{label}: [{formatted_row}]")
-    print("\n🔹 **Emission Probabilities**")
+    print("\n**Emission Probabilities**")
     for i, dist in enumerate(hmm.distributions):
         label = state_labels[i] if state_labels else f"State {i}"
         probs = dist.probs.detach().cpu().numpy()

smftools/{tools → hmm}/nucleosome_hmm_refinement.py RENAMED Viewed

@@ -56,7 +56,7 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
     adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
     adata.layers[f"{layer_name}_octamers"] = octamer_layer
-    print(f"✅ Added layers: {layer_name}_hexamers and {layer_name}_octamers")
+    print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
     return adata
 def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
@@ -100,5 +100,5 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
                         pos_cursor += 1
     adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
-    print(f"✅ Added layer: {large_bound_layer}_phased_nucleosomes")
+    print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
     return adata

smftools/{tools → hmm}/train_hmm.py RENAMED Viewed

@@ -11,7 +11,7 @@ def train_hmm(
     pad_value=0,
 ):
     """
-    Trains a 2-state DenseHMM model on binary methylation data.
+    Trains a 2-state DenseHMM model on binary methylation/deamination data.
     Parameters:
         data (list or np.ndarray): List of sequences (lists) with 0, 1, or NaN.

smftools/informatics/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from . import helpers
 from .basecall_pod5s import basecall_pod5s
-from .load_adata import load_adata
 from .subsample_fasta_from_bed import subsample_fasta_from_bed
 from .subsample_pod5 import subsample_pod5
 from .fast5_to_pod5 import fast5_to_pod5
@@ -8,7 +7,6 @@ from .fast5_to_pod5 import fast5_to_pod5
 __all__ = [
     "basecall_pod5s",
-    "load_adata",
     "subsample_fasta_from_bed",
     "subsample_pod5",
     "fast5_to_pod5",

smftools/informatics/archived/deaminase_smf.py ADDED Viewed

@@ -0,0 +1,132 @@
+def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
+    """
+    Processes sequencing data from a conversion SMF experiment to an adata object.
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        output_directory (str): A file path to the directory to output all the analyses.
+        conversion_type (list): A list of strings of the conversion types to use in the analysis.
+        strands (list): A list of converstion strands to use in the experiment.
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the dorado basecalling model.
+        input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
+        split_dir (str): A string representing the file path to the directory to split the BAMs into.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
+        experiment_name (str): A string to provide an experiment name to the output adata file.
+        bam_suffix (str): A suffix to add to the bam file.
+        basecall (bool): Whether to go through basecalling or not.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
+        device (str): Device to use for basecalling. auto, metal, cpu, cuda
+        make_bigwigs (bool): Whether to make bigwigs
+        threads (int): cpu threads available for processing.
+        input_already_demuxed (bool): Whether the input files were already demultiplexed
+    Returns:
+        final_adata_path (str): Path to the final adata object
+        sorted_output (str): Path to the aligned, sorted BAM
+    """
+    from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
+    import os
+    import shutil
+    import glob
+    if basecall:
+        model_basename = os.path.basename(model)
+        model_basename = model_basename.replace('.', '_')
+        bam=f"{output_directory}/{model_basename}_canonical_basecalls"
+    else:
+        bam_base=os.path.basename(input_data_path).split('.bam')[0]
+        bam=os.path.join(output_directory, bam_base)
+    aligned_BAM=f"{bam}_aligned"
+    aligned_sorted_BAM=f"{aligned_BAM}_sorted"
+    os.chdir(output_directory)
+    # 1) Convert FASTA file
+    fasta_basename = os.path.basename(fasta)
+    converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
+    converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
+    if 'converted.fa' in fasta:
+        print(fasta + ' is already converted. Using existing converted FASTA.')
+        converted_FASTA = fasta
+    elif os.path.exists(converted_FASTA):
+        print(converted_FASTA + ' already exists. Using existing converted FASTA.')
+    else:
+        generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
+    # Make a FAI and .chrom.names file for the converted fasta
+    get_chromosome_lengths(converted_FASTA)
+    # 2) Basecall from the input POD5 to generate a singular output BAM
+    if basecall:
+        canoncall_output = bam + bam_suffix
+        if os.path.exists(canoncall_output):
+            print(canoncall_output + ' already exists. Using existing basecalled BAM.')
+        else:
+            canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
+    else:
+        canoncall_output = input_data_path
+    # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
+    aligned_output = aligned_BAM + bam_suffix
+    sorted_output = aligned_sorted_BAM + bam_suffix
+    if os.path.exists(aligned_output) and os.path.exists(sorted_output):
+        print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
+    else:
+        align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads, deaminase_alignment=True)
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(output_directory, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
+    else:
+        aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
+    ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
+    if barcode_both_ends:
+        split_dir = split_dir + '_both_ends_barcoded'
+    else:
+        split_dir = split_dir + '_at_least_one_end_barcoded'
+    if os.path.isdir(split_dir):
+        print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
+        bam_pattern = '*' + bam_suffix
+        bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
+        bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
+        bam_files.sort()
+    else:
+        make_dirs([split_dir])
+        if input_already_demuxed:
+            bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
+        else:
+            bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(split_dir, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
+    else:
+        for bam in bam_files:
+            aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
+    # 5) Samtools QC metrics on split BAM files
+    bam_qc_dir = f"{split_dir}/bam_qc"
+    if os.path.isdir(bam_qc_dir):
+        print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
+    else:
+        make_dirs([bam_qc_dir])
+        bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
+    # multiqc ###
+    if os.path.isdir(f"{split_dir}/multiqc"):
+        print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
+    else:
+        run_multiqc(split_dir, f"{split_dir}/multiqc")
+    # 6) Take the converted BAM and load it into an adata object.
+    final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device, deaminase_footprinting=True)
+    return final_adata, final_adata_path, sorted_output, bam_files

smftools/informatics/fast5_to_pod5.py CHANGED Viewed

@@ -15,7 +15,10 @@ def fast5_to_pod5(fast5_dir, output_pod5='FAST5s_to_POD5.pod5'):
     import subprocess
     from pathlib import Path
-    if Path(fast5_dir).is_file():
+    if isinstance(fast5_dir, (list, tuple)):
+        cmd = ["pod5", "convert", "fast5"] + fast5_dir + ["--output", output_pod5]
+        subprocess.run(cmd)
+    elif Path(fast5_dir).is_file():
         subprocess.run(["pod5", "convert", "fast5", fast5_dir, "--output", output_pod5])
     elif Path(fast5_dir).is_dir():
         subprocess.run(["pod5", "convert", "fast5", f".{fast5_dir}*.fast5", "--output", output_pod5])

smftools/informatics/helpers/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
 from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
 from .count_aligned_reads import count_aligned_reads
 from .demux_and_index_BAM import demux_and_index_BAM
+from .discover_input_files import *
 from .extract_base_identities import extract_base_identities
 from .extract_mods import extract_mods
 from .extract_read_features_from_bam import extract_read_features_from_bam
@@ -19,7 +20,6 @@ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_F
 from .get_chromosome_lengths import get_chromosome_lengths
 from .get_native_references import get_native_references
 from .index_fasta import index_fasta
-from .LoadExperimentConfig import LoadExperimentConfig
 from .make_dirs import make_dirs
 from .make_modbed import make_modbed
 from .modcall import modcall
@@ -29,7 +29,7 @@ from .one_hot_encode import one_hot_encode
 from .ohe_batching import ohe_batching
 from .one_hot_decode import one_hot_decode
 from .ohe_layers_decode import ohe_layers_decode
-from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
+from .plot_bed_histograms import plot_bed_histograms
 from .run_multiqc import run_multiqc
 from .separate_bam_by_bc import separate_bam_by_bc
 from .split_and_index_BAM import split_and_index_BAM
@@ -57,7 +57,6 @@ __all__ = [
     "get_chromosome_lengths",
     "get_native_references",
     "index_fasta",
-    "LoadExperimentConfig",
     "make_dirs",
     "make_modbed",
     "modcall",
@@ -67,7 +66,7 @@ __all__ = [
     "ohe_batching",
     "one_hot_decode",
     "ohe_layers_decode",
-    "plot_read_length_and_coverage_histograms",
+    "plot_bed_histograms",
     "run_multiqc",
     "separate_bam_by_bc",
     "split_and_index_BAM"

smftools/informatics/helpers/align_and_sort_BAM.py CHANGED Viewed

@@ -1,6 +1,13 @@
 ## align_and_sort_BAM
-def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
+def align_and_sort_BAM(fasta,
+                       input,
+                       bam_suffix='.bam',
+                       output_directory='aligned_outputs',
+                       make_bigwigs=False,
+                       threads=None,
+                       aligner='minimap2',
+                       aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
     """
     A wrapper for running dorado aligner and samtools functions
@@ -11,6 +18,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
         output_directory (str): A file path to the directory to output all the analyses.
         make_bigwigs (bool): Whether to make bigwigs
         threads (int): Number of additional threads to use
+        aligner (str): Aligner to use. minimap2 and dorado options
+        aligner_args (list): list of optional parameters to use for the alignment
     Returns:
         None
@@ -21,6 +30,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
     input_basename = os.path.basename(input)
     input_suffix = '.' + input_basename.split('.')[1]
+    input_as_fastq = input_basename.split('.')[0] + '.fastq'
     output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
@@ -34,13 +44,30 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
     else:
         pass
-    # Run dorado aligner
-    print(f"Aligning BAM to Reference: {input}")
-    if threads:
-        alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
+    if aligner == 'minimap2':
+        print(f"Converting BAM to FASTQ: {input}")
+        bam_to_fastq_command = ['samtools', 'fastq', input]
+        subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
+        print(f"Aligning FASTQ to Reference: {input_as_fastq}")
+        if threads:
+            minimap_command = ['minimap2'] + aligner_args + ['-t', threads, fasta, input_as_fastq]
+        else:
+            minimap_command = ['minimap2'] + aligner_args + [fasta, input_as_fastq]
+        subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
+        os.remove(input_as_fastq)
+    elif aligner == 'dorado':
+        # Run dorado aligner
+        print(f"Aligning BAM to Reference: {input}")
+        if threads:
+            alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [fasta, input]
+        else:
+            alignment_command = ["dorado", "aligner"] + aligner_args + [fasta, input]
+        subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
     else:
-        alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
-    subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
+        print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
+        return
     # Sort the BAM on positional coordinates
     print(f"Sorting BAM: {aligned_output}")

smftools/informatics/helpers/aligned_BAM_to_bed.py CHANGED Viewed

@@ -1,7 +1,7 @@
 def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     """
     Takes an aligned BAM as input and writes a BED file of reads as output.
-    Bed columns are: Record name, start position, end position, read length, read name.
+    Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
     Parameters:
         aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
@@ -15,11 +15,13 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     """
     import subprocess
     import os
+    import pysam
+    import numpy as np
     import concurrent.futures
     from concurrent.futures import ProcessPoolExecutor
     from .bed_to_bigwig import bed_to_bigwig
     from . import make_dirs
-    from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
+    from .plot_bed_histograms import plot_bed_histograms
     threads = threads or os.cpu_count()  # Use max available cores if not specified
@@ -30,45 +32,54 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
-    print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
+    print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
-    # Convert BAM to BED format
-    with open(bed_output, "w") as output_file:
-        samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
-        awk_process = subprocess.Popen(
-            ["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
-            stdin=samtools_view.stdout,
-            stdout=output_file
-        )
+    with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
+        for read in bam.fetch(until_eof=True):
+            if read.is_unmapped:
+                chrom = "*"
+                start1 = 1
+                rl = read.query_length or 0
+                mapq = 0
+            else:
+                chrom = bam.get_reference_name(read.reference_id)
+                # pysam reference_start is 0-based → +1 for 1-based SAM-like start
+                start1 = int(read.reference_start) + 1
+                rl = read.query_length or 0
+                mapq = int(read.mapping_quality)
-    samtools_view.stdout.close()
-    awk_process.wait()
-    samtools_view.wait()
+            # End position in 1-based inclusive coords
+            end1 = start1 + (rl or 0) - 1
-    print(f"BED file created: {bed_output}")
+            qname = read.query_name
+            quals = read.query_qualities
+            if quals is None or rl == 0:
+                avg_q = float("nan")
+            else:
+                avg_q = float(np.mean(quals))
+            out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
+    print(f"BED-like file created: {bed_output}")
     def split_bed(bed):
-        """Splits BED into aligned and unaligned reads."""
+        """Splits into aligned and unaligned reads (chrom == '*')."""
         aligned = bed.replace(".bed", "_aligned.bed")
         unaligned = bed.replace(".bed", "_unaligned.bed")
         with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
             for line in infile:
-                (unaligned_out if line.startswith("*") else aligned_out).write(line)
+                (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
         os.remove(bed)
         return aligned
-    print(f"Splitting BED: {bed_output}")
+    print(f"Splitting: {bed_output}")
     aligned_bed = split_bed(bed_output)
-    with ProcessPoolExecutor() as executor:  # Use processes instead of threads
+    with ProcessPoolExecutor() as executor:
         futures = []
-        futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
+        futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
         if make_bigwigs:
             futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
-        # Wait for all tasks to complete
         concurrent.futures.wait(futures)
     print("Processing completed successfully.")

smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl