PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

smftools/__init__.py +5 -1
smftools/_version.py +1 -1
smftools/informatics/__init__.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +63 -10
smftools/informatics/direct_smf.py +66 -18
smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
smftools/informatics/helpers/__init__.py +16 -2
smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
smftools/informatics/helpers/canoncall.py +12 -3
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +33 -46
smftools/informatics/helpers/extract_mods.py +55 -23
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/find_conversion_sites.py +33 -44
smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
smftools/informatics/helpers/modcall.py +13 -5
smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
smftools/informatics/helpers/ohe_batching.py +65 -41
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +45 -9
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/split_and_index_BAM.py +3 -8
smftools/informatics/load_adata.py +58 -3
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +22 -9
smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
smftools/preprocessing/binarize_on_Youden.py +35 -32
smftools/preprocessing/binary_layers_to_ohe.py +13 -3
smftools/preprocessing/calculate_complexity.py +3 -2
smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
smftools/preprocessing/calculate_coverage.py +26 -25
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_position_Youden.py +18 -7
smftools/preprocessing/calculate_read_length_stats.py +39 -46
smftools/preprocessing/clean_NaN.py +33 -25
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
smftools/preprocessing/filter_reads_on_length.py +14 -4
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +18 -11
smftools/preprocessing/load_sample_sheet.py +30 -16
smftools/preprocessing/recipes.py +22 -20
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +105 -13
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +19 -23
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
smftools-0.1.7.dist-info/RECORD +136 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/read_HMM.py +0 -1
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
/smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
/smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ from . import preprocessing as pp
 from . import tools as tl
 from . import plotting as pl
 from . import readwrite, datasets
+from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
 from importlib.metadata import version
@@ -16,10 +17,13 @@ package_name = "smftools"
 __version__ = version(package_name)
 __all__ = [
+    "adata_to_df",
     "inform",
     "pp",
     "tl",
     "pl",
     "readwrite",
-    "datasets"
+    "datasets",
+    "safe_write_h5ad",
+    "merge_barcoded_anndatas"
 ]

smftools/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.3"
1	+ __version__ = "0.1.7"

smftools/informatics/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from . import helpers
+from .basecall_pod5s import basecall_pod5s
 from .load_adata import load_adata
 from .subsample_fasta_from_bed import subsample_fasta_from_bed
 from .subsample_pod5 import subsample_pod5
@@ -6,6 +7,7 @@ from .fast5_to_pod5 import fast5_to_pod5
 __all__ = [
+    "basecall_pod5s",
     "load_adata",
     "subsample_fasta_from_bed",
     "subsample_pod5",

smftools/informatics/archived/print_bam_query_seq.py ADDED Viewed

@@ -0,0 +1,29 @@
+import pysam
+import sys
+def extract_reads(bam_file_path, num_reads=10):
+    # Open the BAM file
+    bam_file = pysam.AlignmentFile(bam_file_path, "rb")
+    # Iterate through the first 'num_reads' reads and print the sequences
+    count = 0
+    for read in bam_file:
+        print(f"Read {count + 1}: {read.query_sequence}")
+        count += 1
+        if count >= num_reads:
+            break
+    # Close the BAM file
+    bam_file.close()
+if __name__ == "__main__":
+    # Ensure a BAM file path is provided as a command line argument
+    if len(sys.argv) < 2:
+        print("Usage: python extract_reads.py <path_to_bam_file>")
+        sys.exit(1)
+    # Get the BAM file path from command line arguments
+    bam_file_path = sys.argv[1]
+    # Call the function to extract the first 10 reads
+    extract_reads(bam_file_path)

smftools/informatics/basecall_pod5s.py ADDED Viewed

@@ -0,0 +1,80 @@
+# basecall_pod5s
+def basecall_pod5s(config_path):
+    """
+    Basecall from pod5s given a config file.
+    Parameters:
+        config_path (str): File path to the basecall configuration file
+    Returns:
+        None
+    """
+    # Lazy importing of packages
+    from .helpers import LoadExperimentConfig, make_dirs, canoncall, modcall
+    from .fast5_to_pod5 import fast5_to_pod5
+    import os
+    from pathlib import Path
+    # Default params
+    bam_suffix = '.bam' # If different, change from here.
+    # Load experiment config parameters into global variables
+    experiment_config = LoadExperimentConfig(config_path)
+    var_dict = experiment_config.var_dict
+    # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
+    default_value = None
+    # General config variable init
+    input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
+    output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
+    model = var_dict.get('model', default_value) # needed for dorado basecaller
+    barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
+    barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
+    trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
+    device = var_dict.get('device', 'auto')
+    # Modified basecalling specific variable init
+    filter_threshold = var_dict.get('filter_threshold', default_value)
+    m6A_threshold = var_dict.get('m6A_threshold', default_value)
+    m5C_threshold = var_dict.get('m5C_threshold', default_value)
+    hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
+    thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
+    mod_list = var_dict.get('mod_list', default_value)
+    # Make initial output directory
+    make_dirs([output_directory])
+    os.chdir(output_directory)
+    # Get the input filetype
+    if Path(input_data_path).is_file():
+        input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
+        input_is_pod5 = input_data_filetype in ['.pod5','.p5']
+        input_is_fast5 = input_data_filetype in ['.fast5','.f5']
+    elif Path(input_data_path).is_dir():
+        # Get the file names in the input data dir
+        input_files = os.listdir(input_data_path)
+        input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
+        input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
+    # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
+    if input_is_fast5 and not input_is_pod5:
+        # take the input directory of fast5 files and write out a single pod5 file into the output directory.
+        output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
+        print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
+        fast5_to_pod5(input_data_path, output_pod5)
+        # Reassign the pod5_dir variable to point to the new pod5 file.
+        input_data_path = output_pod5
+    model_basename = os.path.basename(model)
+    model_basename = model_basename.replace('.', '_')
+    if mod_list:
+        mod_string = "_".join(mod_list)
+        bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
+        modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
+    else:
+        bam=f"{output_directory}/{model_basename}_canonical_basecalls"
+        canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)

smftools/informatics/conversion_smf.py CHANGED Viewed

@@ -1,6 +1,6 @@
 ## conversion_smf
-def conversion_smf(fasta, output_directory, conversion_types, strands, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall):
+def conversion_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
     """
     Processes sequencing data from a conversion SMF experiment to an adata object.
@@ -9,7 +9,8 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
         output_directory (str): A file path to the directory to output all the analyses.
         conversion_type (list): A list of strings of the conversion types to use in the analysis.
         strands (list): A list of converstion strands to use in the experiment.
-        model (str): a string representing the file path to the dorado basecalling model.
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the dorado basecalling model.
         input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
         split_dir (str): A string representing the file path to the directory to split the BAMs into.
         barcode_kit (str): A string representing the barcoding kit used in the experiment.
@@ -17,12 +18,21 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
         experiment_name (str): A string to provide an experiment name to the output adata file.
         bam_suffix (str): A suffix to add to the bam file.
         basecall (bool): Whether to go through basecalling or not.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
+        device (str): Device to use for basecalling. auto, metal, cpu, cuda
+        make_bigwigs (bool): Whether to make bigwigs
+        threads (int): cpu threads available for processing.
+        input_already_demuxed (bool): Whether the input files were already demultiplexed
     Returns:
-        None
+        final_adata_path (str): Path to the final adata object
+        sorted_output (str): Path to the aligned, sorted BAM
     """
-    from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, get_chromosome_lengths, split_and_index_BAM, make_dirs
+    from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
     import os
+    import glob
     if basecall:
         model_basename = os.path.basename(model)
         model_basename = model_basename.replace('.', '_')
@@ -56,7 +66,7 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
         if os.path.exists(canoncall_output):
             print(canoncall_output + ' already exists. Using existing basecalled BAM.')
         else:
-            canoncall(model, input_data_path, barcode_kit, bam, bam_suffix)
+            canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
     else:
         canoncall_output = input_data_path
@@ -66,14 +76,57 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
     if os.path.exists(aligned_output) and os.path.exists(sorted_output):
         print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
     else:
-        align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory)
+        align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads)
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(output_directory, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
+    else:
+        aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
     ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
+    if barcode_both_ends:
+        split_dir = split_dir + '_both_ends_barcoded'
+    else:
+        split_dir = split_dir + '_at_least_one_end_barcoded'
     if os.path.isdir(split_dir):
-        print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
+        print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
+        bam_pattern = '*' + bam_suffix
+        bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
+        bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
+        bam_files.sort()
     else:
         make_dirs([split_dir])
-        split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA)
+        if input_already_demuxed:
+            bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
+        else:
+            bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(split_dir, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
+    else:
+        for bam in bam_files:
+            aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
+    # 5) Samtools QC metrics on split BAM files
+    bam_qc_dir = f"{split_dir}/bam_qc"
+    if os.path.isdir(bam_qc_dir):
+        print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
+    else:
+        make_dirs([bam_qc_dir])
+        bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
+    # multiqc ###
+    if os.path.isdir(f"{split_dir}/multiqc"):
+        print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
+    else:
+        run_multiqc(split_dir, f"{split_dir}/multiqc")
+    # 6) Take the converted BAM and load it into an adata object.
+    final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device)
-    # 5) Take the converted BAM and load it into an adata object.
-    converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
+    return final_adata, final_adata_path, sorted_output, bam_files

smftools/informatics/direct_smf.py CHANGED Viewed

@@ -1,6 +1,6 @@
 ## direct_smf
-def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall):
+def direct_smf(fasta, output_directory, mod_list, model_dir, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads):
     """
     Processes sequencing data from a direct methylation detection Nanopore SMF experiment to an AnnData object.
@@ -8,7 +8,8 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
         fasta (str): File path to the reference genome to align to.
         output_directory (str): A file path to the directory to output all the analyses.
         mod_list (list): A list of strings of the modification types to use in the analysis.
-        model (str): a string representing the file path to the dorado basecalling model.
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the dorado basecalling model.
         thresholds (list): A list of floats to pass for call thresholds.
         input_data_path (str): a string representing the file path to the experiment directory containing the input sequencing files.
         split_dir (str): A string representing the file path to the directory to split the BAMs into.
@@ -18,11 +19,19 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
         bam_suffix (str): A suffix to add to the bam file.
         batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
         basecall (bool): Whether to basecall
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends
+        device (str): Device to use for basecalling. auto, metal, cpu, cuda
+        make_bigwigs (bool): Whether to make bigwigs
+        skip_unclassified (bool): Whether to skip unclassified reads when extracting mods and loading anndata
+        delete_batch_hdfs (bool): Whether to delete intermediate hdf5 files.
+        threads (int): cpu threads available for processing.
     Returns:
-        None
+        final_adata_path (str): Path to the final adata object
+        sorted_output (str): Path to the aligned, sorted BAM
     """
-    from .helpers import align_and_sort_BAM, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
+    from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc
     import os
     if basecall:
@@ -35,8 +44,15 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
         bam=os.path.join(output_directory, bam_base)
     aligned_BAM=f"{bam}_aligned"
     aligned_sorted_BAM=f"{aligned_BAM}_sorted"
-    mod_bed_dir=f"{output_directory}/split_mod_beds"
-    mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
+    if barcode_both_ends:
+        split_dir = split_dir + '_both_ends_barcoded'
+    else:
+        split_dir = split_dir + '_at_least_one_end_barcoded'
+    mod_bed_dir=f"{split_dir}/split_mod_beds"
+    mod_tsv_dir=f"{split_dir}/split_mod_tsvs"
+    bam_qc_dir = f"{split_dir}/bam_qc"
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
     mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
@@ -53,7 +69,7 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
         if os.path.exists(modcall_output):
             print(modcall_output + ' already exists. Using existing basecalled BAM.')
         else:
-            modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix)
+            modcall(model_dir, model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
     else:
         modcall_output = input_data_path
@@ -63,27 +79,59 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
     if os.path.exists(aligned_output) and os.path.exists(sorted_output):
         print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
     else:
-        align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory)
+        align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory, make_bigwigs, threads)
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(output_directory, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
+    else:
+        aligned_BAM_to_bed(aligned_output, output_directory, fasta, make_bigwigs, threads)
     # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
     if os.path.isdir(split_dir):
-        print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
+        print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
+        bam_files = os.listdir(split_dir)
+        bam_files = [os.path.join(split_dir, file) for file in bam_files if '.bam' in file and '.bai' not in file and 'unclassified' not in file]
+        bam_files.sort()
     else:
         make_dirs([split_dir])
-        split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, fasta)
+        bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
+        # split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA) # deprecated, just use dorado demux
+    # Make beds and provide basic histograms
+    bed_dir = os.path.join(split_dir, 'beds')
+    if os.path.isdir(bed_dir):
+        print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
+    else:
+        for bam in bam_files:
+            aligned_BAM_to_bed(bam, split_dir, fasta, make_bigwigs, threads)
+    # 4) Samtools QC metrics on split BAM files
+    if os.path.isdir(bam_qc_dir):
+        print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
+    else:
+        make_dirs([bam_qc_dir])
+        bam_qc(bam_files, bam_qc_dir, threads, modality='direct')
-    # 4) Using nanopore modkit to work with modified BAM files ###
+    # 5) Using nanopore modkit to work with modified BAM files ###
     if os.path.isdir(mod_bed_dir):
-        print(mod_bed_dir + ' already exists')
+        print(mod_bed_dir + ' already exists, skipping making modbeds')
     else:
         make_dirs([mod_bed_dir])
         modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
         make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
-    if os.path.isdir(mod_tsv_dir):
-        print(mod_tsv_dir + ' already exists')
+    # multiqc ###
+    if os.path.isdir(f"{split_dir}/multiqc"):
+        print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
     else:
-        make_dirs([mod_tsv_dir])
-        extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix) # Extract methylations calls for split BAM files into split TSV files
+        run_multiqc(split_dir, f"{split_dir}/multiqc")
+    make_dirs([mod_tsv_dir])
+    extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified, threads) # Extract methylations calls for split BAM files into split TSV files
+    #6 Load the modification data from TSVs into an adata object
+    final_adata, final_adata_path = modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs, threads)
-    #5 Load the modification data from TSVs into an adata object
-    modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir)
+    return final_adata, final_adata_path, sorted_output, bam_files

smftools/informatics/helpers/LoadExperimentConfig.py CHANGED Viewed

@@ -42,6 +42,7 @@ class LoadExperimentConfig:
     """
     def __init__(self, experiment_config):
         import pandas as pd
+        print(f"Loading experiment config from {experiment_config}")
         # Read the CSV into a pandas DataFrame
         df = pd.read_csv(experiment_config)
         # Initialize an empty dictionary to store variables

smftools/informatics/helpers/__init__.py CHANGED Viewed

@@ -1,14 +1,18 @@
 from .align_and_sort_BAM import align_and_sort_BAM
 from .aligned_BAM_to_bed import aligned_BAM_to_bed
+from .bam_qc import bam_qc
 from .bed_to_bigwig import bed_to_bigwig
 from .binarize_converted_base_identities import binarize_converted_base_identities
 from .canoncall import canoncall
 from .complement_base_list import complement_base_list
-from .converted_BAM_to_adata import converted_BAM_to_adata
+from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
 from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
 from .count_aligned_reads import count_aligned_reads
+from .demux_and_index_BAM import demux_and_index_BAM
 from .extract_base_identities import extract_base_identities
 from .extract_mods import extract_mods
+from .extract_read_features_from_bam import extract_read_features_from_bam
+from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
 from .extract_readnames_from_BAM import extract_readnames_from_BAM
 from .find_conversion_sites import find_conversion_sites
 from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
@@ -23,22 +27,29 @@ from .modkit_extract_to_adata import modkit_extract_to_adata
 from .modQC import modQC
 from .one_hot_encode import one_hot_encode
 from .ohe_batching import ohe_batching
+from .one_hot_decode import one_hot_decode
+from .ohe_layers_decode import ohe_layers_decode
 from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
+from .run_multiqc import run_multiqc
 from .separate_bam_by_bc import separate_bam_by_bc
 from .split_and_index_BAM import split_and_index_BAM
 __all__ = [
     "align_and_sort_BAM",
     "aligned_BAM_to_bed",
+    "bam_qc",
     "bed_to_bigwig",
     "binarize_converted_base_identities",
     "canoncall",
     "complement_base_list",
-    "converted_BAM_to_adata",
+    "converted_BAM_to_adata_II",
     "concatenate_fastqs_to_bam",
     "count_aligned_reads",
+    "demux_and_index_BAM",
     "extract_base_identities",
     "extract_mods",
+    "extract_read_features_from_bam",
+    "extract_read_lengths_from_bed",
     "extract_readnames_from_BAM",
     "find_conversion_sites",
     "convert_FASTA_record",
@@ -54,7 +65,10 @@ __all__ = [
     "modQC",
     "one_hot_encode",
     "ohe_batching",
+    "one_hot_decode",
+    "ohe_layers_decode",
     "plot_read_length_and_coverage_histograms",
+    "run_multiqc",
     "separate_bam_by_bc",
     "split_and_index_BAM"
 ]

smftools/informatics/helpers/align_and_sort_BAM.py CHANGED Viewed

@@ -1,6 +1,6 @@
 ## align_and_sort_BAM
-def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
+def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
     """
     A wrapper for running dorado aligner and samtools functions
@@ -9,6 +9,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
         input (str): File path to the basecalled file to align. Works for .bam and .fastq files
         bam_suffix (str): The suffix to use for the BAM file.
         output_directory (str): A file path to the directory to output all the analyses.
+        make_bigwigs (bool): Whether to make bigwigs
+        threads (int): Number of additional threads to use
     Returns:
         None
@@ -16,9 +18,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
     """
     import subprocess
     import os
-    from .aligned_BAM_to_bed import aligned_BAM_to_bed
-    from .extract_readnames_from_BAM import extract_readnames_from_BAM
-    from .make_dirs import make_dirs
     input_basename = os.path.basename(input)
     input_suffix = '.' + input_basename.split('.')[1]
@@ -28,21 +28,32 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
     aligned_sorted_BAM=f"{aligned_BAM}_sorted"
     aligned_output = aligned_BAM + bam_suffix
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
+    if threads:
+        threads = str(threads)
+    else:
+        pass
     # Run dorado aligner
-    subprocess.run(["dorado", "aligner", "--secondary", "no", fasta, input], stdout=open(aligned_output, "w"))
+    print(f"Aligning BAM to Reference: {input}")
+    if threads:
+        alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
+    else:
+        alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
+    subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
     # Sort the BAM on positional coordinates
-    subprocess.run(["samtools", "sort", "-o", aligned_sorted_output, aligned_output])
+    print(f"Sorting BAM: {aligned_output}")
+    if threads:
+        sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
+    else:
+        sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
+    subprocess.run(sort_command)
     # Create a BAM index file
-    subprocess.run(["samtools", "index", aligned_sorted_output])
-    # Make a bed file of coordinates for the BAM
-    plotting_dir = os.path.join(output_directory, 'coverage_and_readlength_histograms')
-    bed_dir = os.path.join(output_directory, 'read_alignment_coordinates')
-    make_dirs([plotting_dir, bed_dir])
-    aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
-    # Make a text file of reads for the BAM
-    extract_readnames_from_BAM(aligned_sorted_output)
+    print(f"Indexing BAM: {aligned_sorted_output}")
+    if threads:
+        index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
+    else:
+        index_command = ["samtools", "index", aligned_sorted_output]
+    subprocess.run(index_command)

smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl