PyPI - smftools - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

smftools 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

smftools/__init__.py +0 -2
smftools/_settings.py +1 -1
smftools/_version.py +1 -0
smftools/datasets/datasets.py +11 -9
smftools/informatics/__init__.py +8 -7
smftools/informatics/bam_conversion.py +47 -0
smftools/informatics/bam_direct.py +49 -0
smftools/informatics/basecalls_to_adata.py +42 -0
smftools/informatics/fast5_to_pod5.py +19 -0
smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
smftools/informatics/helpers/__init__.py +4 -4
smftools/informatics/helpers/align_and_sort_BAM.py +52 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +10 -3
smftools/informatics/helpers/canoncall.py +12 -1
smftools/informatics/helpers/converted_BAM_to_adata.py +30 -13
smftools/informatics/helpers/count_aligned_reads.py +12 -5
smftools/informatics/helpers/extract_base_identities.py +13 -6
smftools/informatics/helpers/extract_mods.py +17 -5
smftools/informatics/helpers/find_conversion_sites.py +15 -9
smftools/informatics/helpers/generate_converted_FASTA.py +49 -29
smftools/informatics/helpers/get_native_references.py +10 -7
smftools/informatics/helpers/make_dirs.py +9 -3
smftools/informatics/helpers/make_modbed.py +10 -4
smftools/informatics/helpers/modQC.py +10 -2
smftools/informatics/helpers/modcall.py +13 -1
smftools/informatics/helpers/modkit_extract_to_adata.py +25 -13
smftools/informatics/helpers/one_hot_encode.py +8 -3
smftools/informatics/helpers/separate_bam_by_bc.py +18 -5
smftools/informatics/helpers/split_and_index_BAM.py +18 -10
smftools/informatics/pod5_conversion.py +34 -7
smftools/informatics/pod5_direct.py +31 -5
smftools/informatics/pod5_to_adata.py +31 -8
smftools/informatics/readwrite.py +13 -16
smftools/informatics/subsample_pod5.py +48 -0
smftools/preprocessing/__init__.py +0 -6
smftools/preprocessing/append_C_context.py +15 -8
smftools/preprocessing/binarize_on_Youden.py +8 -4
smftools/preprocessing/binary_layers_to_ohe.py +9 -4
smftools/preprocessing/calculate_complexity.py +26 -14
smftools/preprocessing/calculate_converted_read_methylation_stats.py +12 -5
smftools/preprocessing/calculate_coverage.py +13 -7
smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
smftools/preprocessing/calculate_position_Youden.py +21 -12
smftools/preprocessing/calculate_read_length_stats.py +11 -6
smftools/preprocessing/clean_NaN.py +12 -5
smftools/preprocessing/filter_converted_reads_on_methylation.py +12 -5
smftools/preprocessing/filter_reads_on_length.py +13 -5
smftools/preprocessing/invert_adata.py +9 -5
smftools/preprocessing/mark_duplicates.py +20 -11
smftools/preprocessing/min_non_diagonal.py +9 -4
smftools/preprocessing/remove_duplicates.py +9 -3
smftools/readwrite.py +13 -16
smftools-0.1.1.dist-info/METADATA +88 -0
smftools-0.1.1.dist-info/RECORD +64 -0
smftools/informatics/helpers/align_BAM.py +0 -49
smftools/informatics/helpers/load_experiment_config.py +0 -17
smftools-0.1.0.dist-info/METADATA +0 -75
smftools-0.1.0.dist-info/RECORD +0 -58
/smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
/smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
/smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
{smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/WHEEL +0 -0
{smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/find_conversion_sites.py CHANGED Viewed

@@ -1,22 +1,28 @@
 ## find_conversion_sites
-from .. import readwrite
-# bioinformatic operations
-from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
-def find_conversion_sites(fasta_file, modification_type):
+def find_conversion_sites(fasta_file, modification_type, conversion_types):
     """
     A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
     If searching for adenine conversions, it will find coordinates of all adenines.
-    Input: A FASTA file and the modification_types of interest
+    Parameters:
+        fasta_file (str): A string representing the file path to the unconverted reference FASTA.
+        modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
+        conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
     Returns:
-    A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
+        record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
     """
+    from .. import readwrite
+    from Bio import SeqIO
+    from Bio.SeqRecord import SeqRecord
+    from Bio.Seq import Seq
     print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
     # Initialize lists to hold top and bottom strand positional coordinates of interest
     top_strand_coordinates = []
     bottom_strand_coordinates = []
+    unconverted = conversion_types[0]
     record_dict = {}
     print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
     # Open the FASTA record as read only
@@ -24,7 +30,7 @@ def find_conversion_sites(fasta_file, modification_type):
         # Iterate over records in the FASTA
         for record in SeqIO.parse(f, "fasta"):
             # Only iterate over the unconverted records for the reference
-            if 'unconverted' in record.id:
+            if unconverted in record.id:
                 print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
                 # Extract the sequence string of the record
                 sequence = str(record.seq).upper()

smftools/informatics/helpers/generate_converted_FASTA.py CHANGED Viewed

@@ -1,14 +1,16 @@
 ## generate_converted_FASTA
-from .. import readwrite
-# bioinformatic operations
-from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
-def convert_FASTA_record(record, modification_type, strand):
+def convert_FASTA_record(record, modification_type, strand, unconverted):
     """
-    Input: Takes a FASTA record, modification type, and strand as input
-    Output: Returns a new seqrecord object with the conversions of interest
+    Takes a FASTA record and converts every instance of a base to the converted state.
+    Parameters:
+        record (str): The name of the record instance within the FASTA.
+        modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
+        strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
+    Returns:
+        new_seq (str): Converted sequence string.
+        new_id (str): Record id for the converted sequence string.
     """
     if modification_type == '5mC':
         if strand == 'top':
@@ -18,7 +20,8 @@ def convert_FASTA_record(record, modification_type, strand):
             # Replace every 'G' with 'A' in the sequence
             new_seq = record.seq.upper().replace('G', 'A')
         else:
-            print('need to provide a valid strand string: top or bottom')
+            print('need to provide a valid strand string: top or bottom')
+        new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
     elif modification_type == '6mA':
         if strand == 'top':
             # Replace every 'A' with 'G' in the sequence
@@ -28,32 +31,49 @@ def convert_FASTA_record(record, modification_type, strand):
             new_seq = record.seq.upper().replace('T', 'C')
         else:
             print('need to provide a valid strand string: top or bottom')
-    elif modification_type == 'unconverted':
+        new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
+    elif modification_type == unconverted:
         new_seq = record.seq.upper()
+        new_id = '{0}_{1}_top'.format(record.id, modification_type)
     else:
-        print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
-    new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
-    # Return a new SeqRecord with modified sequence and ID
+        print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
+    return new_seq, new_id
 def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
     """
-    Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
-    Output: Writes out a new fasta with all stranded conversions
-    Notes: Uses modify_sequence_and_id function on every record within the FASTA
+    Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
+    Parameters:
+        input_FASTA (str): A string representing the path to the unconverted FASTA file.
+        modification_types (list): A list of modification types to use in the experiment.
+        strands (list): A list of converstion strands to use in the experiment.
+        output_FASTA (str): A string representing the path to the converted FASTA output file.
+    Returns:
+        None
+        Writes out a converted FASTA reference for the experiment.
     """
+    from .. import readwrite
+    from Bio import SeqIO
+    from Bio.SeqRecord import SeqRecord
+    from Bio.Seq import Seq
+    modified_records = []
+    unconverted = modification_types[0]
+    # Iterate over each record in the input FASTA
+    for record in SeqIO.parse(input_fasta, 'fasta'):
+        record_description = record.description
+        # Iterate over each modification type of interest
+        for modification_type in modification_types:
+            # Iterate over the strands of interest
+            for i, strand in enumerate(strands):
+                if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
+                    pass
+                else:
+                    # Add the modified record to the list of modified records
+                    print(f'converting {modification_type} on the {strand} strand of record {record}')
+                    new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
+                    new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
+                    modified_records.append(new_record)
     with open(output_fasta, 'w') as output_handle:
-        modified_records = []
-        # Iterate over each record in the input FASTA
-        for record in SeqIO.parse(input_fasta, 'fasta'):
-            # Iterate over each modification type of interest
-            for modification_type in modification_types:
-                # Iterate over the strands of interest
-                for i, strand in enumerate(strands):
-                    if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
-                        pass
-                    else:
-                        # Add the modified record to the list of modified records
-                        print(f'converting {modification_type} on the {strand} strand of record {record}')
-                        modified_records.append(convert_FASTA_record(record, modification_type, strand))
         # write out the concatenated FASTA file of modified sequences
         SeqIO.write(modified_records, output_handle, 'fasta')

smftools/informatics/helpers/get_native_references.py CHANGED Viewed

@@ -1,17 +1,20 @@
 ## get_native_references
-from .. import readwrite
-# bioinformatic operations
-from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
 # Direct methylation specific
 def get_native_references(fasta_file):
     """
-    Input: A FASTA file
+    Makes a dictionary keyed by record id which points to the record length and record sequence.
+    Paramaters:
+        fasta_file (str): A string representing the path to the FASTA file for the experiment.
     Returns:
-    A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
+        None
     """
+    from .. import readwrite
+    from Bio import SeqIO
+    from Bio.SeqRecord import SeqRecord
+    from Bio.Seq import Seq
     record_dict = {}
     print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
     # Open the FASTA record as read only

smftools/informatics/helpers/make_dirs.py CHANGED Viewed

@@ -1,12 +1,18 @@
 ## make_dirs
-import os
 # General
 def make_dirs(directories):
     """
-    Input: Takes a list of file paths to make directories for
-    Output: Makes each directory in the list if the directory doesn't already exist.
+    Takes a list of file paths and makes new directories if the directory does not already exist.
+    Parameters:
+        directories (list): A list of directories to make
+    Returns:
+        None
     """
+    import os
     for directory in directories:
         if not os.path.isdir(directory):
             os.mkdir(directory)

smftools/informatics/helpers/make_modbed.py CHANGED Viewed

@@ -1,19 +1,25 @@
 ## make_modbed
-import os
-import subprocess
 # Direct SMF
 def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
     """
-    Generating Barcode position methylation summaries starting from the overall BAM file that was direct output of dorado aligner
+    Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
+    Parameters:
+        aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
+    Returns:
+        None
     """
+    import os
+    import subprocess
     os.chdir(mod_bed_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     command = [
         "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
         "--partition-tag", "BC",
         "--only-tabs",
-        "--filter-threshold", filter_threshold,
+        "--filter-threshold", f'{filter_threshold}',
         "--mod-thresholds", f"m:{m5C_threshold}",
         "--mod-thresholds", f"a:{m6A_threshold}",
         "--mod-thresholds", f"h:{hm5C_threshold}"

smftools/informatics/helpers/modQC.py CHANGED Viewed

@@ -1,17 +1,25 @@
 ## modQC
-import subprocess
 # Direct SMF
 def modQC(aligned_sorted_output, thresholds):
     """
     Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
     It is generally good to look at these parameters on positive and negative controls.
+    Parameters:
+        aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
+        thresholds (list): A list of floats to pass for call thresholds.
+    Returns:
+        None
     """
+    import subprocess
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
     command = [
         "modkit", "summary", aligned_sorted_output,
-        "--filter-threshold", filter_threshold,
+        "--filter-threshold", f"{filter_threshold}",
         "--mod-thresholds", f"m:{m5C_threshold}",
         "--mod-thresholds", f"a:{m6A_threshold}",
         "--mod-thresholds", f"h:{hm5C_threshold}"

smftools/informatics/helpers/modcall.py CHANGED Viewed

@@ -1,11 +1,23 @@
 ## modcall
-import subprocess
 # Direct methylation specific
 def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
     """
     Wrapper function for dorado modified base calling.
+    Parameters:
+        model (str): a string representing the file path to the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mod_list (list): A list of modification types to use in the analysis.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+    Returns:
+        None
+            Outputs a BAM file holding the modified base calls output by the dorado basecaller.
     """
+    import subprocess
     output = bam + bam_suffix
     command = [
     "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",

smftools/informatics/helpers/modkit_extract_to_adata.py CHANGED Viewed

@@ -1,20 +1,31 @@
 ## modkit_extract_to_adata
-from .. import readwrite
-from .get_native_references import get_native_references
-from .count_aligned_reads import count_aligned_reads
-from .extract_base_identities import extract_base_identities
-from .one_hot_encode import one_hot_encode
-import pandas as pd
-import anndata as ad
-import os
-import gc
-import math
-import numpy as np
 def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
     """
+    Takes modkit extract outputs and organizes it into an adata object
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        bam (str): File path to the aligned_sorted non-split modified BAM file
+        mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
+        experiment_name (str): A string to provide an experiment name to the output adata file.
+        mods (list): A list of strings of the modification types to use in the analysis.
+        batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
+    Returns:
+        None
     """
+    from .. import readwrite
+    from .get_native_references import get_native_references
+    from .count_aligned_reads import count_aligned_reads
+    from .extract_base_identities import extract_base_identities
+    from .one_hot_encode import one_hot_encode
+    import pandas as pd
+    import anndata as ad
+    import os
+    import gc
+    import math
+    import numpy as np
     ###################################################
     ### Get input tsv file names into a sorted list ###
     # List all files in the directory
@@ -56,7 +67,8 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
         delta_max_length = max_reference_length - current_reference_length
         sequence = reference_dict[record][1] + 'N'*delta_max_length
         # Get a dictionary of positional base identities keyed by read id
-        base_identities = extract_base_identities(bam, record, current_reference_length, max_reference_length)
+        positions = range(current_reference_length)
+        base_identities = extract_base_identities(bam, record, positions, max_reference_length)
         # One hot encode the sequence string of the reads
         one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
         record_seq_dict[record] = (one_hot_reads, sequence)

smftools/informatics/helpers/one_hot_encode.py CHANGED Viewed

@@ -1,12 +1,17 @@
 # one_hot_encode
-from .. import readwrite
 # String encodings
 def one_hot_encode(sequence):
     """
-    Input: A sequence string of a read.
-    Output: One hot encoding of the sequence string.
+    One hot encodes a sequence string.
+    Parameters:
+        sequence (str): A DNA sequence string.
+    Returns:
+        one_hot_matrix (ndarray): A numpy ndarray holding a vstacked one hot encoding of the input sequence string.
     """
+    import numpy as np
     mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
     one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
     for i, nucleotide in enumerate(sequence):

smftools/informatics/helpers/separate_bam_by_bc.py CHANGED Viewed

@@ -1,12 +1,25 @@
 ## separate_bam_by_bc
-import pysam
 # General
-def separate_bam_by_bc(input_bam, output_prefix):
+def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
     """
-    Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
-    Output: Splits the BAM based on the BC SAM tag value.
+    Separates an input BAM file on the BC SAM tag values.
+    Parameters:
+        input_bam (str): File path to the BAM file to split.
+        output_prefix (str): A prefix to append to the output BAM.
+        bam_suffix (str): A suffix to add to the bam file.
+    Returns:
+        None
+            Writes out split BAM files.
     """
+    import pysam
+    import os
+    bam_base = os.path.basename(input_bam)
+    bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
     # Open the input BAM file for reading
     with pysam.AlignmentFile(input_bam, "rb") as bam:
         # Create a dictionary to store output BAM files
@@ -18,7 +31,7 @@ def separate_bam_by_bc(input_bam, output_prefix):
                 bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
                 # Open the output BAM file corresponding to the barcode
                 if bc_tag not in output_files:
-                    output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
+                    output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}", "wb", header=bam.header)
                 # Write the read to the corresponding output BAM file
                 output_files[bc_tag].write(read)
             except KeyError:

smftools/informatics/helpers/split_and_index_BAM.py CHANGED Viewed

@@ -1,21 +1,29 @@
 ## split_and_index_BAM
-from .. import readwrite
-import os
-import subprocess
-import glob
-from .separate_bam_by_bc import separate_bam_by_bc
 def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
     """
-    A wrapper function for splitting BAMS and indexing them
+    A wrapper function for splitting BAMS and indexing them.
+    Parameters:
+        aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
+        split_dir (str): A string representing the file path to the directory to split the BAMs into.
+        bam_suffix (str): A suffix to add to the bam file.
+    Returns:
+        None
+            Splits an input BAM file on barcode value and makes a BAM index file.
     """
+    from .. import readwrite
+    import os
+    import subprocess
+    import glob
+    from .separate_bam_by_bc import separate_bam_by_bc
     os.chdir(split_dir)
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
-    file_prefix = readwrite.datestring()
-    separate_bam_by_bc(aligned_sorted_output, file_prefix)
+    file_prefix = readwrite.date_string()
+    separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix)
     # Make a BAM index file for the BAMs in that directory
     bam_pattern = '*' + bam_suffix
     bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
     for input_file in bam_files:
-        subprocess.run(["samtools", "index", input_file])
-        print(f"Indexed {input_file}")
+        subprocess.run(["samtools", "index", input_file])

smftools/informatics/pod5_conversion.py CHANGED Viewed

@@ -1,23 +1,50 @@
 ## pod5_conversion
-from .helpers import align_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
-import subprocess
 def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
     """
-    Converts a POD5 file from a nanopore conversion SMF experiment to an adata object
+    Converts a POD5 file from a nanopore conversion SMF experiment to an adata object.
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        output_directory (str): A file path to the directory to output all the analyses.
+        conversion_type (list): A list of strings of the conversion types to use in the analysis.
+        strands (list): A list of converstion strands to use in the experiment.
+        model (str): a string representing the file path to the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        split_dir (str): A string representing the file path to the directory to split the BAMs into.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
+        experiment_name (str): A string to provide an experiment name to the output adata file.
+        bam_suffix (str): A suffix to add to the bam file.
+    Returns:
+        None
     """
-    bam=f"{output_directory}/HAC_basecalls"
+    from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
+    import os
+    model_basename = os.path.basename(model)
+    model_basename = model_basename.replace('.', '_')
+    bam=f"{output_directory}/{model_basename}_canonical_basecalls"
     aligned_BAM=f"{bam}_aligned"
     aligned_sorted_BAM=f"{aligned_BAM}_sorted"
+    os.chdir(output_directory)
     # 1) Convert FASTA file
-    converted_FASTA=fasta.split('.fa')[0]+'_converted.fasta'
-    generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
+    fasta_basename = os.path.basename(fasta)
+    converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
+    converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
+    if os.path.exists(converted_FASTA):
+        print(converted_FASTA + ' already exists. Using existing converted FASTA.')
+    else:
+        generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
     # 2) Basecall from the input POD5 to generate a singular output BAM
     canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
     # 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
-    align_BAM(converted_FASTA, bam, bam_suffix)
+    input_BAM = bam + bam_suffix
+    align_and_sort_BAM(converted_FASTA, input_BAM, bam_suffix, output_directory)
     ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
     split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)

smftools/informatics/pod5_direct.py CHANGED Viewed

@@ -1,24 +1,50 @@
 ## pod5_direct
-from .helpers import align_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM
 def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
     """
+    Converts a POD5 file from a nanopore native SMF experiment to an adata object.
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        output_directory (str): A file path to the directory to output all the analyses.
+        mod_list (list): A list of strings of the modification types to use in the analysis.
+        model (str): a string representing the file path to the dorado basecalling model.
+        thresholds (list): A list of floats to pass for call thresholds.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        split_dir (str): A string representing the file path to the directory to split the BAMs into.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
+        experiment_name (str): A string to provide an experiment name to the output adata file.
+        bam_suffix (str): A suffix to add to the bam file.
+        batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
+    Returns:
+        None
     """
-    bam=f"{output_directory}/HAC_mod_calls"
+    from .helpers import align_and_sort_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
+    import os
+    model_basename = os.path.basename(model)
+    model_basename = model_basename.replace('.', '_')
+    mod_string = "_".join(mod_list)
+    bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
     aligned_BAM=f"{bam}_aligned"
     aligned_sorted_BAM=f"{aligned_BAM}_sorted"
     mod_bed_dir=f"{output_directory}/split_mod_beds"
     mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
+    make_dirs([mod_bed_dir, mod_tsv_dir])
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
     mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
     mods = [mod_map[mod] for mod in mod_list]
+    os.chdir(output_directory)
     # 1) Basecall using dorado
     modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
-    # 2) Align the BAM to the converted reference FASTA. Also make an index and a bed file of mapped reads
-    align_BAM(fasta, bam, bam_suffix)
+    # 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
+    input_BAM = bam + bam_suffix
+    align_and_sort_BAM(fasta, input_BAM, bam_suffix, output_directory)
     # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
     split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
     # 4) Using nanopore modkit to work with modified BAM files ###

smftools/informatics/pod5_to_adata.py CHANGED Viewed

@@ -1,17 +1,40 @@
 ## pod5_to_adata
-from .helpers import load_experiment_config
-from.pod5_direct import pod5_direct
-from.pod5_conversion import pod5_conversion
-def pod5_to_adata(config_path, ):
+def pod5_to_adata(config_path):
     """
+    High-level function to call for converting raw sequencing data to an adata object.
+    Parameters:
+        config_path (str): A string representing the file path to the experiment configuration csv file.
+    Returns:
+        None
     """
+    from .helpers import LoadExperimentConfig, make_dirs
+    import os
+    bam_suffix = '.bam' # If different, change from here.
+    split_dir = 'split_BAMs' # If different, change from here.
+    strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
+    conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
     # Load experiment config parameters into global variables
-    load_experiment_config(config_path)
+    experiment_config = LoadExperimentConfig(config_path)
+    var_dict = experiment_config.var_dict
+    for key, value in var_dict.items():
+        globals()[key] = value
+    conversions += conversion_types
+    split_path = os.path.join(output_directory, split_dir)
+    make_dirs([output_directory, split_path])
+    os.chdir(output_directory)
     if smf_modality == 'conversion':
-        (fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
+        from .pod5_conversion import pod5_conversion
+        pod5_conversion(fasta, output_directory, conversions, strands, model, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
     elif smf_modality == 'direct':
-        pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
+        from .pod5_direct import pod5_direct
+        thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
+        pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
     else:
         print("Error")

smftools 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

smftools 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl