PyPI - smftools - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

smftools 0.1.0py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

smftools/__init__.py +0 -2
smftools/_settings.py +3 -2
smftools/_version.py +1 -0
smftools/datasets/F1_sample_sheet.csv +5 -0
smftools/datasets/datasets.py +14 -11
smftools/informatics/__init__.py +10 -7
smftools/informatics/archived/bam_conversion.py +59 -0
smftools/informatics/archived/bam_direct.py +63 -0
smftools/informatics/archived/basecalls_to_adata.py +71 -0
smftools/informatics/conversion_smf.py +79 -0
smftools/informatics/direct_smf.py +89 -0
smftools/informatics/fast5_to_pod5.py +21 -0
smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
smftools/informatics/helpers/__init__.py +22 -4
smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
smftools/informatics/helpers/bed_to_bigwig.py +39 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
smftools/informatics/helpers/canoncall.py +14 -1
smftools/informatics/helpers/complement_base_list.py +21 -0
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
smftools/informatics/helpers/count_aligned_reads.py +25 -14
smftools/informatics/helpers/extract_base_identities.py +44 -23
smftools/informatics/helpers/extract_mods.py +17 -5
smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
smftools/informatics/helpers/find_conversion_sites.py +24 -16
smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
smftools/informatics/helpers/get_native_references.py +10 -7
smftools/informatics/helpers/index_fasta.py +12 -0
smftools/informatics/helpers/make_dirs.py +9 -3
smftools/informatics/helpers/make_modbed.py +10 -4
smftools/informatics/helpers/modQC.py +10 -2
smftools/informatics/helpers/modcall.py +16 -2
smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
smftools/informatics/helpers/ohe_batching.py +52 -0
smftools/informatics/helpers/one_hot_encode.py +15 -8
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
smftools/informatics/helpers/split_and_index_BAM.py +31 -11
smftools/informatics/load_adata.py +127 -0
smftools/informatics/readwrite.py +13 -16
smftools/informatics/subsample_fasta_from_bed.py +47 -0
smftools/informatics/subsample_pod5.py +104 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +52 -22
smftools/preprocessing/binarize_on_Youden.py +8 -4
smftools/preprocessing/binary_layers_to_ohe.py +9 -4
smftools/preprocessing/calculate_complexity.py +26 -14
smftools/preprocessing/calculate_consensus.py +47 -0
smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
smftools/preprocessing/calculate_coverage.py +14 -8
smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
smftools/preprocessing/calculate_position_Youden.py +21 -12
smftools/preprocessing/calculate_read_length_stats.py +67 -8
smftools/preprocessing/clean_NaN.py +13 -6
smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
smftools/preprocessing/filter_reads_on_length.py +16 -6
smftools/preprocessing/invert_adata.py +10 -5
smftools/preprocessing/load_sample_sheet.py +24 -0
smftools/preprocessing/make_dirs.py +21 -0
smftools/preprocessing/mark_duplicates.py +54 -30
smftools/preprocessing/min_non_diagonal.py +9 -4
smftools/preprocessing/recipes.py +125 -0
smftools/preprocessing/remove_duplicates.py +15 -6
smftools/readwrite.py +13 -16
smftools/tools/apply_HMM.py +1 -0
smftools/tools/cluster.py +0 -0
smftools/tools/read_HMM.py +1 -0
smftools/tools/subset_adata.py +32 -0
smftools/tools/train_HMM.py +43 -0
smftools-0.1.3.dist-info/METADATA +94 -0
smftools-0.1.3.dist-info/RECORD +84 -0
smftools/informatics/helpers/align_BAM.py +0 -49
smftools/informatics/helpers/load_experiment_config.py +0 -17
smftools/informatics/pod5_conversion.py +0 -26
smftools/informatics/pod5_direct.py +0 -29
smftools/informatics/pod5_to_adata.py +0 -17
smftools-0.1.0.dist-info/METADATA +0 -75
smftools-0.1.0.dist-info/RECORD +0 -58
/smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
/smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
/smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
{smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
{smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/align_and_sort_BAM.py ADDED Viewed

@@ -0,0 +1,48 @@
+## align_and_sort_BAM
+def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
+    """
+    A wrapper for running dorado aligner and samtools functions
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        input (str): File path to the basecalled file to align. Works for .bam and .fastq files
+        bam_suffix (str): The suffix to use for the BAM file.
+        output_directory (str): A file path to the directory to output all the analyses.
+    Returns:
+        None
+            The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
+    """
+    import subprocess
+    import os
+    from .aligned_BAM_to_bed import aligned_BAM_to_bed
+    from .extract_readnames_from_BAM import extract_readnames_from_BAM
+    from .make_dirs import make_dirs
+    input_basename = os.path.basename(input)
+    input_suffix = '.' + input_basename.split('.')[1]
+    output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
+    aligned_BAM=f"{output_path_minus_suffix}_aligned"
+    aligned_sorted_BAM=f"{aligned_BAM}_sorted"
+    aligned_output = aligned_BAM + bam_suffix
+    aligned_sorted_output = aligned_sorted_BAM + bam_suffix
+    # Run dorado aligner
+    subprocess.run(["dorado", "aligner", "--secondary", "no", fasta, input], stdout=open(aligned_output, "w"))
+    # Sort the BAM on positional coordinates
+    subprocess.run(["samtools", "sort", "-o", aligned_sorted_output, aligned_output])
+    # Create a BAM index file
+    subprocess.run(["samtools", "index", aligned_sorted_output])
+    # Make a bed file of coordinates for the BAM
+    plotting_dir = os.path.join(output_directory, 'coverage_and_readlength_histograms')
+    bed_dir = os.path.join(output_directory, 'read_alignment_coordinates')
+    make_dirs([plotting_dir, bed_dir])
+    aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
+    # Make a text file of reads for the BAM
+    extract_readnames_from_BAM(aligned_sorted_output)

smftools/informatics/helpers/aligned_BAM_to_bed.py ADDED Viewed

@@ -0,0 +1,73 @@
+# aligned_BAM_to_bed
+def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
+    """
+    Takes an aligned BAM as input and writes a bed file of reads as output.
+    Bed columns are: Record name, start position, end position, read length, read name
+    Parameters:
+        aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
+        plotting_dir (str): Path to write out read alignment length and coverage histograms
+        bed_dir (str): Path to write out read alignment coordinates
+        fasta (str): File path to the reference genome to align to.
+    Returns:
+        None
+    """
+    import subprocess
+    import os
+    from .bed_to_bigwig import bed_to_bigwig
+    from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
+    bed_output_basename = os.path.basename(aligned_BAM).split('.bam')[0] + '_bed.bed'
+    bed_output = os.path.join(bed_dir, bed_output_basename)
+    samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
+    with open(bed_output, "w") as output_file:
+        awk_process = subprocess.Popen(["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'], stdin=samtools_view.stdout, stdout=output_file)
+    samtools_view.stdout.close()
+    awk_process.wait()
+    samtools_view.wait()
+    def split_bed(bed, delete_input=True):
+        """
+        Reads in a BED file and splits it into two separate BED files based on alignment status.
+        Parameters:
+            bed (str): Path to the input BED file.
+            delete_input (bool): Whether to delete the input bed file
+        Returns:
+            aligned (str): Path to the aligned bed file
+        """
+        unaligned = bed.split('.bed')[0] + '_unaligned.bed'
+        aligned = bed.split('.bed')[0] + '_aligned.bed'
+        with open(bed, 'r') as infile, \
+            open(unaligned, 'w') as unaligned_outfile, \
+            open(aligned, 'w') as aligned_outfile:
+            for line in infile:
+                fields = line.strip().split('\t')
+                if fields[0] == '*':
+                    unaligned_outfile.write(line)
+                else:
+                    aligned_outfile.write(line)
+        if delete_input:
+            os.remove(bed)
+        return aligned
+    aligned_bed = split_bed(bed_output)
+    # Write out basic plots of reference coverage and read lengths
+    plot_read_length_and_coverage_histograms(aligned_bed, plotting_dir)
+    # Make a bedgraph and bigwig for the aligned reads
+    bed_to_bigwig(fasta, aligned_bed)

smftools/informatics/helpers/bed_to_bigwig.py ADDED Viewed

@@ -0,0 +1,39 @@
+# bed_to_bigwig
+def bed_to_bigwig(fasta, bed):
+    """
+    Takes a bed file of reads and makes a bedgraph plus a bigwig
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        bed (str): File path to the input bed.
+    Returns:
+        None
+    """
+    import os
+    import subprocess
+    bed_basename = os.path.basename(bed)
+    parent_dir = os.path.dirname(bed)
+    bed_basename_minus_suffix = bed_basename.split('.bed')[0]
+    fasta_basename = os.path.basename(fasta)
+    fasta_dir = os.path.dirname(fasta)
+    fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
+    chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
+    chrom_path = os.path.join(fasta_dir, chrom_basename)
+    bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
+    bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
+    bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
+    bigwig_output = os.path.join(parent_dir, bigwig_basename)
+    # Make the bedgraph
+    with open(bedgraph_output, 'w') as outfile:
+        # Command as a list
+        command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
+        print(f'Making bedgraph from {bed_basename}')
+        subprocess.run(command, stdout=outfile)
+    # Make the bigwig
+    command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
+    print(f'Making bigwig from {bedgraph_basename}')
+    subprocess.run(command)

smftools/informatics/helpers/binarize_converted_base_identities.py CHANGED Viewed

@@ -1,11 +1,18 @@
 ## binarize_converted_base_identities
-import numpy as np
 # Conversion SMF specific
 def binarize_converted_base_identities(base_identities, strand, modification_type):
     """
-    Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
-    Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
+    Binarizes conversion SMF data within a sequence string
+    Parameters:
+        base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
+        strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
+        modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
+    Returns:
+        binarized_base_identities (dict): A binarized dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry methylation information.
     """
+    import numpy as np
     binarized_base_identities = {}
     # Iterate over base identity keys to binarize the base identities
     for key in base_identities.keys():
@@ -20,5 +27,5 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
             elif modification_type == '6mA':
                 binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
         else:
-            pass
+            print(f"{strand} not recognized")
     return binarized_base_identities

smftools/informatics/helpers/canoncall.py CHANGED Viewed

@@ -1,12 +1,25 @@
 ## canoncall
-import subprocess
 # Conversion SMF specific
 def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
     """
     Wrapper function for dorado canonical base calling.
+    Parameters:
+        model (str): a string representing the file path to the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+    Returns:
+        None
+            Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
     """
+    import subprocess
     output = bam + bam_suffix
     command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
+    command_string = " ".join(command)
+    print(f"Running {command_string}\n to generate {output}")
     with open(output, "w") as outfile:
         subprocess.run(command, stdout=outfile)

smftools/informatics/helpers/complement_base_list.py ADDED Viewed

@@ -0,0 +1,21 @@
+# complement_base_list
+def complement_base_list(sequence):
+    """
+    Takes a list of DNA base identities and returns their complement.
+    Parameters:
+        sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
+    Returns:
+        complement (list): A list of complementary DNA bases.
+    """
+    complement_mapping = {
+        'A': 'T',
+        'T': 'A',
+        'C': 'G',
+        'G': 'C',
+        'N': 'N'  # Handling ambiguous bases like 'N'
+    }
+    return [complement_mapping[base] for base in sequence]

smftools/informatics/helpers/concatenate_fastqs_to_bam.py ADDED Viewed

@@ -0,0 +1,54 @@
+# concatenate_fastqs_to_bam
+def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
+    """
+    Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
+    Parameters:
+        fastq_files (list): List of paths to demultiplexed FASTQ files.
+        output_bam (str): Path to the output BAM file.
+        barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
+        gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
+    Returns:
+        None
+    """
+    import os
+    import pysam
+    import gzip
+    from Bio import SeqIO
+    from tqdm import tqdm
+    n_fastqs = len(fastq_files)
+    with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
+        for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
+            # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
+            base_name = os.path.basename(fastq_file)
+            if n_fastqs > 1:
+                if base_name.endswith('.fastq.gz'):
+                    barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
+                elif base_name.endswith('.fq.gz'):
+                    barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
+                elif base_name.endswith('.fastq'):
+                    barcode = base_name.split('_')[-1].replace('.fastq', '')
+                elif base_name.endswith('.fq'):
+                    barcode = base_name.split('_')[-1].replace('.fq', '')
+                else:
+                    raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
+            # Read the FASTQ file (handle gzipped and non-gzipped files)
+            open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
+            with open_func(fastq_file, 'rt') as fq_in:
+                for record in SeqIO.parse(fq_in, 'fastq'):
+                    # Create an unaligned BAM entry for each FASTQ record
+                    aln = pysam.AlignedSegment()
+                    aln.query_name = record.id
+                    aln.query_sequence = str(record.seq)
+                    aln.flag = 4  # Unmapped
+                    aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
+                    if n_fastqs > 1:
+                        # Add the barcode to the BC tag
+                        aln.set_tag(barcode_tag, barcode)
+                    # Write to BAM file
+                    bam_out.write(aln)

smftools/informatics/helpers/converted_BAM_to_adata.py CHANGED Viewed

@@ -1,147 +1,233 @@
 ## converted_BAM_to_adata
-from .. import readwrite
-from .binarize_converted_base_identities import binarize_converted_base_identities
-from .find_conversion_sites import find_conversion_sites
-from .count_aligned_reads import count_aligned_reads
-from .extract_base_identities import extract_base_identities
-from .one_hot_encode import one_hot_encode
-import pandas as pd
-import numpy as np
-import anndata as ad
-import os
 def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
     """
+    A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
+    Parameters:
+        converted_FASTA (str): A string representing the file path to the converted FASTA reference.
+        split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
+        mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
+        experiment_name (str): A string to provide an experiment name to the output adata file.
+        conversion_types (list): A list of strings of the conversion types to use in the analysis.
+        bam_suffix (str): The suffix to use for the BAM file.
+    Returns:
+        None
+        Outputs a single gzipped adata object for the experiment.
     """
+    from .. import readwrite
+    from .binarize_converted_base_identities import binarize_converted_base_identities
+    from .find_conversion_sites import find_conversion_sites
+    from .count_aligned_reads import count_aligned_reads
+    from .extract_base_identities import extract_base_identities
+    from .make_dirs import make_dirs
+    from .ohe_batching import ohe_batching
+    import pandas as pd
+    import numpy as np
+    import anndata as ad
+    import os
+    from tqdm import tqdm
+    import gc
+    ##########################################################################################
+    ## Get file paths and make necessary directories. ##
     # Get all of the input BAM files
     files = os.listdir(split_dir)
-    # Change directory to the BAM directory
-    os.chdir(split_dir)
+    # Make output dir
+    parent_dir = os.path.dirname(split_dir)
+    h5_dir = os.path.join(parent_dir, 'h5ads')
+    tmp_dir = os.path.join(parent_dir, 'tmp')
+    make_dirs([h5_dir, tmp_dir])
     # Filter file names that contain the search string in their filename and keep them in a list
     bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
     # Sort file list by names and print the list of file names
     bams.sort()
+    bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
     print(f'Found the following BAMS: {bams}')
     final_adata = None
+    ##########################################################################################
+    ##########################################################################################
-    # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
+    ## need to fix this section
+    # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
     modification_dict = {}
+    # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
+    record_FASTA_dict = {}
     # While populating the dictionary, also extract the longest sequence record in the input references
     max_reference_length = 0
     for conversion_type in conversion_types:
-        modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type)
+        # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
+        modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
+        # Get the max reference length
         for record in modification_dict[conversion_type].keys():
             if modification_dict[conversion_type][record][0] > max_reference_length:
                 max_reference_length = modification_dict[conversion_type][record][0]
-    # Iterate over the experiment BAM files
-    for bam_index, bam in enumerate(bams):
-        # Give each bam a sample name
-        sample = bam.split(sep=bam_suffix)[0]
+            mod_type, strand = record.split('_')[-2:]
+            chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
+            unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
+            current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
+            delta_max_length = max_reference_length - current_reference_length
+            sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
+            complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
+            record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
+    ##########################################################################################
+    ##########################################################################################
+    bam_alignment_stats_dict = {}
+    records_to_analyze = []
+    for bam_index, bam in enumerate(bam_path_list):
+        bam_alignment_stats_dict[bam_index] = {}
         # look at aligned read proportions in the bam
         aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
         percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
-        print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
-        records_to_analyze = []
+        print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
+        bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
         # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
         for record in record_counts:
             print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
             if record_counts[record][1] >= mapping_threshold:
                 records_to_analyze.append(record)
-        print(f'Records to analyze: {records_to_analyze}')
-        # Iterate over records to analyze (ie all conversions detected)
-        record_FASTA_dict = {}
-        for record in records_to_analyze:
-            mod_type, strand = record.split('_')[-2:]
-            if strand == 'top':
-                strand_index = 1
-            elif strand == 'bottom':
-                strand_index = 2
+                bam_alignment_stats_dict[bam_index]
+                bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
+    records_to_analyze = set(records_to_analyze)
+    ##########################################################################################
-            chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
-            unconverted_chromosome_name = chromosome + '_unconverted_top'
-            positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
-            current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
-            delta_max_length = max_reference_length - current_reference_length
-            sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
-            record_FASTA_dict[f'{record}'] = sequence
-            print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
+    ##########################################################################################
+    # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
+    # Save the file paths in the bam_record_ohe_files dict.
+    bam_record_ohe_files = {}
+    # Iterate over split bams
+    for bam_index, bam in enumerate(bam_path_list):
+        # Iterate over references to process
+        for record in records_to_analyze:
+            unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
+            sample = bams[bam_index].split(sep=bam_suffix)[0]
+            chromosome = record_FASTA_dict[unconverted_record_name][2]
+            current_reference_length = record_FASTA_dict[unconverted_record_name][4]
+            mod_type = record_FASTA_dict[unconverted_record_name][6]
+            strand = record_FASTA_dict[unconverted_record_name][7]
+            # Extract the base identities of reads aligned to the record
+            fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
-            # Get a dictionary of positional identities keyed by read id
-            print(f'Extracting base identities of target positions')
-            target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
             # binarize the dictionary of positional identities
-            print(f'Binarizing base identities of target positions')
-            binarized_base_identities = binarize_converted_base_identities(target_base_identities, strand, mod_type)
+            print(f'Binarizing base identities')
+            fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
+            rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
+            merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
             # converts the base identity dictionary to a dataframe.
-            binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
+            binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
             sorted_index = sorted(binarized_base_identities_df.index)
             binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
-            # Get the sequence string of every read
-            print(f'Extracting base identities of all positions in each read')
-            all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
-            # One hot encode the sequence string of the reads
-            print(f'One hot encoding base identities of all positions in each read')
-            one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
-            # Initialize empty DataFrames for each base
-            read_names = list(one_hot_reads.keys())
-            sequence_length = one_hot_reads[read_names[0]].shape[0]
-            df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-            df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-            df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-            df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-            df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-            # Iterate through the dictionary and populate the DataFrames
-            for read_name, one_hot_array in one_hot_reads.items():
-                df_A.loc[read_name] = one_hot_array[:, 0]
-                df_C.loc[read_name] = one_hot_array[:, 1]
-                df_G.loc[read_name] = one_hot_array[:, 2]
-                df_T.loc[read_name] = one_hot_array[:, 3]
-                df_N.loc[read_name] = one_hot_array[:, 4]
-            ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
             # Load an anndata object with the sample data
             X = binarized_base_identities_df.values
             adata = ad.AnnData(X, dtype=X.dtype)
-            adata.obs_names = binarized_base_identities_df.index
-            adata.obs_names = adata.obs_names.astype(str)
-            adata.var_names = binarized_base_identities_df.columns
-            adata.var_names = adata.var_names.astype(str)
-            adata.obs['Sample'] = [sample] * len(adata)
-            adata.obs['Strand'] = [strand] * len(adata)
-            adata.obs['Dataset'] = [mod_type] * len(adata)
-            adata.obs['Reference'] = [record] * len(adata)
-            adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
-            for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
-                adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
+            if adata.shape[0] > 0:
+                adata.obs_names = binarized_base_identities_df.index.astype(str)
+                adata.var_names = binarized_base_identities_df.columns.astype(str)
+                adata.obs['Sample'] = [sample] * len(adata)
+                adata.obs['Strand'] = [strand] * len(adata)
+                adata.obs['Dataset'] = [mod_type] * len(adata)
+                adata.obs['Reference'] = [record] * len(adata)
+                adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
+                read_mapping_direction = []
+                for read_id in adata.obs_names:
+                    if read_id in fwd_base_identities.keys():
+                        read_mapping_direction.append('fwd')
+                    elif read_id in rev_base_identities.keys():
+                        read_mapping_direction.append('rev')
+                    else:
+                        read_mapping_direction.append('unk')
+                adata.obs['Read_mapping_direction'] = read_mapping_direction
+                # One hot encode the sequence string of the reads
+                fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
+                rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
+                bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
+                del fwd_base_identities, rev_base_identities
+                one_hot_reads = {}
+                n_rows_OHE = 5
+                for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
+                    tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
+                    one_hot_reads.update(tmp_ohe_dict)
+                    del tmp_ohe_dict
+                read_names = list(one_hot_reads.keys())
+                dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
+                sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
+                df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                for read_name, one_hot_array in one_hot_reads.items():
+                    one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
+                    dict_A[read_name] = one_hot_array[0, :]
+                    dict_C[read_name] = one_hot_array[1, :]
+                    dict_G[read_name] = one_hot_array[2, :]
+                    dict_T[read_name] = one_hot_array[3, :]
+                    dict_N[read_name] = one_hot_array[4, :]
+                del one_hot_reads
+                gc.collect()
+                for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
+                    df_A.iloc[j] = dict_A[read_name]
+                    df_C.iloc[j] = dict_C[read_name]
+                    df_G.iloc[j] = dict_G[read_name]
+                    df_T.iloc[j] = dict_T[read_name]
+                    df_N.iloc[j] = dict_N[read_name]
+                del dict_A, dict_C, dict_G, dict_T, dict_N
+                gc.collect()
+                ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
+                for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
+                    adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
+                    ohe_df_map[j] = None # Reassign pointer for memory usage purposes
+                if final_adata:
+                    if adata.shape[0] > 0:
+                        final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
+                    else:
+                        print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
+                else:
+                    if adata.shape[0] > 0:
+                        final_adata = adata
+                    else:
+                        print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
-            if final_adata:
-                final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
             else:
-                final_adata = adata
+                print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
+    # Set obs columns to type 'category'
+    for col in final_adata.obs.columns:
+        final_adata.obs[col] = final_adata.obs[col].astype('category')
-    for record in record_FASTA_dict.keys():
-        chromosome = record.split('_')[0]
-        sequence = record_FASTA_dict[record]
+    for record in records_to_analyze:
+        unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
+        sequence = record_FASTA_dict[unconverted_record_name][0]
+        complement = record_FASTA_dict[unconverted_record_name][1]
+        chromosome = record_FASTA_dict[unconverted_record_name][2]
+        final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
+        final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
         final_adata.uns[f'{record}_FASTA_sequence'] = sequence
-        final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
-        record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
-        layer_map, layer_counts = {}, []
-        for i, layer in enumerate(record_subset.layers):
-            layer_map[i] = layer.split('_')[0]
-            layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
-        count_array = np.array(layer_counts)
-        nucleotide_indexes = np.argmax(count_array, axis=0)
-        consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
-        final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
     ######################################################################################################
     ######################################################################################################
     ## Export the final adata object
-    final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(readwrite.date_string(), experiment_name), compression='gzip')
+    final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
+    final_adata.write_h5ad(final_output, compression='gzip')

smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

smftools 0.1.0py3-none-any.whl → 0.1.3py3-none-any.whl