PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

smftools/__init__.py +5 -1
smftools/_version.py +1 -1
smftools/informatics/__init__.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +63 -10
smftools/informatics/direct_smf.py +66 -18
smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
smftools/informatics/helpers/__init__.py +16 -2
smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
smftools/informatics/helpers/canoncall.py +12 -3
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +33 -46
smftools/informatics/helpers/extract_mods.py +55 -23
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/find_conversion_sites.py +33 -44
smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
smftools/informatics/helpers/modcall.py +13 -5
smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
smftools/informatics/helpers/ohe_batching.py +65 -41
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +45 -9
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/split_and_index_BAM.py +3 -8
smftools/informatics/load_adata.py +58 -3
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +22 -9
smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
smftools/preprocessing/binarize_on_Youden.py +35 -32
smftools/preprocessing/binary_layers_to_ohe.py +13 -3
smftools/preprocessing/calculate_complexity.py +3 -2
smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
smftools/preprocessing/calculate_coverage.py +26 -25
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_position_Youden.py +18 -7
smftools/preprocessing/calculate_read_length_stats.py +39 -46
smftools/preprocessing/clean_NaN.py +33 -25
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
smftools/preprocessing/filter_reads_on_length.py +14 -4
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +18 -11
smftools/preprocessing/load_sample_sheet.py +30 -16
smftools/preprocessing/recipes.py +22 -20
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +105 -13
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +19 -23
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
smftools-0.1.7.dist-info/RECORD +136 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/read_HMM.py +0 -1
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
/smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
/smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/aligned_BAM_to_bed.py CHANGED Viewed

@@ -1,73 +1,74 @@
-# aligned_BAM_to_bed
-def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
+def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     """
-    Takes an aligned BAM as input and writes a bed file of reads as output.
-    Bed columns are: Record name, start position, end position, read length, read name
+    Takes an aligned BAM as input and writes a BED file of reads as output.
+    Bed columns are: Record name, start position, end position, read length, read name.
     Parameters:
         aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
-        plotting_dir (str): Path to write out read alignment length and coverage histograms
-        bed_dir (str): Path to write out read alignment coordinates
-        fasta (str): File path to the reference genome to align to.
+        out_dir (str): Directory to output files.
+        fasta (str): File path to the reference genome.
+        make_bigwigs (bool): Whether to generate bigwig files.
+        threads (int): Number of threads to use.
     Returns:
         None
     """
     import subprocess
     import os
+    import concurrent.futures
+    from concurrent.futures import ProcessPoolExecutor
     from .bed_to_bigwig import bed_to_bigwig
+    from . import make_dirs
     from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
-    bed_output_basename = os.path.basename(aligned_BAM).split('.bam')[0] + '_bed.bed'
-    bed_output = os.path.join(bed_dir, bed_output_basename)
+    threads = threads or os.cpu_count()  # Use max available cores if not specified
+    # Create necessary directories
+    plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
+    bed_dir = os.path.join(out_dir, "beds")
+    make_dirs([plotting_dir, bed_dir])
-    samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
+    bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
+    print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
+    # Convert BAM to BED format
     with open(bed_output, "w") as output_file:
-        awk_process = subprocess.Popen(["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'], stdin=samtools_view.stdout, stdout=output_file)
+        samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
+        awk_process = subprocess.Popen(
+            ["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
+            stdin=samtools_view.stdout,
+            stdout=output_file
+        )
     samtools_view.stdout.close()
     awk_process.wait()
     samtools_view.wait()
-    def split_bed(bed, delete_input=True):
-        """
-        Reads in a BED file and splits it into two separate BED files based on alignment status.
-        Parameters:
-            bed (str): Path to the input BED file.
-            delete_input (bool): Whether to delete the input bed file
-        Returns:
-            aligned (str): Path to the aligned bed file
-        """
-        unaligned = bed.split('.bed')[0] + '_unaligned.bed'
-        aligned = bed.split('.bed')[0] + '_aligned.bed'
-        with open(bed, 'r') as infile, \
-            open(unaligned, 'w') as unaligned_outfile, \
-            open(aligned, 'w') as aligned_outfile:
+    print(f"BED file created: {bed_output}")
+    def split_bed(bed):
+        """Splits BED into aligned and unaligned reads."""
+        aligned = bed.replace(".bed", "_aligned.bed")
+        unaligned = bed.replace(".bed", "_unaligned.bed")
+        with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
             for line in infile:
-                fields = line.strip().split('\t')
-                if fields[0] == '*':
-                    unaligned_outfile.write(line)
-                else:
-                    aligned_outfile.write(line)
-        if delete_input:
-            os.remove(bed)
-        return aligned
-    aligned_bed = split_bed(bed_output)
+                (unaligned_out if line.startswith("*") else aligned_out).write(line)
-    # Write out basic plots of reference coverage and read lengths
-    plot_read_length_and_coverage_histograms(aligned_bed, plotting_dir)
+        os.remove(bed)
+        return aligned
-    # Make a bedgraph and bigwig for the aligned reads
-    bed_to_bigwig(fasta, aligned_bed)
+    print(f"Splitting BED: {bed_output}")
+    aligned_bed = split_bed(bed_output)
+    with ProcessPoolExecutor() as executor:  # Use processes instead of threads
+        futures = []
+        futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
+        if make_bigwigs:
+            futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
+        # Wait for all tasks to complete
+        concurrent.futures.wait(futures)
+    print("Processing completed successfully.")

smftools/informatics/helpers/bam_qc.py ADDED Viewed

@@ -0,0 +1,66 @@
+## bam_qc
+def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
+    """
+    Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
+    Parameters:
+    - bam_files: List of BAM file paths.
+    - bam_qc_dir: Directory to save QC reports.
+    - threads: Number threads to use.
+    - modality: 'conversion' or 'direct' (affects processing mode).
+    - stats: Run `samtools stats` if True.
+    - flagstats: Run `samtools flagstat` if True.
+    - idxstats: Run `samtools idxstats` if True.
+    """
+    import os
+    import subprocess
+    # Ensure the QC output directory exists
+    os.makedirs(bam_qc_dir, exist_ok=True)
+    if threads:
+        threads = str(threads)
+    else:
+        pass
+    for bam in bam_files:
+        bam_name = os.path.basename(bam).replace(".bam", "")  # Extract filename without extension
+        # Run samtools QC commands based on selected options
+        if stats:
+            stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
+            if threads:
+                command = ["samtools", "stats", "-@", threads, bam]
+            else:
+                command = ["samtools", "stats", bam]
+            print(f"Running: {' '.join(command)} > {stats_out}")
+            with open(stats_out, "w") as out_file:
+                subprocess.run(command, stdout=out_file)
+        if flagstats:
+            flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
+            if threads:
+                command = ["samtools", "flagstat", "-@", threads, bam]
+            else:
+                command = ["samtools", "flagstat", bam]
+            print(f"Running: {' '.join(command)} > {flagstats_out}")
+            with open(flagstats_out, "w") as out_file:
+                subprocess.run(command, stdout=out_file)
+        if idxstats:
+            idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
+            if threads:
+                command = ["samtools", "idxstats", "-@", threads, bam]
+            else:
+                command = ["samtools", "idxstats", bam]
+            print(f"Running: {' '.join(command)} > {idxstats_out}")
+            with open(idxstats_out, "w") as out_file:
+                subprocess.run(command, stdout=out_file)
+        if modality == 'conversion':
+            pass
+        elif modality == 'direct':
+            pass
+    print("QC processing completed.")

smftools/informatics/helpers/binarize_converted_base_identities.py CHANGED Viewed

@@ -1,31 +1,79 @@
-## binarize_converted_base_identities
-# Conversion SMF specific
-def binarize_converted_base_identities(base_identities, strand, modification_type):
+def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
     """
-    Binarizes conversion SMF data within a sequence string
+    Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
     Parameters:
         base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
         strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
         modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
+        bam (str): The bam file path
     Returns:
-        binarized_base_identities (dict): A binarized dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry methylation information.
+        dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
     """
     import numpy as np
+    # If the modification type is 'unconverted', return NaN for all positions
+    if modification_type == "unconverted":
+        #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
+        return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
+    # Define mappings for binarization based on strand and modification type
+    binarization_maps = {
+        ('top', '5mC'): {'C': 1, 'T': 0},
+        ('top', '6mA'): {'A': 1, 'G': 0},
+        ('bottom', '5mC'): {'G': 1, 'A': 0},
+        ('bottom', '6mA'): {'T': 1, 'C': 0}
+    }
+    if (strand, modification_type) not in binarization_maps:
+        raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+    # Fetch the appropriate mapping
+    base_map = binarization_maps[(strand, modification_type)]
     binarized_base_identities = {}
-    # Iterate over base identity keys to binarize the base identities
-    for key in base_identities.keys():
-        if strand == 'top':
-            if modification_type == '5mC':
-                binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
-            elif modification_type == '6mA':
-                binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
-        elif strand == 'bottom':
-            if modification_type == '5mC':
-                binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
-            elif modification_type == '6mA':
-                binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
-        else:
-            print(f"{strand} not recognized")
-    return binarized_base_identities
+    for key, bases in base_identities.items():
+        arr = np.array(bases, dtype='<U1')
+        binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr)  # Apply mapping with fallback to NaN
+        binarized_base_identities[key] = binarized
+    return binarized_base_identities
+    # import torch
+    # # If the modification type is 'unconverted', return NaN for all positions
+    # if modification_type == "unconverted":
+    #     print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
+    #     return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
+    # # Define mappings for binarization based on strand and modification type
+    # binarization_maps = {
+    #     ('top', '5mC'): {'C': 1, 'T': 0},
+    #     ('top', '6mA'): {'A': 1, 'G': 0},
+    #     ('bottom', '5mC'): {'G': 1, 'A': 0},
+    #     ('bottom', '6mA'): {'T': 1, 'C': 0}
+    # }
+    # if (strand, modification_type) not in binarization_maps:
+    #     raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+    # # Fetch the appropriate mapping
+    # base_map = binarization_maps[(strand, modification_type)]
+    # # Convert mapping to tensor
+    # base_keys = list(base_map.keys())
+    # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
+    # # Create a lookup dictionary (ASCII-based for fast mapping)
+    # lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
+    # for k, v in zip(base_keys, base_values):
+    #     lookup_table[ord(k)] = v
+    # # Process reads
+    # binarized_base_identities = {}
+    # for key, bases in base_identities.items():
+    #     bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device)  # Convert chars to ASCII
+    #     binarized = lookup_table[bases_tensor]  # Efficient lookup
+    #     binarized_base_identities[key] = binarized
+    # return binarized_base_identities

smftools/informatics/helpers/canoncall.py CHANGED Viewed

@@ -1,16 +1,20 @@
 ## canoncall
 # Conversion SMF specific
-def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
+def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
     """
     Wrapper function for dorado canonical base calling.
     Parameters:
-        model (str): a string representing the file path to the dorado basecalling model.
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the the dorado basecalling model.
         pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
         barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
         bam (str): File path to the BAM file to output.
         bam_suffix (str): The suffix to use for the BAM file.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
+        device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
     Returns:
         None
@@ -18,7 +22,12 @@ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
     """
     import subprocess
     output = bam + bam_suffix
-    command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
+    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
+    if barcode_both_ends:
+        command.append("--barcode-both-ends")
+    if not trim:
+        command.append("--no-trim")
+    command += [model, pod5_dir]
     command_string = " ".join(command)
     print(f"Running {command_string}\n to generate {output}")
     with open(output, "w") as outfile:

smftools/informatics/helpers/concatenate_fastqs_to_bam.py CHANGED Viewed

@@ -36,7 +36,9 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
                     barcode = base_name.split('_')[-1].replace('.fq', '')
                 else:
                     raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
+            else:
+                barcode = 'barcode0'
             # Read the FASTQ file (handle gzipped and non-gzipped files)
             open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
             with open_func(fastq_file, 'rt') as fq_in:
@@ -47,8 +49,7 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
                     aln.query_sequence = str(record.seq)
                     aln.flag = 4  # Unmapped
                     aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
-                    if n_fastqs > 1:
-                        # Add the barcode to the BC tag
-                        aln.set_tag(barcode_tag, barcode)
+                    # Add the barcode to the BC tag
+                    aln.set_tag(barcode_tag, barcode)
                     # Write to BAM file
                     bam_out.write(aln)

smftools/informatics/helpers/converted_BAM_to_adata.py CHANGED Viewed

@@ -13,7 +13,7 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
         bam_suffix (str): The suffix to use for the BAM file.
     Returns:
-        None
+        final_adata_path (str): File path to the final adata object
         Outputs a single gzipped adata object for the experiment.
     """
     from .. import readwrite
@@ -36,7 +36,14 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
     files = os.listdir(split_dir)
     # Make output dir
     parent_dir = os.path.dirname(split_dir)
+    split_dir_base = os.path.basename(split_dir)
     h5_dir = os.path.join(parent_dir, 'h5ads')
+    final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{split_dir_base}.h5ad')
+    if os.path.exists(f"{final_adata_path}.gz"):
+        print(f'{final_adata_path}.gz already exists, using existing adata object') # Stops here if the final_adata file already exists
+        return final_adata_path
     tmp_dir = os.path.join(parent_dir, 'tmp')
     make_dirs([h5_dir, tmp_dir])
     # Filter file names that contain the search string in their filename and keep them in a list
@@ -57,7 +64,8 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
     record_FASTA_dict = {}
     # While populating the dictionary, also extract the longest sequence record in the input references
     max_reference_length = 0
-    for conversion_type in conversion_types:
+    conversions = conversion_types[1:]
+    for conversion_type in conversions:
         # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
         modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
         # Get the max reference length
@@ -132,10 +140,11 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
                 adata.obs_names = binarized_base_identities_df.index.astype(str)
                 adata.var_names = binarized_base_identities_df.columns.astype(str)
                 adata.obs['Sample'] = [sample] * len(adata)
+                adata.obs['Reference'] = [chromosome] * len(adata)
                 adata.obs['Strand'] = [strand] * len(adata)
                 adata.obs['Dataset'] = [mod_type] * len(adata)
-                adata.obs['Reference'] = [record] * len(adata)
-                adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
+                adata.obs['Reference_dataset_strand'] = [f'{chromosome}_{mod_type}_{strand}'] * len(adata)
+                adata.obs['Reference_strand'] = [f'{record}'] * len(adata)
                 read_mapping_direction = []
                 for read_id in adata.obs_names:
@@ -162,15 +171,16 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
                     del tmp_ohe_dict
                 read_names = list(one_hot_reads.keys())
-                dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
                 sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
-                df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-                df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-                df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-                df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
-                df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
+                df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
+                df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
+                df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
+                df_T = np.zeros((len(sorted_index), sequence_length), dtype=int)
+                df_N = np.zeros((len(sorted_index), sequence_length), dtype=int)
+                # Process one-hot data into dictionaries
+                dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
                 for read_name, one_hot_array in one_hot_reads.items():
                     one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
                     dict_A[read_name] = one_hot_array[0, :]
@@ -182,21 +192,22 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
                 del one_hot_reads
                 gc.collect()
-                for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
-                    df_A.iloc[j] = dict_A[read_name]
-                    df_C.iloc[j] = dict_C[read_name]
-                    df_G.iloc[j] = dict_G[read_name]
-                    df_T.iloc[j] = dict_T[read_name]
-                    df_N.iloc[j] = dict_N[read_name]
+                # Fill the arrays
+                for j, read_name in tqdm(enumerate(sorted_index), desc='Loading arrays of OHE reads', total=len(sorted_index)):
+                    df_A[j, :] = dict_A[read_name]
+                    df_C[j, :] = dict_C[read_name]
+                    df_G[j, :] = dict_G[read_name]
+                    df_T[j, :] = dict_T[read_name]
+                    df_N[j, :] = dict_N[read_name]
                 del dict_A, dict_C, dict_G, dict_T, dict_N
                 gc.collect()
+                # Store the results in AnnData layers
                 ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
                 for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
-                    adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
-                    ohe_df_map[j] = None # Reassign pointer for memory usage purposes
+                    adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
+                    ohe_df_map[j] = None  # Reassign pointer for memory usage purposes
                 if final_adata:
                     if adata.shape[0] > 0:
@@ -223,11 +234,12 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
         chromosome = record_FASTA_dict[unconverted_record_name][2]
         final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
         final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
-        final_adata.uns[f'{record}_FASTA_sequence'] = sequence
+        final_adata.uns[f'{chromosome}_FASTA_sequence'] = sequence
     ######################################################################################################
     ######################################################################################################
     ## Export the final adata object
-    final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
-    final_adata.write_h5ad(final_output, compression='gzip')
+    print('Saving initial draft of final adata')
+    final_adata.write_h5ad(final_adata_path)
+    return final_adata_path

smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl