PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} RENAMED Viewed

@@ -1,11 +1,12 @@
-## modkit_extract_to_adata
 import concurrent.futures
 import gc
-from .count_aligned_reads import count_aligned_reads
+from .bam_functions import count_aligned_reads
 import pandas as pd
 from tqdm import tqdm
 import numpy as np
+from pathlib import Path
+from typing import Union, Iterable, Optional
+import shutil
 def filter_bam_records(bam, mapping_threshold):
     """Processes a single BAM file, counts reads, and determines records to analyze."""
@@ -336,29 +337,122 @@ def parallel_extract_stranded_methylation(dict_list, dict_to_skip, max_reference
             dict_list[dict_index][record][sample] = processed_data
     return dict_list
-def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs=False, threads=None):
+def delete_intermediate_h5ads_and_tmpdir(
+    h5_dir: Union[str, Path, Iterable[str], None],
+    tmp_dir: Optional[Union[str, Path]] = None,
+    *,
+    dry_run: bool = False,
+    verbose: bool = True,
+):
+    """
+    Delete intermediate .h5ad files and a temporary directory.
+    Parameters
+    ----------
+    h5_dir : str | Path | iterable[str] | None
+        If a directory path is given, all files directly inside it will be considered.
+        If an iterable of file paths is given, those files will be considered.
+        Only files ending with '.h5ad' (and not ending with '.gz') are removed.
+    tmp_dir : str | Path | None
+        Path to a directory to remove recursively (e.g. a temp dir created earlier).
+    dry_run : bool
+        If True, print what *would* be removed but do not actually delete.
+    verbose : bool
+        Print progress / warnings.
+    """
+    # Helper: remove a single file path (Path-like or string)
+    def _maybe_unlink(p: Path):
+        if not p.exists():
+            if verbose:
+                print(f"[skip] not found: {p}")
+            return
+        if not p.is_file():
+            if verbose:
+                print(f"[skip] not a file: {p}")
+            return
+        if dry_run:
+            print(f"[dry-run] would remove file: {p}")
+            return
+        try:
+            p.unlink()
+            if verbose:
+                print(f"Removed file: {p}")
+        except Exception as e:
+            print(f"[error] failed to remove file {p}: {e}")
+    # Handle h5_dir input (directory OR iterable of file paths)
+    if h5_dir is not None:
+        # If it's a path to a directory, iterate its children
+        if isinstance(h5_dir, (str, Path)) and Path(h5_dir).is_dir():
+            dpath = Path(h5_dir)
+            for p in dpath.iterdir():
+                # only target top-level files (not recursing); require '.h5ad' suffix and exclude gz
+                name = p.name.lower()
+                if "h5ad" in name:
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        # optional: comment this out if too noisy
+                        print(f"[skip] not matching pattern: {p.name}")
+        else:
+            # treat as iterable of file paths
+            for f in h5_dir:
+                p = Path(f)
+                name = p.name.lower()
+                if name.endswith(".h5ad") and not name.endswith(".gz"):
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        print(f"[skip] not matching pattern or not a file: {p}")
+    # Remove tmp_dir recursively (if provided)
+    if tmp_dir is not None:
+        td = Path(tmp_dir)
+        if not td.exists():
+            if verbose:
+                print(f"[skip] tmp_dir not found: {td}")
+        else:
+            if not td.is_dir():
+                if verbose:
+                    print(f"[skip] tmp_dir is not a directory: {td}")
+            else:
+                if dry_run:
+                    print(f"[dry-run] would remove directory tree: {td}")
+                else:
+                    try:
+                        shutil.rmtree(td)
+                        if verbose:
+                            print(f"Removed directory tree: {td}")
+                    except Exception as e:
+                        print(f"[error] failed to remove tmp dir {td}: {e}")
+def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs=False, threads=None, double_barcoded_path = None):
     """
     Takes modkit extract outputs and organizes it into an adata object
     Parameters:
-        fasta (str): File path to the reference genome to align to.
-        bam_dir (str): File path to the directory containing the aligned_sorted split modified BAM files
+        fasta (Path): File path to the reference genome to align to.
+        bam_dir (Path): File path to the directory containing the aligned_sorted split modified BAM files
+        out_dir (Path): File path to output directory
+        input_already_demuxed (bool): Whether input reads were originally demuxed
         mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
         experiment_name (str): A string to provide an experiment name to the output adata file.
         mods (list): A list of strings of the modification types to use in the analysis.
         batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
-        mod_tsv_dir (str): String representing the path to the mod TSV directory
+        mod_tsv_dir (Path): path to the mod TSV directory
         delete_batch_hdfs (bool): Whether to delete the batch hdfs after writing out the final concatenated hdf. Default is False
+        double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
     Returns:
-        final_adata_path (str): Path to the final adata
+        final_adata_path (Path): Path to the final adata
     """
     ###################################################
     # Package imports
     from .. import readwrite
-    from .get_native_references import get_native_references
-    from .extract_base_identities import extract_base_identities
-    from .ohe_batching import ohe_batching
+    from ..readwrite import safe_write_h5ad, make_dirs
+    from .fasta_functions import get_native_references
+    from .bam_functions import extract_base_identities
+    from .ohe import ohe_batching
     import pandas as pd
     import anndata as ad
     import os
@@ -368,42 +462,34 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     from Bio.Seq import Seq
     from tqdm import tqdm
     import h5py
-    from .make_dirs import make_dirs
     ###################################################
     ################## Get input tsv and bam file names into a sorted list ################
-    # List all files in the directory
-    tsv_files = os.listdir(mod_tsv_dir)
-    bam_files = os.listdir(bam_dir)
-    # get current working directory
-    parent_dir = os.path.dirname(mod_tsv_dir)
     # Make output dirs
-    h5_dir = os.path.join(parent_dir, 'h5ads')
-    tmp_dir = os.path.join(parent_dir, 'tmp')
+    h5_dir = out_dir / 'h5ads'
+    tmp_dir = out_dir / 'tmp'
     make_dirs([h5_dir, tmp_dir])
-    existing_h5s =  os.listdir(h5_dir)
-    existing_h5s = [h5 for h5 in existing_h5s if '.h5ad.gz' in h5]
-    final_hdf = f'{experiment_name}_final_experiment_hdf5.h5ad'
-    final_adata_path = os.path.join(h5_dir, final_hdf)
-    final_adata = None
-    if os.path.exists(f"{final_adata_path}.gz"):
-        print(f'{final_adata_path}.gz already exists. Using existing adata')
-        return final_adata, f"{final_adata_path}.gz"
+    existing_h5s =  h5_dir.iterdir()
+    existing_h5s = [h5 for h5 in existing_h5s if '.h5ad.gz' in str(h5)]
+    final_hdf = f'{experiment_name}.h5ad.gz'
+    final_adata_path = h5_dir / final_hdf
+    final_adata = None
-    elif os.path.exists(f"{final_adata_path}"):
+    if final_adata_path.exists():
         print(f'{final_adata_path} already exists. Using existing adata')
         return final_adata, final_adata_path
-    # Filter file names that contain the search string in their filename and keep them in a list
-    tsvs = [tsv for tsv in tsv_files if 'extract.tsv' in tsv and 'unclassified' not in tsv]
-    bams = [bam for bam in bam_files if '.bam' in bam and '.bai' not in bam and 'unclassified' not in bam]
-    # Sort file list by names and print the list of file names
-    tsvs.sort()
-    tsv_path_list = [os.path.join(mod_tsv_dir, tsv) for tsv in tsvs]
-    bams.sort()
-    bam_path_list = [os.path.join(bam_dir, bam) for bam in bams]
+    # List all files in the directory
+    tsvs = sorted(
+        p for p in mod_tsv_dir.iterdir()
+        if p.is_file() and 'unclassified' not in p.name and 'extract.tsv' in p.name)
+    bams = sorted(
+        p for p in bam_dir.iterdir()
+        if p.is_file() and p.suffix == '.bam' and 'unclassified' not in p.name and '.bai' not in p.name)
+    tsv_path_list = [mod_tsv_dir / tsv for tsv in tsvs]
+    bam_path_list = [bam_dir / bam for bam in bams]
     print(f'{len(tsvs)} sample tsv files found: {tsvs}')
     print(f'{len(bams)} sample bams found: {bams}')
     ##########################################################################################
@@ -417,7 +503,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     ########### Determine the maximum record length to analyze in the dataset ################
     # Get all references within the FASTA and indicate the length and identity of the record sequence
     max_reference_length = 0
-    reference_dict = get_native_references(fasta) # returns a dict keyed by record name. Points to a tuple of (reference length, reference sequence)
+    reference_dict = get_native_references(str(fasta)) # returns a dict keyed by record name. Points to a tuple of (reference length, reference sequence)
     # Get the max record length in the dataset.
     for record in records_to_analyze:
         if reference_dict[record][0] > max_reference_length:
@@ -431,11 +517,11 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
     # Save the file paths in the bam_record_ohe_files dict.
     bam_record_ohe_files = {}
-    bam_record_save = os.path.join(tmp_dir, 'tmp_file_dict.h5ad')
+    bam_record_save = tmp_dir / 'tmp_file_dict.h5ad'
     fwd_mapped_reads = set()
     rev_mapped_reads = set()
     # If this step has already been performed, read in the tmp_dile_dict
-    if os.path.exists(bam_record_save):
+    if bam_record_save.exists():
         bam_record_ohe_files = ad.read_h5ad(bam_record_save).uns
         print('Found existing OHE reads, using these')
     else:
@@ -489,7 +575,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
             bam_path_list = bam_path_list[batch_size:]
         print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
-        batch_already_processed = sum([1 for h5 in existing_h5s if f'_{batch}_' in h5])
+        batch_already_processed = sum([1 for h5 in existing_h5s if f'_{batch}_' in h5.name])
     ###################################################
         if batch_already_processed:
             print(f'Batch {batch} has already been processed into h5ads. Skipping batch and using existing files')
@@ -677,7 +763,6 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
             # Save the sample files in the batch as gzipped hdf5 files
-            os.chdir(h5_dir)
             print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
             for dict_index, dict_type in enumerate(dict_list):
                 if dict_index not in dict_to_skip:
@@ -807,7 +892,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
                     try:
                         print('{0}: Writing {1} anndata out as a hdf5 file'.format(readwrite.time_string(), sample_types[dict_index]))
-                        adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[dict_index]), compression='gzip')
+                        adata.write_h5ad(h5_dir / '{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[dict_index]), compression='gzip')
                     except:
                         print(f"Skipping writing anndata for sample")
@@ -816,11 +901,10 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
             gc.collect()
     # Iterate over all of the batched hdf5 files and concatenate them.
-    os.chdir(h5_dir)
-    files = os.listdir(h5_dir)
+    files = h5_dir.iterdir()
     # Filter file names that contain the search string in their filename and keep them in a list
-    hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
-    combined_hdfs = [hdf for hdf in hdfs if "combined" in hdf]
+    hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf.name and hdf != final_hdf]
+    combined_hdfs = [hdf for hdf in hdfs if "combined" in hdf.name]
     if len(combined_hdfs) > 0:
         hdfs = combined_hdfs
     else:
@@ -828,7 +912,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     # Sort file list by names and print the list of file names
     hdfs.sort()
     print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
-    hdf_paths = [os.path.join(h5_dir, hd5) for hd5 in hdfs]
+    hdf_paths = [h5_dir / hd5 for hd5 in hdfs]
     final_adata = None
     for hdf_index, hdf in enumerate(hdf_paths):
         print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdfs[hdf_index]))
@@ -847,6 +931,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     ohe_bases = ['A', 'C', 'G', 'T'] # ignore N bases for consensus
     ohe_layers = [f"{ohe_base}_binary_encoding" for ohe_base in ohe_bases]
+    final_adata.uns['References'] = {}
     for record in records_to_analyze:
         # Add FASTA sequence to the object
         sequence = record_seq_dict[record][0]
@@ -854,6 +939,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
         final_adata.var[f'{record}_top_strand_FASTA_base'] = list(sequence)
         final_adata.var[f'{record}_bottom_strand_FASTA_base'] = list(complement)
         final_adata.uns[f'{record}_FASTA_sequence'] = sequence
+        final_adata.uns['References'][f'{record}_FASTA_sequence'] = sequence
         # Add consensus sequence of samples mapped to the record to the object
         record_subset = final_adata[final_adata.obs['Reference'] == record]
         for strand in record_subset.obs['Strand'].cat.categories:
@@ -869,19 +955,16 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
                 consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
                 final_adata.var[f'{record}_{strand}_{mapping_dir}_consensus_sequence_from_all_samples'] = consensus_sequence_list
-    #final_adata.write_h5ad(final_adata_path)
+    if input_already_demuxed:
+        final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
+        final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
+    else:
+        from .h5ad_functions import add_demux_type_annotation
+        double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
+        add_demux_type_annotation(final_adata, double_barcoded_reads)
     # Delete the individual h5ad files and only keep the final concatenated file
     if delete_batch_hdfs:
-        files = os.listdir(h5_dir)
-        hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
-        hdf_paths_to_delete = [os.path.join(h5_dir, hdf) for hdf in hdfs_to_delete]
-        # Iterate over the files and delete them
-        for hdf in hdf_paths_to_delete:
-            try:
-                os.remove(hdf)
-                print(f"Deleted file: {hdf}")
-            except OSError as e:
-                print(f"Error deleting file {hdf}: {e}")
+        delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
     return final_adata, final_adata_path

smftools/informatics/modkit_functions.py ADDED Viewed

@@ -0,0 +1,129 @@
+import os
+import subprocess
+import glob
+import zipfile
+from pathlib import Path
+def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified=True, modkit_summary=False, threads=None):
+    """
+    Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
+    Parameters:
+        thresholds (list): A list of thresholds to use for marking each basecalled base as passing or failing on canonical and modification call status.
+        mod_tsv_dir (str): A string representing the file path to the directory to hold the modkit extract outputs.
+        split_dit (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
+        bam_suffix (str): The suffix to use for the BAM file.
+        skip_unclassified (bool): Whether to skip unclassified bam file for modkit extract command
+        modkit_summary (bool): Whether to run and display modkit summary
+        threads (int): Number of threads to use
+    Returns:
+        None
+        Runs modkit extract on input aligned_sorted_split modified BAM files to output zipped TSVs containing modification calls.
+    """
+    filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
+    bam_files = sorted(p for p in split_dir.iterdir() if bam_suffix in p.name and '.bai' not in p.name)
+    if skip_unclassified:
+        bam_files = [p for p in bam_files if "unclassified" not in p.name]
+    print(f"Running modkit extract for the following bam files: {bam_files}")
+    if threads:
+        threads = str(threads)
+    else:
+        pass
+    for input_file in bam_files:
+        print(input_file)
+        # Construct the output TSV file path
+        output_tsv = mod_tsv_dir / (input_file.stem + "_extract.tsv")
+        output_tsv_gz = output_tsv.parent / (output_tsv.name + '.gz')
+        if output_tsv_gz.exists():
+            print(f"{output_tsv_gz} already exists, skipping modkit extract")
+        else:
+            print(f"Extracting modification data from {input_file}")
+            if modkit_summary:
+                # Run modkit summary
+                subprocess.run(["modkit", "summary", str(input_file)])
+            else:
+                pass
+            # Run modkit extract
+            if threads:
+                extract_command = [
+                    "modkit", "extract",
+                    "calls", "--mapped-only",
+                    "--filter-threshold", f'{filter_threshold}',
+                    "--mod-thresholds", f"m:{m5C_threshold}",
+                    "--mod-thresholds", f"a:{m6A_threshold}",
+                    "--mod-thresholds", f"h:{hm5C_threshold}",
+                    "-t", threads,
+                    str(input_file), str(output_tsv)
+                    ]
+            else:
+                extract_command = [
+                    "modkit", "extract",
+                    "calls", "--mapped-only",
+                    "--filter-threshold", f'{filter_threshold}',
+                    "--mod-thresholds", f"m:{m5C_threshold}",
+                    "--mod-thresholds", f"a:{m6A_threshold}",
+                    "--mod-thresholds", f"h:{hm5C_threshold}",
+                    str(input_file), str(output_tsv)
+                    ]
+            subprocess.run(extract_command)
+            # Zip the output TSV
+            print(f'zipping {output_tsv}')
+            if threads:
+                zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
+            else:
+                zip_command = ["pigz", "-f", str(output_tsv)]
+            subprocess.run(zip_command, check=True)
+    return
+def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
+    """
+    Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
+    Parameters:
+        aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
+    Returns:
+        None
+    """
+    import os
+    import subprocess
+    filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
+    command = [
+        "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
+        "--partition-tag", "BC",
+        "--only-tabs",
+        "--filter-threshold", f'{filter_threshold}',
+        "--mod-thresholds", f"m:{m5C_threshold}",
+        "--mod-thresholds", f"a:{m6A_threshold}",
+        "--mod-thresholds", f"h:{hm5C_threshold}"
+    ]
+    subprocess.run(command)
+def modQC(aligned_sorted_output, thresholds):
+    """
+    Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
+    It is generally good to look at these parameters on positive and negative controls.
+    Parameters:
+        aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
+        thresholds (list): A list of floats to pass for call thresholds.
+    Returns:
+        None
+    """
+    import subprocess
+    filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
+    subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
+    command = [
+        "modkit", "summary", str(aligned_sorted_output),
+        "--filter-threshold", f"{filter_threshold}",
+        "--mod-thresholds", f"m:{m5C_threshold}",
+        "--mod-thresholds", f"a:{m6A_threshold}",
+        "--mod-thresholds", f"h:{hm5C_threshold}"
+    ]
+    subprocess.run(command)

smftools/informatics/ohe.py ADDED Viewed

@@ -0,0 +1,160 @@
+import numpy as np
+import anndata as ad
+import os
+import concurrent.futures
+def one_hot_encode(sequence, device='auto'):
+    """
+    One-hot encodes a DNA sequence.
+    Parameters:
+        sequence (str or list): DNA sequence (e.g., "ACGTN" or ['A', 'C', 'G', 'T', 'N']).
+    Returns:
+        ndarray: Flattened one-hot encoded representation of the input sequence.
+    """
+    mapping = np.array(['A', 'C', 'G', 'T', 'N'])
+    # Ensure input is a list of characters
+    if not isinstance(sequence, list):
+        sequence = list(sequence)  # Convert string to list of characters
+    # Handle empty sequences
+    if len(sequence) == 0:
+        print("Warning: Empty sequence encountered in one_hot_encode()")
+        return np.zeros(len(mapping))  # Return empty encoding instead of failing
+    # Convert sequence to NumPy array
+    seq_array = np.array(sequence, dtype='<U1')
+    # Replace invalid bases with 'N'
+    seq_array = np.where(np.isin(seq_array, mapping), seq_array, 'N')
+    # Create one-hot encoding matrix
+    one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
+    # Flatten and return
+    return one_hot_matrix.flatten()
+def one_hot_decode(ohe_array):
+    """
+    Takes a flattened one hot encoded array and returns the sequence string from that array.
+    Parameters:
+        ohe_array (np.array): A one hot encoded array
+    Returns:
+        sequence (str): Sequence string of the one hot encoded array
+    """
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    # Reshape the flattened array into a 2D matrix with 5 columns (one for each base)
+    one_hot_matrix = ohe_array.reshape(-1, 5)
+    # Get the index of the maximum value (which will be 1) in each row
+    decoded_indices = np.argmax(one_hot_matrix, axis=1)
+    # Map the indices back to the corresponding bases
+    sequence_list = [mapping[i] for i in decoded_indices]
+    sequence = ''.join(sequence_list)
+    return sequence
+def ohe_layers_decode(adata, obs_names):
+    """
+    Takes an anndata object and a list of observation names. Returns a list of sequence strings for the reads of interest.
+    Parameters:
+        adata (AnnData): An anndata object.
+        obs_names (list): A list of observation name strings to retrieve sequences for.
+    Returns:
+        sequences (list of str): List of strings of the one hot encoded array
+    """
+    # Define the mapping of one-hot encoded indices to DNA bases
+    mapping = ['A', 'C', 'G', 'T', 'N']
+    ohe_layers = [f"{base}_binary_encoding" for base in mapping]
+    sequences = []
+    for obs_name in obs_names:
+        obs_subset = adata[obs_name]
+        ohe_list = []
+        for layer in ohe_layers:
+            ohe_list += list(obs_subset.layers[layer])
+        ohe_array = np.array(ohe_list)
+        sequence = one_hot_decode(ohe_array)
+        sequences.append(sequence)
+    return sequences
+def _encode_sequence(args):
+    """Parallel helper function for one-hot encoding."""
+    read_name, seq, device = args
+    try:
+        one_hot_matrix = one_hot_encode(seq, device)
+        return read_name, one_hot_matrix
+    except Exception:
+        return None  # Skip invalid sequences
+def _encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number):
+    """Encodes a batch and writes to disk immediately."""
+    batch = {read_name: matrix for read_name, matrix in batch_data if matrix is not None}
+    if batch:
+        save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad')
+        tmp_ad = ad.AnnData(X=np.zeros((1, 1)), uns=batch)  # Placeholder X
+        tmp_ad.write_h5ad(save_name)
+        return save_name
+    return None
+def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000, progress_bar=None, device='auto', threads=None):
+    """
+    Efficient version of ohe_batching: one-hot encodes sequences in parallel and writes batches immediately.
+    Parameters:
+        base_identities (dict): Dictionary mapping read names to sequences.
+        tmp_dir (str): Directory for storing temporary files.
+        record (str): Record name.
+        prefix (str): Prefix for file naming.
+        batch_size (int): Number of reads per batch.
+        progress_bar (tqdm instance, optional): Shared progress bar.
+        device (str): Device for encoding.
+        threads (int, optional): Number of parallel workers.
+    Returns:
+        list: List of valid H5AD file paths.
+    """
+    threads = threads or os.cpu_count()  # Default to max available CPU cores
+    batch_data = []
+    batch_number = 0
+    file_names = []
+    # Step 1: Prepare Data for Parallel Encoding
+    encoding_args = [(read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None]
+    # Step 2: Parallel One-Hot Encoding using threads (to avoid nested processes)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+        for result in executor.map(_encode_sequence, encoding_args):
+            if result:
+                batch_data.append(result)
+                if len(batch_data) >= batch_size:
+                    # Step 3: Process and Write Batch Immediately
+                    file_name = _encode_and_save_batch(batch_data.copy(), tmp_dir, prefix, record, batch_number)
+                    if file_name:
+                        file_names.append(file_name)
+                    batch_data.clear()
+                    batch_number += 1
+                if progress_bar:
+                    progress_bar.update(1)
+    # Step 4: Process Remaining Batch
+    if batch_data:
+        file_name = _encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number)
+        if file_name:
+            file_names.append(file_name)
+    return file_names

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl