PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/informatics/basecalling.py ADDED Viewed

@@ -0,0 +1,67 @@
+import subprocess
+from pathlib import Path
+def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+    """
+    Wrapper function for dorado canonical base calling.
+    Parameters:
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
+        device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
+    Returns:
+        None
+            Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
+    """
+    output = bam + bam_suffix
+    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
+    if barcode_both_ends:
+        command.append("--barcode-both-ends")
+    if not trim:
+        command.append("--no-trim")
+    command += [model, pod5_dir]
+    command_string = " ".join(command)
+    print(f"Running {command_string}\n to generate {output}")
+    with open(output, "w") as outfile:
+        subprocess.run(command, stdout=outfile)
+def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+    """
+    Wrapper function for dorado modified base calling.
+    Parameters:
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mod_list (list): A list of modification types to use in the analysis.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends
+        device (str): Device to use for basecalling. auto, metal, cpu, cuda.
+    Returns:
+        None
+            Outputs a BAM file holding the modified base calls output by the dorado basecaller.
+    """
+    import subprocess
+    output = bam + bam_suffix
+    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
+    command += mod_list
+    command += ["--device", device, "--batchsize", "0"]
+    if barcode_both_ends:
+        command.append("--barcode-both-ends")
+    if not trim:
+        command.append("--no-trim")
+    command += [model, pod5_dir]
+    print(f'Running: {" ".join(command)}')
+    with open(output, "w") as outfile:
+        subprocess.run(command, stdout=outfile)

smftools/informatics/bed_functions.py ADDED Viewed

@@ -0,0 +1,366 @@
+from pathlib import Path
+import os
+import subprocess
+from typing import List, Optional, Union
+import pysam
+import pybedtools
+import pyBigWig
+import numpy as np
+import pandas as pd
+import concurrent.futures
+from concurrent.futures import ProcessPoolExecutor
+import matplotlib.pyplot as plt
+from ..readwrite import make_dirs
+def _bed_to_bigwig(fasta: str, bed: str) -> str:
+    """
+    BED → bedGraph → bigWig
+    Requires:
+      - FASTA must have .fai index present
+    """
+    bed = Path(bed)
+    fa = Path(fasta)  # path to .fa
+    parent = bed.parent
+    stem = bed.stem
+    fa_stem = fa.stem
+    fai = parent / f"{fa_stem}.fai"
+    bedgraph = parent / f"{stem}.bedgraph"
+    bigwig = parent / f"{stem}.bw"
+    # 1) Compute coverage → bedGraph
+    print(f"[pybedtools] generating coverage bedgraph from {bed}")
+    bt = pybedtools.BedTool(str(bed))
+    # bedtools genomecov -bg
+    coverage = bt.genome_coverage(bg=True, genome=str(fai))
+    coverage.saveas(str(bedgraph))
+    # 2) Convert bedGraph → BigWig via pyBigWig
+    print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
+    # read chrom sizes from the FASTA .fai index
+    chrom_sizes = {}
+    with open(fai) as f:
+        for line in f:
+            fields = line.strip().split("\t")
+            chrom = fields[0]
+            size = int(fields[1])
+            chrom_sizes[chrom] = size
+    bw = pyBigWig.open(str(bigwig), "w")
+    bw.addHeader(list(chrom_sizes.items()))
+    with open(bedgraph) as f:
+        for line in f:
+            chrom, start, end, coverage = line.strip().split()
+            bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
+    bw.close()
+    print(f"BigWig written: {bigwig}")
+    return str(bigwig)
+def _plot_bed_histograms(
+    bed_file,
+    plotting_directory,
+    fasta,
+    *,
+    bins=60,
+    clip_quantiles=(0.0, 0.995),
+    cov_bin_size=1000,       # coverage bin size in bp
+    rows_per_fig=6,          # paginate if many chromosomes
+    include_mapq_quality=True,   # add MAPQ + avg read quality columns to grid
+    coordinate_mode="one_based",  # "one_based" (your BED-like) or "zero_based"
+):
+    """
+    Plot per-chromosome QC grids from a BED-like file.
+    Expects columns:
+      chrom, start, end, read_len, qname, mapq, avg_base_qual
+    For each chromosome:
+      - Column 1: Read length histogram
+      - Column 2: Coverage across the chromosome (binned)
+      - (optional) Column 3: MAPQ histogram
+      - (optional) Column 4: Avg base quality histogram
+    The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
+    Saves one PNG per page under `plotting_directory`.
+    Parameters
+    ----------
+    bed_file : str
+    plotting_directory : str
+    fasta : str
+        Reference FASTA (used to get chromosome lengths).
+    bins : int
+        Histogram bins for read length / MAPQ / quality.
+    clip_quantiles : (float, float)
+        Clip hist tails for readability (e.g., (0, 0.995)).
+    cov_bin_size : int
+        Bin size (bp) for coverage plot; bigger = faster/coarser.
+    rows_per_fig : int
+        Number of chromosomes per page.
+    include_mapq_quality : bool
+        If True, add MAPQ and avg base quality histograms as extra columns.
+    coordinate_mode : {"one_based","zero_based"}
+        One-based, inclusive (your file) vs BED-standard zero-based, half-open.
+    """
+    os.makedirs(plotting_directory, exist_ok=True)
+    bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
+    print(f"[plot_bed_histograms] Loading: {bed_file}")
+    # Load BED-like table
+    cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
+    df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
+        'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
+        'mapq': float, 'avg_q': float
+    })
+    # Drop unaligned records (chrom == '*') if present
+    df = df[df['chrom'] != '*'].copy()
+    if df.empty:
+        print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
+        return
+    # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
+    # Input is typically one_based inclusive (from your writer).
+    if coordinate_mode not in {"one_based", "zero_based"}:
+        raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
+    if coordinate_mode == "one_based":
+        # convert to 0-based half-open [start0, end0)
+        start0 = df['start'].to_numpy() - 1
+        end0   = df['end'].to_numpy()   # inclusive in input -> +1 already handled by not subtracting
+    else:
+        # already 0-based half-open (assumption)
+        start0 = df['start'].to_numpy()
+        end0   = df['end'].to_numpy()
+    # Clip helper for hist tails
+    def _clip_series(s, q=(0.0, 0.995)):
+        if q is None:
+            return s.to_numpy()
+        lo = s.quantile(q[0]) if q[0] is not None else s.min()
+        hi = s.quantile(q[1]) if q[1] is not None else s.max()
+        x = s.to_numpy(dtype=float)
+        return np.clip(x, lo, hi)
+    # Load chromosome order/lengths from FASTA
+    with pysam.FastaFile(fasta) as fa:
+        ref_names = list(fa.references)
+        ref_lengths = dict(zip(ref_names, fa.lengths))
+    # Keep only chroms present in FASTA and with at least one read
+    chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
+    # Order chromosomes by FASTA order
+    chrom_order = [c for c in ref_names if c in chroms]
+    if not chrom_order:
+        print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
+        return
+    # Pagination
+    def _sanitize(name: str) -> str:
+        return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
+    cols_per_fig = 4 if include_mapq_quality else 2
+    for start_idx in range(0, len(chrom_order), rows_per_fig):
+        chunk = chrom_order[start_idx:start_idx + rows_per_fig]
+        nrows = len(chunk)
+        ncols = cols_per_fig
+        fig, axes = plt.subplots(
+            nrows=nrows, ncols=ncols,
+            figsize=(4.0 * ncols, 2.6 * nrows),
+            dpi=160,
+            squeeze=False
+        )
+        for r, chrom in enumerate(chunk):
+            chrom_len = ref_lengths[chrom]
+            mask = (df['chrom'].to_numpy() == chrom)
+            # Slice per-chrom arrays for speed
+            s0 = start0[mask]
+            e0 = end0[mask]
+            len_arr = df.loc[mask, 'read_len']
+            mapq_arr = df.loc[mask, 'mapq']
+            q_arr = df.loc[mask, 'avg_q']
+            # --- Col 1: Read length histogram (clipped) ---
+            ax = axes[r, 0]
+            ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+            if r == 0:
+                ax.set_title("Read length")
+            ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
+            ax.set_xlabel("bp")
+            ax.grid(alpha=0.25)
+            # --- Col 2: Coverage (binned over genome) ---
+            ax = axes[r, 1]
+            nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
+            # Bin edges in 0-based coords
+            edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
+            # Compute per-bin "read count coverage": number of reads overlapping each bin.
+            # Approximate by incrementing all bins touched by the interval.
+            # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
+            cov = np.zeros(nb, dtype=np.int32)
+            # bin indices overlapped by each read (0-based half-open)
+            b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
+            b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
+            # ensure valid ordering
+            b_lo = np.minimum(b0, b1)
+            b_hi = np.maximum(b0, b1)
+            # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
+            for lo, hi in zip(b_lo, b_hi):
+                cov[lo:hi + 1] += 1
+            x_mid = (edges[:-1] + edges[1:]) / 2.0
+            ax.plot(x_mid, cov)
+            if r == 0:
+                ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
+            ax.set_xlim(0, chrom_len)
+            ax.set_xlabel("Position (bp)")
+            ax.set_ylabel("")  # already show chrom on col 1
+            ax.grid(alpha=0.25)
+            if include_mapq_quality:
+                # --- Col 3: MAPQ ---
+                ax = axes[r, 2]
+                # Clip MAPQ upper tail if needed (usually 60)
+                ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("MAPQ")
+                ax.set_xlabel("MAPQ")
+                ax.grid(alpha=0.25)
+                # --- Col 4: Avg base quality ---
+                ax = axes[r, 3]
+                ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("Avg base qual")
+                ax.set_xlabel("Phred")
+                ax.grid(alpha=0.25)
+        fig.suptitle(
+            f"{bed_basename} — per-chromosome QC "
+            f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
+            y=0.995, fontsize=11
+        )
+        fig.tight_layout(rect=[0, 0, 1, 0.98])
+        page = start_idx // rows_per_fig + 1
+        out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
+        plt.savefig(out_png, bbox_inches="tight")
+        plt.close(fig)
+    print("[plot_bed_histograms] Done.")
+def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
+    """
+    Takes an aligned BAM as input and writes a BED file of reads as output.
+    Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
+    Parameters:
+        aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
+        out_dir (str): Directory to output files.
+        fasta (str): File path to the reference genome.
+        make_bigwigs (bool): Whether to generate bigwig files.
+        threads (int): Number of threads to use.
+    Returns:
+        None
+    """
+    threads = threads or os.cpu_count()  # Use max available cores if not specified
+    # Create necessary directories
+    plotting_dir = out_dir / "bed_cov_histograms"
+    bed_dir = out_dir / "beds"
+    make_dirs([plotting_dir, bed_dir])
+    bed_output = bed_dir /  str(aligned_BAM.name).replace(".bam", "_bed.bed")
+    print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
+    with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
+        for read in bam.fetch(until_eof=True):
+            if read.is_unmapped:
+                chrom = "*"
+                start1 = 1
+                rl = read.query_length or 0
+                mapq = 0
+            else:
+                chrom = bam.get_reference_name(read.reference_id)
+                # pysam reference_start is 0-based → +1 for 1-based SAM-like start
+                start1 = int(read.reference_start) + 1
+                rl = read.query_length or 0
+                mapq = int(read.mapping_quality)
+            # End position in 1-based inclusive coords
+            end1 = start1 + (rl or 0) - 1
+            qname = read.query_name
+            quals = read.query_qualities
+            if quals is None or rl == 0:
+                avg_q = float("nan")
+            else:
+                avg_q = float(np.mean(quals))
+            out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
+    print(f"BED-like file created: {bed_output}")
+    def split_bed(bed):
+        """Splits into aligned and unaligned reads (chrom == '*')."""
+        bed = str(bed)
+        aligned = bed.replace(".bed", "_aligned.bed")
+        unaligned = bed.replace(".bed", "_unaligned.bed")
+        with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
+            for line in infile:
+                (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
+        os.remove(bed)
+        return aligned
+    print(f"Splitting: {bed_output}")
+    aligned_bed = split_bed(bed_output)
+    with ProcessPoolExecutor() as executor:
+        futures = []
+        futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
+        if make_bigwigs:
+            futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
+        concurrent.futures.wait(futures)
+    print("Processing completed successfully.")
+def extract_read_lengths_from_bed(file_path):
+    """
+    Load a dict of read names that points to the read length
+    Params:
+        file_path (str): file path to a bed file
+    Returns:
+        read_dict (dict)
+    """
+    import pandas as pd
+    columns = ['chrom', 'start', 'end', 'length', 'name']
+    df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
+    read_dict = {}
+    for _, row in df.iterrows():
+        chrom = row['chrom']
+        start = row['start']
+        end = row['end']
+        name = row['name']
+        length = row['length']
+        read_dict[name] = length
+    return read_dict

smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} RENAMED Viewed

@@ -15,19 +15,19 @@ import shutil
 from pathlib import Path
 from typing import Union, Iterable, Optional
-from ... import readwrite
+from ..readwrite import make_dirs, safe_write_h5ad
 from .binarize_converted_base_identities import binarize_converted_base_identities
-from .find_conversion_sites import find_conversion_sites
-from .count_aligned_reads import count_aligned_reads
-from .extract_base_identities import extract_base_identities
-from .make_dirs import make_dirs
-from .ohe_batching import ohe_batching
+from .fasta_functions import find_conversion_sites
+from .bam_functions import count_aligned_reads, extract_base_identities
+from .ohe import ohe_batching
 if __name__ == "__main__":
     multiprocessing.set_start_method("forkserver", force=True)
-def converted_BAM_to_adata_II(converted_FASTA,
+def converted_BAM_to_adata(converted_FASTA,
                               split_dir,
+                              output_dir,
+                              input_already_demuxed,
                               mapping_threshold,
                               experiment_name,
                               conversions,
@@ -35,20 +35,24 @@ def converted_BAM_to_adata_II(converted_FASTA,
                               device='cpu',
                               num_threads=8,
                               deaminase_footprinting=False,
-                              delete_intermediates=True
+                              delete_intermediates=True,
+                              double_barcoded_path = None,
 ):
     """
     Converts BAM files into an AnnData object by binarizing modified base identities.
     Parameters:
-        converted_FASTA (str): Path to the converted FASTA reference.
-        split_dir (str): Directory containing converted BAM files.
+        converted_FASTA (Path): Path to the converted FASTA reference.
+        split_dir (Path): Directory containing converted BAM files.
+        output_dir (Path): Directory of the output dir
+        input_already_demuxed (bool): Whether input reads were originally demuxed
         mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
         experiment_name (str): Name for the output AnnData object.
         conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
         bam_suffix (str): File suffix for BAM files.
         num_threads (int): Number of parallel processing threads.
         deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
+        double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
     Returns:
         str: Path to the final AnnData object.
@@ -63,22 +67,25 @@ def converted_BAM_to_adata_II(converted_FASTA,
     print(f"Using device: {device}")
     ## Set Up Directories and File Paths
-    #parent_dir = os.path.dirname(split_dir)
-    h5_dir = os.path.join(split_dir, 'h5ads')
-    tmp_dir = os.path.join(split_dir, 'tmp')
+    h5_dir = output_dir / 'h5ads'
+    tmp_dir = output_dir / 'tmp'
     final_adata = None
-    final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{os.path.basename(split_dir)}.h5ad.gz')
+    final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
-    if os.path.exists(final_adata_path):
+    if final_adata_path.exists():
         print(f"{final_adata_path} already exists. Using existing AnnData object.")
         return final_adata, final_adata_path
     make_dirs([h5_dir, tmp_dir])
-    ## Get BAM Files ##
-    bam_files = [f for f in os.listdir(split_dir) if f.endswith(bam_suffix) and not f.endswith('.bai') and 'unclassified' not in f]
-    bam_files.sort()
-    bam_path_list = [os.path.join(split_dir, f) for f in bam_files]
+    bam_files = sorted(
+        p for p in split_dir.iterdir()
+        if p.is_file()
+        and p.suffix == ".bam"
+        and "unclassified" not in p.name
+    )
+    bam_path_list = [split_dir / f for f in bam_files]
     print(f"Found {len(bam_files)} BAM files: {bam_files}")
     ## Process Conversion Sites
@@ -90,10 +97,12 @@ def converted_BAM_to_adata_II(converted_FASTA,
     ## Process BAMs in Parallel
     final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
+    final_adata.uns['References'] = {}
     for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
         final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
         final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
         final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
+        final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
     final_adata.obs_names_make_unique()
     cols = final_adata.obs.columns
@@ -102,10 +111,13 @@ def converted_BAM_to_adata_II(converted_FASTA,
     for col in cols:
         final_adata.obs[col] = final_adata.obs[col].astype('category')
-    ## Save Final AnnData
-    print(f"Saving AnnData to {final_adata_path}")
-    backup_dir=os.path.join(os.path.dirname(final_adata_path), 'adata_accessory_data')
-    readwrite.safe_write_h5ad(final_adata, final_adata_path, compression='gzip', backup=True, backup_dir=backup_dir)
+    if input_already_demuxed:
+        final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
+        final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
+    else:
+        from .h5ad_functions import add_demux_type_annotation
+        double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
+        add_demux_type_annotation(final_adata, double_barcoded_reads)
     ## Delete intermediate h5ad files and temp directories
     if delete_intermediates:
@@ -211,7 +223,7 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
     adata_list = []
     for record in records_to_analyze:
-        sample = os.path.basename(bam).split(sep=".bam")[0]
+        sample = bam.stem
         chromosome = record_FASTA_dict[record][2]
         current_length = record_FASTA_dict[record][4]
         mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
@@ -329,13 +341,13 @@ def timestamp():
 def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
     """Worker function that processes a single BAM and writes the output to an H5AD file."""
     worker_id = current_process().pid  # Get worker process ID
-    sample = os.path.basename(bam).split(sep=".bam")[0]
+    sample = bam.stem
     try:
         print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
-        h5ad_path = os.path.join(h5_dir, f"{sample}.h5ad")
-        if os.path.exists(h5ad_path):
+        h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
+        if h5ad_path.exists():
             print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
             progress_queue.put(sample)
             return
@@ -352,7 +364,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
         adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
         if adata is not None:
-            adata.write_h5ad(h5ad_path)
+            adata.write_h5ad(str(h5ad_path))
             print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
             # Free memory
@@ -367,7 +379,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
 def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
     """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
-    os.makedirs(h5_dir, exist_ok=True)  # Ensure h5_dir exists
+    make_dirs(h5_dir)  # Ensure h5_dir exists
     print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
@@ -403,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
             pool.join()  # Ensure all workers finish
     # Final Concatenation Step
-    h5ad_files = [os.path.join(h5_dir, f) for f in os.listdir(h5_dir) if f.endswith(".h5ad")]
+    h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
     if not h5ad_files:
         print(f"{timestamp()} No valid H5AD files generated. Exiting.")

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl