PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +7 -1
smftools/cli/hmm_adata.py +902 -244
smftools/cli/load_adata.py +318 -198
smftools/cli/preprocess_adata.py +285 -171
smftools/cli/spatial_adata.py +137 -53
smftools/cli_entry.py +94 -178
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +22 -17
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +505 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2125 -1426
smftools/hmm/__init__.py +2 -3
smftools/hmm/archived/call_hmm_peaks.py +16 -1
smftools/hmm/call_hmm_peaks.py +173 -193
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +379 -156
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +195 -29
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +347 -168
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +145 -85
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +8 -8
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +103 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +688 -271
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.4.dist-info/RECORD +0 -176
/smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/bam_functions.py CHANGED Viewed

@@ -1,24 +1,55 @@
 from __future__ import annotations
-from pathlib import Path
+import glob
 import os
+import re
 import subprocess
-import glob
 import time
-from typing import Dict, List, Any, Tuple, Union, Optional, Iterable
-import re
+from collections import Counter, defaultdict, deque
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import zip_longest
-import pysam
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import numpy as np
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from concurrent.futures import ProcessPoolExecutor
+import pysam
 from tqdm import tqdm
-from collections import defaultdict, Counter
-from ..readwrite import make_dirs, time_string, date_string
+from smftools.logging_utils import get_logger
+from ..readwrite import date_string, time_string
+logger = get_logger(__name__)
+_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
+_EMPTY_RE = re.compile(r"^\s*$")
+def _stream_dorado_logs(stderr_iter) -> None:
+    """Stream dorado stderr and emit structured log messages.
+    Args:
+        stderr_iter: Iterable of stderr lines.
+    """
+    last_n: int | None = None
+    for raw in stderr_iter:
+        line = raw.rstrip("\n")
+        if _EMPTY_RE.match(line):
+            continue
+        m = _PROGRESS_RE.search(line)
+        if m:
+            n = int(m.group(1))
+            logger.debug("[dorado] Output records written: %d", n)
+            last_n = n
+            continue
+        logger.info("[dorado] %s", line)
+    if last_n is not None:
+        logger.info("[dorado] Final output records written: %d", last_n)
 def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
     """
@@ -26,7 +57,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
     """
     bam_path = str(bam_path)
     fastq_path = str(fastq_path)
-    with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w", encoding="utf-8") as fq:
+    logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
+    with (
+        pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
+        open(fastq_path, "w", encoding="utf-8") as fq,
+    ):
         for r in bam.fetch(until_eof=True):
             # Optionally skip secondary/supplementary:
             # if r.is_secondary or r.is_supplementary:
@@ -45,14 +82,22 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
                 # q is an array/list of ints (Phred scores).
                 # Convert to FASTQ string with Phred+33 encoding,
                 # clamping to sane range [0, 93] to stay in printable ASCII.
-                qual_str = "".join(
-                    chr(min(max(int(qv), 0), 93) + 33)
-                    for qv in q
-                )
+                qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
             fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
-def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
+def _sort_bam_with_pysam(
+    in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
+) -> None:
+    """Sort a BAM file using pysam.
+    Args:
+        in_bam: Input BAM path.
+        out_bam: Output BAM path.
+        threads: Optional thread count.
+    """
+    logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
     in_bam, out_bam = str(in_bam), str(out_bam)
     args = []
     if threads:
@@ -60,21 +105,31 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
     args += ["-o", out_bam, in_bam]
     pysam.sort(*args)
 def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
+    """Index a BAM file using pysam.
+    Args:
+        bam_path: BAM path to index.
+        threads: Optional thread count.
+    """
     bam_path = str(bam_path)
+    logger.debug(f"Indexing BAM using _index_bam_with_pysam")
     # pysam.index supports samtools-style args
     if threads:
         pysam.index("-@", str(threads), bam_path)
     else:
         pysam.index(bam_path)
-def align_and_sort_BAM(fasta,
-                       input,
-                       cfg,
+def align_and_sort_BAM(
+    fasta,
+    input,
+    cfg,
 ):
     """
     A wrapper for running dorado aligner and samtools functions
     Parameters:
         fasta (str): File path to the reference genome to align to.
         input (str): File path to the basecalled file to align. Works for .bam and .fastq files
@@ -84,61 +139,95 @@ def align_and_sort_BAM(fasta,
         None
             The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
     """
+    logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
     input_basename = input.name
     input_suffix = input.suffix
-    input_as_fastq = input.with_name(input.stem + '.fastq')
+    input_as_fastq = input.with_name(input.stem + ".fastq")
     output_path_minus_suffix = cfg.output_directory / input.stem
     aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
     aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
-    aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
+    aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
     aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
     if cfg.threads:
         threads = str(cfg.threads)
     else:
         threads = None
-    if cfg.aligner == 'minimap2':
+    if cfg.aligner == "minimap2":
         if not cfg.align_from_bam:
-            print(f"Converting BAM to FASTQ: {input}")
+            logger.debug(f"Converting BAM to FASTQ: {input}")
             _bam_to_fastq_with_pysam(input, input_as_fastq)
-            print(f"Aligning FASTQ to Reference: {input_as_fastq}")
+            logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
             mm_input = input_as_fastq
-        else:
-            print(f"Aligning BAM to Reference: {input}")
+        else:
+            logger.debug(f"Aligning BAM to Reference: {input}")
             mm_input = input
         if threads:
-            minimap_command = ['minimap2'] + cfg.aligner_args + ['-t', threads, str(fasta), str(mm_input)]
+            minimap_command = (
+                ["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
+            )
         else:
-            minimap_command = ['minimap2'] + cfg.aligner_args + [str(fasta), str(mm_input)]
-        subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
+            minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
+        with open(aligned_output, "wb") as out:
+            proc = subprocess.Popen(
+                minimap_command,
+                stdout=out,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            assert proc.stderr is not None
+            for line in proc.stderr:
+                logger.info("[minimap2] %s", line.rstrip())
+            ret = proc.wait()
+            if ret != 0:
+                raise RuntimeError(f"minimap2 failed with exit code {ret}")
         if not cfg.align_from_bam:
             os.remove(input_as_fastq)
-    elif cfg.aligner == 'dorado':
+    elif cfg.aligner == "dorado":
         # Run dorado aligner
         print(f"Aligning BAM to Reference: {input}")
         if threads:
-            alignment_command = ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
+            alignment_command = (
+                ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
+            )
         else:
             alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
-        subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
+        with open(aligned_output, "wb") as out:
+            proc = subprocess.Popen(
+                alignment_command,
+                stdout=out,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            assert proc.stderr is not None
+            _stream_dorado_logs(proc.stderr)
+            ret = proc.wait()
+            if ret != 0:
+                raise RuntimeError(f"dorado failed with exit code {ret}")
     else:
-        print(f'Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado')
+        logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
         return
     # --- Sort & Index with pysam ---
-    print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
+    logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
     _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
-    print(f"[pysam] Indexing: {aligned_sorted_output}")
+    logger.debug(f"Indexing: {aligned_sorted_output}")
     _index_bam_with_pysam(aligned_sorted_output, threads=threads)
 def bam_qc(
     bam_files: Iterable[str | Path],
     bam_qc_dir: str | Path,
@@ -153,133 +242,154 @@ def bam_qc(
     Prefers pysam; falls back to `samtools` if needed.
     Runs BAMs in parallel (up to `threads`, default serial).
     """
-    import subprocess
     import shutil
+    import subprocess
+    logger.debug("Performing BAM QC using bam_qc")
     # Try to import pysam once
     try:
-        import pysam
-        HAVE_PYSAM = True
+        import pysam  # type: ignore
+        have_pysam = True
     except Exception:
-        HAVE_PYSAM = False
+        pysam = None  # type: ignore
+        have_pysam = False
     bam_qc_dir = Path(bam_qc_dir)
     bam_qc_dir.mkdir(parents=True, exist_ok=True)
-    bam_files = [Path(b) for b in bam_files]
+    bam_paths = [Path(b) for b in bam_files]
     def _has_index(p: Path) -> bool:
-        if p.suffix.lower() == ".bam":
-            bai = p.with_suffix(p.suffix + ".bai")
-            bai_alt = Path(str(p) + ".bai")
-            return bai.exists() or bai_alt.exists()
-        if p.suffix.lower() == ".cram":
-            crai = Path(str(p) + ".crai")
-            return crai.exists()
+        """Return True if a BAM/CRAM index exists for the path."""
+        suf = p.suffix.lower()
+        if suf == ".bam":
+            return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
+        if suf == ".cram":
+            return Path(str(p) + ".crai").exists()
         return False
     def _ensure_index(p: Path) -> None:
+        """Ensure a BAM/CRAM index exists, creating one if needed."""
         if _has_index(p):
             return
-        if HAVE_PYSAM:
-            # pysam.index supports both BAM & CRAM
-            pysam.index(str(p))
+        if have_pysam:
+            assert pysam is not None
+            pysam.index(str(p))  # supports BAM & CRAM
         else:
+            if not shutil.which("samtools"):
+                raise RuntimeError("Neither pysam nor samtools is available in PATH.")
             cmd = ["samtools", "index", str(p)]
-            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            # capture text so errors are readable; raise on failure
+            cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
+            if cp.returncode != 0:
+                raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
+    def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
+        """
+        Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
+        """
+        last_err = deque(maxlen=80)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(out_path, "w") as fh:
+            proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
+            assert proc.stderr is not None
+            for line in proc.stderr:
+                line = line.rstrip()
+                if line:
+                    last_err.append(line)
+                    logger.info("[%s][%s] %s", tag, bam.name, line)
+            rc = proc.wait()
+        if rc != 0:
+            tail = "\n".join(last_err)
+            raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
+        return rc
+    def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
+        """Run stats/flagstat/idxstats for a single BAM.
+        Args:
+            bam: Path to the BAM file.
+        Returns:
+            Tuple of (bam_path, list of (stage, return_code)).
+        """
+        import subprocess
-    def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
-        # outputs + return (file, [(task_name, returncode)])
-        results: List[Tuple[str, int]] = []
-        base = bam.stem  # filename without .bam
+        results: list[tuple[str, int]] = []
+        base = bam.stem  # e.g. sample.bam -> sample
         out_stats = bam_qc_dir / f"{base}_stats.txt"
         out_flag = bam_qc_dir / f"{base}_flagstat.txt"
-        out_idx  = bam_qc_dir / f"{base}_idxstats.txt"
+        out_idx = bam_qc_dir / f"{base}_idxstats.txt"
-        # Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
+        # Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
         try:
             _ensure_index(bam)
         except Exception as e:
-            # Still attempt stats/flagstat if requested
-            print(f"[warn] Indexing failed for {bam}: {e}")
-        # Choose runner per task
-        def run_stats():
-            if not stats:
-                return
-            if HAVE_PYSAM and hasattr(pysam, "stats"):
+            # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
+            logger.warning("Indexing failed for %s: %s", bam, e)
+        if not have_pysam:
+            import shutil
+            if not shutil.which("samtools"):
+                raise RuntimeError("Neither pysam nor samtools is available in PATH.")
+        # --- stats ---
+        if stats:
+            if have_pysam and pysam is not None and hasattr(pysam, "stats"):
                 txt = pysam.stats(str(bam))
                 out_stats.write_text(txt)
                 results.append(("stats(pysam)", 0))
             else:
                 cmd = ["samtools", "stats", str(bam)]
-                with open(out_stats, "w") as fh:
-                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
-                results.append(("stats(samtools)", cp.returncode))
-                if cp.returncode != 0:
-                    raise RuntimeError(cp.stderr.decode(errors="replace"))
-        def run_flagstat():
-            if not flagstats:
-                return
-            if HAVE_PYSAM and hasattr(pysam, "flagstat"):
+                rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
+                results.append(("stats(samtools)", rc))
+        # --- flagstat ---
+        if flagstats:
+            if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
                 txt = pysam.flagstat(str(bam))
                 out_flag.write_text(txt)
                 results.append(("flagstat(pysam)", 0))
             else:
                 cmd = ["samtools", "flagstat", str(bam)]
-                with open(out_flag, "w") as fh:
-                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
-                results.append(("flagstat(samtools)", cp.returncode))
-                if cp.returncode != 0:
-                    raise RuntimeError(cp.stderr.decode(errors="replace"))
-        def run_idxstats():
-            if not idxstats:
-                return
-            if HAVE_PYSAM and hasattr(pysam, "idxstats"):
+                rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
+                results.append(("flagstat(samtools)", rc))
+        # --- idxstats ---
+        if idxstats:
+            if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
                 txt = pysam.idxstats(str(bam))
                 out_idx.write_text(txt)
                 results.append(("idxstats(pysam)", 0))
             else:
                 cmd = ["samtools", "idxstats", str(bam)]
-                with open(out_idx, "w") as fh:
-                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
-                results.append(("idxstats(samtools)", cp.returncode))
-                if cp.returncode != 0:
-                    raise RuntimeError(cp.stderr.decode(errors="replace"))
-        # Sanity: ensure samtools exists if pysam missing
-        if not HAVE_PYSAM:
-            if not shutil.which("samtools"):
-                raise RuntimeError("Neither pysam nor samtools is available in PATH.")
+                rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
+                results.append(("idxstats(samtools)", rc))
-        # Execute tasks (serial per file; parallelized across files)
-        run_stats()
-        run_flagstat()
-        run_idxstats()
         return bam, results
-    # Parallel across BAMs
     max_workers = int(threads) if threads and int(threads) > 0 else 1
-    futures = []
-    with ThreadPoolExecutor(max_workers=max_workers) as ex:
-        for b in bam_files:
-            futures.append(ex.submit(_run_one, b))
-        for fut in as_completed(futures):
+    with ThreadPoolExecutor(max_workers=max_workers) as ex:
+        futs = [ex.submit(_run_one, b) for b in bam_paths]
+        for fut in as_completed(futs):
             try:
                 bam, res = fut.result()
                 summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
-                print(f"[qc] {bam.name}: {summary}")
+                logger.info("[qc] %s: %s", bam.name, summary)
             except Exception as e:
-                print(f"[error] QC failed: {e}")
+                logger.exception("QC failed: %s", e)
+    if modality not in {"conversion", "direct", "deaminase"}:
+        logger.warning("Unknown modality '%s', continuing.", modality)
-    # Placeholders to keep your signature stable
-    if modality not in {"conversion", "direct"}:
-        print(f"[warn] Unknown modality '{modality}', continuing.")
+    logger.info("QC processing completed.")
-    print("QC processing completed.")
 def concatenate_fastqs_to_bam(
     fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
@@ -326,12 +436,29 @@ def concatenate_fastqs_to_bam(
         """
         name = p.name
         lowers = name.lower()
-        for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
+        for ext in (
+            ".fastq.gz",
+            ".fq.gz",
+            ".fastq.bz2",
+            ".fq.bz2",
+            ".fastq.xz",
+            ".fq.xz",
+            ".fastq",
+            ".fq",
+        ):
             if lowers.endswith(ext):
                 return name[: -len(ext)]
         return p.stem  # fallback: remove last suffix only
     def _extract_barcode_from_filename(p: Path) -> str:
+        """Extract a barcode token from a FASTQ filename.
+        Args:
+            p: FASTQ path.
+        Returns:
+            Barcode token string.
+        """
         stem = _strip_fastq_ext(p)
         if "_" in stem:
             token = stem.split("_")[-1]
@@ -340,10 +467,18 @@ def concatenate_fastqs_to_bam(
         return stem
     def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
+        """Classify a FASTQ filename stem into (prefix, read_number).
+        Args:
+            stem: Filename stem.
+        Returns:
+            Tuple of (prefix, read_number) or (None, None) if not matched.
+        """
         # return (prefix, readnum) if matches; else (None, None)
         patterns = [
-            r"(?i)(.*?)[._-]r?([12])$",        # prefix_R1 / prefix.r2 / prefix-1
-            r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
+            r"(?i)(.*?)[._-]r?([12])$",  # prefix_R1 / prefix.r2 / prefix-1
+            r"(?i)(.*?)[._-]read[_-]?([12])$",  # prefix_read1
         ]
         for pat in patterns:
             m = re.match(pat, stem)
@@ -352,6 +487,14 @@ def concatenate_fastqs_to_bam(
         return None, None
     def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
+        """Pair FASTQ files based on filename conventions.
+        Args:
+            paths: FASTQ paths to pair.
+        Returns:
+            Tuple of (paired list, leftover list).
+        """
         pref_map: Dict[str, Dict[int, Path]] = {}
         unpaired: List[Path] = []
         for pth in paths:
@@ -373,6 +516,14 @@ def concatenate_fastqs_to_bam(
         return pairs, leftovers
     def _fastq_iter(p: Path):
+        """Yield FASTQ records using pysam.FastxFile.
+        Args:
+            p: FASTQ path.
+        Yields:
+            Pysam Fastx records.
+        """
         # pysam.FastxFile handles compressed extensions transparently
         with pysam.FastxFile(str(p)) as fx:
             for rec in fx:
@@ -386,6 +537,19 @@ def concatenate_fastqs_to_bam(
         read1: bool,
         read2: bool,
     ) -> pysam.AlignedSegment:
+        """Construct an unaligned pysam.AlignedSegment.
+        Args:
+            name: Read name.
+            seq: Read sequence.
+            qual: FASTQ quality string.
+            bc: Barcode string.
+            read1: Whether this is read 1.
+            read2: Whether this is read 2.
+        Returns:
+            Unaligned pysam.AlignedSegment.
+        """
         a = pysam.AlignedSegment()
         a.query_name = name
         a.query_sequence = seq
@@ -408,6 +572,7 @@ def concatenate_fastqs_to_bam(
     # ---------- normalize inputs to Path ----------
     def _to_path_pair(x) -> Tuple[Path, Path]:
+        """Convert a tuple of path-like objects to Path instances."""
         a, b = x
         return Path(a), Path(b)
@@ -450,7 +615,10 @@ def concatenate_fastqs_to_bam(
     # ---------- BAM header ----------
     header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
     if add_read_group:
-        header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
+        header["RG"] = [
+            {"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
+            for bc in barcodes_in_order
+        ]
     header.setdefault("PG", []).append(
         {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
     )
@@ -476,7 +644,9 @@ def concatenate_fastqs_to_bam(
             it2 = _fastq_iter(r2_path)
             for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
                 def _clean(n: Optional[str]) -> Optional[str]:
+                    """Normalize FASTQ read names by trimming read suffixes."""
                     if n is None:
                         return None
                     return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
@@ -489,12 +659,16 @@ def concatenate_fastqs_to_bam(
                 )
                 if rec1 is not None:
-                    a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
+                    a1 = _make_unaligned_segment(
+                        name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
+                    )
                     bam_out.write(a1)
                     per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
                     total_written += 1
                 if rec2 is not None:
-                    a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
+                    a2 = _make_unaligned_segment(
+                        name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
+                    )
                     bam_out.write(a2)
                     per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
                     total_written += 1
@@ -516,7 +690,9 @@ def concatenate_fastqs_to_bam(
                 raise FileNotFoundError(pth)
             bc = per_path_barcode.get(pth, "barcode")
             for rec in _fastq_iter(pth):
-                a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
+                a = _make_unaligned_segment(
+                    rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
+                )
                 bam_out.write(a)
                 per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
                 total_written += 1
@@ -530,20 +706,21 @@ def concatenate_fastqs_to_bam(
         "barcodes": barcodes_in_order,
     }
 def count_aligned_reads(bam_file):
     """
     Counts the number of aligned reads in a bam file that map to each reference record.
     Parameters:
         bam_file (str): A string representing the path to an aligned BAM file.
     Returns:
        aligned_reads_count (int): The total number or reads aligned in the BAM.
        unaligned_reads_count (int): The total number of reads not aligned in the BAM.
        record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
     """
-    print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
+    print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
     aligned_reads_count = 0
     unaligned_reads_count = 0
     # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
@@ -552,12 +729,14 @@ def count_aligned_reads(bam_file):
     with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped + bam.unmapped
         # Iterate over reads to get the total mapped read counts and the reads that map to each reference
-        for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
+        for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
             if read.is_unmapped:
                 unaligned_reads_count += 1
             else:
                 aligned_reads_count += 1
-                record_counts[read.reference_name] += 1  # Automatically increments if key exists, adds if not
+                record_counts[read.reference_name] += (
+                    1  # Automatically increments if key exists, adds if not
+                )
         # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
         for reference in record_counts:
@@ -566,7 +745,10 @@ def count_aligned_reads(bam_file):
     return aligned_reads_count, unaligned_reads_count, dict(record_counts)
-def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads):
+def demux_and_index_BAM(
+    aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
+):
     """
     A wrapper function for splitting BAMS and indexing them.
     Parameters:
@@ -577,11 +759,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         barcode_both_ends (bool): Whether to require both ends to be barcoded.
         trim (bool): Whether to trim off barcodes after demultiplexing.
         threads (int): Number of threads to use.
     Returns:
         bam_files (list): List of split BAM file path strings
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
     input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
     command = ["dorado", "demux", "--kit-name", barcode_kit]
     if barcode_both_ends:
@@ -594,25 +777,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         pass
     command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
     command.append(str(input_bam))
-    command_string = ' '.join(command)
-    print(f"Running: {command_string}")
-    subprocess.run(command)
+    command_string = " ".join(command)
+    logger.info("Running dorado demux: %s", " ".join(command))
+    proc = subprocess.Popen(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    assert proc.stderr is not None
+    _stream_dorado_logs(proc.stderr)
+    rc = proc.wait()
+    if rc != 0:
+        raise RuntimeError(f"dorado demux failed with exit code {rc}")
     bam_files = sorted(
-        p for p in split_dir.glob(f"*{bam_suffix}")
-        if p.is_file() and p.suffix == bam_suffix
+        p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
     )
     if not bam_files:
         raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
     # ---- Optional renaming with prefix ----
     renamed_bams = []
     prefix = "de" if barcode_both_ends else "se"
     for bam in bam_files:
         bam = Path(bam)
-        bai = bam.with_suffix(bam_suffix + ".bai")   # dorado’s sorting produces .bam.bai
+        bai = bam.with_suffix(bam_suffix + ".bai")  # dorado’s sorting produces .bam.bai
         if prefix:
             new_name = f"{prefix}_{bam.name}"
@@ -628,9 +823,10 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
             bai.rename(new_bai)
         renamed_bams.append(new_bam)
     return renamed_bams
 def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
     """
     Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -646,14 +842,15 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
         dict: Base identities from forward mapped reads.
         dict: Base identities from reverse mapped reads.
     """
+    logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
     timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
     positions = set(positions)
-    fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
-    rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
+    fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
+    rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
     mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
-    #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
+    # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
     with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped
         ref_seq = sequence.upper()
@@ -676,7 +873,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
                     base_dict[read_name][reference_position] = read_base
                 # Track mismatches (excluding Ns)
-                if read_base != ref_base and read_base != 'N' and ref_base != 'N':
+                if read_base != ref_base and read_base != "N" and ref_base != "N":
                     mismatch_counts_per_read[read_name][ref_base][read_base] += 1
     # Determine C→T vs G→A dominance per read
@@ -694,7 +891,13 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
         else:
             mismatch_trend_per_read[read_name] = "none"
-    return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
+    return (
+        dict(fwd_base_identities),
+        dict(rev_base_identities),
+        dict(mismatch_counts_per_read),
+        mismatch_trend_per_read,
+    )
 def extract_read_features_from_bam(bam_file_path):
     """
@@ -705,7 +908,9 @@ def extract_read_features_from_bam(bam_file_path):
         read_metrics (dict)
     """
     # Open the BAM file
-    print(f'Extracting read features from BAM: {bam_file_path}')
+    logger.debug(
+        f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
+    )
     with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
         read_metrics = {}
         reference_lengths = bam_file.lengths  # List of lengths for each reference (chromosome)
@@ -722,10 +927,17 @@ def extract_read_features_from_bam(bam_file_path):
             reference_length = reference_lengths[reference_index]
             mapped_length = sum(end - start for start, end in read.get_blocks())
             mapping_quality = read.mapping_quality  # Phred-scaled MAPQ
-            read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
+            read_metrics[read.query_name] = [
+                read.query_length,
+                median_read_quality,
+                reference_length,
+                mapped_length,
+                mapping_quality,
+            ]
     return read_metrics
 def extract_readnames_from_bam(aligned_BAM):
     """
     Takes a BAM and writes out a txt file containing read names from the BAM
@@ -738,15 +950,19 @@ def extract_readnames_from_bam(aligned_BAM):
     """
     import subprocess
     # Make a text file of reads for the BAM
-    txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
+    txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
     samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
     with open(txt_output, "w") as output_file:
-        cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
+        cut_process = subprocess.Popen(
+            ["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
+        )
     samtools_view.stdout.close()
     cut_process.wait()
     samtools_view.wait()
 def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
     """
     Separates an input BAM file on the BC SAM tag values.
@@ -756,11 +972,12 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
         output_prefix (str): A prefix to append to the output BAM.
         bam_suffix (str): A suffix to add to the bam file.
         split_dir (str): String indicating path to directory to split BAMs into
     Returns:
         None
             Writes out split BAM files.
     """
+    logger.debug("Demultiplexing BAM based on the BC tag")
     bam_base = input_bam.name
     bam_base_minus_suffix = input_bam.stem
@@ -773,19 +990,24 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
             try:
                 # Get the barcode tag value
                 bc_tag = read.get_tag("BC", with_value_type=True)[0]
-                #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
+                # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
                 # Open the output BAM file corresponding to the barcode
                 if bc_tag not in output_files:
-                    output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
-                    output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
+                    output_path = (
+                        split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
+                    )
+                    output_files[bc_tag] = pysam.AlignmentFile(
+                        str(output_path), "wb", header=bam.header
+                    )
                 # Write the read to the corresponding output BAM file
                 output_files[bc_tag].write(read)
             except KeyError:
-                 print(f"BC tag not present for read: {read.query_name}")
+                logger.warning(f"BC tag not present for read: {read.query_name}")
     # Close all output BAM files
     for output_file in output_files.values():
         output_file.close()
 def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
     """
     A wrapper function for splitting BAMS and indexing them.
@@ -793,19 +1015,20 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
         aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
         split_dir (str): A string representing the file path to the directory to split the BAMs into.
         bam_suffix (str): A suffix to add to the bam file.
     Returns:
         None
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
+    logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
     file_prefix = date_string()
     separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
     # Make a BAM index file for the BAMs in that directory
-    bam_pattern = '*' + bam_suffix
+    bam_pattern = "*" + bam_suffix
     bam_files = glob.glob(split_dir / bam_pattern)
-    bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
+    bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
     for input_file in bam_files:
         pysam.index(input_file)
-    return bam_files
+    return bam_files

smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl