PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py ADDED Viewed

@@ -0,0 +1,87 @@
+def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
+    """
+    Takes an aligned BAM as input and writes a BED file of reads as output.
+    Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
+    Parameters:
+        aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
+        out_dir (str): Directory to output files.
+        fasta (str): File path to the reference genome.
+        make_bigwigs (bool): Whether to generate bigwig files.
+        threads (int): Number of threads to use.
+    Returns:
+        None
+    """
+    import subprocess
+    import os
+    from pathlib import Path
+    import pysam
+    import numpy as np
+    import concurrent.futures
+    from concurrent.futures import ProcessPoolExecutor
+    from .bed_to_bigwig import bed_to_bigwig
+    from ...readwrite import make_dirs
+    from .plot_bed_histograms import plot_bed_histograms
+    threads = threads or os.cpu_count()  # Use max available cores if not specified
+    # Create necessary directories
+    plotting_dir = out_dir / "bed_cov_histograms"
+    bed_dir = out_dir / "beds"
+    make_dirs([plotting_dir, bed_dir])
+    bed_output = bed_dir /  str(aligned_BAM.name).replace(".bam", "_bed.bed")
+    print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
+    with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
+        for read in bam.fetch(until_eof=True):
+            if read.is_unmapped:
+                chrom = "*"
+                start1 = 1
+                rl = read.query_length or 0
+                mapq = 0
+            else:
+                chrom = bam.get_reference_name(read.reference_id)
+                # pysam reference_start is 0-based → +1 for 1-based SAM-like start
+                start1 = int(read.reference_start) + 1
+                rl = read.query_length or 0
+                mapq = int(read.mapping_quality)
+            # End position in 1-based inclusive coords
+            end1 = start1 + (rl or 0) - 1
+            qname = read.query_name
+            quals = read.query_qualities
+            if quals is None or rl == 0:
+                avg_q = float("nan")
+            else:
+                avg_q = float(np.mean(quals))
+            out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
+    print(f"BED-like file created: {bed_output}")
+    def split_bed(bed):
+        """Splits into aligned and unaligned reads (chrom == '*')."""
+        bed = str(bed)
+        aligned = bed.replace(".bed", "_aligned.bed")
+        unaligned = bed.replace(".bed", "_unaligned.bed")
+        with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
+            for line in infile:
+                (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
+        os.remove(bed)
+        return aligned
+    print(f"Splitting: {bed_output}")
+    aligned_bed = split_bed(bed_output)
+    with ProcessPoolExecutor() as executor:
+        futures = []
+        futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
+        if make_bigwigs:
+            futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
+        concurrent.futures.wait(futures)
+    print("Processing completed successfully.")

smftools/informatics/archived/helpers/archived/bam_qc.py ADDED Viewed

@@ -0,0 +1,213 @@
+from __future__ import annotations
+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Iterable, Optional, Tuple, List
+def bam_qc(
+    bam_files: Iterable[str | Path],
+    bam_qc_dir: str | Path,
+    threads: Optional[int],
+    modality: str,
+    stats: bool = True,
+    flagstats: bool = True,
+    idxstats: bool = True,
+) -> None:
+    """
+    QC for BAM/CRAMs: stats, flagstat, idxstats.
+    Prefers pysam; falls back to `samtools` if needed.
+    Runs BAMs in parallel (up to `threads`, default serial).
+    """
+    import subprocess
+    import shutil
+    # Try to import pysam once
+    try:
+        import pysam
+        HAVE_PYSAM = True
+    except Exception:
+        HAVE_PYSAM = False
+    bam_qc_dir = Path(bam_qc_dir)
+    bam_qc_dir.mkdir(parents=True, exist_ok=True)
+    bam_files = [Path(b) for b in bam_files]
+    def _has_index(p: Path) -> bool:
+        if p.suffix.lower() == ".bam":
+            bai = p.with_suffix(p.suffix + ".bai")
+            bai_alt = Path(str(p) + ".bai")
+            return bai.exists() or bai_alt.exists()
+        if p.suffix.lower() == ".cram":
+            crai = Path(str(p) + ".crai")
+            return crai.exists()
+        return False
+    def _ensure_index(p: Path) -> None:
+        if _has_index(p):
+            return
+        if HAVE_PYSAM:
+            # pysam.index supports both BAM & CRAM
+            pysam.index(str(p))
+        else:
+            cmd = ["samtools", "index", str(p)]
+            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
+        # outputs + return (file, [(task_name, returncode)])
+        results: List[Tuple[str, int]] = []
+        base = bam.stem  # filename without .bam
+        out_stats = bam_qc_dir / f"{base}_stats.txt"
+        out_flag = bam_qc_dir / f"{base}_flagstat.txt"
+        out_idx  = bam_qc_dir / f"{base}_idxstats.txt"
+        # Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
+        try:
+            _ensure_index(bam)
+        except Exception as e:
+            # Still attempt stats/flagstat if requested
+            print(f"[warn] Indexing failed for {bam}: {e}")
+        # Choose runner per task
+        def run_stats():
+            if not stats:
+                return
+            if HAVE_PYSAM and hasattr(pysam, "stats"):
+                txt = pysam.stats(str(bam))
+                out_stats.write_text(txt)
+                results.append(("stats(pysam)", 0))
+            else:
+                cmd = ["samtools", "stats", str(bam)]
+                with open(out_stats, "w") as fh:
+                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
+                results.append(("stats(samtools)", cp.returncode))
+                if cp.returncode != 0:
+                    raise RuntimeError(cp.stderr.decode(errors="replace"))
+        def run_flagstat():
+            if not flagstats:
+                return
+            if HAVE_PYSAM and hasattr(pysam, "flagstat"):
+                txt = pysam.flagstat(str(bam))
+                out_flag.write_text(txt)
+                results.append(("flagstat(pysam)", 0))
+            else:
+                cmd = ["samtools", "flagstat", str(bam)]
+                with open(out_flag, "w") as fh:
+                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
+                results.append(("flagstat(samtools)", cp.returncode))
+                if cp.returncode != 0:
+                    raise RuntimeError(cp.stderr.decode(errors="replace"))
+        def run_idxstats():
+            if not idxstats:
+                return
+            if HAVE_PYSAM and hasattr(pysam, "idxstats"):
+                txt = pysam.idxstats(str(bam))
+                out_idx.write_text(txt)
+                results.append(("idxstats(pysam)", 0))
+            else:
+                cmd = ["samtools", "idxstats", str(bam)]
+                with open(out_idx, "w") as fh:
+                    cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
+                results.append(("idxstats(samtools)", cp.returncode))
+                if cp.returncode != 0:
+                    raise RuntimeError(cp.stderr.decode(errors="replace"))
+        # Sanity: ensure samtools exists if pysam missing
+        if not HAVE_PYSAM:
+            if not shutil.which("samtools"):
+                raise RuntimeError("Neither pysam nor samtools is available in PATH.")
+        # Execute tasks (serial per file; parallelized across files)
+        run_stats()
+        run_flagstat()
+        run_idxstats()
+        return bam, results
+    # Parallel across BAMs
+    max_workers = int(threads) if threads and int(threads) > 0 else 1
+    futures = []
+    with ThreadPoolExecutor(max_workers=max_workers) as ex:
+        for b in bam_files:
+            futures.append(ex.submit(_run_one, b))
+        for fut in as_completed(futures):
+            try:
+                bam, res = fut.result()
+                summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
+                print(f"[qc] {bam.name}: {summary}")
+            except Exception as e:
+                print(f"[error] QC failed: {e}")
+    # Placeholders to keep your signature stable
+    if modality not in {"conversion", "direct"}:
+        print(f"[warn] Unknown modality '{modality}', continuing.")
+    print("QC processing completed.")
+# def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
+#     """
+#     Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
+#     Parameters:
+#     - bam_files: List of BAM file paths.
+#     - bam_qc_dir: Directory to save QC reports.
+#     - threads: Number threads to use.
+#     - modality: 'conversion' or 'direct' (affects processing mode).
+#     - stats: Run `samtools stats` if True.
+#     - flagstats: Run `samtools flagstat` if True.
+#     - idxstats: Run `samtools idxstats` if True.
+#     """
+#     import os
+#     import subprocess
+#     # Ensure the QC output directory exists
+#     os.makedirs(bam_qc_dir, exist_ok=True)
+#     if threads:
+#         threads = str(threads)
+#     else:
+#         pass
+#     for bam in bam_files:
+#         bam_name = os.path.basename(bam).replace(".bam", "")  # Extract filename without extension
+#         # Run samtools QC commands based on selected options
+#         if stats:
+#             stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
+#             if threads:
+#                 command = ["samtools", "stats", "-@", threads, bam]
+#             else:
+#                 command = ["samtools", "stats", bam]
+#             print(f"Running: {' '.join(command)} > {stats_out}")
+#             with open(stats_out, "w") as out_file:
+#                 subprocess.run(command, stdout=out_file)
+#         if flagstats:
+#             flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
+#             if threads:
+#                 command = ["samtools", "flagstat", "-@", threads, bam]
+#             else:
+#                 command = ["samtools", "flagstat", bam]
+#             print(f"Running: {' '.join(command)} > {flagstats_out}")
+#             with open(flagstats_out, "w") as out_file:
+#                 subprocess.run(command, stdout=out_file)
+#         if idxstats:
+#             idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
+#             if threads:
+#                 command = ["samtools", "idxstats", "-@", threads, bam]
+#             else:
+#                 command = ["samtools", "idxstats", bam]
+#             print(f"Running: {' '.join(command)} > {idxstats_out}")
+#             with open(idxstats_out, "w") as out_file:
+#                 subprocess.run(command, stdout=out_file)
+#         if modality == 'conversion':
+#             pass
+#         elif modality == 'direct':
+#             pass
+#     print("QC processing completed.")

smftools/informatics/archived/helpers/archived/bed_to_bigwig.py ADDED Viewed

@@ -0,0 +1,90 @@
+from pathlib import Path
+import pybedtools
+import pyBigWig
+def bed_to_bigwig(fasta: str, bed: str) -> str:
+    """
+    BED → bedGraph → bigWig
+    Requires:
+      - FASTA must have .fai index present
+    """
+    bed = Path(bed)
+    fa = Path(fasta)  # path to .fa
+    parent = bed.parent
+    stem = bed.stem
+    fa_stem = fa.stem
+    fai = parent / f"{fa_stem}.fai"
+    bedgraph = parent / f"{stem}.bedgraph"
+    bigwig = parent / f"{stem}.bw"
+    # 1) Compute coverage → bedGraph
+    print(f"[pybedtools] generating coverage bedgraph from {bed}")
+    bt = pybedtools.BedTool(str(bed))
+    # bedtools genomecov -bg
+    coverage = bt.genome_coverage(bg=True, genome=str(fai))
+    coverage.saveas(str(bedgraph))
+    # 2) Convert bedGraph → BigWig via pyBigWig
+    print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
+    # read chrom sizes from the FASTA .fai index
+    chrom_sizes = {}
+    with open(fai) as f:
+        for line in f:
+            fields = line.strip().split("\t")
+            chrom = fields[0]
+            size = int(fields[1])
+            chrom_sizes[chrom] = size
+    bw = pyBigWig.open(str(bigwig), "w")
+    bw.addHeader(list(chrom_sizes.items()))
+    with open(bedgraph) as f:
+        for line in f:
+            chrom, start, end, coverage = line.strip().split()
+            bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
+    bw.close()
+    print(f"BigWig written: {bigwig}")
+    return str(bigwig)
+# def bed_to_bigwig(fasta, bed):
+#     """
+#     Takes a bed file of reads and makes a bedgraph plus a bigwig
+#     Parameters:
+#         fasta (str): File path to the reference genome to align to.
+#         bed (str): File path to the input bed.
+#     Returns:
+#         None
+#     """
+#     import os
+#     import subprocess
+#     bed_basename = os.path.basename(bed)
+#     parent_dir = os.path.dirname(bed)
+#     bed_basename_minus_suffix = bed_basename.split('.bed')[0]
+#     fasta_basename = os.path.basename(fasta)
+#     fasta_dir = os.path.dirname(fasta)
+#     fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
+#     chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
+#     chrom_path = os.path.join(fasta_dir, chrom_basename)
+#     bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
+#     bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
+#     bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
+#     bigwig_output = os.path.join(parent_dir, bigwig_basename)
+#     # Make the bedgraph
+#     with open(bedgraph_output, 'w') as outfile:
+#         # Command as a list
+#         command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
+#         print(f'Making bedgraph from {bed_basename}')
+#         subprocess.run(command, stdout=outfile)
+#     # Make the bigwig
+#     command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
+#     print(f'Making bigwig from {bedgraph_basename}')
+#     subprocess.run(command)

smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py ADDED Viewed

@@ -0,0 +1,259 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List, Any, Tuple, Union, Optional
+import re
+from itertools import zip_longest
+import pysam
+from tqdm import tqdm
+def concatenate_fastqs_to_bam(
+    fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
+    output_bam: Union[str, Path],
+    barcode_tag: str = "BC",
+    barcode_map: Optional[Dict[Union[str, Path], str]] = None,
+    add_read_group: bool = True,
+    rg_sample_field: Optional[str] = None,
+    progress: bool = True,
+    auto_pair: bool = True,
+) -> Dict[str, Any]:
+    """
+    Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
+    Parameters
+    ----------
+    fastq_files : list[Path|str] or list[(Path|str, Path|str)]
+        Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
+    output_bam : Path|str
+        Output BAM path (parent directory will be created).
+    barcode_tag : str
+        SAM tag used to store barcode on each read (default 'BC').
+    barcode_map : dict or None
+        Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
+    add_read_group : bool
+        If True, add @RG header lines (ID = barcode) and set each read's RG tag.
+    rg_sample_field : str or None
+        If set, include SM=<value> in @RG.
+    progress : bool
+        Show tqdm progress bars.
+    auto_pair : bool
+        Auto-pair R1/R2 based on filename patterns if given a flat list.
+    Returns
+    -------
+    dict
+      {'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
+    """
+    # ---------- helpers (Pathlib-only) ----------
+    def _strip_fastq_ext(p: Path) -> str:
+        """
+        Remove common FASTQ multi-suffixes; return stem-like name.
+        """
+        name = p.name
+        lowers = name.lower()
+        for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
+            if lowers.endswith(ext):
+                return name[: -len(ext)]
+        return p.stem  # fallback: remove last suffix only
+    def _extract_barcode_from_filename(p: Path) -> str:
+        stem = _strip_fastq_ext(p)
+        if "_" in stem:
+            token = stem.split("_")[-1]
+            if token:
+                return token
+        return stem
+    def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
+        # return (prefix, readnum) if matches; else (None, None)
+        patterns = [
+            r"(?i)(.*?)[._-]r?([12])$",        # prefix_R1 / prefix.r2 / prefix-1
+            r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
+        ]
+        for pat in patterns:
+            m = re.match(pat, stem)
+            if m:
+                return m.group(1), int(m.group(2))
+        return None, None
+    def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
+        pref_map: Dict[str, Dict[int, Path]] = {}
+        unpaired: List[Path] = []
+        for pth in paths:
+            stem = _strip_fastq_ext(pth)
+            pref, num = _classify_read_token(stem)
+            if pref is None:
+                unpaired.append(pth)
+            else:
+                entry = pref_map.setdefault(pref, {})
+                entry[num] = pth
+        pairs: List[Tuple[Path, Path]] = []
+        leftovers: List[Path] = []
+        for d in pref_map.values():
+            if 1 in d and 2 in d:
+                pairs.append((d[1], d[2]))
+            else:
+                leftovers.extend(d.values())
+        leftovers.extend(unpaired)
+        return pairs, leftovers
+    def _fastq_iter(p: Path):
+        # pysam.FastxFile handles compressed extensions transparently
+        with pysam.FastxFile(str(p)) as fx:
+            for rec in fx:
+                yield rec  # rec.name, rec.sequence, rec.quality
+    def _make_unaligned_segment(
+        name: str,
+        seq: str,
+        qual: Optional[str],
+        bc: str,
+        read1: bool,
+        read2: bool,
+    ) -> pysam.AlignedSegment:
+        a = pysam.AlignedSegment()
+        a.query_name = name
+        a.query_sequence = seq
+        if qual is not None:
+            a.query_qualities = pysam.qualitystring_to_array(qual)
+        a.is_unmapped = True
+        a.is_paired = read1 or read2
+        a.is_read1 = read1
+        a.is_read2 = read2
+        a.mate_is_unmapped = a.is_paired
+        a.reference_id = -1
+        a.reference_start = -1
+        a.next_reference_id = -1
+        a.next_reference_start = -1
+        a.template_length = 0
+        a.set_tag(barcode_tag, str(bc), value_type="Z")
+        if add_read_group:
+            a.set_tag("RG", str(bc), value_type="Z")
+        return a
+    # ---------- normalize inputs to Path ----------
+    def _to_path_pair(x) -> Tuple[Path, Path]:
+        a, b = x
+        return Path(a), Path(b)
+    explicit_pairs: List[Tuple[Path, Path]] = []
+    singles: List[Path] = []
+    if not isinstance(fastq_files, (list, tuple)):
+        raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
+    if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
+        explicit_pairs = [_to_path_pair(x) for x in fastq_files]
+    else:
+        flat_paths = [Path(x) for x in fastq_files if x is not None]
+        if auto_pair:
+            explicit_pairs, leftovers = _pair_by_filename(flat_paths)
+            singles = leftovers
+        else:
+            singles = flat_paths
+    output_bam = Path(output_bam)
+    output_bam.parent.mkdir(parents=True, exist_ok=True)
+    # ---------- barcodes ----------
+    barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
+    per_path_barcode: Dict[Path, str] = {}
+    barcodes_in_order: List[str] = []
+    for r1, r2 in explicit_pairs:
+        bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
+        per_path_barcode[r1] = bc
+        per_path_barcode[r2] = bc
+        if bc not in barcodes_in_order:
+            barcodes_in_order.append(bc)
+    for pth in singles:
+        bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
+        per_path_barcode[pth] = bc
+        if bc not in barcodes_in_order:
+            barcodes_in_order.append(bc)
+    # ---------- BAM header ----------
+    header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
+    if add_read_group:
+        header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
+    header.setdefault("PG", []).append(
+        {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
+    )
+    # ---------- counters ----------
+    per_file_counts: Dict[Path, int] = {}
+    total_written = 0
+    paired_pairs_written = 0
+    singletons_written = 0
+    # ---------- write BAM ----------
+    with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
+        # Paired
+        it_pairs = explicit_pairs
+        if progress and it_pairs:
+            it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
+        for r1_path, r2_path in it_pairs:
+            if not (r1_path.exists() and r2_path.exists()):
+                raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
+            bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
+            it1 = _fastq_iter(r1_path)
+            it2 = _fastq_iter(r2_path)
+            for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
+                def _clean(n: Optional[str]) -> Optional[str]:
+                    if n is None:
+                        return None
+                    return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
+                name = (
+                    _clean(getattr(rec1, "name", None))
+                    or _clean(getattr(rec2, "name", None))
+                    or getattr(rec1, "name", None)
+                    or getattr(rec2, "name", None)
+                )
+                if rec1 is not None:
+                    a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
+                    bam_out.write(a1)
+                    per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
+                    total_written += 1
+                if rec2 is not None:
+                    a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
+                    bam_out.write(a2)
+                    per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
+                    total_written += 1
+                if rec1 is not None and rec2 is not None:
+                    paired_pairs_written += 1
+                else:
+                    if rec1 is not None:
+                        singletons_written += 1
+                    if rec2 is not None:
+                        singletons_written += 1
+        # Singles
+        it_singles = singles
+        if progress and it_singles:
+            it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
+        for pth in it_singles:
+            if not pth.exists():
+                raise FileNotFoundError(pth)
+            bc = per_path_barcode.get(pth, "barcode")
+            for rec in _fastq_iter(pth):
+                a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
+                bam_out.write(a)
+                per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
+                total_written += 1
+                singletons_written += 1
+    return {
+        "total_reads": total_written,
+        "per_file": {str(k): v for k, v in per_file_counts.items()},
+        "paired_pairs_written": paired_pairs_written,
+        "singletons_written": singletons_written,
+        "barcodes": barcodes_in_order,
+    }

smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py RENAMED Viewed

@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
        record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
     """
-    from .. import readwrite
+    from ... import readwrite
     import pysam
     from tqdm import tqdm
     from collections import defaultdict
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
     # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
     record_counts = defaultdict(int)
-    with pysam.AlignmentFile(bam_file, "rb") as bam:
+    with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped + bam.unmapped
         # Iterate over reads to get the total mapped read counts and the reads that map to each reference
         for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl