PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

smftools/config/direct.yaml CHANGED Viewed

@@ -1,5 +1,7 @@
 # Direct (Nanopore modified base calling)footprinting defaults
 extends: default
+######## smftools load params #########
 filter_threshold: 0.8 # min threshold to call a canononical base
 m6A_threshold: 0.7 # min threshold to call a modified m6a base
 m5C_threshold: 0.7 # min threshold to call a modified 5mC base
@@ -12,6 +14,28 @@ thresholds:
 mod_list:
   - '5mC_5hmC'
   - '6mA' # mods to detect
+mod_target_bases:
+  - "A"
+enzyme_target_bases:
+  - "A"
 batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
 skip_unclassified: True # Whether to skip unclassified barcodes
-delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
+delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
+######## smftools preprocess params ########
+fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
+binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
+positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
+negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
+infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
+inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
+fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
+output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
+######## smftools spatial params #########
+autocorr_site_types:
+  - "A"
+######## smftools hmm params #########
+hmm_methbases:
+  - "A"

smftools/config/discover_input_files.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List, Any, Iterable, Union
+def discover_input_files(
+    input_data_path: Union[str, Path],
+    bam_suffix: str = ".bam",
+    recursive: bool = False,
+    follow_symlinks: bool = False,
+) -> Dict[str, Any]:
+    """
+    Discover input files under `input_data_path`.
+    Returns a dict with:
+      - pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
+      - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
+      - all_files_searched (int)
+    Behavior:
+      - If `input_data_path` is a file, returns that single file categorized.
+      - If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
+      - Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
+    """
+    p = Path(input_data_path)
+    # normalize bam suffix with a leading dot and lower
+    if not bam_suffix.startswith("."):
+        bam_suffix = "." + bam_suffix
+    bam_suffix = bam_suffix.lower()
+    # Sets of canonical extension keys we’ll compare against
+    pod5_exts  = {".pod5", ".p5"}
+    fast5_exts = {".fast5", ".f5"}
+    fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
+    h5ad_exts  = {".h5ad", ".h5"}
+    compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
+    def ext_key(pp: Path) -> str:
+        """
+        A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
+        Examples:
+          a.fastq.gz -> ".fastq.gz"
+          a.fq.xz    -> ".fq.xz"
+          a.bam      -> ".bam"
+          a          -> ""
+        """
+        suff = [s.lower() for s in pp.suffixes]
+        if not suff:
+            return ""
+        if suff[-1] in compressed_exts and len(suff) >= 2:
+            return suff[-2] + suff[-1]
+        return suff[-1]
+    pod5_paths: List[Path] = []
+    fast5_paths: List[Path] = []
+    fastq_paths: List[Path] = []
+    bam_paths: List[Path] = []
+    h5ad_paths: List[Path] = []
+    other_paths: List[Path] = []
+    def categorize_file(fp: Path) -> None:
+        key = ext_key(fp)
+        if key in pod5_exts:
+            pod5_paths.append(fp)
+        elif key in fast5_exts:
+            fast5_paths.append(fp)
+        elif key in fastq_exts:
+            fastq_paths.append(fp)
+        elif key in h5ad_exts:
+            h5ad_paths.append(fp)
+        elif key == bam_suffix:
+            bam_paths.append(fp)
+        else:
+            other_paths.append(fp)
+    if not p.exists():
+        raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
+    total_searched = 0
+    if p.is_file():
+        total_searched = 1
+        categorize_file(p)
+    else:
+        # Directory scan
+        if recursive:
+            # Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
+            try:
+                iterator = p.rglob("*", follow_symlinks=follow_symlinks)  # type: ignore[call-arg]
+            except TypeError:
+                iterator = p.rglob("*")  # follow_symlinks not supported
+        else:
+            iterator = p.iterdir()
+        for fp in iterator:
+            if not fp.is_file():
+                continue
+            total_searched += 1
+            categorize_file(fp)
+    return {
+        "pod5_paths": sorted(pod5_paths),
+        "fast5_paths": sorted(fast5_paths),
+        "fastq_paths": sorted(fastq_paths),
+        "bam_paths": sorted(bam_paths),
+        "h5ad_paths": sorted(h5ad_paths),
+        "other_paths": sorted(other_paths),
+        "input_is_pod5": len(pod5_paths) > 0,
+        "input_is_fast5": len(fast5_paths) > 0,
+        "input_is_fastq": len(fastq_paths) > 0,
+        "input_is_bam": len(bam_paths) > 0,
+        "input_is_h5ad": len(h5ad_paths) > 0,
+        "all_files_searched": total_searched,
+    }

smftools/config/experiment_config.py CHANGED Viewed

@@ -6,6 +6,7 @@ import warnings
 from dataclasses import dataclass, field, asdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
+from .discover_input_files import discover_input_files
 # Optional dependency for YAML handling
 try:
@@ -593,7 +594,10 @@ class ExperimentConfig:
     fasta: Optional[str] = None
     bam_suffix: str = ".bam"
     recursive_input_search: bool = True
+    input_type: Optional[str] = None
+    input_files: Optional[List[Path]] = None
     split_dir: str = "demultiplexed_BAMs"
+    split_path: Optional[str] = None
     strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
     conversions: List[str] = field(default_factory=lambda: ["unconverted"])
     fasta_regions_of_interest: Optional[str] = None
@@ -601,11 +605,16 @@ class ExperimentConfig:
     sample_sheet_mapping_column: Optional[str] = 'Barcode'
     experiment_name: Optional[str] = None
     input_already_demuxed: bool = False
+    summary_file: Optional[Path] = None
     # FASTQ input specific
     fastq_barcode_map: Optional[Dict[str, str]] = None
     fastq_auto_pairing: bool = True
+    # Remove intermediate file options
+    delete_intermediate_bams: bool = True
+    delete_intermediate_tsvs: bool = True
     # Conversion/Deamination file handling
     delete_intermediate_hdfs: bool = True
@@ -645,6 +654,7 @@ class ExperimentConfig:
     aligner: str = "minimap2"
     aligner_args: Optional[List[str]] = None
     make_bigwigs: bool = False
+    make_beds: bool = False
     # Anndata structure
     reference_column: Optional[str] = 'Reference_strand'
@@ -656,11 +666,21 @@ class ExperimentConfig:
     # Preprocessing - Read length and quality filter params
     read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
-    read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [200, None])
-    read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.1])
-    read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [20, None])
+    read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
+    read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
+    read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
     read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
+    # Preprocessing - Direct mod detection binarization params
+    fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
+    binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
+    positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
+    negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
+    infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
+    inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
+    fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
+    output_binary_layer_name: Optional[str] = "binarized_methylation"
     # Preprocessing - Read modification filter params
     read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
@@ -680,7 +700,8 @@ class ExperimentConfig:
     duplicate_detection_hierarchical_linkage: str = "average"
     duplicate_detection_do_pca: bool = False
-    # Preprocessing - Complexity analysis params
+    # Preprocessing - Position QC
+    position_max_nan_threshold: float = 0.1
     # Basic Analysis - Clustermap params
     layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
@@ -718,6 +739,9 @@ class ExperimentConfig:
     hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
     hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
+    # Pipeline control flow - load adata
+    force_redo_load_adata: bool = False
     # Pipeline control flow - preprocessing and QC
     force_redo_preprocessing: bool = False
     force_reload_sample_sheet: bool = True
@@ -860,6 +884,63 @@ class ExperimentConfig:
         if merged.get("experiment_name") is None and date_str:
             merged["experiment_name"] = f"{date_str}_SMF_experiment"
+        # Input file types and path handling
+        input_data_path = Path(merged['input_data_path'])
+        # Detect the input filetype
+        if input_data_path.is_file():
+                suffix = input_data_path.suffix.lower()
+                suffixes = [s.lower() for s in input_data_path.suffixes]  # handles multi-part extensions
+                # recognize multi-suffix cases like .fastq.gz or .fq.gz
+                if any(s in ['.pod5', '.p5'] for s in suffixes):
+                    input_type = "pod5"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.fast5', '.f5'] for s in suffixes):
+                    input_type = "fast5"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.fastq', '.fq'] for s in suffixes):
+                    input_type = "fastq"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.bam'] for s in suffixes):
+                    input_type = "bam"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.h5ad', ".h5"] for s in suffixes):
+                    input_type = "h5ad"
+                    input_files = [Path(input_data_path)]
+                else:
+                    print("Error detecting input file type")
+        elif input_data_path.is_dir():
+            found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
+            if found["input_is_pod5"]:
+                input_type = "pod5"
+                input_files = found["pod5_paths"]
+            elif found["input_is_fast5"]:
+                input_type = "fast5"
+                input_files = found["fast5_paths"]
+            elif found["input_is_fastq"]:
+                input_type = "fastq"
+                input_files = found["fastq_paths"]
+            elif found["input_is_bam"]:
+                input_type = "bam"
+                input_files = found["bam_paths"]
+            elif found["input_is_h5ad"]:
+                input_type = "h5ad"
+                input_files = found["h5ad_paths"]
+            print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
+        # summary file output path
+        output_dir = Path(merged['output_directory'])
+        summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
+        summary_file = output_dir / summary_file_basename
+        # Demultiplexing output path
+        split_dir = merged.get("split_dir", "demultiplexed_BAMs")
+        split_path = output_dir / split_dir
         # final normalization
         if "strands" in merged:
             merged["strands"] = _parse_list(merged["strands"])
@@ -936,13 +1017,15 @@ class ExperimentConfig:
         hmm_methbases = list(hmm_methbases)
         hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
         # instantiate dataclass
         instance = cls(
             smf_modality = merged.get("smf_modality"),
-            input_data_path = merged.get("input_data_path"),
+            input_data_path = input_data_path,
             recursive_input_search = merged.get("recursive_input_search"),
-            output_directory = merged.get("output_directory"),
+            input_type = input_type,
+            input_files = input_files,
+            output_directory = output_dir,
+            summary_file = summary_file,
             fasta = merged.get("fasta"),
             sequencer = merged.get("sequencer"),
             model_dir = merged.get("model_dir"),
@@ -950,7 +1033,8 @@ class ExperimentConfig:
             fastq_barcode_map = merged.get("fastq_barcode_map"),
             fastq_auto_pairing = merged.get("fastq_auto_pairing"),
             bam_suffix = merged.get("bam_suffix", ".bam"),
-            split_dir = merged.get("split_dir", "demultiplexed_BAMs"),
+            split_dir = split_dir,
+            split_path = split_path,
             strands = merged.get("strands", ["bottom","top"]),
             conversions = merged.get("conversions", ["unconverted"]),
             fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
@@ -963,14 +1047,17 @@ class ExperimentConfig:
             threads = merged.get("threads"),
             sample_sheet_path = merged.get("sample_sheet_path"),
             sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
+            delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
+            delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
             aligner = merged.get("aligner", "minimap2"),
             aligner_args = merged.get("aligner_args", None),
             device = merged.get("device", "auto"),
             make_bigwigs = merged.get("make_bigwigs", False),
+            make_beds = merged.get("make_beds", False),
             delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
             mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
             enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
-            conversion_types = merged.get("conversion_types", ["5mC"]),
+            conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
             filter_threshold = merged.get("filter_threshold", 0.8),
             m6A_threshold = merged.get("m6A_threshold", 0.7),
             m5C_threshold = merged.get("m5C_threshold", 0.7),
@@ -983,6 +1070,14 @@ class ExperimentConfig:
             reference_column = merged.get("reference_column", 'Reference_strand'),
             sample_column = merged.get("sample_column", 'Barcode'),
             sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
+            fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
+            binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
+            positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
+            negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
+            infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
+            inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
+            fit_j_threshold = merged.get("fit_j_threshold", 0.5),
+            output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
             layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
             layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
             umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
@@ -1008,9 +1103,9 @@ class ExperimentConfig:
             accessible_patches = merged.get("accessible_patches", None),
             cpg = merged.get("cpg", None),
             read_coord_filter = merged.get("read_coord_filter", [None, None]),
-            read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [200, None]),
-            read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.4, 1.1]),
-            read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [20, None]),
+            read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
+            read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
+            read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
             read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
             read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
             read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
@@ -1026,10 +1121,12 @@ class ExperimentConfig:
             duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
             duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
             duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
+            position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
             correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
             correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
             correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
             hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
+            force_redo_load_adata = merged.get("force_redo_load_adata", False),
             force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
             force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
             bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),

smftools/informatics/__init__.py CHANGED Viewed

@@ -1,14 +1,20 @@
-from . import helpers
-from .basecall_pod5s import basecall_pod5s
-from .subsample_fasta_from_bed import subsample_fasta_from_bed
-from .subsample_pod5 import subsample_pod5
-from .fast5_to_pod5 import fast5_to_pod5
+from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
+from .basecalling import canoncall, modcall
+from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
+from .converted_BAM_to_adata import converted_BAM_to_adata
+from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
+from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
+from .modkit_functions import extract_mods, make_modbed, modQC
+from .modkit_extract_to_adata import modkit_extract_to_adata
+from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
+from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
+from .run_multiqc import run_multiqc
 __all__ = [
     "basecall_pod5s",
+    "converted_BAM_to_adata",
     "subsample_fasta_from_bed",
     "subsample_pod5",
     "fast5_to_pod5",
-    "helpers"
+    "run_multiqc"
 ]

smftools/informatics/archived/fast5_to_pod5.py ADDED Viewed

@@ -0,0 +1,43 @@
+from pathlib import Path
+import subprocess
+from typing import Union, List
+def fast5_to_pod5(
+    fast5_dir: Union[str, Path, List[Union[str, Path]]],
+    output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
+) -> None:
+    """
+    Convert Nanopore FAST5 files (single file, list of files, or directory)
+    into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
+    """
+    output_pod5 = str(output_pod5)  # ensure string
+    # 1) If user gives a list of FAST5 files
+    if isinstance(fast5_dir, (list, tuple)):
+        fast5_paths = [str(Path(f)) for f in fast5_dir]
+        cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    # Ensure Path object
+    p = Path(fast5_dir)
+    # 2) If user gives a single file
+    if p.is_file():
+        cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    # 3) If user gives a directory → collect FAST5s
+    if p.is_dir():
+        fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
+        if not fast5_paths:
+            raise FileNotFoundError(f"No FAST5 files found in {p}")
+        cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
+        subprocess.run(cmd, check=True)
+        return
+    raise FileNotFoundError(f"Input path invalid: {fast5_dir}")

smftools/informatics/archived/helpers/archived/__init__.py ADDED Viewed

@@ -0,0 +1,71 @@
+# from .align_and_sort_BAM import align_and_sort_BAM
+# from .aligned_BAM_to_bed import aligned_BAM_to_bed
+# from .bam_qc import bam_qc
+# from .bed_to_bigwig import bed_to_bigwig
+# from .binarize_converted_base_identities import binarize_converted_base_identities
+# from .canoncall import canoncall
+# from .complement_base_list import complement_base_list
+# from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
+# from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
+# from .count_aligned_reads import count_aligned_reads
+# from .demux_and_index_BAM import demux_and_index_BAM
+# from .discover_input_files import *
+# from .extract_base_identities import extract_base_identities
+# from .extract_mods import extract_mods
+# from .extract_read_features_from_bam import extract_read_features_from_bam
+# from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
+# from .extract_readnames_from_BAM import extract_readnames_from_BAM
+# from .find_conversion_sites import find_conversion_sites
+# from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
+# from .get_chromosome_lengths import get_chromosome_lengths
+# from .get_native_references import get_native_references
+# from .index_fasta import index_fasta
+# from .make_modbed import make_modbed
+# from .modcall import modcall
+# from .modkit_extract_to_adata import modkit_extract_to_adata
+# from .modQC import modQC
+# from .one_hot_encode import one_hot_encode
+# from .ohe_batching import ohe_batching
+# from .one_hot_decode import one_hot_decode
+# from .ohe_layers_decode import ohe_layers_decode
+# from .plot_bed_histograms import plot_bed_histograms
+# from .run_multiqc import run_multiqc
+# from .separate_bam_by_bc import separate_bam_by_bc
+# from .split_and_index_BAM import split_and_index_BAM
+# __all__ = [
+#     "align_and_sort_BAM",
+#     "aligned_BAM_to_bed",
+#     "bam_qc",
+#     "bed_to_bigwig",
+#     "binarize_converted_base_identities",
+#     "canoncall",
+#     "complement_base_list",
+#     "converted_BAM_to_adata_II",
+#     "concatenate_fastqs_to_bam",
+#     "count_aligned_reads",
+#     "demux_and_index_BAM",
+#     "extract_base_identities",
+#     "extract_mods",
+#     "extract_read_features_from_bam",
+#     "extract_read_lengths_from_bed",
+#     "extract_readnames_from_BAM",
+#     "find_conversion_sites",
+#     "convert_FASTA_record",
+#     "generate_converted_FASTA",
+#     "get_chromosome_lengths",
+#     "get_native_references",
+#     "index_fasta",
+#     "make_modbed",
+#     "modcall",
+#     "modkit_extract_to_adata",
+#     "modQC",
+#     "one_hot_encode",
+#     "ohe_batching",
+#     "one_hot_decode",
+#     "ohe_layers_decode",
+#     "plot_bed_histograms",
+#     "run_multiqc",
+#     "separate_bam_by_bc",
+#     "split_and_index_BAM"
+# ]

smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py ADDED Viewed

@@ -0,0 +1,126 @@
+from pathlib import Path
+import os
+import subprocess
+from typing import List, Optional, Union
+import pysam
+def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
+    """
+    Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
+    """
+    bam_path = str(bam_path)
+    fastq_path = str(fastq_path)
+    with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
+        for r in bam.fetch(until_eof=True):
+            # Skip secondary/supplementary if you want (optional):
+            # if r.is_secondary or r.is_supplementary: continue
+            name = r.query_name
+            seq = r.query_sequence or ""
+            qual = r.qual or ""
+            fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
+def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
+    in_bam, out_bam = str(in_bam), str(out_bam)
+    args = []
+    if threads:
+        args += ["-@", str(threads)]
+    args += ["-o", out_bam, in_bam]
+    pysam.sort(*args)
+def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
+    bam_path = str(bam_path)
+    # pysam.index supports samtools-style args
+    if threads:
+        pysam.index("-@", str(threads), bam_path)
+    else:
+        pysam.index(bam_path)
+def align_and_sort_BAM(fasta,
+                       input,
+                       bam_suffix='.bam',
+                       output_directory='aligned_outputs',
+                       make_bigwigs=False,
+                       threads=None,
+                       aligner='minimap2',
+                       aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
+    """
+    A wrapper for running dorado aligner and samtools functions
+    Parameters:
+        fasta (str): File path to the reference genome to align to.
+        input (str): File path to the basecalled file to align. Works for .bam and .fastq files
+        bam_suffix (str): The suffix to use for the BAM file.
+        output_directory (str): A file path to the directory to output all the analyses.
+        make_bigwigs (bool): Whether to make bigwigs
+        threads (int): Number of additional threads to use
+        aligner (str): Aligner to use. minimap2 and dorado options
+        aligner_args (list): list of optional parameters to use for the alignment
+    Returns:
+        None
+            The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
+    """
+    input_basename = input.name
+    input_suffix = input.suffix
+    input_as_fastq = input.with_name(input.stem + '.fastq')
+    output_path_minus_suffix = output_directory / input.stem
+    aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
+    aligned_output = aligned_BAM.with_suffix(bam_suffix)
+    aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
+    aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
+    if threads:
+        threads = str(threads)
+    else:
+        pass
+    if aligner == 'minimap2':
+        print(f"Converting BAM to FASTQ: {input}")
+        _bam_to_fastq_with_pysam(input, input_as_fastq)
+        # bam_to_fastq_command = ['samtools', 'fastq', input]
+        # subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
+        print(f"Aligning FASTQ to Reference: {input_as_fastq}")
+        if threads:
+            minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
+        else:
+            minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
+        subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
+        os.remove(input_as_fastq)
+    elif aligner == 'dorado':
+        # Run dorado aligner
+        print(f"Aligning BAM to Reference: {input}")
+        if threads:
+            alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
+        else:
+            alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
+        subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
+    else:
+        print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
+        return
+    # --- Sort & Index with pysam ---
+    print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
+    _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
+    print(f"[pysam] Indexing: {aligned_sorted_output}")
+    _index_bam_with_pysam(aligned_sorted_output, threads=threads)
+    # Sort the BAM on positional coordinates
+    # print(f"Sorting BAM: {aligned_output}")
+    # if threads:
+    #     sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
+    # else:
+    #     sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
+    # subprocess.run(sort_command)
+    # # Create a BAM index file
+    # print(f"Indexing BAM: {aligned_sorted_output}")
+    # if threads:
+    #     index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
+    # else:
+    #     index_command = ["samtools", "index", aligned_sorted_output]
+    # subprocess.run(index_command)

smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl