smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/config/direct.yaml
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Direct (Nanopore modified base calling)footprinting defaults
|
|
2
2
|
extends: default
|
|
3
|
+
|
|
4
|
+
######## smftools load params #########
|
|
3
5
|
filter_threshold: 0.8 # min threshold to call a canononical base
|
|
4
6
|
m6A_threshold: 0.7 # min threshold to call a modified m6a base
|
|
5
7
|
m5C_threshold: 0.7 # min threshold to call a modified 5mC base
|
|
@@ -12,6 +14,28 @@ thresholds:
|
|
|
12
14
|
mod_list:
|
|
13
15
|
- '5mC_5hmC'
|
|
14
16
|
- '6mA' # mods to detect
|
|
17
|
+
mod_target_bases:
|
|
18
|
+
- "A"
|
|
19
|
+
enzyme_target_bases:
|
|
20
|
+
- "A"
|
|
15
21
|
batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
|
|
16
22
|
skip_unclassified: True # Whether to skip unclassified barcodes
|
|
17
|
-
delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
|
|
23
|
+
delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
|
|
24
|
+
|
|
25
|
+
######## smftools preprocess params ########
|
|
26
|
+
fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
27
|
+
binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
28
|
+
positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
|
|
29
|
+
negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
|
|
30
|
+
infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
31
|
+
inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
32
|
+
fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
33
|
+
output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
|
|
34
|
+
|
|
35
|
+
######## smftools spatial params #########
|
|
36
|
+
autocorr_site_types:
|
|
37
|
+
- "A"
|
|
38
|
+
|
|
39
|
+
######## smftools hmm params #########
|
|
40
|
+
hmm_methbases:
|
|
41
|
+
- "A"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Any, Iterable, Union
|
|
5
|
+
|
|
6
|
+
def discover_input_files(
|
|
7
|
+
input_data_path: Union[str, Path],
|
|
8
|
+
bam_suffix: str = ".bam",
|
|
9
|
+
recursive: bool = False,
|
|
10
|
+
follow_symlinks: bool = False,
|
|
11
|
+
) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Discover input files under `input_data_path`.
|
|
14
|
+
|
|
15
|
+
Returns a dict with:
|
|
16
|
+
- pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
|
|
17
|
+
- input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
|
|
18
|
+
- all_files_searched (int)
|
|
19
|
+
|
|
20
|
+
Behavior:
|
|
21
|
+
- If `input_data_path` is a file, returns that single file categorized.
|
|
22
|
+
- If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
|
|
23
|
+
- Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
|
|
24
|
+
"""
|
|
25
|
+
p = Path(input_data_path)
|
|
26
|
+
|
|
27
|
+
# normalize bam suffix with a leading dot and lower
|
|
28
|
+
if not bam_suffix.startswith("."):
|
|
29
|
+
bam_suffix = "." + bam_suffix
|
|
30
|
+
bam_suffix = bam_suffix.lower()
|
|
31
|
+
|
|
32
|
+
# Sets of canonical extension keys we’ll compare against
|
|
33
|
+
pod5_exts = {".pod5", ".p5"}
|
|
34
|
+
fast5_exts = {".fast5", ".f5"}
|
|
35
|
+
fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
|
|
36
|
+
h5ad_exts = {".h5ad", ".h5"}
|
|
37
|
+
compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
|
|
38
|
+
|
|
39
|
+
def ext_key(pp: Path) -> str:
|
|
40
|
+
"""
|
|
41
|
+
A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
|
|
42
|
+
Examples:
|
|
43
|
+
a.fastq.gz -> ".fastq.gz"
|
|
44
|
+
a.fq.xz -> ".fq.xz"
|
|
45
|
+
a.bam -> ".bam"
|
|
46
|
+
a -> ""
|
|
47
|
+
"""
|
|
48
|
+
suff = [s.lower() for s in pp.suffixes]
|
|
49
|
+
if not suff:
|
|
50
|
+
return ""
|
|
51
|
+
if suff[-1] in compressed_exts and len(suff) >= 2:
|
|
52
|
+
return suff[-2] + suff[-1]
|
|
53
|
+
return suff[-1]
|
|
54
|
+
|
|
55
|
+
pod5_paths: List[Path] = []
|
|
56
|
+
fast5_paths: List[Path] = []
|
|
57
|
+
fastq_paths: List[Path] = []
|
|
58
|
+
bam_paths: List[Path] = []
|
|
59
|
+
h5ad_paths: List[Path] = []
|
|
60
|
+
other_paths: List[Path] = []
|
|
61
|
+
|
|
62
|
+
def categorize_file(fp: Path) -> None:
|
|
63
|
+
key = ext_key(fp)
|
|
64
|
+
if key in pod5_exts:
|
|
65
|
+
pod5_paths.append(fp)
|
|
66
|
+
elif key in fast5_exts:
|
|
67
|
+
fast5_paths.append(fp)
|
|
68
|
+
elif key in fastq_exts:
|
|
69
|
+
fastq_paths.append(fp)
|
|
70
|
+
elif key in h5ad_exts:
|
|
71
|
+
h5ad_paths.append(fp)
|
|
72
|
+
elif key == bam_suffix:
|
|
73
|
+
bam_paths.append(fp)
|
|
74
|
+
else:
|
|
75
|
+
other_paths.append(fp)
|
|
76
|
+
|
|
77
|
+
if not p.exists():
|
|
78
|
+
raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
|
|
79
|
+
|
|
80
|
+
total_searched = 0
|
|
81
|
+
|
|
82
|
+
if p.is_file():
|
|
83
|
+
total_searched = 1
|
|
84
|
+
categorize_file(p)
|
|
85
|
+
else:
|
|
86
|
+
# Directory scan
|
|
87
|
+
if recursive:
|
|
88
|
+
# Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
|
|
89
|
+
try:
|
|
90
|
+
iterator = p.rglob("*", follow_symlinks=follow_symlinks) # type: ignore[call-arg]
|
|
91
|
+
except TypeError:
|
|
92
|
+
iterator = p.rglob("*") # follow_symlinks not supported
|
|
93
|
+
else:
|
|
94
|
+
iterator = p.iterdir()
|
|
95
|
+
|
|
96
|
+
for fp in iterator:
|
|
97
|
+
if not fp.is_file():
|
|
98
|
+
continue
|
|
99
|
+
total_searched += 1
|
|
100
|
+
categorize_file(fp)
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
"pod5_paths": sorted(pod5_paths),
|
|
104
|
+
"fast5_paths": sorted(fast5_paths),
|
|
105
|
+
"fastq_paths": sorted(fastq_paths),
|
|
106
|
+
"bam_paths": sorted(bam_paths),
|
|
107
|
+
"h5ad_paths": sorted(h5ad_paths),
|
|
108
|
+
"other_paths": sorted(other_paths),
|
|
109
|
+
"input_is_pod5": len(pod5_paths) > 0,
|
|
110
|
+
"input_is_fast5": len(fast5_paths) > 0,
|
|
111
|
+
"input_is_fastq": len(fastq_paths) > 0,
|
|
112
|
+
"input_is_bam": len(bam_paths) > 0,
|
|
113
|
+
"input_is_h5ad": len(h5ad_paths) > 0,
|
|
114
|
+
"all_files_searched": total_searched,
|
|
115
|
+
}
|
|
@@ -6,6 +6,7 @@ import warnings
|
|
|
6
6
|
from dataclasses import dataclass, field, asdict
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
|
|
9
|
+
from .discover_input_files import discover_input_files
|
|
9
10
|
|
|
10
11
|
# Optional dependency for YAML handling
|
|
11
12
|
try:
|
|
@@ -593,7 +594,10 @@ class ExperimentConfig:
|
|
|
593
594
|
fasta: Optional[str] = None
|
|
594
595
|
bam_suffix: str = ".bam"
|
|
595
596
|
recursive_input_search: bool = True
|
|
597
|
+
input_type: Optional[str] = None
|
|
598
|
+
input_files: Optional[List[Path]] = None
|
|
596
599
|
split_dir: str = "demultiplexed_BAMs"
|
|
600
|
+
split_path: Optional[str] = None
|
|
597
601
|
strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
|
|
598
602
|
conversions: List[str] = field(default_factory=lambda: ["unconverted"])
|
|
599
603
|
fasta_regions_of_interest: Optional[str] = None
|
|
@@ -601,11 +605,16 @@ class ExperimentConfig:
|
|
|
601
605
|
sample_sheet_mapping_column: Optional[str] = 'Barcode'
|
|
602
606
|
experiment_name: Optional[str] = None
|
|
603
607
|
input_already_demuxed: bool = False
|
|
608
|
+
summary_file: Optional[Path] = None
|
|
604
609
|
|
|
605
610
|
# FASTQ input specific
|
|
606
611
|
fastq_barcode_map: Optional[Dict[str, str]] = None
|
|
607
612
|
fastq_auto_pairing: bool = True
|
|
608
613
|
|
|
614
|
+
# Remove intermediate file options
|
|
615
|
+
delete_intermediate_bams: bool = True
|
|
616
|
+
delete_intermediate_tsvs: bool = True
|
|
617
|
+
|
|
609
618
|
# Conversion/Deamination file handling
|
|
610
619
|
delete_intermediate_hdfs: bool = True
|
|
611
620
|
|
|
@@ -645,6 +654,7 @@ class ExperimentConfig:
|
|
|
645
654
|
aligner: str = "minimap2"
|
|
646
655
|
aligner_args: Optional[List[str]] = None
|
|
647
656
|
make_bigwigs: bool = False
|
|
657
|
+
make_beds: bool = False
|
|
648
658
|
|
|
649
659
|
# Anndata structure
|
|
650
660
|
reference_column: Optional[str] = 'Reference_strand'
|
|
@@ -656,11 +666,21 @@ class ExperimentConfig:
|
|
|
656
666
|
|
|
657
667
|
# Preprocessing - Read length and quality filter params
|
|
658
668
|
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
659
|
-
read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [
|
|
660
|
-
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.
|
|
661
|
-
read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [
|
|
669
|
+
read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
|
|
670
|
+
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
|
|
671
|
+
read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
|
|
662
672
|
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
663
673
|
|
|
674
|
+
# Preprocessing - Direct mod detection binarization params
|
|
675
|
+
fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
676
|
+
binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
677
|
+
positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
|
|
678
|
+
negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
|
|
679
|
+
infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
680
|
+
inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
681
|
+
fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
682
|
+
output_binary_layer_name: Optional[str] = "binarized_methylation"
|
|
683
|
+
|
|
664
684
|
# Preprocessing - Read modification filter params
|
|
665
685
|
read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
666
686
|
read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
|
|
@@ -680,7 +700,8 @@ class ExperimentConfig:
|
|
|
680
700
|
duplicate_detection_hierarchical_linkage: str = "average"
|
|
681
701
|
duplicate_detection_do_pca: bool = False
|
|
682
702
|
|
|
683
|
-
# Preprocessing -
|
|
703
|
+
# Preprocessing - Position QC
|
|
704
|
+
position_max_nan_threshold: float = 0.1
|
|
684
705
|
|
|
685
706
|
# Basic Analysis - Clustermap params
|
|
686
707
|
layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
|
|
@@ -718,6 +739,9 @@ class ExperimentConfig:
|
|
|
718
739
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
719
740
|
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
|
|
720
741
|
|
|
742
|
+
# Pipeline control flow - load adata
|
|
743
|
+
force_redo_load_adata: bool = False
|
|
744
|
+
|
|
721
745
|
# Pipeline control flow - preprocessing and QC
|
|
722
746
|
force_redo_preprocessing: bool = False
|
|
723
747
|
force_reload_sample_sheet: bool = True
|
|
@@ -860,6 +884,63 @@ class ExperimentConfig:
|
|
|
860
884
|
if merged.get("experiment_name") is None and date_str:
|
|
861
885
|
merged["experiment_name"] = f"{date_str}_SMF_experiment"
|
|
862
886
|
|
|
887
|
+
# Input file types and path handling
|
|
888
|
+
input_data_path = Path(merged['input_data_path'])
|
|
889
|
+
|
|
890
|
+
# Detect the input filetype
|
|
891
|
+
if input_data_path.is_file():
|
|
892
|
+
suffix = input_data_path.suffix.lower()
|
|
893
|
+
suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
|
|
894
|
+
|
|
895
|
+
# recognize multi-suffix cases like .fastq.gz or .fq.gz
|
|
896
|
+
if any(s in ['.pod5', '.p5'] for s in suffixes):
|
|
897
|
+
input_type = "pod5"
|
|
898
|
+
input_files = [Path(input_data_path)]
|
|
899
|
+
elif any(s in ['.fast5', '.f5'] for s in suffixes):
|
|
900
|
+
input_type = "fast5"
|
|
901
|
+
input_files = [Path(input_data_path)]
|
|
902
|
+
elif any(s in ['.fastq', '.fq'] for s in suffixes):
|
|
903
|
+
input_type = "fastq"
|
|
904
|
+
input_files = [Path(input_data_path)]
|
|
905
|
+
elif any(s in ['.bam'] for s in suffixes):
|
|
906
|
+
input_type = "bam"
|
|
907
|
+
input_files = [Path(input_data_path)]
|
|
908
|
+
elif any(s in ['.h5ad', ".h5"] for s in suffixes):
|
|
909
|
+
input_type = "h5ad"
|
|
910
|
+
input_files = [Path(input_data_path)]
|
|
911
|
+
else:
|
|
912
|
+
print("Error detecting input file type")
|
|
913
|
+
|
|
914
|
+
elif input_data_path.is_dir():
|
|
915
|
+
found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
|
|
916
|
+
|
|
917
|
+
if found["input_is_pod5"]:
|
|
918
|
+
input_type = "pod5"
|
|
919
|
+
input_files = found["pod5_paths"]
|
|
920
|
+
elif found["input_is_fast5"]:
|
|
921
|
+
input_type = "fast5"
|
|
922
|
+
input_files = found["fast5_paths"]
|
|
923
|
+
elif found["input_is_fastq"]:
|
|
924
|
+
input_type = "fastq"
|
|
925
|
+
input_files = found["fastq_paths"]
|
|
926
|
+
elif found["input_is_bam"]:
|
|
927
|
+
input_type = "bam"
|
|
928
|
+
input_files = found["bam_paths"]
|
|
929
|
+
elif found["input_is_h5ad"]:
|
|
930
|
+
input_type = "h5ad"
|
|
931
|
+
input_files = found["h5ad_paths"]
|
|
932
|
+
|
|
933
|
+
print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
|
|
934
|
+
|
|
935
|
+
# summary file output path
|
|
936
|
+
output_dir = Path(merged['output_directory'])
|
|
937
|
+
summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
|
|
938
|
+
summary_file = output_dir / summary_file_basename
|
|
939
|
+
|
|
940
|
+
# Demultiplexing output path
|
|
941
|
+
split_dir = merged.get("split_dir", "demultiplexed_BAMs")
|
|
942
|
+
split_path = output_dir / split_dir
|
|
943
|
+
|
|
863
944
|
# final normalization
|
|
864
945
|
if "strands" in merged:
|
|
865
946
|
merged["strands"] = _parse_list(merged["strands"])
|
|
@@ -936,13 +1017,15 @@ class ExperimentConfig:
|
|
|
936
1017
|
hmm_methbases = list(hmm_methbases)
|
|
937
1018
|
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
938
1019
|
|
|
939
|
-
|
|
940
1020
|
# instantiate dataclass
|
|
941
1021
|
instance = cls(
|
|
942
1022
|
smf_modality = merged.get("smf_modality"),
|
|
943
|
-
input_data_path =
|
|
1023
|
+
input_data_path = input_data_path,
|
|
944
1024
|
recursive_input_search = merged.get("recursive_input_search"),
|
|
945
|
-
|
|
1025
|
+
input_type = input_type,
|
|
1026
|
+
input_files = input_files,
|
|
1027
|
+
output_directory = output_dir,
|
|
1028
|
+
summary_file = summary_file,
|
|
946
1029
|
fasta = merged.get("fasta"),
|
|
947
1030
|
sequencer = merged.get("sequencer"),
|
|
948
1031
|
model_dir = merged.get("model_dir"),
|
|
@@ -950,7 +1033,8 @@ class ExperimentConfig:
|
|
|
950
1033
|
fastq_barcode_map = merged.get("fastq_barcode_map"),
|
|
951
1034
|
fastq_auto_pairing = merged.get("fastq_auto_pairing"),
|
|
952
1035
|
bam_suffix = merged.get("bam_suffix", ".bam"),
|
|
953
|
-
split_dir =
|
|
1036
|
+
split_dir = split_dir,
|
|
1037
|
+
split_path = split_path,
|
|
954
1038
|
strands = merged.get("strands", ["bottom","top"]),
|
|
955
1039
|
conversions = merged.get("conversions", ["unconverted"]),
|
|
956
1040
|
fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
|
|
@@ -963,14 +1047,17 @@ class ExperimentConfig:
|
|
|
963
1047
|
threads = merged.get("threads"),
|
|
964
1048
|
sample_sheet_path = merged.get("sample_sheet_path"),
|
|
965
1049
|
sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
|
|
1050
|
+
delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
|
|
1051
|
+
delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
|
|
966
1052
|
aligner = merged.get("aligner", "minimap2"),
|
|
967
1053
|
aligner_args = merged.get("aligner_args", None),
|
|
968
1054
|
device = merged.get("device", "auto"),
|
|
969
1055
|
make_bigwigs = merged.get("make_bigwigs", False),
|
|
1056
|
+
make_beds = merged.get("make_beds", False),
|
|
970
1057
|
delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
|
|
971
1058
|
mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
|
|
972
1059
|
enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
|
|
973
|
-
conversion_types = merged.get("conversion_types", ["5mC"]),
|
|
1060
|
+
conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
|
|
974
1061
|
filter_threshold = merged.get("filter_threshold", 0.8),
|
|
975
1062
|
m6A_threshold = merged.get("m6A_threshold", 0.7),
|
|
976
1063
|
m5C_threshold = merged.get("m5C_threshold", 0.7),
|
|
@@ -983,6 +1070,14 @@ class ExperimentConfig:
|
|
|
983
1070
|
reference_column = merged.get("reference_column", 'Reference_strand'),
|
|
984
1071
|
sample_column = merged.get("sample_column", 'Barcode'),
|
|
985
1072
|
sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
|
|
1073
|
+
fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
|
|
1074
|
+
binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
|
|
1075
|
+
positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
|
|
1076
|
+
negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
|
|
1077
|
+
infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
|
|
1078
|
+
inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
|
|
1079
|
+
fit_j_threshold = merged.get("fit_j_threshold", 0.5),
|
|
1080
|
+
output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
|
|
986
1081
|
layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
|
|
987
1082
|
layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
|
|
988
1083
|
umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
|
|
@@ -1008,9 +1103,9 @@ class ExperimentConfig:
|
|
|
1008
1103
|
accessible_patches = merged.get("accessible_patches", None),
|
|
1009
1104
|
cpg = merged.get("cpg", None),
|
|
1010
1105
|
read_coord_filter = merged.get("read_coord_filter", [None, None]),
|
|
1011
|
-
read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [
|
|
1012
|
-
read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.
|
|
1013
|
-
read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [
|
|
1106
|
+
read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
|
|
1107
|
+
read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
|
|
1108
|
+
read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
|
|
1014
1109
|
read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
|
|
1015
1110
|
read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
|
|
1016
1111
|
read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
|
|
@@ -1026,10 +1121,12 @@ class ExperimentConfig:
|
|
|
1026
1121
|
duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
|
|
1027
1122
|
duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
|
|
1028
1123
|
duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
|
|
1124
|
+
position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
|
|
1029
1125
|
correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
|
|
1030
1126
|
correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1031
1127
|
correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1032
1128
|
hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
|
|
1129
|
+
force_redo_load_adata = merged.get("force_redo_load_adata", False),
|
|
1033
1130
|
force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
|
|
1034
1131
|
force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
|
|
1035
1132
|
bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
|
smftools/informatics/__init__.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
|
-
from . import
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
|
|
1
|
+
from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
|
|
2
|
+
from .basecalling import canoncall, modcall
|
|
3
|
+
from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
|
|
4
|
+
from .converted_BAM_to_adata import converted_BAM_to_adata
|
|
5
|
+
from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
|
|
6
|
+
from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
|
|
7
|
+
from .modkit_functions import extract_mods, make_modbed, modQC
|
|
8
|
+
from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
9
|
+
from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
|
|
10
|
+
from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
|
|
11
|
+
from .run_multiqc import run_multiqc
|
|
7
12
|
|
|
8
13
|
__all__ = [
|
|
9
14
|
"basecall_pod5s",
|
|
15
|
+
"converted_BAM_to_adata",
|
|
10
16
|
"subsample_fasta_from_bed",
|
|
11
17
|
"subsample_pod5",
|
|
12
18
|
"fast5_to_pod5",
|
|
13
|
-
"
|
|
19
|
+
"run_multiqc"
|
|
14
20
|
]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import subprocess
|
|
3
|
+
from typing import Union, List
|
|
4
|
+
|
|
5
|
+
def fast5_to_pod5(
|
|
6
|
+
fast5_dir: Union[str, Path, List[Union[str, Path]]],
|
|
7
|
+
output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
|
|
8
|
+
) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Convert Nanopore FAST5 files (single file, list of files, or directory)
|
|
11
|
+
into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
output_pod5 = str(output_pod5) # ensure string
|
|
15
|
+
|
|
16
|
+
# 1) If user gives a list of FAST5 files
|
|
17
|
+
if isinstance(fast5_dir, (list, tuple)):
|
|
18
|
+
fast5_paths = [str(Path(f)) for f in fast5_dir]
|
|
19
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
20
|
+
subprocess.run(cmd, check=True)
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
# Ensure Path object
|
|
24
|
+
p = Path(fast5_dir)
|
|
25
|
+
|
|
26
|
+
# 2) If user gives a single file
|
|
27
|
+
if p.is_file():
|
|
28
|
+
cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
|
|
29
|
+
subprocess.run(cmd, check=True)
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
# 3) If user gives a directory → collect FAST5s
|
|
33
|
+
if p.is_dir():
|
|
34
|
+
fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
|
|
35
|
+
if not fast5_paths:
|
|
36
|
+
raise FileNotFoundError(f"No FAST5 files found in {p}")
|
|
37
|
+
|
|
38
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
39
|
+
subprocess.run(cmd, check=True)
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
|
|
43
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# from .align_and_sort_BAM import align_and_sort_BAM
|
|
2
|
+
# from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
3
|
+
# from .bam_qc import bam_qc
|
|
4
|
+
# from .bed_to_bigwig import bed_to_bigwig
|
|
5
|
+
# from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
6
|
+
# from .canoncall import canoncall
|
|
7
|
+
# from .complement_base_list import complement_base_list
|
|
8
|
+
# from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
|
|
9
|
+
# from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
|
|
10
|
+
# from .count_aligned_reads import count_aligned_reads
|
|
11
|
+
# from .demux_and_index_BAM import demux_and_index_BAM
|
|
12
|
+
# from .discover_input_files import *
|
|
13
|
+
# from .extract_base_identities import extract_base_identities
|
|
14
|
+
# from .extract_mods import extract_mods
|
|
15
|
+
# from .extract_read_features_from_bam import extract_read_features_from_bam
|
|
16
|
+
# from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
|
|
17
|
+
# from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
18
|
+
# from .find_conversion_sites import find_conversion_sites
|
|
19
|
+
# from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
|
|
20
|
+
# from .get_chromosome_lengths import get_chromosome_lengths
|
|
21
|
+
# from .get_native_references import get_native_references
|
|
22
|
+
# from .index_fasta import index_fasta
|
|
23
|
+
# from .make_modbed import make_modbed
|
|
24
|
+
# from .modcall import modcall
|
|
25
|
+
# from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
26
|
+
# from .modQC import modQC
|
|
27
|
+
# from .one_hot_encode import one_hot_encode
|
|
28
|
+
# from .ohe_batching import ohe_batching
|
|
29
|
+
# from .one_hot_decode import one_hot_decode
|
|
30
|
+
# from .ohe_layers_decode import ohe_layers_decode
|
|
31
|
+
# from .plot_bed_histograms import plot_bed_histograms
|
|
32
|
+
# from .run_multiqc import run_multiqc
|
|
33
|
+
# from .separate_bam_by_bc import separate_bam_by_bc
|
|
34
|
+
# from .split_and_index_BAM import split_and_index_BAM
|
|
35
|
+
|
|
36
|
+
# __all__ = [
|
|
37
|
+
# "align_and_sort_BAM",
|
|
38
|
+
# "aligned_BAM_to_bed",
|
|
39
|
+
# "bam_qc",
|
|
40
|
+
# "bed_to_bigwig",
|
|
41
|
+
# "binarize_converted_base_identities",
|
|
42
|
+
# "canoncall",
|
|
43
|
+
# "complement_base_list",
|
|
44
|
+
# "converted_BAM_to_adata_II",
|
|
45
|
+
# "concatenate_fastqs_to_bam",
|
|
46
|
+
# "count_aligned_reads",
|
|
47
|
+
# "demux_and_index_BAM",
|
|
48
|
+
# "extract_base_identities",
|
|
49
|
+
# "extract_mods",
|
|
50
|
+
# "extract_read_features_from_bam",
|
|
51
|
+
# "extract_read_lengths_from_bed",
|
|
52
|
+
# "extract_readnames_from_BAM",
|
|
53
|
+
# "find_conversion_sites",
|
|
54
|
+
# "convert_FASTA_record",
|
|
55
|
+
# "generate_converted_FASTA",
|
|
56
|
+
# "get_chromosome_lengths",
|
|
57
|
+
# "get_native_references",
|
|
58
|
+
# "index_fasta",
|
|
59
|
+
# "make_modbed",
|
|
60
|
+
# "modcall",
|
|
61
|
+
# "modkit_extract_to_adata",
|
|
62
|
+
# "modQC",
|
|
63
|
+
# "one_hot_encode",
|
|
64
|
+
# "ohe_batching",
|
|
65
|
+
# "one_hot_decode",
|
|
66
|
+
# "ohe_layers_decode",
|
|
67
|
+
# "plot_bed_histograms",
|
|
68
|
+
# "run_multiqc",
|
|
69
|
+
# "separate_bam_by_bc",
|
|
70
|
+
# "split_and_index_BAM"
|
|
71
|
+
# ]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
import pysam
|
|
6
|
+
|
|
7
|
+
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
8
|
+
"""
|
|
9
|
+
Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
|
|
10
|
+
"""
|
|
11
|
+
bam_path = str(bam_path)
|
|
12
|
+
fastq_path = str(fastq_path)
|
|
13
|
+
with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
|
|
14
|
+
for r in bam.fetch(until_eof=True):
|
|
15
|
+
# Skip secondary/supplementary if you want (optional):
|
|
16
|
+
# if r.is_secondary or r.is_supplementary: continue
|
|
17
|
+
name = r.query_name
|
|
18
|
+
seq = r.query_sequence or ""
|
|
19
|
+
qual = r.qual or ""
|
|
20
|
+
fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
|
|
21
|
+
|
|
22
|
+
def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
23
|
+
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
24
|
+
args = []
|
|
25
|
+
if threads:
|
|
26
|
+
args += ["-@", str(threads)]
|
|
27
|
+
args += ["-o", out_bam, in_bam]
|
|
28
|
+
pysam.sort(*args)
|
|
29
|
+
|
|
30
|
+
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
31
|
+
bam_path = str(bam_path)
|
|
32
|
+
# pysam.index supports samtools-style args
|
|
33
|
+
if threads:
|
|
34
|
+
pysam.index("-@", str(threads), bam_path)
|
|
35
|
+
else:
|
|
36
|
+
pysam.index(bam_path)
|
|
37
|
+
|
|
38
|
+
def align_and_sort_BAM(fasta,
|
|
39
|
+
input,
|
|
40
|
+
bam_suffix='.bam',
|
|
41
|
+
output_directory='aligned_outputs',
|
|
42
|
+
make_bigwigs=False,
|
|
43
|
+
threads=None,
|
|
44
|
+
aligner='minimap2',
|
|
45
|
+
aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
|
|
46
|
+
"""
|
|
47
|
+
A wrapper for running dorado aligner and samtools functions
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
fasta (str): File path to the reference genome to align to.
|
|
51
|
+
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
52
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
53
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
54
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
55
|
+
threads (int): Number of additional threads to use
|
|
56
|
+
aligner (str): Aligner to use. minimap2 and dorado options
|
|
57
|
+
aligner_args (list): list of optional parameters to use for the alignment
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
None
|
|
61
|
+
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
62
|
+
"""
|
|
63
|
+
input_basename = input.name
|
|
64
|
+
input_suffix = input.suffix
|
|
65
|
+
input_as_fastq = input.with_name(input.stem + '.fastq')
|
|
66
|
+
|
|
67
|
+
output_path_minus_suffix = output_directory / input.stem
|
|
68
|
+
|
|
69
|
+
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
70
|
+
aligned_output = aligned_BAM.with_suffix(bam_suffix)
|
|
71
|
+
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
72
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
73
|
+
|
|
74
|
+
if threads:
|
|
75
|
+
threads = str(threads)
|
|
76
|
+
else:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
if aligner == 'minimap2':
|
|
80
|
+
print(f"Converting BAM to FASTQ: {input}")
|
|
81
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
82
|
+
# bam_to_fastq_command = ['samtools', 'fastq', input]
|
|
83
|
+
# subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
|
|
84
|
+
print(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
85
|
+
if threads:
|
|
86
|
+
minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
|
|
87
|
+
else:
|
|
88
|
+
minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
|
|
89
|
+
subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
|
|
90
|
+
os.remove(input_as_fastq)
|
|
91
|
+
|
|
92
|
+
elif aligner == 'dorado':
|
|
93
|
+
# Run dorado aligner
|
|
94
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
95
|
+
if threads:
|
|
96
|
+
alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
|
|
97
|
+
else:
|
|
98
|
+
alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
|
|
99
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# --- Sort & Index with pysam ---
|
|
106
|
+
print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
107
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
108
|
+
|
|
109
|
+
print(f"[pysam] Indexing: {aligned_sorted_output}")
|
|
110
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
111
|
+
|
|
112
|
+
# Sort the BAM on positional coordinates
|
|
113
|
+
# print(f"Sorting BAM: {aligned_output}")
|
|
114
|
+
# if threads:
|
|
115
|
+
# sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
|
|
116
|
+
# else:
|
|
117
|
+
# sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
|
|
118
|
+
# subprocess.run(sort_command)
|
|
119
|
+
|
|
120
|
+
# # Create a BAM index file
|
|
121
|
+
# print(f"Indexing BAM: {aligned_sorted_output}")
|
|
122
|
+
# if threads:
|
|
123
|
+
# index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
|
|
124
|
+
# else:
|
|
125
|
+
# index_command = ["samtools", "index", aligned_sorted_output]
|
|
126
|
+
# subprocess.run(index_command)
|