smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
import pysam
|
|
6
|
+
|
|
7
|
+
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
8
|
+
"""
|
|
9
|
+
Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
|
|
10
|
+
"""
|
|
11
|
+
bam_path = str(bam_path)
|
|
12
|
+
fastq_path = str(fastq_path)
|
|
13
|
+
with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
|
|
14
|
+
for r in bam.fetch(until_eof=True):
|
|
15
|
+
# Skip secondary/supplementary if you want (optional):
|
|
16
|
+
# if r.is_secondary or r.is_supplementary: continue
|
|
17
|
+
name = r.query_name
|
|
18
|
+
seq = r.query_sequence or ""
|
|
19
|
+
qual = r.qual or ""
|
|
20
|
+
fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
|
|
21
|
+
|
|
22
|
+
def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
23
|
+
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
24
|
+
args = []
|
|
25
|
+
if threads:
|
|
26
|
+
args += ["-@", str(threads)]
|
|
27
|
+
args += ["-o", out_bam, in_bam]
|
|
28
|
+
pysam.sort(*args)
|
|
29
|
+
|
|
30
|
+
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
31
|
+
bam_path = str(bam_path)
|
|
32
|
+
# pysam.index supports samtools-style args
|
|
33
|
+
if threads:
|
|
34
|
+
pysam.index("-@", str(threads), bam_path)
|
|
35
|
+
else:
|
|
36
|
+
pysam.index(bam_path)
|
|
37
|
+
|
|
38
|
+
def align_and_sort_BAM(fasta,
|
|
39
|
+
input,
|
|
40
|
+
bam_suffix='.bam',
|
|
41
|
+
output_directory='aligned_outputs',
|
|
42
|
+
make_bigwigs=False,
|
|
43
|
+
threads=None,
|
|
44
|
+
aligner='minimap2',
|
|
45
|
+
aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
|
|
46
|
+
"""
|
|
47
|
+
A wrapper for running dorado aligner and samtools functions
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
fasta (str): File path to the reference genome to align to.
|
|
51
|
+
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
52
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
53
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
54
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
55
|
+
threads (int): Number of additional threads to use
|
|
56
|
+
aligner (str): Aligner to use. minimap2 and dorado options
|
|
57
|
+
aligner_args (list): list of optional parameters to use for the alignment
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
None
|
|
61
|
+
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
62
|
+
"""
|
|
63
|
+
input_basename = input.name
|
|
64
|
+
input_suffix = input.suffix
|
|
65
|
+
input_as_fastq = input.with_name(input.stem + '.fastq')
|
|
66
|
+
|
|
67
|
+
output_path_minus_suffix = output_directory / input.stem
|
|
68
|
+
|
|
69
|
+
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
70
|
+
aligned_output = aligned_BAM.with_suffix(bam_suffix)
|
|
71
|
+
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
72
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
73
|
+
|
|
74
|
+
if threads:
|
|
75
|
+
threads = str(threads)
|
|
76
|
+
else:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
if aligner == 'minimap2':
|
|
80
|
+
print(f"Converting BAM to FASTQ: {input}")
|
|
81
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
82
|
+
# bam_to_fastq_command = ['samtools', 'fastq', input]
|
|
83
|
+
# subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
|
|
84
|
+
print(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
85
|
+
if threads:
|
|
86
|
+
minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
|
|
87
|
+
else:
|
|
88
|
+
minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
|
|
89
|
+
subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
|
|
90
|
+
os.remove(input_as_fastq)
|
|
91
|
+
|
|
92
|
+
elif aligner == 'dorado':
|
|
93
|
+
# Run dorado aligner
|
|
94
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
95
|
+
if threads:
|
|
96
|
+
alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
|
|
97
|
+
else:
|
|
98
|
+
alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
|
|
99
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# --- Sort & Index with pysam ---
|
|
106
|
+
print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
107
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
108
|
+
|
|
109
|
+
print(f"[pysam] Indexing: {aligned_sorted_output}")
|
|
110
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
111
|
+
|
|
112
|
+
# Sort the BAM on positional coordinates
|
|
113
|
+
# print(f"Sorting BAM: {aligned_output}")
|
|
114
|
+
# if threads:
|
|
115
|
+
# sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
|
|
116
|
+
# else:
|
|
117
|
+
# sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
|
|
118
|
+
# subprocess.run(sort_command)
|
|
119
|
+
|
|
120
|
+
# # Create a BAM index file
|
|
121
|
+
# print(f"Indexing BAM: {aligned_sorted_output}")
|
|
122
|
+
# if threads:
|
|
123
|
+
# index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
|
|
124
|
+
# else:
|
|
125
|
+
# index_command = ["samtools", "index", aligned_sorted_output]
|
|
126
|
+
# subprocess.run(index_command)
|
|
@@ -15,22 +15,23 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
15
15
|
"""
|
|
16
16
|
import subprocess
|
|
17
17
|
import os
|
|
18
|
+
from pathlib import Path
|
|
18
19
|
import pysam
|
|
19
20
|
import numpy as np
|
|
20
21
|
import concurrent.futures
|
|
21
22
|
from concurrent.futures import ProcessPoolExecutor
|
|
22
23
|
from .bed_to_bigwig import bed_to_bigwig
|
|
23
|
-
from
|
|
24
|
+
from ...readwrite import make_dirs
|
|
24
25
|
from .plot_bed_histograms import plot_bed_histograms
|
|
25
26
|
|
|
26
27
|
threads = threads or os.cpu_count() # Use max available cores if not specified
|
|
27
28
|
|
|
28
29
|
# Create necessary directories
|
|
29
|
-
plotting_dir =
|
|
30
|
-
bed_dir =
|
|
30
|
+
plotting_dir = out_dir / "bed_cov_histograms"
|
|
31
|
+
bed_dir = out_dir / "beds"
|
|
31
32
|
make_dirs([plotting_dir, bed_dir])
|
|
32
33
|
|
|
33
|
-
bed_output =
|
|
34
|
+
bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
|
|
34
35
|
|
|
35
36
|
print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
36
37
|
|
|
@@ -64,6 +65,7 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
64
65
|
|
|
65
66
|
def split_bed(bed):
|
|
66
67
|
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
68
|
+
bed = str(bed)
|
|
67
69
|
aligned = bed.replace(".bed", "_aligned.bed")
|
|
68
70
|
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
69
71
|
with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from typing import Iterable, Optional, Tuple, List
|
|
7
|
+
|
|
8
|
+
def bam_qc(
|
|
9
|
+
bam_files: Iterable[str | Path],
|
|
10
|
+
bam_qc_dir: str | Path,
|
|
11
|
+
threads: Optional[int],
|
|
12
|
+
modality: str,
|
|
13
|
+
stats: bool = True,
|
|
14
|
+
flagstats: bool = True,
|
|
15
|
+
idxstats: bool = True,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""
|
|
18
|
+
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
19
|
+
Prefers pysam; falls back to `samtools` if needed.
|
|
20
|
+
Runs BAMs in parallel (up to `threads`, default serial).
|
|
21
|
+
"""
|
|
22
|
+
import subprocess
|
|
23
|
+
import shutil
|
|
24
|
+
|
|
25
|
+
# Try to import pysam once
|
|
26
|
+
try:
|
|
27
|
+
import pysam
|
|
28
|
+
HAVE_PYSAM = True
|
|
29
|
+
except Exception:
|
|
30
|
+
HAVE_PYSAM = False
|
|
31
|
+
|
|
32
|
+
bam_qc_dir = Path(bam_qc_dir)
|
|
33
|
+
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
bam_files = [Path(b) for b in bam_files]
|
|
36
|
+
|
|
37
|
+
def _has_index(p: Path) -> bool:
|
|
38
|
+
if p.suffix.lower() == ".bam":
|
|
39
|
+
bai = p.with_suffix(p.suffix + ".bai")
|
|
40
|
+
bai_alt = Path(str(p) + ".bai")
|
|
41
|
+
return bai.exists() or bai_alt.exists()
|
|
42
|
+
if p.suffix.lower() == ".cram":
|
|
43
|
+
crai = Path(str(p) + ".crai")
|
|
44
|
+
return crai.exists()
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def _ensure_index(p: Path) -> None:
|
|
48
|
+
if _has_index(p):
|
|
49
|
+
return
|
|
50
|
+
if HAVE_PYSAM:
|
|
51
|
+
# pysam.index supports both BAM & CRAM
|
|
52
|
+
pysam.index(str(p))
|
|
53
|
+
else:
|
|
54
|
+
cmd = ["samtools", "index", str(p)]
|
|
55
|
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
56
|
+
|
|
57
|
+
def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
|
|
58
|
+
# outputs + return (file, [(task_name, returncode)])
|
|
59
|
+
results: List[Tuple[str, int]] = []
|
|
60
|
+
base = bam.stem # filename without .bam
|
|
61
|
+
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
62
|
+
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
63
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
64
|
+
|
|
65
|
+
# Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
|
|
66
|
+
try:
|
|
67
|
+
_ensure_index(bam)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
# Still attempt stats/flagstat if requested
|
|
70
|
+
print(f"[warn] Indexing failed for {bam}: {e}")
|
|
71
|
+
|
|
72
|
+
# Choose runner per task
|
|
73
|
+
def run_stats():
|
|
74
|
+
if not stats:
|
|
75
|
+
return
|
|
76
|
+
if HAVE_PYSAM and hasattr(pysam, "stats"):
|
|
77
|
+
txt = pysam.stats(str(bam))
|
|
78
|
+
out_stats.write_text(txt)
|
|
79
|
+
results.append(("stats(pysam)", 0))
|
|
80
|
+
else:
|
|
81
|
+
cmd = ["samtools", "stats", str(bam)]
|
|
82
|
+
with open(out_stats, "w") as fh:
|
|
83
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
84
|
+
results.append(("stats(samtools)", cp.returncode))
|
|
85
|
+
if cp.returncode != 0:
|
|
86
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
87
|
+
|
|
88
|
+
def run_flagstat():
|
|
89
|
+
if not flagstats:
|
|
90
|
+
return
|
|
91
|
+
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
92
|
+
txt = pysam.flagstat(str(bam))
|
|
93
|
+
out_flag.write_text(txt)
|
|
94
|
+
results.append(("flagstat(pysam)", 0))
|
|
95
|
+
else:
|
|
96
|
+
cmd = ["samtools", "flagstat", str(bam)]
|
|
97
|
+
with open(out_flag, "w") as fh:
|
|
98
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
99
|
+
results.append(("flagstat(samtools)", cp.returncode))
|
|
100
|
+
if cp.returncode != 0:
|
|
101
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
102
|
+
|
|
103
|
+
def run_idxstats():
|
|
104
|
+
if not idxstats:
|
|
105
|
+
return
|
|
106
|
+
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
107
|
+
txt = pysam.idxstats(str(bam))
|
|
108
|
+
out_idx.write_text(txt)
|
|
109
|
+
results.append(("idxstats(pysam)", 0))
|
|
110
|
+
else:
|
|
111
|
+
cmd = ["samtools", "idxstats", str(bam)]
|
|
112
|
+
with open(out_idx, "w") as fh:
|
|
113
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
114
|
+
results.append(("idxstats(samtools)", cp.returncode))
|
|
115
|
+
if cp.returncode != 0:
|
|
116
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
117
|
+
|
|
118
|
+
# Sanity: ensure samtools exists if pysam missing
|
|
119
|
+
if not HAVE_PYSAM:
|
|
120
|
+
if not shutil.which("samtools"):
|
|
121
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
122
|
+
|
|
123
|
+
# Execute tasks (serial per file; parallelized across files)
|
|
124
|
+
run_stats()
|
|
125
|
+
run_flagstat()
|
|
126
|
+
run_idxstats()
|
|
127
|
+
return bam, results
|
|
128
|
+
|
|
129
|
+
# Parallel across BAMs
|
|
130
|
+
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
131
|
+
futures = []
|
|
132
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
133
|
+
for b in bam_files:
|
|
134
|
+
futures.append(ex.submit(_run_one, b))
|
|
135
|
+
|
|
136
|
+
for fut in as_completed(futures):
|
|
137
|
+
try:
|
|
138
|
+
bam, res = fut.result()
|
|
139
|
+
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
140
|
+
print(f"[qc] {bam.name}: {summary}")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"[error] QC failed: {e}")
|
|
143
|
+
|
|
144
|
+
# Placeholders to keep your signature stable
|
|
145
|
+
if modality not in {"conversion", "direct"}:
|
|
146
|
+
print(f"[warn] Unknown modality '{modality}', continuing.")
|
|
147
|
+
|
|
148
|
+
print("QC processing completed.")
|
|
149
|
+
|
|
150
|
+
# def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
|
|
151
|
+
# """
|
|
152
|
+
# Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
|
|
153
|
+
|
|
154
|
+
# Parameters:
|
|
155
|
+
# - bam_files: List of BAM file paths.
|
|
156
|
+
# - bam_qc_dir: Directory to save QC reports.
|
|
157
|
+
# - threads: Number threads to use.
|
|
158
|
+
# - modality: 'conversion' or 'direct' (affects processing mode).
|
|
159
|
+
# - stats: Run `samtools stats` if True.
|
|
160
|
+
# - flagstats: Run `samtools flagstat` if True.
|
|
161
|
+
# - idxstats: Run `samtools idxstats` if True.
|
|
162
|
+
# """
|
|
163
|
+
# import os
|
|
164
|
+
# import subprocess
|
|
165
|
+
|
|
166
|
+
# # Ensure the QC output directory exists
|
|
167
|
+
# os.makedirs(bam_qc_dir, exist_ok=True)
|
|
168
|
+
|
|
169
|
+
# if threads:
|
|
170
|
+
# threads = str(threads)
|
|
171
|
+
# else:
|
|
172
|
+
# pass
|
|
173
|
+
|
|
174
|
+
# for bam in bam_files:
|
|
175
|
+
# bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
|
|
176
|
+
|
|
177
|
+
# # Run samtools QC commands based on selected options
|
|
178
|
+
# if stats:
|
|
179
|
+
# stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
|
|
180
|
+
# if threads:
|
|
181
|
+
# command = ["samtools", "stats", "-@", threads, bam]
|
|
182
|
+
# else:
|
|
183
|
+
# command = ["samtools", "stats", bam]
|
|
184
|
+
# print(f"Running: {' '.join(command)} > {stats_out}")
|
|
185
|
+
# with open(stats_out, "w") as out_file:
|
|
186
|
+
# subprocess.run(command, stdout=out_file)
|
|
187
|
+
|
|
188
|
+
# if flagstats:
|
|
189
|
+
# flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
|
|
190
|
+
# if threads:
|
|
191
|
+
# command = ["samtools", "flagstat", "-@", threads, bam]
|
|
192
|
+
# else:
|
|
193
|
+
# command = ["samtools", "flagstat", bam]
|
|
194
|
+
# print(f"Running: {' '.join(command)} > {flagstats_out}")
|
|
195
|
+
# with open(flagstats_out, "w") as out_file:
|
|
196
|
+
# subprocess.run(command, stdout=out_file)
|
|
197
|
+
|
|
198
|
+
# if idxstats:
|
|
199
|
+
# idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
|
|
200
|
+
# if threads:
|
|
201
|
+
# command = ["samtools", "idxstats", "-@", threads, bam]
|
|
202
|
+
# else:
|
|
203
|
+
# command = ["samtools", "idxstats", bam]
|
|
204
|
+
# print(f"Running: {' '.join(command)} > {idxstats_out}")
|
|
205
|
+
# with open(idxstats_out, "w") as out_file:
|
|
206
|
+
# subprocess.run(command, stdout=out_file)
|
|
207
|
+
|
|
208
|
+
# if modality == 'conversion':
|
|
209
|
+
# pass
|
|
210
|
+
# elif modality == 'direct':
|
|
211
|
+
# pass
|
|
212
|
+
|
|
213
|
+
# print("QC processing completed.")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import pybedtools
|
|
3
|
+
import pyBigWig
|
|
4
|
+
|
|
5
|
+
def bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
BED → bedGraph → bigWig
|
|
8
|
+
Requires:
|
|
9
|
+
- FASTA must have .fai index present
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
bed = Path(bed)
|
|
13
|
+
fa = Path(fasta) # path to .fa
|
|
14
|
+
parent = bed.parent
|
|
15
|
+
stem = bed.stem
|
|
16
|
+
fa_stem = fa.stem
|
|
17
|
+
fai = parent / f"{fa_stem}.fai"
|
|
18
|
+
|
|
19
|
+
bedgraph = parent / f"{stem}.bedgraph"
|
|
20
|
+
bigwig = parent / f"{stem}.bw"
|
|
21
|
+
|
|
22
|
+
# 1) Compute coverage → bedGraph
|
|
23
|
+
print(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
24
|
+
bt = pybedtools.BedTool(str(bed))
|
|
25
|
+
# bedtools genomecov -bg
|
|
26
|
+
coverage = bt.genome_coverage(bg=True, genome=str(fai))
|
|
27
|
+
coverage.saveas(str(bedgraph))
|
|
28
|
+
|
|
29
|
+
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
30
|
+
print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
31
|
+
|
|
32
|
+
# read chrom sizes from the FASTA .fai index
|
|
33
|
+
chrom_sizes = {}
|
|
34
|
+
with open(fai) as f:
|
|
35
|
+
for line in f:
|
|
36
|
+
fields = line.strip().split("\t")
|
|
37
|
+
chrom = fields[0]
|
|
38
|
+
size = int(fields[1])
|
|
39
|
+
chrom_sizes[chrom] = size
|
|
40
|
+
|
|
41
|
+
bw = pyBigWig.open(str(bigwig), "w")
|
|
42
|
+
bw.addHeader(list(chrom_sizes.items()))
|
|
43
|
+
|
|
44
|
+
with open(bedgraph) as f:
|
|
45
|
+
for line in f:
|
|
46
|
+
chrom, start, end, coverage = line.strip().split()
|
|
47
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
48
|
+
|
|
49
|
+
bw.close()
|
|
50
|
+
|
|
51
|
+
print(f"BigWig written: {bigwig}")
|
|
52
|
+
return str(bigwig)
|
|
53
|
+
|
|
54
|
+
# def bed_to_bigwig(fasta, bed):
|
|
55
|
+
# """
|
|
56
|
+
# Takes a bed file of reads and makes a bedgraph plus a bigwig
|
|
57
|
+
|
|
58
|
+
# Parameters:
|
|
59
|
+
# fasta (str): File path to the reference genome to align to.
|
|
60
|
+
# bed (str): File path to the input bed.
|
|
61
|
+
# Returns:
|
|
62
|
+
# None
|
|
63
|
+
# """
|
|
64
|
+
# import os
|
|
65
|
+
# import subprocess
|
|
66
|
+
|
|
67
|
+
# bed_basename = os.path.basename(bed)
|
|
68
|
+
# parent_dir = os.path.dirname(bed)
|
|
69
|
+
# bed_basename_minus_suffix = bed_basename.split('.bed')[0]
|
|
70
|
+
# fasta_basename = os.path.basename(fasta)
|
|
71
|
+
# fasta_dir = os.path.dirname(fasta)
|
|
72
|
+
# fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
|
|
73
|
+
# chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
|
|
74
|
+
# chrom_path = os.path.join(fasta_dir, chrom_basename)
|
|
75
|
+
# bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
|
|
76
|
+
# bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
|
|
77
|
+
# bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
|
|
78
|
+
# bigwig_output = os.path.join(parent_dir, bigwig_basename)
|
|
79
|
+
|
|
80
|
+
# # Make the bedgraph
|
|
81
|
+
# with open(bedgraph_output, 'w') as outfile:
|
|
82
|
+
# # Command as a list
|
|
83
|
+
# command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
|
|
84
|
+
# print(f'Making bedgraph from {bed_basename}')
|
|
85
|
+
# subprocess.run(command, stdout=outfile)
|
|
86
|
+
|
|
87
|
+
# # Make the bigwig
|
|
88
|
+
# command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
|
|
89
|
+
# print(f'Making bigwig from {bedgraph_basename}')
|
|
90
|
+
# subprocess.run(command)
|