smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,22 +15,23 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
15
15
|
"""
|
|
16
16
|
import subprocess
|
|
17
17
|
import os
|
|
18
|
+
from pathlib import Path
|
|
18
19
|
import pysam
|
|
19
20
|
import numpy as np
|
|
20
21
|
import concurrent.futures
|
|
21
22
|
from concurrent.futures import ProcessPoolExecutor
|
|
22
23
|
from .bed_to_bigwig import bed_to_bigwig
|
|
23
|
-
from
|
|
24
|
+
from ...readwrite import make_dirs
|
|
24
25
|
from .plot_bed_histograms import plot_bed_histograms
|
|
25
26
|
|
|
26
27
|
threads = threads or os.cpu_count() # Use max available cores if not specified
|
|
27
28
|
|
|
28
29
|
# Create necessary directories
|
|
29
|
-
plotting_dir =
|
|
30
|
-
bed_dir =
|
|
30
|
+
plotting_dir = out_dir / "bed_cov_histograms"
|
|
31
|
+
bed_dir = out_dir / "beds"
|
|
31
32
|
make_dirs([plotting_dir, bed_dir])
|
|
32
33
|
|
|
33
|
-
bed_output =
|
|
34
|
+
bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
|
|
34
35
|
|
|
35
36
|
print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
36
37
|
|
|
@@ -64,6 +65,7 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
64
65
|
|
|
65
66
|
def split_bed(bed):
|
|
66
67
|
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
68
|
+
bed = str(bed)
|
|
67
69
|
aligned = bed.replace(".bed", "_aligned.bed")
|
|
68
70
|
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
69
71
|
with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from typing import Iterable, Optional, Tuple, List
|
|
7
|
+
|
|
8
|
+
def bam_qc(
|
|
9
|
+
bam_files: Iterable[str | Path],
|
|
10
|
+
bam_qc_dir: str | Path,
|
|
11
|
+
threads: Optional[int],
|
|
12
|
+
modality: str,
|
|
13
|
+
stats: bool = True,
|
|
14
|
+
flagstats: bool = True,
|
|
15
|
+
idxstats: bool = True,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""
|
|
18
|
+
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
19
|
+
Prefers pysam; falls back to `samtools` if needed.
|
|
20
|
+
Runs BAMs in parallel (up to `threads`, default serial).
|
|
21
|
+
"""
|
|
22
|
+
import subprocess
|
|
23
|
+
import shutil
|
|
24
|
+
|
|
25
|
+
# Try to import pysam once
|
|
26
|
+
try:
|
|
27
|
+
import pysam
|
|
28
|
+
HAVE_PYSAM = True
|
|
29
|
+
except Exception:
|
|
30
|
+
HAVE_PYSAM = False
|
|
31
|
+
|
|
32
|
+
bam_qc_dir = Path(bam_qc_dir)
|
|
33
|
+
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
bam_files = [Path(b) for b in bam_files]
|
|
36
|
+
|
|
37
|
+
def _has_index(p: Path) -> bool:
|
|
38
|
+
if p.suffix.lower() == ".bam":
|
|
39
|
+
bai = p.with_suffix(p.suffix + ".bai")
|
|
40
|
+
bai_alt = Path(str(p) + ".bai")
|
|
41
|
+
return bai.exists() or bai_alt.exists()
|
|
42
|
+
if p.suffix.lower() == ".cram":
|
|
43
|
+
crai = Path(str(p) + ".crai")
|
|
44
|
+
return crai.exists()
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def _ensure_index(p: Path) -> None:
|
|
48
|
+
if _has_index(p):
|
|
49
|
+
return
|
|
50
|
+
if HAVE_PYSAM:
|
|
51
|
+
# pysam.index supports both BAM & CRAM
|
|
52
|
+
pysam.index(str(p))
|
|
53
|
+
else:
|
|
54
|
+
cmd = ["samtools", "index", str(p)]
|
|
55
|
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
56
|
+
|
|
57
|
+
def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
|
|
58
|
+
# outputs + return (file, [(task_name, returncode)])
|
|
59
|
+
results: List[Tuple[str, int]] = []
|
|
60
|
+
base = bam.stem # filename without .bam
|
|
61
|
+
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
62
|
+
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
63
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
64
|
+
|
|
65
|
+
# Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
|
|
66
|
+
try:
|
|
67
|
+
_ensure_index(bam)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
# Still attempt stats/flagstat if requested
|
|
70
|
+
print(f"[warn] Indexing failed for {bam}: {e}")
|
|
71
|
+
|
|
72
|
+
# Choose runner per task
|
|
73
|
+
def run_stats():
|
|
74
|
+
if not stats:
|
|
75
|
+
return
|
|
76
|
+
if HAVE_PYSAM and hasattr(pysam, "stats"):
|
|
77
|
+
txt = pysam.stats(str(bam))
|
|
78
|
+
out_stats.write_text(txt)
|
|
79
|
+
results.append(("stats(pysam)", 0))
|
|
80
|
+
else:
|
|
81
|
+
cmd = ["samtools", "stats", str(bam)]
|
|
82
|
+
with open(out_stats, "w") as fh:
|
|
83
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
84
|
+
results.append(("stats(samtools)", cp.returncode))
|
|
85
|
+
if cp.returncode != 0:
|
|
86
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
87
|
+
|
|
88
|
+
def run_flagstat():
|
|
89
|
+
if not flagstats:
|
|
90
|
+
return
|
|
91
|
+
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
92
|
+
txt = pysam.flagstat(str(bam))
|
|
93
|
+
out_flag.write_text(txt)
|
|
94
|
+
results.append(("flagstat(pysam)", 0))
|
|
95
|
+
else:
|
|
96
|
+
cmd = ["samtools", "flagstat", str(bam)]
|
|
97
|
+
with open(out_flag, "w") as fh:
|
|
98
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
99
|
+
results.append(("flagstat(samtools)", cp.returncode))
|
|
100
|
+
if cp.returncode != 0:
|
|
101
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
102
|
+
|
|
103
|
+
def run_idxstats():
|
|
104
|
+
if not idxstats:
|
|
105
|
+
return
|
|
106
|
+
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
107
|
+
txt = pysam.idxstats(str(bam))
|
|
108
|
+
out_idx.write_text(txt)
|
|
109
|
+
results.append(("idxstats(pysam)", 0))
|
|
110
|
+
else:
|
|
111
|
+
cmd = ["samtools", "idxstats", str(bam)]
|
|
112
|
+
with open(out_idx, "w") as fh:
|
|
113
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
114
|
+
results.append(("idxstats(samtools)", cp.returncode))
|
|
115
|
+
if cp.returncode != 0:
|
|
116
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
117
|
+
|
|
118
|
+
# Sanity: ensure samtools exists if pysam missing
|
|
119
|
+
if not HAVE_PYSAM:
|
|
120
|
+
if not shutil.which("samtools"):
|
|
121
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
122
|
+
|
|
123
|
+
# Execute tasks (serial per file; parallelized across files)
|
|
124
|
+
run_stats()
|
|
125
|
+
run_flagstat()
|
|
126
|
+
run_idxstats()
|
|
127
|
+
return bam, results
|
|
128
|
+
|
|
129
|
+
# Parallel across BAMs
|
|
130
|
+
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
131
|
+
futures = []
|
|
132
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
133
|
+
for b in bam_files:
|
|
134
|
+
futures.append(ex.submit(_run_one, b))
|
|
135
|
+
|
|
136
|
+
for fut in as_completed(futures):
|
|
137
|
+
try:
|
|
138
|
+
bam, res = fut.result()
|
|
139
|
+
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
140
|
+
print(f"[qc] {bam.name}: {summary}")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"[error] QC failed: {e}")
|
|
143
|
+
|
|
144
|
+
# Placeholders to keep your signature stable
|
|
145
|
+
if modality not in {"conversion", "direct"}:
|
|
146
|
+
print(f"[warn] Unknown modality '{modality}', continuing.")
|
|
147
|
+
|
|
148
|
+
print("QC processing completed.")
|
|
149
|
+
|
|
150
|
+
# def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
|
|
151
|
+
# """
|
|
152
|
+
# Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
|
|
153
|
+
|
|
154
|
+
# Parameters:
|
|
155
|
+
# - bam_files: List of BAM file paths.
|
|
156
|
+
# - bam_qc_dir: Directory to save QC reports.
|
|
157
|
+
# - threads: Number threads to use.
|
|
158
|
+
# - modality: 'conversion' or 'direct' (affects processing mode).
|
|
159
|
+
# - stats: Run `samtools stats` if True.
|
|
160
|
+
# - flagstats: Run `samtools flagstat` if True.
|
|
161
|
+
# - idxstats: Run `samtools idxstats` if True.
|
|
162
|
+
# """
|
|
163
|
+
# import os
|
|
164
|
+
# import subprocess
|
|
165
|
+
|
|
166
|
+
# # Ensure the QC output directory exists
|
|
167
|
+
# os.makedirs(bam_qc_dir, exist_ok=True)
|
|
168
|
+
|
|
169
|
+
# if threads:
|
|
170
|
+
# threads = str(threads)
|
|
171
|
+
# else:
|
|
172
|
+
# pass
|
|
173
|
+
|
|
174
|
+
# for bam in bam_files:
|
|
175
|
+
# bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
|
|
176
|
+
|
|
177
|
+
# # Run samtools QC commands based on selected options
|
|
178
|
+
# if stats:
|
|
179
|
+
# stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
|
|
180
|
+
# if threads:
|
|
181
|
+
# command = ["samtools", "stats", "-@", threads, bam]
|
|
182
|
+
# else:
|
|
183
|
+
# command = ["samtools", "stats", bam]
|
|
184
|
+
# print(f"Running: {' '.join(command)} > {stats_out}")
|
|
185
|
+
# with open(stats_out, "w") as out_file:
|
|
186
|
+
# subprocess.run(command, stdout=out_file)
|
|
187
|
+
|
|
188
|
+
# if flagstats:
|
|
189
|
+
# flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
|
|
190
|
+
# if threads:
|
|
191
|
+
# command = ["samtools", "flagstat", "-@", threads, bam]
|
|
192
|
+
# else:
|
|
193
|
+
# command = ["samtools", "flagstat", bam]
|
|
194
|
+
# print(f"Running: {' '.join(command)} > {flagstats_out}")
|
|
195
|
+
# with open(flagstats_out, "w") as out_file:
|
|
196
|
+
# subprocess.run(command, stdout=out_file)
|
|
197
|
+
|
|
198
|
+
# if idxstats:
|
|
199
|
+
# idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
|
|
200
|
+
# if threads:
|
|
201
|
+
# command = ["samtools", "idxstats", "-@", threads, bam]
|
|
202
|
+
# else:
|
|
203
|
+
# command = ["samtools", "idxstats", bam]
|
|
204
|
+
# print(f"Running: {' '.join(command)} > {idxstats_out}")
|
|
205
|
+
# with open(idxstats_out, "w") as out_file:
|
|
206
|
+
# subprocess.run(command, stdout=out_file)
|
|
207
|
+
|
|
208
|
+
# if modality == 'conversion':
|
|
209
|
+
# pass
|
|
210
|
+
# elif modality == 'direct':
|
|
211
|
+
# pass
|
|
212
|
+
|
|
213
|
+
# print("QC processing completed.")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import pybedtools
|
|
3
|
+
import pyBigWig
|
|
4
|
+
|
|
5
|
+
def bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
BED → bedGraph → bigWig
|
|
8
|
+
Requires:
|
|
9
|
+
- FASTA must have .fai index present
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
bed = Path(bed)
|
|
13
|
+
fa = Path(fasta) # path to .fa
|
|
14
|
+
parent = bed.parent
|
|
15
|
+
stem = bed.stem
|
|
16
|
+
fa_stem = fa.stem
|
|
17
|
+
fai = parent / f"{fa_stem}.fai"
|
|
18
|
+
|
|
19
|
+
bedgraph = parent / f"{stem}.bedgraph"
|
|
20
|
+
bigwig = parent / f"{stem}.bw"
|
|
21
|
+
|
|
22
|
+
# 1) Compute coverage → bedGraph
|
|
23
|
+
print(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
24
|
+
bt = pybedtools.BedTool(str(bed))
|
|
25
|
+
# bedtools genomecov -bg
|
|
26
|
+
coverage = bt.genome_coverage(bg=True, genome=str(fai))
|
|
27
|
+
coverage.saveas(str(bedgraph))
|
|
28
|
+
|
|
29
|
+
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
30
|
+
print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
31
|
+
|
|
32
|
+
# read chrom sizes from the FASTA .fai index
|
|
33
|
+
chrom_sizes = {}
|
|
34
|
+
with open(fai) as f:
|
|
35
|
+
for line in f:
|
|
36
|
+
fields = line.strip().split("\t")
|
|
37
|
+
chrom = fields[0]
|
|
38
|
+
size = int(fields[1])
|
|
39
|
+
chrom_sizes[chrom] = size
|
|
40
|
+
|
|
41
|
+
bw = pyBigWig.open(str(bigwig), "w")
|
|
42
|
+
bw.addHeader(list(chrom_sizes.items()))
|
|
43
|
+
|
|
44
|
+
with open(bedgraph) as f:
|
|
45
|
+
for line in f:
|
|
46
|
+
chrom, start, end, coverage = line.strip().split()
|
|
47
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
48
|
+
|
|
49
|
+
bw.close()
|
|
50
|
+
|
|
51
|
+
print(f"BigWig written: {bigwig}")
|
|
52
|
+
return str(bigwig)
|
|
53
|
+
|
|
54
|
+
# def bed_to_bigwig(fasta, bed):
|
|
55
|
+
# """
|
|
56
|
+
# Takes a bed file of reads and makes a bedgraph plus a bigwig
|
|
57
|
+
|
|
58
|
+
# Parameters:
|
|
59
|
+
# fasta (str): File path to the reference genome to align to.
|
|
60
|
+
# bed (str): File path to the input bed.
|
|
61
|
+
# Returns:
|
|
62
|
+
# None
|
|
63
|
+
# """
|
|
64
|
+
# import os
|
|
65
|
+
# import subprocess
|
|
66
|
+
|
|
67
|
+
# bed_basename = os.path.basename(bed)
|
|
68
|
+
# parent_dir = os.path.dirname(bed)
|
|
69
|
+
# bed_basename_minus_suffix = bed_basename.split('.bed')[0]
|
|
70
|
+
# fasta_basename = os.path.basename(fasta)
|
|
71
|
+
# fasta_dir = os.path.dirname(fasta)
|
|
72
|
+
# fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
|
|
73
|
+
# chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
|
|
74
|
+
# chrom_path = os.path.join(fasta_dir, chrom_basename)
|
|
75
|
+
# bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
|
|
76
|
+
# bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
|
|
77
|
+
# bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
|
|
78
|
+
# bigwig_output = os.path.join(parent_dir, bigwig_basename)
|
|
79
|
+
|
|
80
|
+
# # Make the bedgraph
|
|
81
|
+
# with open(bedgraph_output, 'w') as outfile:
|
|
82
|
+
# # Command as a list
|
|
83
|
+
# command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
|
|
84
|
+
# print(f'Making bedgraph from {bed_basename}')
|
|
85
|
+
# subprocess.run(command, stdout=outfile)
|
|
86
|
+
|
|
87
|
+
# # Make the bigwig
|
|
88
|
+
# command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
|
|
89
|
+
# print(f'Making bigwig from {bedgraph_basename}')
|
|
90
|
+
# subprocess.run(command)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Any, Tuple, Union, Optional
|
|
5
|
+
import re
|
|
6
|
+
from itertools import zip_longest
|
|
7
|
+
|
|
8
|
+
import pysam
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def concatenate_fastqs_to_bam(
|
|
13
|
+
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
14
|
+
output_bam: Union[str, Path],
|
|
15
|
+
barcode_tag: str = "BC",
|
|
16
|
+
barcode_map: Optional[Dict[Union[str, Path], str]] = None,
|
|
17
|
+
add_read_group: bool = True,
|
|
18
|
+
rg_sample_field: Optional[str] = None,
|
|
19
|
+
progress: bool = True,
|
|
20
|
+
auto_pair: bool = True,
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
fastq_files : list[Path|str] or list[(Path|str, Path|str)]
|
|
28
|
+
Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
|
|
29
|
+
output_bam : Path|str
|
|
30
|
+
Output BAM path (parent directory will be created).
|
|
31
|
+
barcode_tag : str
|
|
32
|
+
SAM tag used to store barcode on each read (default 'BC').
|
|
33
|
+
barcode_map : dict or None
|
|
34
|
+
Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
|
|
35
|
+
add_read_group : bool
|
|
36
|
+
If True, add @RG header lines (ID = barcode) and set each read's RG tag.
|
|
37
|
+
rg_sample_field : str or None
|
|
38
|
+
If set, include SM=<value> in @RG.
|
|
39
|
+
progress : bool
|
|
40
|
+
Show tqdm progress bars.
|
|
41
|
+
auto_pair : bool
|
|
42
|
+
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
dict
|
|
47
|
+
{'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# ---------- helpers (Pathlib-only) ----------
|
|
51
|
+
def _strip_fastq_ext(p: Path) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Remove common FASTQ multi-suffixes; return stem-like name.
|
|
54
|
+
"""
|
|
55
|
+
name = p.name
|
|
56
|
+
lowers = name.lower()
|
|
57
|
+
for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
|
|
58
|
+
if lowers.endswith(ext):
|
|
59
|
+
return name[: -len(ext)]
|
|
60
|
+
return p.stem # fallback: remove last suffix only
|
|
61
|
+
|
|
62
|
+
def _extract_barcode_from_filename(p: Path) -> str:
|
|
63
|
+
stem = _strip_fastq_ext(p)
|
|
64
|
+
if "_" in stem:
|
|
65
|
+
token = stem.split("_")[-1]
|
|
66
|
+
if token:
|
|
67
|
+
return token
|
|
68
|
+
return stem
|
|
69
|
+
|
|
70
|
+
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
71
|
+
# return (prefix, readnum) if matches; else (None, None)
|
|
72
|
+
patterns = [
|
|
73
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
74
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
75
|
+
]
|
|
76
|
+
for pat in patterns:
|
|
77
|
+
m = re.match(pat, stem)
|
|
78
|
+
if m:
|
|
79
|
+
return m.group(1), int(m.group(2))
|
|
80
|
+
return None, None
|
|
81
|
+
|
|
82
|
+
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
83
|
+
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
84
|
+
unpaired: List[Path] = []
|
|
85
|
+
for pth in paths:
|
|
86
|
+
stem = _strip_fastq_ext(pth)
|
|
87
|
+
pref, num = _classify_read_token(stem)
|
|
88
|
+
if pref is None:
|
|
89
|
+
unpaired.append(pth)
|
|
90
|
+
else:
|
|
91
|
+
entry = pref_map.setdefault(pref, {})
|
|
92
|
+
entry[num] = pth
|
|
93
|
+
pairs: List[Tuple[Path, Path]] = []
|
|
94
|
+
leftovers: List[Path] = []
|
|
95
|
+
for d in pref_map.values():
|
|
96
|
+
if 1 in d and 2 in d:
|
|
97
|
+
pairs.append((d[1], d[2]))
|
|
98
|
+
else:
|
|
99
|
+
leftovers.extend(d.values())
|
|
100
|
+
leftovers.extend(unpaired)
|
|
101
|
+
return pairs, leftovers
|
|
102
|
+
|
|
103
|
+
def _fastq_iter(p: Path):
|
|
104
|
+
# pysam.FastxFile handles compressed extensions transparently
|
|
105
|
+
with pysam.FastxFile(str(p)) as fx:
|
|
106
|
+
for rec in fx:
|
|
107
|
+
yield rec # rec.name, rec.sequence, rec.quality
|
|
108
|
+
|
|
109
|
+
def _make_unaligned_segment(
|
|
110
|
+
name: str,
|
|
111
|
+
seq: str,
|
|
112
|
+
qual: Optional[str],
|
|
113
|
+
bc: str,
|
|
114
|
+
read1: bool,
|
|
115
|
+
read2: bool,
|
|
116
|
+
) -> pysam.AlignedSegment:
|
|
117
|
+
a = pysam.AlignedSegment()
|
|
118
|
+
a.query_name = name
|
|
119
|
+
a.query_sequence = seq
|
|
120
|
+
if qual is not None:
|
|
121
|
+
a.query_qualities = pysam.qualitystring_to_array(qual)
|
|
122
|
+
a.is_unmapped = True
|
|
123
|
+
a.is_paired = read1 or read2
|
|
124
|
+
a.is_read1 = read1
|
|
125
|
+
a.is_read2 = read2
|
|
126
|
+
a.mate_is_unmapped = a.is_paired
|
|
127
|
+
a.reference_id = -1
|
|
128
|
+
a.reference_start = -1
|
|
129
|
+
a.next_reference_id = -1
|
|
130
|
+
a.next_reference_start = -1
|
|
131
|
+
a.template_length = 0
|
|
132
|
+
a.set_tag(barcode_tag, str(bc), value_type="Z")
|
|
133
|
+
if add_read_group:
|
|
134
|
+
a.set_tag("RG", str(bc), value_type="Z")
|
|
135
|
+
return a
|
|
136
|
+
|
|
137
|
+
# ---------- normalize inputs to Path ----------
|
|
138
|
+
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
139
|
+
a, b = x
|
|
140
|
+
return Path(a), Path(b)
|
|
141
|
+
|
|
142
|
+
explicit_pairs: List[Tuple[Path, Path]] = []
|
|
143
|
+
singles: List[Path] = []
|
|
144
|
+
|
|
145
|
+
if not isinstance(fastq_files, (list, tuple)):
|
|
146
|
+
raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
|
|
147
|
+
|
|
148
|
+
if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
|
|
149
|
+
explicit_pairs = [_to_path_pair(x) for x in fastq_files]
|
|
150
|
+
else:
|
|
151
|
+
flat_paths = [Path(x) for x in fastq_files if x is not None]
|
|
152
|
+
if auto_pair:
|
|
153
|
+
explicit_pairs, leftovers = _pair_by_filename(flat_paths)
|
|
154
|
+
singles = leftovers
|
|
155
|
+
else:
|
|
156
|
+
singles = flat_paths
|
|
157
|
+
|
|
158
|
+
output_bam = Path(output_bam)
|
|
159
|
+
output_bam.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
|
|
161
|
+
# ---------- barcodes ----------
|
|
162
|
+
barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
|
|
163
|
+
per_path_barcode: Dict[Path, str] = {}
|
|
164
|
+
barcodes_in_order: List[str] = []
|
|
165
|
+
|
|
166
|
+
for r1, r2 in explicit_pairs:
|
|
167
|
+
bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
|
|
168
|
+
per_path_barcode[r1] = bc
|
|
169
|
+
per_path_barcode[r2] = bc
|
|
170
|
+
if bc not in barcodes_in_order:
|
|
171
|
+
barcodes_in_order.append(bc)
|
|
172
|
+
for pth in singles:
|
|
173
|
+
bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
|
|
174
|
+
per_path_barcode[pth] = bc
|
|
175
|
+
if bc not in barcodes_in_order:
|
|
176
|
+
barcodes_in_order.append(bc)
|
|
177
|
+
|
|
178
|
+
# ---------- BAM header ----------
|
|
179
|
+
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
180
|
+
if add_read_group:
|
|
181
|
+
header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
|
|
182
|
+
header.setdefault("PG", []).append(
|
|
183
|
+
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# ---------- counters ----------
|
|
187
|
+
per_file_counts: Dict[Path, int] = {}
|
|
188
|
+
total_written = 0
|
|
189
|
+
paired_pairs_written = 0
|
|
190
|
+
singletons_written = 0
|
|
191
|
+
|
|
192
|
+
# ---------- write BAM ----------
|
|
193
|
+
with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
|
|
194
|
+
# Paired
|
|
195
|
+
it_pairs = explicit_pairs
|
|
196
|
+
if progress and it_pairs:
|
|
197
|
+
it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
|
|
198
|
+
for r1_path, r2_path in it_pairs:
|
|
199
|
+
if not (r1_path.exists() and r2_path.exists()):
|
|
200
|
+
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
201
|
+
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
202
|
+
|
|
203
|
+
it1 = _fastq_iter(r1_path)
|
|
204
|
+
it2 = _fastq_iter(r2_path)
|
|
205
|
+
|
|
206
|
+
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
207
|
+
def _clean(n: Optional[str]) -> Optional[str]:
|
|
208
|
+
if n is None:
|
|
209
|
+
return None
|
|
210
|
+
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
211
|
+
|
|
212
|
+
name = (
|
|
213
|
+
_clean(getattr(rec1, "name", None))
|
|
214
|
+
or _clean(getattr(rec2, "name", None))
|
|
215
|
+
or getattr(rec1, "name", None)
|
|
216
|
+
or getattr(rec2, "name", None)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if rec1 is not None:
|
|
220
|
+
a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
|
|
221
|
+
bam_out.write(a1)
|
|
222
|
+
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
223
|
+
total_written += 1
|
|
224
|
+
if rec2 is not None:
|
|
225
|
+
a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
|
|
226
|
+
bam_out.write(a2)
|
|
227
|
+
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
228
|
+
total_written += 1
|
|
229
|
+
|
|
230
|
+
if rec1 is not None and rec2 is not None:
|
|
231
|
+
paired_pairs_written += 1
|
|
232
|
+
else:
|
|
233
|
+
if rec1 is not None:
|
|
234
|
+
singletons_written += 1
|
|
235
|
+
if rec2 is not None:
|
|
236
|
+
singletons_written += 1
|
|
237
|
+
|
|
238
|
+
# Singles
|
|
239
|
+
it_singles = singles
|
|
240
|
+
if progress and it_singles:
|
|
241
|
+
it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
|
|
242
|
+
for pth in it_singles:
|
|
243
|
+
if not pth.exists():
|
|
244
|
+
raise FileNotFoundError(pth)
|
|
245
|
+
bc = per_path_barcode.get(pth, "barcode")
|
|
246
|
+
for rec in _fastq_iter(pth):
|
|
247
|
+
a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
|
|
248
|
+
bam_out.write(a)
|
|
249
|
+
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
250
|
+
total_written += 1
|
|
251
|
+
singletons_written += 1
|
|
252
|
+
|
|
253
|
+
return {
|
|
254
|
+
"total_reads": total_written,
|
|
255
|
+
"per_file": {str(k): v for k, v in per_file_counts.items()},
|
|
256
|
+
"paired_pairs_written": paired_pairs_written,
|
|
257
|
+
"singletons_written": singletons_written,
|
|
258
|
+
"barcodes": barcodes_in_order,
|
|
259
|
+
}
|
|
@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
|
|
|
14
14
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
15
15
|
|
|
16
16
|
"""
|
|
17
|
-
from
|
|
17
|
+
from ... import readwrite
|
|
18
18
|
import pysam
|
|
19
19
|
from tqdm import tqdm
|
|
20
20
|
from collections import defaultdict
|
|
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
|
|
|
25
25
|
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
26
26
|
record_counts = defaultdict(int)
|
|
27
27
|
|
|
28
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
28
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
29
29
|
total_reads = bam.mapped + bam.unmapped
|
|
30
30
|
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
31
31
|
for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
|
|
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
18
18
|
bam_files (list): List of split BAM file path strings
|
|
19
19
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
20
20
|
"""
|
|
21
|
-
from
|
|
21
|
+
from ...readwrite import make_dirs
|
|
22
22
|
import os
|
|
23
23
|
import subprocess
|
|
24
24
|
import glob
|
|
25
|
-
from .make_dirs import make_dirs
|
|
26
25
|
|
|
27
|
-
input_bam = aligned_sorted_BAM
|
|
26
|
+
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
28
27
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
29
28
|
if barcode_both_ends:
|
|
30
29
|
command.append("--barcode-both-ends")
|
|
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
34
33
|
command += ["-t", str(threads)]
|
|
35
34
|
else:
|
|
36
35
|
pass
|
|
37
|
-
command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
|
|
38
|
-
command.append(input_bam)
|
|
36
|
+
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
37
|
+
command.append(str(input_bam))
|
|
39
38
|
command_string = ' '.join(command)
|
|
40
39
|
print(f"Running: {command_string}")
|
|
41
40
|
subprocess.run(command)
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
bam_files.sort()
|
|
42
|
+
bam_files = sorted(
|
|
43
|
+
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
44
|
+
if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
|
|
45
|
+
)
|
|
48
46
|
|
|
49
47
|
if not bam_files:
|
|
50
48
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
@@ -27,7 +27,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
27
27
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
28
28
|
|
|
29
29
|
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
30
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
30
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
31
31
|
total_reads = bam.mapped
|
|
32
32
|
ref_seq = sequence.upper()
|
|
33
33
|
for read in bam.fetch(chromosome):
|